toil 7.0.0__py3-none-any.whl → 8.1.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. toil/__init__.py +124 -86
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +137 -77
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
  5. toil/batchSystems/awsBatch.py +237 -128
  6. toil/batchSystems/cleanup_support.py +22 -16
  7. toil/batchSystems/contained_executor.py +30 -26
  8. toil/batchSystems/gridengine.py +85 -49
  9. toil/batchSystems/htcondor.py +164 -87
  10. toil/batchSystems/kubernetes.py +622 -386
  11. toil/batchSystems/local_support.py +17 -12
  12. toil/batchSystems/lsf.py +132 -79
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +288 -149
  16. toil/batchSystems/mesos/executor.py +77 -49
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +39 -29
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +293 -123
  21. toil/batchSystems/slurm.py +651 -155
  22. toil/batchSystems/torque.py +46 -32
  23. toil/bus.py +141 -73
  24. toil/common.py +784 -397
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1137 -534
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +62 -41
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +88 -57
  32. toil/fileStores/cachingFileStore.py +711 -247
  33. toil/fileStores/nonCachingFileStore.py +113 -75
  34. toil/job.py +1031 -349
  35. toil/jobStores/abstractJobStore.py +387 -243
  36. toil/jobStores/aws/jobStore.py +772 -412
  37. toil/jobStores/aws/utils.py +161 -109
  38. toil/jobStores/conftest.py +1 -0
  39. toil/jobStores/fileJobStore.py +289 -151
  40. toil/jobStores/googleJobStore.py +137 -70
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +614 -269
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +55 -28
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +204 -58
  49. toil/lib/aws/utils.py +290 -213
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +83 -49
  53. toil/lib/docker.py +131 -103
  54. toil/lib/dockstore.py +379 -0
  55. toil/lib/ec2.py +322 -209
  56. toil/lib/ec2nodes.py +174 -105
  57. toil/lib/encryption/_dummy.py +5 -3
  58. toil/lib/encryption/_nacl.py +10 -6
  59. toil/lib/encryption/conftest.py +1 -0
  60. toil/lib/exceptions.py +26 -7
  61. toil/lib/expando.py +4 -2
  62. toil/lib/ftp_utils.py +217 -0
  63. toil/lib/generatedEC2Lists.py +127 -19
  64. toil/lib/history.py +1271 -0
  65. toil/lib/history_submission.py +681 -0
  66. toil/lib/humanize.py +6 -2
  67. toil/lib/io.py +121 -12
  68. toil/lib/iterables.py +4 -2
  69. toil/lib/memoize.py +12 -8
  70. toil/lib/misc.py +83 -18
  71. toil/lib/objects.py +2 -2
  72. toil/lib/resources.py +19 -7
  73. toil/lib/retry.py +125 -87
  74. toil/lib/threading.py +282 -80
  75. toil/lib/throttle.py +15 -14
  76. toil/lib/trs.py +390 -0
  77. toil/lib/web.py +38 -0
  78. toil/options/common.py +850 -402
  79. toil/options/cwl.py +185 -90
  80. toil/options/runner.py +50 -0
  81. toil/options/wdl.py +70 -19
  82. toil/provisioners/__init__.py +111 -46
  83. toil/provisioners/abstractProvisioner.py +322 -157
  84. toil/provisioners/aws/__init__.py +62 -30
  85. toil/provisioners/aws/awsProvisioner.py +980 -627
  86. toil/provisioners/clusterScaler.py +541 -279
  87. toil/provisioners/gceProvisioner.py +283 -180
  88. toil/provisioners/node.py +147 -79
  89. toil/realtimeLogger.py +34 -22
  90. toil/resource.py +137 -75
  91. toil/server/app.py +127 -61
  92. toil/server/celery_app.py +3 -1
  93. toil/server/cli/wes_cwl_runner.py +84 -55
  94. toil/server/utils.py +56 -31
  95. toil/server/wes/abstract_backend.py +64 -26
  96. toil/server/wes/amazon_wes_utils.py +21 -15
  97. toil/server/wes/tasks.py +121 -63
  98. toil/server/wes/toil_backend.py +142 -107
  99. toil/server/wsgi_app.py +4 -3
  100. toil/serviceManager.py +58 -22
  101. toil/statsAndLogging.py +183 -65
  102. toil/test/__init__.py +263 -179
  103. toil/test/batchSystems/batchSystemTest.py +438 -195
  104. toil/test/batchSystems/batch_system_plugin_test.py +18 -7
  105. toil/test/batchSystems/test_gridengine.py +173 -0
  106. toil/test/batchSystems/test_lsf_helper.py +67 -58
  107. toil/test/batchSystems/test_slurm.py +265 -49
  108. toil/test/cactus/test_cactus_integration.py +20 -22
  109. toil/test/cwl/conftest.py +39 -0
  110. toil/test/cwl/cwlTest.py +375 -72
  111. toil/test/cwl/measure_default_memory.cwl +12 -0
  112. toil/test/cwl/not_run_required_input.cwl +29 -0
  113. toil/test/cwl/optional-file.cwl +18 -0
  114. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  115. toil/test/docs/scriptsTest.py +60 -34
  116. toil/test/jobStores/jobStoreTest.py +412 -235
  117. toil/test/lib/aws/test_iam.py +116 -48
  118. toil/test/lib/aws/test_s3.py +16 -9
  119. toil/test/lib/aws/test_utils.py +5 -6
  120. toil/test/lib/dockerTest.py +118 -141
  121. toil/test/lib/test_conversions.py +113 -115
  122. toil/test/lib/test_ec2.py +57 -49
  123. toil/test/lib/test_history.py +212 -0
  124. toil/test/lib/test_misc.py +12 -5
  125. toil/test/lib/test_trs.py +161 -0
  126. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  127. toil/test/mesos/helloWorld.py +7 -6
  128. toil/test/mesos/stress.py +25 -20
  129. toil/test/options/options.py +7 -2
  130. toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
  131. toil/test/provisioners/clusterScalerTest.py +440 -250
  132. toil/test/provisioners/clusterTest.py +81 -42
  133. toil/test/provisioners/gceProvisionerTest.py +174 -100
  134. toil/test/provisioners/provisionerTest.py +25 -13
  135. toil/test/provisioners/restartScript.py +5 -4
  136. toil/test/server/serverTest.py +188 -141
  137. toil/test/sort/restart_sort.py +137 -68
  138. toil/test/sort/sort.py +134 -66
  139. toil/test/sort/sortTest.py +91 -49
  140. toil/test/src/autoDeploymentTest.py +140 -100
  141. toil/test/src/busTest.py +20 -18
  142. toil/test/src/checkpointTest.py +8 -2
  143. toil/test/src/deferredFunctionTest.py +49 -35
  144. toil/test/src/dockerCheckTest.py +33 -26
  145. toil/test/src/environmentTest.py +20 -10
  146. toil/test/src/fileStoreTest.py +538 -271
  147. toil/test/src/helloWorldTest.py +7 -4
  148. toil/test/src/importExportFileTest.py +61 -31
  149. toil/test/src/jobDescriptionTest.py +32 -17
  150. toil/test/src/jobEncapsulationTest.py +2 -0
  151. toil/test/src/jobFileStoreTest.py +74 -50
  152. toil/test/src/jobServiceTest.py +187 -73
  153. toil/test/src/jobTest.py +120 -70
  154. toil/test/src/miscTests.py +19 -18
  155. toil/test/src/promisedRequirementTest.py +82 -36
  156. toil/test/src/promisesTest.py +7 -6
  157. toil/test/src/realtimeLoggerTest.py +6 -6
  158. toil/test/src/regularLogTest.py +71 -37
  159. toil/test/src/resourceTest.py +80 -49
  160. toil/test/src/restartDAGTest.py +36 -22
  161. toil/test/src/resumabilityTest.py +9 -2
  162. toil/test/src/retainTempDirTest.py +45 -14
  163. toil/test/src/systemTest.py +12 -8
  164. toil/test/src/threadingTest.py +44 -25
  165. toil/test/src/toilContextManagerTest.py +10 -7
  166. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  167. toil/test/src/workerTest.py +33 -16
  168. toil/test/utils/toilDebugTest.py +70 -58
  169. toil/test/utils/toilKillTest.py +4 -5
  170. toil/test/utils/utilsTest.py +239 -102
  171. toil/test/wdl/wdltoil_test.py +789 -148
  172. toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
  173. toil/toilState.py +52 -26
  174. toil/utils/toilConfig.py +13 -4
  175. toil/utils/toilDebugFile.py +44 -27
  176. toil/utils/toilDebugJob.py +85 -25
  177. toil/utils/toilDestroyCluster.py +11 -6
  178. toil/utils/toilKill.py +8 -3
  179. toil/utils/toilLaunchCluster.py +251 -145
  180. toil/utils/toilMain.py +37 -16
  181. toil/utils/toilRsyncCluster.py +27 -14
  182. toil/utils/toilSshCluster.py +45 -22
  183. toil/utils/toilStats.py +75 -36
  184. toil/utils/toilStatus.py +226 -119
  185. toil/utils/toilUpdateEC2Instances.py +3 -1
  186. toil/version.py +6 -6
  187. toil/wdl/utils.py +5 -5
  188. toil/wdl/wdltoil.py +3528 -1053
  189. toil/worker.py +370 -149
  190. toil-8.1.0b1.dist-info/METADATA +178 -0
  191. toil-8.1.0b1.dist-info/RECORD +259 -0
  192. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/WHEEL +1 -1
  193. toil-7.0.0.dist-info/METADATA +0 -158
  194. toil-7.0.0.dist-info/RECORD +0 -244
  195. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/LICENSE +0 -0
  196. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/entry_points.txt +0 -0
  197. {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/top_level.txt +0 -0
toil/leader.py CHANGED
@@ -21,32 +21,36 @@ import os
21
21
  import pickle
22
22
  import sys
23
23
  import time
24
- from typing import Any, Dict, List, Optional, Set, Union
24
+ from typing import Any, Optional, Union
25
25
 
26
26
  import enlighten
27
27
 
28
28
  from toil import resolveEntryPoint
29
29
  from toil.batchSystems import DeadlockException
30
- from toil.batchSystems.abstractBatchSystem import (AbstractBatchSystem,
31
- BatchJobExitReason,
32
- EXIT_STATUS_UNAVAILABLE_VALUE)
33
- from toil.bus import (JobCompletedMessage,
34
- JobFailedMessage,
35
- JobIssuedMessage,
36
- JobMissingMessage,
37
- JobUpdatedMessage,
38
- QueueSizeMessage,
39
- gen_message_bus_path,
40
- get_job_kind)
30
+ from toil.batchSystems.abstractBatchSystem import (
31
+ EXIT_STATUS_UNAVAILABLE_VALUE,
32
+ AbstractBatchSystem,
33
+ BatchJobExitReason,
34
+ )
35
+ from toil.bus import (
36
+ JobCompletedMessage,
37
+ JobFailedMessage,
38
+ JobIssuedMessage,
39
+ JobMissingMessage,
40
+ JobUpdatedMessage,
41
+ QueueSizeMessage,
42
+ get_job_kind,
43
+ )
41
44
  from toil.common import Config, ToilMetrics
42
45
  from toil.cwl.utils import CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
43
46
  from toil.exceptions import FailedJobsException
44
- from toil.job import (CheckpointJobDescription,
45
- JobDescription,
46
- ServiceJobDescription,
47
- TemporaryID)
48
- from toil.jobStores.abstractJobStore import (AbstractJobStore,
49
- NoSuchJobException)
47
+ from toil.job import (
48
+ CheckpointJobDescription,
49
+ JobDescription,
50
+ ServiceJobDescription,
51
+ TemporaryID,
52
+ )
53
+ from toil.jobStores.abstractJobStore import AbstractJobStore, NoSuchJobException
50
54
  from toil.lib.throttle import LocalThrottle
51
55
  from toil.provisioners.abstractProvisioner import AbstractProvisioner
52
56
  from toil.provisioners.clusterScaler import ScalerThread
@@ -80,13 +84,15 @@ class Leader:
80
84
  consulting the job store, and issuing them in the batch system.
81
85
  """
82
86
 
83
- def __init__(self,
84
- config: Config,
85
- batchSystem: AbstractBatchSystem,
86
- provisioner: Optional[AbstractProvisioner],
87
- jobStore: AbstractJobStore,
88
- rootJob: JobDescription,
89
- jobCache: Optional[Dict[Union[str, TemporaryID], JobDescription]] = None) -> None:
87
+ def __init__(
88
+ self,
89
+ config: Config,
90
+ batchSystem: AbstractBatchSystem,
91
+ provisioner: Optional[AbstractProvisioner],
92
+ jobStore: AbstractJobStore,
93
+ rootJob: JobDescription,
94
+ jobCache: Optional[dict[Union[str, TemporaryID], JobDescription]] = None,
95
+ ) -> None:
90
96
  """
91
97
  Create a Toil Leader object.
92
98
 
@@ -116,19 +122,11 @@ class Leader:
116
122
  # state change information about jobs.
117
123
  self.toilState = ToilState(self.jobStore)
118
124
 
119
- if self.config.write_messages is None:
120
- # The user hasn't specified a place for the message bus so we
121
- # should make one.
122
- # pass in coordination_dir for toil-cwl-runner; we want to obey --tmpdir-prefix
123
- # from cwltool and we change the coordination_dir when detected. we don't want
124
- # to make another config attribute so put the message bus in the already prefixed dir
125
- # if a coordination_dir is provided normally, we can still put the bus in there
126
- # as the coordination dir should serve a similar purpose to the tmp directory
127
- self.config.write_messages = gen_message_bus_path(config.coordination_dir)
128
-
129
125
  # Message bus messages need to go to the given file.
130
126
  # Keep a reference to the return value so the listener stays alive.
131
- self._message_subscription = self.toilState.bus.connect_output_file(self.config.write_messages)
127
+ self._message_subscription = self.toilState.bus.connect_output_file(
128
+ self.config.write_messages
129
+ )
132
130
 
133
131
  # Connect to the message bus, so we will get all the messages of these
134
132
  # types in an inbox.
@@ -143,17 +141,22 @@ class Leader:
143
141
  # this, somehow, so they can also see messages from this?
144
142
  self.toilState.load_workflow(rootJob, jobCache=jobCache)
145
143
 
146
- logger.debug("Found %s jobs to start and %i jobs with successors to run",
147
- self._messages.count(JobUpdatedMessage), len(self.toilState.successorCounts))
144
+ logger.debug(
145
+ "Found %s jobs to start and %i jobs with successors to run",
146
+ self._messages.count(JobUpdatedMessage),
147
+ len(self.toilState.successorCounts),
148
+ )
148
149
 
149
150
  # Batch system
150
151
  self.batchSystem = batchSystem
151
152
  if len(self.batchSystem.getIssuedBatchJobIDs()) != 0:
152
- raise RuntimeError("The initialized batchsystem did not start with 0 active jobs.")
153
+ raise RuntimeError(
154
+ "The initialized batchsystem did not start with 0 active jobs."
155
+ )
153
156
  logger.debug("Checked batch system has no running jobs and no updated jobs")
154
157
 
155
158
  # Map of batch system IDs to job store IDs
156
- self.issued_jobs_by_batch_system_id: Dict[int, str] = {}
159
+ self.issued_jobs_by_batch_system_id: dict[int, str] = {}
157
160
 
158
161
  # Number of preemptible jobs currently being run by batch system
159
162
  self.preemptibleJobsIssued = 0
@@ -161,10 +164,12 @@ class Leader:
161
164
  # Tracking the number service jobs issued,
162
165
  # this is used limit the number of services issued to the batch system
163
166
  self.serviceJobsIssued = 0
164
- self.serviceJobsToBeIssued: List[str] = [] # A queue of IDs of service jobs that await scheduling
167
+ self.serviceJobsToBeIssued: list[str] = (
168
+ []
169
+ ) # A queue of IDs of service jobs that await scheduling
165
170
  # Equivalents for service jobs to be run on preemptible nodes
166
171
  self.preemptibleServiceJobsIssued = 0
167
- self.preemptibleServiceJobsToBeIssued: List[str] = []
172
+ self.preemptibleServiceJobsToBeIssued: list[str] = []
168
173
 
169
174
  # Timing of the rescuing method
170
175
  self.timeSinceJobsLastRescued = None
@@ -172,7 +177,7 @@ class Leader:
172
177
  # For each issued job's batch system ID, how many times did we not see
173
178
  # it when we should have? If this hits a threshold, the job is declared
174
179
  # missing and killed and possibly retried.
175
- self.reissueMissingJobs_missingHash: Dict[int, int] = {}
180
+ self.reissueMissingJobs_missingHash: dict[int, int] = {}
176
181
 
177
182
  # Class used to create/destroy nodes in the cluster, may be None if
178
183
  # using a statically defined cluster
@@ -190,7 +195,7 @@ class Leader:
190
195
  self.statsAndLogging = StatsAndLogging(self.jobStore, self.config)
191
196
 
192
197
  # Set used to monitor deadlocked jobs
193
- self.potentialDeadlockedJobs: Set[str] = set()
198
+ self.potentialDeadlockedJobs: set[str] = set()
194
199
  self.potentialDeadlockTime = 0
195
200
 
196
201
  # A dashboard that runs on the leader node in AWS clusters to track the state
@@ -198,8 +203,13 @@ class Leader:
198
203
  self.toilMetrics: Optional[ToilMetrics] = None
199
204
 
200
205
  # internal jobs we should not expose at top level debugging
201
- self.debugJobNames = ("CWLJob", "CWLWorkflow", "CWLScatter", "CWLGather",
202
- "ResolveIndirect")
206
+ self.debugJobNames = (
207
+ "CWLJob",
208
+ "CWLWorkflow",
209
+ "CWLScatter",
210
+ "CWLGather",
211
+ "ResolveIndirect",
212
+ )
203
213
 
204
214
  self.deadlockThrottler = LocalThrottle(self.config.deadlockCheckInterval)
205
215
 
@@ -217,8 +227,10 @@ class Leader:
217
227
  self.GOOD_COLOR = (0, 60, 108)
218
228
  self.BAD_COLOR = (253, 199, 0)
219
229
  # And set a format that shows failures
220
- self.PROGRESS_BAR_FORMAT = ('{desc}{desc_pad}{percentage:3.0f}%|{bar}| {count:{len_total}d}/{total:d} '
221
- '({count_1:d} failures) [{elapsed}<{eta}, {rate:.2f}{unit_pad}{unit}/s]')
230
+ self.PROGRESS_BAR_FORMAT = (
231
+ "{desc}{desc_pad}{percentage:3.0f}%|{bar}| {count:{len_total}d}/{total:d} "
232
+ "({count_1:d} failures) [{elapsed}<{eta}, {rate:.2f}{unit_pad}{unit}/s]"
233
+ )
222
234
  # TODO: No way to set background color on the terminal for the bar.
223
235
 
224
236
  # What exit code should the process use if the workflow failed?
@@ -236,16 +248,25 @@ class Leader:
236
248
  """
237
249
  self.jobStore.write_kill_flag(kill=False)
238
250
 
239
- with enlighten.get_manager(stream=sys.stderr, enabled=not self.config.disableProgress) as manager:
251
+ with enlighten.get_manager(
252
+ stream=sys.stderr, enabled=not self.config.disableProgress
253
+ ) as manager:
240
254
  # Set up the fancy console UI if desirable
241
- self.progress_overall = manager.counter(total=0, desc='Workflow Progress', unit='jobs',
242
- color=self.GOOD_COLOR, bar_format=self.PROGRESS_BAR_FORMAT)
255
+ self.progress_overall = manager.counter(
256
+ total=0,
257
+ desc="Workflow Progress",
258
+ unit="jobs",
259
+ color=self.GOOD_COLOR,
260
+ bar_format=self.PROGRESS_BAR_FORMAT,
261
+ )
243
262
  self.progress_failed = self.progress_overall.add_subcounter(self.BAD_COLOR)
244
263
 
245
264
  # Start the stats/logging aggregation thread
246
265
  self.statsAndLogging.start()
247
266
  if self.config.metrics:
248
- self.toilMetrics = ToilMetrics(self.toilState.bus, provisioner=self.provisioner)
267
+ self.toilMetrics = ToilMetrics(
268
+ self.toilState.bus, provisioner=self.provisioner
269
+ )
249
270
 
250
271
  try:
251
272
 
@@ -262,10 +283,13 @@ class Leader:
262
283
  self.innerLoop()
263
284
  finally:
264
285
  if self.clusterScaler is not None:
265
- logger.debug('Waiting for workers to shutdown.')
286
+ logger.debug("Waiting for workers to shutdown.")
266
287
  startTime = time.time()
267
288
  self.clusterScaler.shutdown()
268
- logger.debug('Worker shutdown complete in %s seconds.', time.time() - startTime)
289
+ logger.debug(
290
+ "Worker shutdown complete in %s seconds.",
291
+ time.time() - startTime,
292
+ )
269
293
 
270
294
  finally:
271
295
  # Ensure service manager thread is properly shutdown
@@ -278,16 +302,25 @@ class Leader:
278
302
  self.toilMetrics.shutdown()
279
303
 
280
304
  # Filter the failed jobs
281
- self.toilState.totalFailedJobs = [j for j in self.toilState.totalFailedJobs if self.toilState.job_exists(j)]
305
+ self.toilState.totalFailedJobs = [
306
+ j
307
+ for j in self.toilState.totalFailedJobs
308
+ if self.toilState.job_exists(j)
309
+ ]
282
310
 
283
311
  try:
284
312
  self.create_status_sentinel_file(self.toilState.totalFailedJobs)
285
313
  except OSError as e:
286
- logger.debug(f'Error from importFile with hardlink=True: {e}')
314
+ logger.debug(f"Error from importFile with hardlink=True: {e}")
287
315
 
288
- logger.info("Finished toil run %s" %
289
- ("successfully." if not self.toilState.totalFailedJobs \
290
- else ("with %s failed jobs." % len(self.toilState.totalFailedJobs))))
316
+ logger.info(
317
+ "Finished toil run %s"
318
+ % (
319
+ "successfully."
320
+ if not self.toilState.totalFailedJobs
321
+ else ("with %s failed jobs." % len(self.toilState.totalFailedJobs))
322
+ )
323
+ )
291
324
 
292
325
  if len(self.toilState.totalFailedJobs):
293
326
  failed_jobs = []
@@ -300,19 +333,28 @@ class Leader:
300
333
  # Job actually finished and was removed
301
334
  pass
302
335
 
303
- logger.info("Failed jobs at end of the run: %s", ' '.join(str(j) for j in failed_jobs))
304
- raise FailedJobsException(self.jobStore, failed_jobs, exit_code=self.recommended_fail_exit_code)
336
+ logger.info(
337
+ "Failed jobs at end of the run: %s",
338
+ " ".join(str(j) for j in failed_jobs),
339
+ )
340
+ raise FailedJobsException(
341
+ self.jobStore,
342
+ failed_jobs,
343
+ exit_code=self.recommended_fail_exit_code,
344
+ )
305
345
 
306
346
  return self.jobStore.get_root_job_return_value()
307
347
 
308
348
  def create_status_sentinel_file(self, fail: bool) -> None:
309
349
  """Create a file in the jobstore indicating failure or success."""
310
- logName = 'failed.log' if fail else 'succeeded.log'
350
+ logName = "failed.log" if fail else "succeeded.log"
311
351
  localLog = os.path.join(os.getcwd(), logName)
312
- open(localLog, 'w').close()
313
- self.jobStore.import_file('file://' + localLog, logName, hardlink=True)
352
+ open(localLog, "w").close()
353
+ self.jobStore.import_file("file://" + localLog, logName, hardlink=True)
314
354
 
315
- if os.path.exists(localLog): # Bandaid for Jenkins tests failing stochastically and unexplainably.
355
+ if os.path.exists(
356
+ localLog
357
+ ): # Bandaid for Jenkins tests failing stochastically and unexplainably.
316
358
  os.remove(localLog)
317
359
 
318
360
  def _handledFailedSuccessor(self, successor_id: str, predecessor_id: str) -> bool:
@@ -324,8 +366,11 @@ class Leader:
324
366
  :returns: True if there are still active successors.
325
367
  False if all successors have failed and the job is queued to run to handle the failed successors.
326
368
  """
327
- logger.debug("Successor job: %s of job: %s has failed """
328
- "predecessors", self.toilState.get_job(successor_id), self.toilState.get_job(predecessor_id))
369
+ logger.debug(
370
+ "Successor job: %s of job: %s has failed " "" "predecessors",
371
+ self.toilState.get_job(successor_id),
372
+ self.toilState.get_job(predecessor_id),
373
+ )
329
374
 
330
375
  # Add the job to the set having failed successors
331
376
  self.toilState.hasFailedSuccessors.add(predecessor_id)
@@ -339,9 +384,12 @@ class Leader:
339
384
  # If the job now has no active successors, add to active jobs
340
385
  # so it can be processed as a job with failed successors.
341
386
  if self.toilState.count_pending_successors(predecessor_id) == 0:
342
- logger.debug("Job: %s has no successors to run "
343
- "and some are failed, adding to list of jobs "
344
- "with failed successors", self.toilState.get_job(predecessor_id))
387
+ logger.debug(
388
+ "Job: %s has no successors to run "
389
+ "and some are failed, adding to list of jobs "
390
+ "with failed successors",
391
+ self.toilState.get_job(predecessor_id),
392
+ )
345
393
  self._messages.publish(JobUpdatedMessage(predecessor_id, 0))
346
394
  # Report no successors are running
347
395
  return False
@@ -349,7 +397,9 @@ class Leader:
349
397
  # Some successors are still active
350
398
  return True
351
399
 
352
- def _checkSuccessorReadyToRunMultiplePredecessors(self, successor_id: str, predecessor_id: str) -> bool:
400
+ def _checkSuccessorReadyToRunMultiplePredecessors(
401
+ self, successor_id: str, predecessor_id: str
402
+ ) -> bool:
353
403
  """
354
404
  Check if a successor job is ready to run when there are multiple predecessors.
355
405
 
@@ -370,8 +420,11 @@ class Leader:
370
420
  # Grab the predecessor for reporting
371
421
  predecessor = self.toilState.get_job(predecessor_id)
372
422
 
373
- logger.debug("Successor job: %s of job: %s has multiple "
374
- "predecessors", successor, predecessor)
423
+ logger.debug(
424
+ "Successor job: %s of job: %s has multiple " "predecessors",
425
+ successor,
426
+ predecessor,
427
+ )
375
428
 
376
429
  # Add the predecessor as a finished predecessor to the successor
377
430
  successor.predecessorsFinished.add(predecessor_id)
@@ -390,13 +443,17 @@ class Leader:
390
443
  if len(successor.predecessorsFinished) == successor.predecessorNumber:
391
444
  # All the successor's predecessors are done now.
392
445
  # Remove the successor job from the set of waiting multi-predecessor jobs.
393
- self.toilState.jobsToBeScheduledWithMultiplePredecessors.remove(successor_id)
446
+ self.toilState.jobsToBeScheduledWithMultiplePredecessors.remove(
447
+ successor_id
448
+ )
394
449
  return True
395
450
  else:
396
451
  # The job is not ready to run
397
452
  return False
398
453
 
399
- def _makeJobSuccessorReadyToRun(self, successor_id: str, predecessor_id: str) -> bool:
454
+ def _makeJobSuccessorReadyToRun(
455
+ self, successor_id: str, predecessor_id: str
456
+ ) -> bool:
400
457
  """
401
458
  Make a successor job ready to run if possible.
402
459
 
@@ -404,7 +461,7 @@ class Leader:
404
461
  :param predecessor_id: The job which the successor comes after.
405
462
  :returns: False if the successor job should not yet be run or True otherwise.
406
463
  """
407
- #Build map from successor to predecessors.
464
+ # Build map from successor to predecessors.
408
465
  if successor_id not in self.toilState.successor_to_predecessors:
409
466
  self.toilState.successor_to_predecessors[successor_id] = set()
410
467
  if not isinstance(successor_id, str):
@@ -415,9 +472,15 @@ class Leader:
415
472
 
416
473
  # Grab the successor
417
474
  successor = self.toilState.get_job(successor_id)
418
- logger.debug("Added job %s as coming after job %s", successor, self.toilState.get_job(predecessor_id))
475
+ logger.debug(
476
+ "Added job %s as coming after job %s",
477
+ successor,
478
+ self.toilState.get_job(predecessor_id),
479
+ )
419
480
  if successor.predecessorNumber > 1:
420
- return self._checkSuccessorReadyToRunMultiplePredecessors(successor_id, predecessor_id)
481
+ return self._checkSuccessorReadyToRunMultiplePredecessors(
482
+ successor_id, predecessor_id
483
+ )
421
484
  else:
422
485
  return True
423
486
 
@@ -436,13 +499,20 @@ class Leader:
436
499
  next_successors = predecessor.nextSuccessors()
437
500
 
438
501
  if next_successors is None or len(next_successors) == 0:
439
- raise RuntimeError(f"Job {self} trying to run successors, but it doesn't have any")
440
- logger.debug("Job: %s has %i successors to schedule",
441
- predecessor_id, len(next_successors))
442
- #Record the number of successors that must be completed before
443
- #the job can be considered again
502
+ raise RuntimeError(
503
+ f"Job {self} trying to run successors, but it doesn't have any"
504
+ )
505
+ logger.debug(
506
+ "Job: %s has %i successors to schedule",
507
+ predecessor_id,
508
+ len(next_successors),
509
+ )
510
+ # Record the number of successors that must be completed before
511
+ # the job can be considered again
444
512
  if self.toilState.count_pending_successors(predecessor_id) != 0:
445
- raise RuntimeError('Attempted to schedule successors of the same job twice!')
513
+ raise RuntimeError(
514
+ "Attempted to schedule successors of the same job twice!"
515
+ )
446
516
  self.toilState.successors_pending(predecessor_id, len(next_successors))
447
517
 
448
518
  # For each successor schedule if all predecessors have been completed
@@ -453,7 +523,11 @@ class Leader:
453
523
  except NoSuchJobException:
454
524
  # Job already done and gone, but probably shouldn't be. Or maybe isn't visible yet.
455
525
  # TODO: Shouldn't this be an error?
456
- logger.warning("Job %s is a successor of %s but is already done and gone.", successor_id, predecessor_id)
526
+ logger.warning(
527
+ "Job %s is a successor of %s but is already done and gone.",
528
+ successor_id,
529
+ predecessor_id,
530
+ )
457
531
  # Don't try and run it
458
532
  continue
459
533
  if self._makeJobSuccessorReadyToRun(successor_id, predecessor_id):
@@ -475,46 +549,62 @@ class Leader:
475
549
  # The job has services running; signal for them to be killed.
476
550
  # Once they are killed, then the job will be updated again and then
477
551
  # scheduled to be removed.
478
- logger.warning("Telling job %s to terminate its services due to successor failure",
479
- predecessor)
480
- self.serviceManager.kill_services(self.toilState.servicesIssued[predecessor_id],
481
- error=True)
552
+ logger.warning(
553
+ "Telling job %s to terminate its services due to successor failure",
554
+ predecessor,
555
+ )
556
+ self.serviceManager.kill_services(
557
+ self.toilState.servicesIssued[predecessor_id], error=True
558
+ )
482
559
  elif self.toilState.count_pending_successors(predecessor_id) > 0:
483
560
  # The job has non-service jobs running; wait for them to finish.
484
561
  # the job will be re-added to the updated jobs when these jobs
485
562
  # are done
486
- logger.debug("Job %s with ID: %s with failed successors still has successor jobs running",
487
- predecessor, predecessor_id)
488
- elif (isinstance(predecessor, CheckpointJobDescription) and
489
- predecessor.checkpoint is not None and
490
- predecessor.remainingTryCount > 1):
563
+ logger.debug(
564
+ "Job %s with ID: %s with failed successors still has successor jobs running",
565
+ predecessor,
566
+ predecessor_id,
567
+ )
568
+ elif (
569
+ isinstance(predecessor, CheckpointJobDescription)
570
+ and predecessor.checkpoint is not None
571
+ and predecessor.remainingTryCount > 1
572
+ ):
491
573
  # If the job is a checkpoint and has remaining retries...
492
574
  # The logic behind using > 1 rather than > 0 here: Since this job has
493
575
  # been tried once (without decreasing its try count as the job
494
576
  # itself was successful), and its subtree failed, it shouldn't be retried
495
577
  # unless it has more than 1 try.
496
578
  if predecessor_id in self.toilState.jobs_issued:
497
- logger.debug('Checkpoint job %s was updated while issued', predecessor_id)
579
+ logger.debug(
580
+ "Checkpoint job %s was updated while issued", predecessor_id
581
+ )
498
582
  else:
499
583
  # It hasn't already been reissued.
500
584
  # This check lets us be robust against repeated job update
501
585
  # messages (such as from services starting *and* failing), by
502
586
  # making sure that we don't stay in a state that where we
503
587
  # reissue the job every time we get one.
504
- logger.warning('Job: %s is being restarted as a checkpoint after the total '
505
- 'failure of jobs in its subtree.', predecessor_id)
588
+ logger.warning(
589
+ "Job: %s is being restarted as a checkpoint after the total "
590
+ "failure of jobs in its subtree.",
591
+ predecessor_id,
592
+ )
506
593
  self.issueJob(predecessor)
507
594
  else:
508
595
  # Mark it totally failed
509
- logger.debug("Job %s is being processed as completely failed", predecessor_id)
596
+ logger.debug(
597
+ "Job %s is being processed as completely failed", predecessor_id
598
+ )
510
599
  self.processTotallyFailedJob(predecessor_id)
511
600
 
512
601
  def _processReadyJob(self, job_id: str, result_status: int):
513
602
  # We operate on the JobDescription mostly.
514
603
  readyJob = self.toilState.get_job(job_id)
515
604
 
516
- logger.debug('Updating status of job %s with result status: %s',
517
- readyJob, result_status)
605
+ logger.debug(
606
+ "Updating status of job %s with result status: %s", readyJob, result_status
607
+ )
518
608
 
519
609
  # TODO: Filter out nonexistent successors/services now, so we can tell
520
610
  # if they are all done and the job needs deleting?
@@ -527,8 +617,11 @@ class Leader:
527
617
  # want to act on it; we want to wait until it gets the update it
528
618
  # gets when the service manager is done trying to start its
529
619
  # services.
530
- logger.debug("Got a job to update which is still owned by the service "
531
- "manager: %s", readyJob.jobStoreID)
620
+ logger.debug(
621
+ "Got a job to update which is still owned by the service "
622
+ "manager: %s",
623
+ readyJob.jobStoreID,
624
+ )
532
625
  elif readyJob.jobStoreID in self.toilState.hasFailedSuccessors:
533
626
  self._processFailedSuccessors(job_id)
534
627
  elif readyJob.has_body() or result_status != 0:
@@ -542,8 +635,9 @@ class Leader:
542
635
 
543
636
  # If the job has run out of tries or is a service job whose error flag has
544
637
  # been indicated, fail the job.
545
- if (readyJob.remainingTryCount == 0 or
546
- (isServiceJob and not self.jobStore.file_exists(readyJob.errorJobStoreID))):
638
+ if readyJob.remainingTryCount == 0 or (
639
+ isServiceJob and not self.jobStore.file_exists(readyJob.errorJobStoreID)
640
+ ):
547
641
  self.processTotallyFailedJob(job_id)
548
642
  logger.warning("Job %s is completely failed", readyJob)
549
643
  else:
@@ -554,29 +648,39 @@ class Leader:
554
648
  # Build a map from the service jobs to the job and a map
555
649
  # of the services created for the job
556
650
  if readyJob.jobStoreID in self.toilState.servicesIssued:
557
- raise RuntimeError(f"The ready job: {readyJob.jobStoreID} was already issued.")
651
+ raise RuntimeError(
652
+ f"The ready job: {readyJob.jobStoreID} was already issued."
653
+ )
558
654
  self.toilState.servicesIssued[readyJob.jobStoreID] = set()
559
655
  for serviceJobList in readyJob.serviceHostIDsInBatches():
560
656
  for serviceID in serviceJobList:
561
657
  if serviceID in self.toilState.service_to_client:
562
- raise RuntimeError(f"The ready service ID: {serviceID} was already added.")
658
+ raise RuntimeError(
659
+ f"The ready service ID: {serviceID} was already added."
660
+ )
563
661
  # TODO: Why do we refresh here?
564
662
  self.toilState.reset_job(serviceID)
565
663
  serviceHost = self.toilState.get_job(serviceID)
566
664
  self.toilState.service_to_client[serviceID] = readyJob.jobStoreID
567
665
  self.toilState.servicesIssued[readyJob.jobStoreID].add(serviceID)
568
666
 
569
- logger.debug("Giving job: %s to service manager to schedule its jobs", readyJob)
667
+ logger.debug(
668
+ "Giving job: %s to service manager to schedule its jobs", readyJob
669
+ )
570
670
  # Use the service manager to start the services
571
671
  self.serviceManager.put_client(job_id)
572
672
  elif readyJob.nextSuccessors() is not None:
573
673
  # There are successors to run
574
674
  self._runJobSuccessors(job_id)
575
675
  elif readyJob.jobStoreID in self.toilState.servicesIssued:
576
- logger.debug("Telling job: %s to terminate its services due to the "
577
- "successful completion of its successor jobs",
578
- readyJob)
579
- self.serviceManager.kill_services(self.toilState.servicesIssued[readyJob.jobStoreID], error=False)
676
+ logger.debug(
677
+ "Telling job: %s to terminate its services due to the "
678
+ "successful completion of its successor jobs",
679
+ readyJob,
680
+ )
681
+ self.serviceManager.kill_services(
682
+ self.toilState.servicesIssued[readyJob.jobStoreID], error=False
683
+ )
580
684
  else:
581
685
  # There are no remaining tasks to schedule within the job.
582
686
  #
@@ -605,7 +709,10 @@ class Leader:
605
709
  try:
606
710
  self.toilState.delete_job(readyJob.jobStoreID)
607
711
  except Exception as e:
608
- logger.exception("Re-processing success for job we could not remove: %s", readyJob)
712
+ logger.exception(
713
+ "Re-processing success for job we could not remove: %s",
714
+ readyJob,
715
+ )
609
716
  # Kick it back to being handled as succeeded again. We
610
717
  # don't want to have a failure here cause a Toil-level
611
718
  # retry which causes more actual jobs to try to run.
@@ -617,12 +724,18 @@ class Leader:
617
724
  self.processRemovedJob(readyJob, 0)
618
725
  else:
619
726
  self.processTotallyFailedJob(job_id)
620
- logger.error("Job: %s is empty but completely failed - something is very wrong", readyJob.jobStoreID)
727
+ logger.error(
728
+ "Job: %s is empty but completely failed - something is very wrong",
729
+ readyJob.jobStoreID,
730
+ )
621
731
 
622
732
  def _processReadyJobs(self):
623
733
  """Process jobs that are ready to be scheduled/have successors to schedule."""
624
- logger.debug('Built the jobs list, currently have %i jobs to update and %i jobs issued',
625
- self._messages.count(JobUpdatedMessage), self.getNumberOfJobsIssued())
734
+ logger.debug(
735
+ "Built the jobs list, currently have %i jobs to update and %i jobs issued",
736
+ self._messages.count(JobUpdatedMessage),
737
+ self.getNumberOfJobsIssued(),
738
+ )
626
739
 
627
740
  # Now go through and, for each job that has updated this tick, process it.
628
741
 
@@ -637,9 +750,13 @@ class Leader:
637
750
  if message.job_id in handled_with_status:
638
751
  if handled_with_status[message.job_id] == message.result_status:
639
752
  # This is a harmless duplicate
640
- logger.debug("Job %s already updated this tick with status %s and "
641
- "we've received duplicate message %s", message.job_id,
642
- handled_with_status[message.job_id], message)
753
+ logger.debug(
754
+ "Job %s already updated this tick with status %s and "
755
+ "we've received duplicate message %s",
756
+ message.job_id,
757
+ handled_with_status[message.job_id],
758
+ message,
759
+ )
643
760
  else:
644
761
  # This is a conflicting update. We may have already treated
645
762
  # a job as succeeding but now we've heard it's failed, or
@@ -647,9 +764,13 @@ class Leader:
647
764
  # This probably shouldn't happen, but does because the
648
765
  # scheduler is not correct somehow and hasn't been for a
649
766
  # long time. Complain about it.
650
- logger.warning("Job %s already updated this tick with status %s "
651
- "but we've now received %s", message.job_id,
652
- handled_with_status[message.job_id], message)
767
+ logger.warning(
768
+ "Job %s already updated this tick with status %s "
769
+ "but we've now received %s",
770
+ message.job_id,
771
+ handled_with_status[message.job_id],
772
+ message,
773
+ )
653
774
  # Either way, we only want to handle one update per tick, like
654
775
  # the old dict-based implementation.
655
776
  continue
@@ -667,16 +788,21 @@ class Leader:
667
788
  if service_id is None:
668
789
  break
669
790
 
670
- logger.debug('Launching service job: %s', self.toilState.get_job(service_id))
791
+ logger.debug(
792
+ "Launching service job: %s", self.toilState.get_job(service_id)
793
+ )
671
794
  self.issueServiceJob(service_id)
672
795
 
673
796
  def _processJobsWithRunningServices(self):
674
797
  """Get jobs whose services have started."""
675
798
  while True:
676
799
  client_id = self.serviceManager.get_ready_client(0)
677
- if client_id is None: # Stop trying to get jobs when function returns None
800
+ if client_id is None: # Stop trying to get jobs when function returns None
678
801
  break
679
- logger.debug('Job: %s has established its services; all services are running', client_id)
802
+ logger.debug(
803
+ "Job: %s has established its services; all services are running",
804
+ client_id,
805
+ )
680
806
 
681
807
  # Grab the client job description
682
808
  client = self.toilState.get_job(client_id)
@@ -689,9 +815,9 @@ class Leader:
689
815
  """Get jobs whose services have failed to start."""
690
816
  while True:
691
817
  client_id = self.serviceManager.get_unservable_client(0)
692
- if client_id is None: # Stop trying to get jobs when function returns None
818
+ if client_id is None: # Stop trying to get jobs when function returns None
693
819
  break
694
- logger.debug('Job: %s has failed to establish its services.', client_id)
820
+ logger.debug("Job: %s has failed to establish its services.", client_id)
695
821
 
696
822
  # Grab the client job description
697
823
  client = self.toilState.get_job(client_id)
@@ -706,30 +832,56 @@ class Leader:
706
832
  def _gatherUpdatedJobs(self, updatedJobTuple):
707
833
  """Gather any new, updated JobDescriptions from the batch system."""
708
834
  bsID, exitStatus, exitReason, wallTime = (
709
- updatedJobTuple.jobID, updatedJobTuple.exitStatus, updatedJobTuple.exitReason,
710
- updatedJobTuple.wallTime)
835
+ updatedJobTuple.jobID,
836
+ updatedJobTuple.exitStatus,
837
+ updatedJobTuple.exitReason,
838
+ updatedJobTuple.wallTime,
839
+ )
711
840
  # easy, track different state
712
841
  try:
713
- updatedJob = self.toilState.get_job(self.issued_jobs_by_batch_system_id[bsID])
842
+ updatedJob = self.toilState.get_job(
843
+ self.issued_jobs_by_batch_system_id[bsID]
844
+ )
714
845
  except KeyError:
715
- logger.warning("A result seems to already have been processed for job %s", bsID)
846
+ logger.warning(
847
+ "A result seems to already have been processed for job %s", bsID
848
+ )
716
849
  else:
717
850
  if exitStatus == 0:
718
- logger.debug('Job ended: %s', updatedJob)
851
+ logger.debug("Job ended: %s", updatedJob)
719
852
  else:
720
- status_string = str(exitStatus) if exitStatus != EXIT_STATUS_UNAVAILABLE_VALUE else "<UNAVAILABLE>"
721
- logger.warning(f'Job failed with exit value {status_string}: {updatedJob}\n'
722
- f'Exit reason: {BatchJobExitReason.to_string(exitReason)}')
853
+ status_string = (
854
+ str(exitStatus)
855
+ if exitStatus != EXIT_STATUS_UNAVAILABLE_VALUE
856
+ else "<UNAVAILABLE>"
857
+ )
858
+ logger.warning(
859
+ f"Job failed with exit value {status_string}: {updatedJob}\n"
860
+ f"Exit reason: {BatchJobExitReason.to_string(exitReason)}"
861
+ )
862
+ # This logic is undefined for which of the failing jobs will send its exit code
863
+ # when there are multiple failing jobs with different exit statuses
864
+ self.recommended_fail_exit_code = exitStatus
723
865
  if exitStatus == CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE:
724
866
  # This is a CWL job informing us that the workflow is
725
867
  # asking things of us that Toil can't do. When we raise an
726
868
  # exception because of this, make sure to forward along
727
869
  # this exit code.
728
870
  logger.warning("This indicates an unsupported CWL requirement!")
729
- self.recommended_fail_exit_code = CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
871
+ self.recommended_fail_exit_code = (
872
+ CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
873
+ )
730
874
  # Tell everyone it stopped running.
731
- self._messages.publish(JobCompletedMessage(get_job_kind(updatedJob.get_names()), updatedJob.jobStoreID, exitStatus))
732
- self.process_finished_job(bsID, exitStatus, wall_time=wallTime, exit_reason=exitReason)
875
+ self._messages.publish(
876
+ JobCompletedMessage(
877
+ get_job_kind(updatedJob.get_names()),
878
+ updatedJob.jobStoreID,
879
+ exitStatus,
880
+ )
881
+ )
882
+ self.process_finished_job(
883
+ bsID, exitStatus, wall_time=wallTime, exit_reason=exitReason
884
+ )
733
885
 
734
886
  def _processLostJobs(self):
735
887
  """Process jobs that have gone awry."""
@@ -737,7 +889,9 @@ class Leader:
737
889
  # gather for rescueJobsFrequency seconds) check if there are any jobs
738
890
  # that have run too long (see self.reissueOverLongJobs) or which have
739
891
  # gone missing from the batch system (see self.reissueMissingJobs)
740
- if ((time.time() - self.timeSinceJobsLastRescued) >= self.config.rescueJobsFrequency):
892
+ if (
893
+ time.time() - self.timeSinceJobsLastRescued
894
+ ) >= self.config.rescueJobsFrequency:
741
895
  # We only rescue jobs every N seconds, and when we have apparently
742
896
  # exhausted the current job supply
743
897
  self.reissueOverLongJobs()
@@ -757,9 +911,11 @@ class Leader:
757
911
  """
758
912
  self.timeSinceJobsLastRescued = time.time()
759
913
 
760
- while self._messages.count(JobUpdatedMessage) > 0 or \
761
- self.getNumberOfJobsIssued() or \
762
- self.serviceManager.get_job_count():
914
+ while (
915
+ self._messages.count(JobUpdatedMessage) > 0
916
+ or self.getNumberOfJobsIssued()
917
+ or self.serviceManager.get_job_count()
918
+ ):
763
919
 
764
920
  if self._messages.count(JobUpdatedMessage) > 0:
765
921
  self._processReadyJobs()
@@ -811,13 +967,21 @@ class Leader:
811
967
  if not self._messages.empty():
812
968
  raise RuntimeError(f"Pending messages at shutdown: {self._messages}")
813
969
  if self.toilState.successorCounts != {}:
814
- raise RuntimeError(f"Jobs waiting on successors at shutdown: {self.toilState.successorCounts}")
970
+ raise RuntimeError(
971
+ f"Jobs waiting on successors at shutdown: {self.toilState.successorCounts}"
972
+ )
815
973
  if self.toilState.successor_to_predecessors != {}:
816
- raise RuntimeError(f"Successors pending for their predecessors at shutdown: {self.toilState.successor_to_predecessors}")
974
+ raise RuntimeError(
975
+ f"Successors pending for their predecessors at shutdown: {self.toilState.successor_to_predecessors}"
976
+ )
817
977
  if self.toilState.service_to_client != {}:
818
- raise RuntimeError(f"Services pending for their clients at shutdown: {self.toilState.service_to_client}")
978
+ raise RuntimeError(
979
+ f"Services pending for their clients at shutdown: {self.toilState.service_to_client}"
980
+ )
819
981
  if self.toilState.servicesIssued != {}:
820
- raise RuntimeError(f"Services running at shutdown: {self.toilState.servicesIssued}")
982
+ raise RuntimeError(
983
+ f"Services running at shutdown: {self.toilState.servicesIssued}"
984
+ )
821
985
 
822
986
  def checkForDeadlocks(self):
823
987
  """Check if the system is deadlocked running service jobs."""
@@ -827,18 +991,22 @@ class Leader:
827
991
  # If there are no updated jobs and at least some jobs running
828
992
  if totalServicesIssued >= totalRunningJobs and totalRunningJobs > 0:
829
993
  # Collect all running service job store IDs into a set to compare with the deadlock set
830
- running_service_ids: Set[str] = set()
994
+ running_service_ids: set[str] = set()
831
995
  for js_id in self.issued_jobs_by_batch_system_id.values():
832
996
  job = self.toilState.get_job(js_id)
833
- if isinstance(job, ServiceJobDescription) and self.serviceManager.is_running(js_id):
997
+ if isinstance(
998
+ job, ServiceJobDescription
999
+ ) and self.serviceManager.is_running(js_id):
834
1000
  running_service_ids.add(js_id)
835
1001
 
836
1002
  if len(running_service_ids) > totalRunningJobs:
837
1003
  # This is too many services.
838
1004
  # TODO: couldn't more jobs have started since we polled the
839
1005
  # running job count?
840
- raise RuntimeError(f"Supposedly running {len(running_service_ids)} services, which is"
841
- f"more than the {totalRunningJobs} currently running jobs overall.")
1006
+ raise RuntimeError(
1007
+ f"Supposedly running {len(running_service_ids)} services, which is"
1008
+ f"more than the {totalRunningJobs} currently running jobs overall."
1009
+ )
842
1010
 
843
1011
  # If all the running jobs are active services then we have a potential deadlock
844
1012
  if len(running_service_ids) == totalRunningJobs:
@@ -852,27 +1020,49 @@ class Leader:
852
1020
  # Use a generic message if none is available
853
1021
  message = "Cluster may be too small."
854
1022
 
855
-
856
1023
  # See if this is a new potential deadlock
857
1024
  if self.potentialDeadlockedJobs != running_service_ids:
858
- logger.warning(("Potential deadlock detected! All %s running jobs are service jobs, "
859
- "with no normal jobs to use them! %s"), totalRunningJobs, message)
1025
+ logger.warning(
1026
+ (
1027
+ "Potential deadlock detected! All %s running jobs are service jobs, "
1028
+ "with no normal jobs to use them! %s"
1029
+ ),
1030
+ totalRunningJobs,
1031
+ message,
1032
+ )
860
1033
  self.potentialDeadlockedJobs = running_service_ids
861
1034
  self.potentialDeadlockTime = time.time()
862
1035
  else:
863
1036
  # We wait self.config.deadlockWait seconds before declaring the system deadlocked
864
1037
  stuckFor = time.time() - self.potentialDeadlockTime
865
1038
  if stuckFor >= self.config.deadlockWait:
866
- logger.error("We have been deadlocked since %s on these service jobs: %s",
867
- self.potentialDeadlockTime, self.potentialDeadlockedJobs)
868
- raise DeadlockException(("The workflow is service deadlocked - all %d running jobs "
869
- "have been the same active services for at least %s seconds") % (totalRunningJobs, self.config.deadlockWait))
1039
+ logger.error(
1040
+ "We have been deadlocked since %s on these service jobs: %s",
1041
+ self.potentialDeadlockTime,
1042
+ self.potentialDeadlockedJobs,
1043
+ )
1044
+ raise DeadlockException(
1045
+ (
1046
+ "The workflow is service deadlocked - all %d running jobs "
1047
+ "have been the same active services for at least %s seconds"
1048
+ )
1049
+ % (totalRunningJobs, self.config.deadlockWait)
1050
+ )
870
1051
  else:
871
1052
  # Complain that we are still stuck.
872
- waitingNormalJobs = self.getNumberOfJobsIssued() - totalServicesIssued
873
- logger.warning(("Potentially deadlocked for %.0f seconds. Waiting at most %.0f more seconds "
874
- "for any of %d issued non-service jobs to schedule and start. %s"),
875
- stuckFor, self.config.deadlockWait - stuckFor, waitingNormalJobs, message)
1053
+ waitingNormalJobs = (
1054
+ self.getNumberOfJobsIssued() - totalServicesIssued
1055
+ )
1056
+ logger.warning(
1057
+ (
1058
+ "Potentially deadlocked for %.0f seconds. Waiting at most %.0f more seconds "
1059
+ "for any of %d issued non-service jobs to schedule and start. %s"
1060
+ ),
1061
+ stuckFor,
1062
+ self.config.deadlockWait - stuckFor,
1063
+ waitingNormalJobs,
1064
+ message,
1065
+ )
876
1066
  else:
877
1067
  # We have observed non-service jobs running, so reset the potential deadlock
878
1068
  self.feed_deadlock_watchdog()
@@ -893,29 +1083,38 @@ class Leader:
893
1083
  """Add a job to the queue of jobs currently trying to run."""
894
1084
  # Never issue the same job multiple times simultaneously
895
1085
  if jobNode.jobStoreID in self.toilState.jobs_issued:
896
- raise RuntimeError(f"Attempted to issue {jobNode} multiple times simultaneously!")
1086
+ raise RuntimeError(
1087
+ f"Attempted to issue {jobNode} multiple times simultaneously!"
1088
+ )
897
1089
 
898
- workerCommand = [resolveEntryPoint('_toil_worker'),
899
- jobNode.jobName,
900
- self.jobStoreLocator,
901
- jobNode.jobStoreID]
1090
+ workerCommand = [
1091
+ resolveEntryPoint("_toil_worker"),
1092
+ jobNode.jobName,
1093
+ self.jobStoreLocator,
1094
+ jobNode.jobStoreID,
1095
+ ]
902
1096
 
903
1097
  for context in self.batchSystem.getWorkerContexts():
904
1098
  # For each context manager hook the batch system wants to run in
905
1099
  # the worker, serialize and send it.
906
- workerCommand.append('--context')
907
- workerCommand.append(base64.b64encode(pickle.dumps(context)).decode('utf-8'))
1100
+ workerCommand.append("--context")
1101
+ workerCommand.append(
1102
+ base64.b64encode(pickle.dumps(context)).decode("utf-8")
1103
+ )
908
1104
 
909
- omp_threads = os.environ.get('OMP_NUM_THREADS') \
910
- or str(max(1, int(jobNode.cores))) # make sure OMP_NUM_THREADS is a positive integer
1105
+ omp_threads = os.environ.get("OMP_NUM_THREADS") or str(
1106
+ max(1, int(jobNode.cores))
1107
+ ) # make sure OMP_NUM_THREADS is a positive integer
911
1108
 
912
1109
  job_environment = {
913
1110
  # Set the number of cores used by OpenMP applications
914
- 'OMP_NUM_THREADS': omp_threads,
1111
+ "OMP_NUM_THREADS": omp_threads,
915
1112
  }
916
1113
 
917
1114
  # jobBatchSystemID is an int for each job
918
- jobBatchSystemID = self.batchSystem.issueBatchJob(' '.join(workerCommand), jobNode, job_environment=job_environment)
1115
+ jobBatchSystemID = self.batchSystem.issueBatchJob(
1116
+ " ".join(workerCommand), jobNode, job_environment=job_environment
1117
+ )
919
1118
  # Record the job by the ID the batch system will use to talk about it with us
920
1119
  self.issued_jobs_by_batch_system_id[jobBatchSystemID] = jobNode.jobStoreID
921
1120
  # Record that this job is issued right now and shouldn't e.g. be issued again.
@@ -925,11 +1124,18 @@ class Leader:
925
1124
  # so increment this value after the job is added to the issuedJob dict
926
1125
  self.preemptibleJobsIssued += 1
927
1126
  cur_logger = logger.debug if jobNode.local else logger.info
928
- cur_logger("Issued job %s with job batch system ID: "
929
- "%s and %s",
930
- jobNode, str(jobBatchSystemID), jobNode.requirements_string())
1127
+ cur_logger(
1128
+ "Issued job %s with job batch system ID: " "%s and %s",
1129
+ jobNode,
1130
+ str(jobBatchSystemID),
1131
+ jobNode.requirements_string(),
1132
+ )
931
1133
  # Tell everyone it is issued and the queue size changed
932
- self._messages.publish(JobIssuedMessage(get_job_kind(jobNode.get_names()), jobNode.jobStoreID, jobBatchSystemID))
1134
+ self._messages.publish(
1135
+ JobIssuedMessage(
1136
+ get_job_kind(jobNode.get_names()), jobNode.jobStoreID, jobBatchSystemID
1137
+ )
1138
+ )
933
1139
  self._messages.publish(QueueSizeMessage(self.getNumberOfJobsIssued()))
934
1140
  # Tell the user there's another job to do
935
1141
  self.progress_overall.total += 1
@@ -949,7 +1155,9 @@ class Leader:
949
1155
  # Grab the service job description
950
1156
  service = self.toilState.get_job(service_id)
951
1157
  if not isinstance(service, ServiceJobDescription):
952
- raise RuntimeError("The grabbed service job description is not the right type.")
1158
+ raise RuntimeError(
1159
+ "The grabbed service job description is not the right type."
1160
+ )
953
1161
 
954
1162
  if service.preemptible:
955
1163
  self.preemptibleServiceJobsToBeIssued.append(service_id)
@@ -959,14 +1167,23 @@ class Leader:
959
1167
 
960
1168
  def issueQueingServiceJobs(self):
961
1169
  """Issues any queuing service jobs up to the limit of the maximum allowed."""
962
- while len(self.serviceJobsToBeIssued) > 0 and self.serviceJobsIssued < self.config.maxServiceJobs:
1170
+ while (
1171
+ len(self.serviceJobsToBeIssued) > 0
1172
+ and self.serviceJobsIssued < self.config.maxServiceJobs
1173
+ ):
963
1174
  self.issueJob(self.toilState.get_job(self.serviceJobsToBeIssued.pop()))
964
1175
  self.serviceJobsIssued += 1
965
- while len(self.preemptibleServiceJobsToBeIssued) > 0 and self.preemptibleServiceJobsIssued < self.config.maxPreemptibleServiceJobs:
966
- self.issueJob(self.toilState.get_job(self.preemptibleServiceJobsToBeIssued.pop()))
1176
+ while (
1177
+ len(self.preemptibleServiceJobsToBeIssued) > 0
1178
+ and self.preemptibleServiceJobsIssued
1179
+ < self.config.maxPreemptibleServiceJobs
1180
+ ):
1181
+ self.issueJob(
1182
+ self.toilState.get_job(self.preemptibleServiceJobsToBeIssued.pop())
1183
+ )
967
1184
  self.preemptibleServiceJobsIssued += 1
968
1185
 
969
- def getNumberOfJobsIssued(self, preemptible: Optional[bool]=None) -> int:
1186
+ def getNumberOfJobsIssued(self, preemptible: Optional[bool] = None) -> int:
970
1187
  """
971
1188
  Get number of jobs that have been added by issueJob(s) and not removed by removeJob.
972
1189
 
@@ -1016,12 +1233,16 @@ class Leader:
1016
1233
  """
1017
1234
  if jobBatchSystemID not in self.issued_jobs_by_batch_system_id:
1018
1235
  raise RuntimeError("Job was already removed or was never issued.")
1019
- issuedDesc = self.toilState.get_job(self.issued_jobs_by_batch_system_id[jobBatchSystemID])
1236
+ issuedDesc = self.toilState.get_job(
1237
+ self.issued_jobs_by_batch_system_id[jobBatchSystemID]
1238
+ )
1020
1239
  if issuedDesc.preemptible:
1021
1240
  # len(issued_jobs_by_batch_system_id) should always be greater than or equal to preemptibleJobsIssued,
1022
1241
  # so decrement this value before removing the job from the issuedJob map
1023
1242
  if self.preemptibleJobsIssued <= 0:
1024
- raise RuntimeError("The number of preemptive issued jobs cannot be negative.")
1243
+ raise RuntimeError(
1244
+ "The number of preemptive issued jobs cannot be negative."
1245
+ )
1025
1246
  self.preemptibleJobsIssued -= 1
1026
1247
  # It's not issued anymore.
1027
1248
  del self.issued_jobs_by_batch_system_id[jobBatchSystemID]
@@ -1041,19 +1262,24 @@ class Leader:
1041
1262
 
1042
1263
  return issuedDesc
1043
1264
 
1044
- def getJobs(self, preemptible: Optional[bool] = None) -> List[JobDescription]:
1265
+ def getJobs(self, preemptible: Optional[bool] = None) -> list[JobDescription]:
1045
1266
  """
1046
1267
  Get all issued jobs.
1047
1268
 
1048
1269
  :param preemptible: If specified, select only preemptible or only non-preemptible jobs.
1049
1270
  """
1050
1271
 
1051
- jobs = [self.toilState.get_job(job_store_id) for job_store_id in self.issued_jobs_by_batch_system_id.values()]
1272
+ jobs = [
1273
+ self.toilState.get_job(job_store_id)
1274
+ for job_store_id in self.issued_jobs_by_batch_system_id.values()
1275
+ ]
1052
1276
  if preemptible is not None:
1053
1277
  jobs = [job for job in jobs if job.preemptible == preemptible]
1054
1278
  return jobs
1055
1279
 
1056
- def killJobs(self, jobsToKill, exit_reason: BatchJobExitReason = BatchJobExitReason.KILLED):
1280
+ def killJobs(
1281
+ self, jobsToKill, exit_reason: BatchJobExitReason = BatchJobExitReason.KILLED
1282
+ ):
1057
1283
  """
1058
1284
  Kills the given set of jobs and then sends them for processing.
1059
1285
 
@@ -1067,7 +1293,9 @@ class Leader:
1067
1293
  self.batchSystem.killBatchJobs(jobsToKill)
1068
1294
  for jobBatchSystemID in jobsToKill:
1069
1295
  # Reissue immediately, noting that we killed the job
1070
- willRerun = self.process_finished_job(jobBatchSystemID, 1, exit_reason=exit_reason)
1296
+ willRerun = self.process_finished_job(
1297
+ jobBatchSystemID, 1, exit_reason=exit_reason
1298
+ )
1071
1299
 
1072
1300
  if willRerun:
1073
1301
  # Compose a list of all the jobs that will run again
@@ -1075,8 +1303,7 @@ class Leader:
1075
1303
 
1076
1304
  return jobsRerunning
1077
1305
 
1078
-
1079
- #Following functions handle error cases for when jobs have gone awry with the batch system.
1306
+ # Following functions handle error cases for when jobs have gone awry with the batch system.
1080
1307
 
1081
1308
  def reissueOverLongJobs(self) -> None:
1082
1309
  """
@@ -1087,20 +1314,30 @@ class Leader:
1087
1314
  """
1088
1315
  maxJobDuration = self.config.maxJobDuration
1089
1316
  jobsToKill = []
1090
- if maxJobDuration < 10000000: # We won't bother doing anything if rescue time > 16 weeks.
1317
+ if (
1318
+ maxJobDuration < 10000000
1319
+ ): # We won't bother doing anything if rescue time > 16 weeks.
1091
1320
  runningJobs = self.batchSystem.getRunningBatchJobIDs()
1092
1321
  for jobBatchSystemID in list(runningJobs.keys()):
1093
1322
  if runningJobs[jobBatchSystemID] > maxJobDuration:
1094
- logger.warning("The job: %s has been running for: %s seconds, more than the "
1095
- "max job duration: %s, we'll kill it",
1096
- self.issued_jobs_by_batch_system_id[jobBatchSystemID],
1097
- str(runningJobs[jobBatchSystemID]),
1098
- str(maxJobDuration))
1323
+ logger.warning(
1324
+ "The job: %s has been running for: %s seconds, more than the "
1325
+ "max job duration: %s, we'll kill it",
1326
+ self.issued_jobs_by_batch_system_id[jobBatchSystemID],
1327
+ str(runningJobs[jobBatchSystemID]),
1328
+ str(maxJobDuration),
1329
+ )
1099
1330
  jobsToKill.append(jobBatchSystemID)
1100
- reissued = self.killJobs(jobsToKill, exit_reason=BatchJobExitReason.MAXJOBDURATION)
1331
+ reissued = self.killJobs(
1332
+ jobsToKill, exit_reason=BatchJobExitReason.MAXJOBDURATION
1333
+ )
1101
1334
  if len(jobsToKill) > 0:
1102
1335
  # Summarize our actions
1103
- logger.info("Killed %d over long jobs and reissued %d of them", len(jobsToKill), len(reissued))
1336
+ logger.info(
1337
+ "Killed %d over long jobs and reissued %d of them",
1338
+ len(jobsToKill),
1339
+ len(reissued),
1340
+ )
1104
1341
 
1105
1342
  def reissueMissingJobs(self, killAfterNTimesMissing=3):
1106
1343
  """
@@ -1112,11 +1349,13 @@ class Leader:
1112
1349
  """
1113
1350
  issuedJobs = set(self.batchSystem.getIssuedBatchJobIDs())
1114
1351
  jobBatchSystemIDsSet = set(list(self.issued_jobs_by_batch_system_id.keys()))
1115
- #Clean up the reissueMissingJobs_missingHash hash, getting rid of jobs that have turned up
1352
+ # Clean up the reissueMissingJobs_missingHash hash, getting rid of jobs that have turned up
1116
1353
  missingJobIDsSet = set(list(self.reissueMissingJobs_missingHash.keys()))
1117
1354
  for jobBatchSystemID in missingJobIDsSet.difference(jobBatchSystemIDsSet):
1118
1355
  self.reissueMissingJobs_missingHash.pop(jobBatchSystemID)
1119
- logger.warning("Batch system id: %s is no longer missing", str(jobBatchSystemID))
1356
+ logger.warning(
1357
+ "Batch system id: %s is no longer missing", str(jobBatchSystemID)
1358
+ )
1120
1359
  # checks we have no unexpected jobs running
1121
1360
  if not issuedJobs.issubset(jobBatchSystemIDsSet):
1122
1361
  raise RuntimeError("An unexpected job is still running.")
@@ -1128,24 +1367,33 @@ class Leader:
1128
1367
  else:
1129
1368
  self.reissueMissingJobs_missingHash[jobBatchSystemID] = 1
1130
1369
  timesMissing = self.reissueMissingJobs_missingHash[jobBatchSystemID]
1131
- logger.warning("Job store ID %s with batch system id %s is missing for the %i time",
1132
- jobStoreID, str(jobBatchSystemID), timesMissing)
1370
+ logger.warning(
1371
+ "Job store ID %s with batch system id %s is missing for the %i time",
1372
+ jobStoreID,
1373
+ str(jobBatchSystemID),
1374
+ timesMissing,
1375
+ )
1133
1376
  # Tell everyone it is missing
1134
1377
  self._messages.publish(JobMissingMessage(jobStoreID))
1135
1378
  if timesMissing == killAfterNTimesMissing:
1136
1379
  self.reissueMissingJobs_missingHash.pop(jobBatchSystemID)
1137
1380
  jobsToKill.append(jobBatchSystemID)
1138
1381
  self.killJobs(jobsToKill, exit_reason=BatchJobExitReason.MISSING)
1139
- return len( self.reissueMissingJobs_missingHash ) == 0 #We use this to inform
1140
- #if there are missing jobs
1382
+ return len(self.reissueMissingJobs_missingHash) == 0 # We use this to inform
1383
+ # if there are missing jobs
1141
1384
 
1142
1385
  def processRemovedJob(self, issuedJob, result_status):
1143
1386
  if result_status != 0:
1144
- logger.warning("Despite the batch system claiming failure the "
1145
- "job %s seems to have finished and been removed", issuedJob)
1387
+ logger.warning(
1388
+ "Despite the batch system claiming failure the "
1389
+ "job %s seems to have finished and been removed",
1390
+ issuedJob,
1391
+ )
1146
1392
  self._updatePredecessorStatus(issuedJob.jobStoreID)
1147
1393
 
1148
- def process_finished_job(self, batch_system_id, result_status, wall_time=None, exit_reason=None) -> bool:
1394
+ def process_finished_job(
1395
+ self, batch_system_id, result_status, wall_time=None, exit_reason=None
1396
+ ) -> bool:
1149
1397
  """
1150
1398
  Process finished jobs.
1151
1399
 
@@ -1166,12 +1414,18 @@ class Leader:
1166
1414
  self.progress_failed.update(incr=1)
1167
1415
 
1168
1416
  # Delegate to the version that uses a JobDescription
1169
- return self.process_finished_job_description(issued_job, result_status, wall_time, exit_reason, batch_system_id)
1170
-
1171
- def process_finished_job_description(self, finished_job: JobDescription, result_status: int,
1172
- wall_time: Optional[float] = None,
1173
- exit_reason: Optional[BatchJobExitReason] = None,
1174
- batch_system_id: Optional[int] = None) -> bool:
1417
+ return self.process_finished_job_description(
1418
+ issued_job, result_status, wall_time, exit_reason, batch_system_id
1419
+ )
1420
+
1421
+ def process_finished_job_description(
1422
+ self,
1423
+ finished_job: JobDescription,
1424
+ result_status: int,
1425
+ wall_time: Optional[float] = None,
1426
+ exit_reason: Optional[BatchJobExitReason] = None,
1427
+ batch_system_id: Optional[int] = None,
1428
+ ) -> bool:
1175
1429
  """
1176
1430
  Process a finished JobDescription based upon its success or failure.
1177
1431
 
@@ -1193,7 +1447,9 @@ class Leader:
1193
1447
  # TODO: Use message bus?
1194
1448
  self.clusterScaler.addCompletedJob(finished_job, wall_time)
1195
1449
  if self.toilState.job_exists(job_store_id):
1196
- logger.debug("Job %s continues to exist (i.e. has more to do)", finished_job)
1450
+ logger.debug(
1451
+ "Job %s continues to exist (i.e. has more to do)", finished_job
1452
+ )
1197
1453
  try:
1198
1454
  # Reload the job as modified by the worker
1199
1455
  if finished_job.has_body():
@@ -1218,24 +1474,22 @@ class Leader:
1218
1474
  "batch system may have killed (or never started) "
1219
1475
  "the Toil worker."
1220
1476
  )
1221
- change_detected = self.toilState.reset_job_expecting_change(job_store_id, timeout)
1477
+ change_detected = self.toilState.reset_job_expecting_change(
1478
+ job_store_id, timeout
1479
+ )
1222
1480
  replacement_job = self.toilState.get_job(job_store_id)
1223
1481
 
1224
1482
  if not change_detected:
1225
- logger.warning(
1226
- 'Job %s %s',
1227
- replacement_job,
1228
- complaint
1229
- )
1483
+ logger.warning("Job %s %s", replacement_job, complaint)
1230
1484
  if result_status == 0:
1231
1485
  # Make the job fail because we ran it and it finished
1232
1486
  # and we never heard back.
1233
1487
  logger.error(
1234
- 'Marking ostensibly successful job %s that did '
1235
- 'not report in to the job store before '
1236
- '--jobStoreTimeout as having been partitioned '
1237
- 'from us.',
1238
- replacement_job
1488
+ "Marking ostensibly successful job %s that did "
1489
+ "not report in to the job store before "
1490
+ "--jobStoreTimeout as having been partitioned "
1491
+ "from us.",
1492
+ replacement_job,
1239
1493
  )
1240
1494
  result_status = EXIT_STATUS_UNAVAILABLE_VALUE
1241
1495
  exit_reason = BatchJobExitReason.PARTITION
@@ -1251,7 +1505,9 @@ class Leader:
1251
1505
  # read from e.g. a non-POSIX-compliant filesystem gave us a
1252
1506
  # false positive when we checked for its existence. Process the
1253
1507
  # job from here as any other job removed from the job store.
1254
- logger.debug("Job %s is actually complete upon closer inspection", finished_job)
1508
+ logger.debug(
1509
+ "Job %s is actually complete upon closer inspection", finished_job
1510
+ )
1255
1511
  self.processRemovedJob(finished_job, result_status)
1256
1512
  return False
1257
1513
  if replacement_job.logJobStoreFileID is not None:
@@ -1259,18 +1515,31 @@ class Leader:
1259
1515
  # more memory efficient than read().striplines() while leaving off the
1260
1516
  # trailing \n left when using readlines()
1261
1517
  # http://stackoverflow.com/a/15233739
1262
- StatsAndLogging.logWithFormatting(f'Log from job "{job_store_id}"', log_stream, method=logger.warning,
1263
- message='The job seems to have left a log file, indicating failure: %s' % replacement_job)
1518
+ StatsAndLogging.logWithFormatting(
1519
+ f'Log from job "{job_store_id}"',
1520
+ log_stream,
1521
+ method=logger.warning,
1522
+ message="The job seems to have left a log file, indicating failure: %s"
1523
+ % replacement_job,
1524
+ )
1264
1525
  if self.config.writeLogs or self.config.writeLogsGzip:
1265
1526
  with replacement_job.getLogFileHandle(self.jobStore) as log_stream:
1266
1527
  # Send log data from the job store to each per-job log file involved.
1267
- StatsAndLogging.writeLogFiles([names.stats_name for names in replacement_job.get_chain()], log_stream, self.config, failed=True)
1528
+ StatsAndLogging.writeLogFiles(
1529
+ [names.stats_name for names in replacement_job.get_chain()],
1530
+ log_stream,
1531
+ self.config,
1532
+ failed=True,
1533
+ )
1268
1534
  if result_status != 0:
1269
1535
  # If the batch system returned a non-zero exit code then the worker
1270
1536
  # is assumed not to have captured the failure of the job, so we
1271
1537
  # reduce the try count here.
1272
1538
  if replacement_job.logJobStoreFileID is None:
1273
- logger.warning("No log file is present, despite job failing: %s", replacement_job)
1539
+ logger.warning(
1540
+ "No log file is present, despite job failing: %s",
1541
+ replacement_job,
1542
+ )
1274
1543
 
1275
1544
  if batch_system_id is not None:
1276
1545
  # Look for any standard output/error files created by the batch system.
@@ -1279,30 +1548,60 @@ class Leader:
1279
1548
  # --workDir / TOIL_WORKDIR is on a shared file system.
1280
1549
  # They live directly in the Toil work directory because that is
1281
1550
  # guaranteed to exist on the leader and workers.
1282
- file_list = glob.glob(self.batchSystem.format_std_out_err_glob(batch_system_id))
1551
+ file_list = glob.glob(
1552
+ self.batchSystem.format_std_out_err_glob(batch_system_id)
1553
+ )
1283
1554
  for log_file in file_list:
1284
1555
  try:
1285
- log_stream = open(log_file, 'rb')
1556
+ log_stream = open(log_file, "rb")
1286
1557
  except:
1287
- logger.warning('The batch system left a file %s, but it could not be opened' % log_file)
1558
+ logger.warning(
1559
+ "The batch system left a file %s, but it could not be opened"
1560
+ % log_file
1561
+ )
1288
1562
  else:
1289
1563
  with log_stream:
1290
1564
  if os.path.getsize(log_file) > 0:
1291
- StatsAndLogging.logWithFormatting(f'Log from job "{job_store_id}"', log_stream, method=logger.warning,
1292
- message='The batch system left a non-empty file %s:' % log_file)
1293
- if self.config.writeLogs or self.config.writeLogsGzip:
1294
- file_root, _ = os.path.splitext(os.path.basename(log_file))
1295
- job_names = [names.stats_name for names in replacement_job.get_chain()]
1565
+ StatsAndLogging.logWithFormatting(
1566
+ f'Log from job "{job_store_id}"',
1567
+ log_stream,
1568
+ method=logger.warning,
1569
+ message="The batch system left a non-empty file %s:"
1570
+ % log_file,
1571
+ )
1572
+ if (
1573
+ self.config.writeLogs
1574
+ or self.config.writeLogsGzip
1575
+ ):
1576
+ file_root, _ = os.path.splitext(
1577
+ os.path.basename(log_file)
1578
+ )
1579
+ job_names = [
1580
+ names.stats_name
1581
+ for names in replacement_job.get_chain()
1582
+ ]
1296
1583
  # Tack the batch system log file name onto each job's name
1297
- job_names = [j + '_' + file_root for j in job_names]
1584
+ job_names = [
1585
+ j + "_" + file_root for j in job_names
1586
+ ]
1298
1587
  log_stream.seek(0)
1299
- StatsAndLogging.writeLogFiles(job_names, log_stream, self.config, failed=True)
1588
+ StatsAndLogging.writeLogFiles(
1589
+ job_names,
1590
+ log_stream,
1591
+ self.config,
1592
+ failed=True,
1593
+ )
1300
1594
  else:
1301
- logger.warning('The batch system left an empty file %s' % log_file)
1595
+ logger.warning(
1596
+ "The batch system left an empty file %s"
1597
+ % log_file
1598
+ )
1302
1599
 
1303
1600
  # Tell the job to reset itself after a failure.
1304
1601
  # It needs to know the failure reason if available; some are handled specially.
1305
- replacement_job.setupJobAfterFailure(exit_status=result_status, exit_reason=exit_reason)
1602
+ replacement_job.setupJobAfterFailure(
1603
+ exit_status=result_status, exit_reason=exit_reason
1604
+ )
1306
1605
  self.toilState.commit_job(job_store_id)
1307
1606
 
1308
1607
  elif job_store_id in self.toilState.hasFailedSuccessors:
@@ -1310,18 +1609,20 @@ class Leader:
1310
1609
  self.toilState.hasFailedSuccessors.remove(job_store_id)
1311
1610
 
1312
1611
  # Now that we know the job is done we can add it to the list of updated jobs
1313
- self._messages.publish(JobUpdatedMessage(replacement_job.jobStoreID, result_status))
1612
+ self._messages.publish(
1613
+ JobUpdatedMessage(replacement_job.jobStoreID, result_status)
1614
+ )
1314
1615
  logger.debug("Added job: %s to updated jobs", replacement_job)
1315
1616
 
1316
1617
  # Return True if it will rerun (still has retries) and false if it
1317
1618
  # is completely failed.
1318
1619
  return replacement_job.remainingTryCount > 0
1319
- else: #The job is done
1620
+ else: # The job is done
1320
1621
  self.processRemovedJob(finished_job, result_status)
1321
1622
  # Being done, it won't run again.
1322
1623
  return False
1323
1624
 
1324
- def getSuccessors(self, job_id: str, alreadySeenSuccessors: Set[str]) -> Set[str]:
1625
+ def getSuccessors(self, job_id: str, alreadySeenSuccessors: set[str]) -> set[str]:
1325
1626
  """
1326
1627
  Get successors of the given job by walking the job graph recursively.
1327
1628
 
@@ -1329,6 +1630,7 @@ class Leader:
1329
1630
  :returns: The set of found successors. This set is added to alreadySeenSuccessors.
1330
1631
  """
1331
1632
  successors = set()
1633
+
1332
1634
  def successorRecursion(job_id: str) -> None:
1333
1635
  # TODO: do we need to reload from the job store here, or is the cache OK?
1334
1636
  jobDesc = self.toilState.get_job(job_id)
@@ -1360,12 +1662,15 @@ class Leader:
1360
1662
 
1361
1663
  # Tell everyone it failed
1362
1664
 
1363
- self._messages.publish(JobFailedMessage(get_job_kind(job_desc.get_names()), job_id))
1665
+ self._messages.publish(
1666
+ JobFailedMessage(get_job_kind(job_desc.get_names()), job_id)
1667
+ )
1364
1668
 
1365
1669
  if job_id in self.toilState.service_to_client:
1366
1670
  # Is a service job
1367
- logger.debug("Service job is being processed as a totally failed job: %s", job_desc)
1368
-
1671
+ logger.debug(
1672
+ "Service job is being processed as a totally failed job: %s", job_desc
1673
+ )
1369
1674
 
1370
1675
  if not isinstance(job_desc, ServiceJobDescription):
1371
1676
  raise RuntimeError("The service job description type is incorrect.")
@@ -1389,8 +1694,13 @@ class Leader:
1389
1694
  # properly, and to remember that this service failed with an error
1390
1695
  # and possibly never started.
1391
1696
  if client_id in self.toilState.servicesIssued:
1392
- self.serviceManager.kill_services(self.toilState.servicesIssued[client_id], error=True)
1393
- logger.warning("Job: %s is instructing all other services of its parent job to quit", job_desc)
1697
+ self.serviceManager.kill_services(
1698
+ self.toilState.servicesIssued[client_id], error=True
1699
+ )
1700
+ logger.warning(
1701
+ "Job: %s is instructing all other services of its parent job to quit",
1702
+ job_desc,
1703
+ )
1394
1704
 
1395
1705
  # This ensures that the job will not attempt to run any of it's
1396
1706
  # successors on the stack
@@ -1414,9 +1724,14 @@ class Leader:
1414
1724
  # Any successor already in toilState.failedSuccessors will not be traversed
1415
1725
  # All successors traversed will be added to toilState.failedSuccessors and returned
1416
1726
  # as a set (unseenSuccessors).
1417
- unseenSuccessors = self.getSuccessors(job_id, self.toilState.failedSuccessors)
1418
- logger.debug("Found new failed successors: %s of job: %s", " ".join(
1419
- unseenSuccessors), job_desc)
1727
+ unseenSuccessors = self.getSuccessors(
1728
+ job_id, self.toilState.failedSuccessors
1729
+ )
1730
+ logger.debug(
1731
+ "Found new failed successors: %s of job: %s",
1732
+ " ".join(unseenSuccessors),
1733
+ job_desc,
1734
+ )
1420
1735
 
1421
1736
  # For each newly found successor
1422
1737
  for successorJobStoreID in unseenSuccessors:
@@ -1427,7 +1742,9 @@ class Leader:
1427
1742
  # For each such predecessor job
1428
1743
  # (we remove the successor from toilState.successor_to_predecessors to avoid doing
1429
1744
  # this multiple times for each failed predecessor)
1430
- for predecessor_id in self.toilState.successor_to_predecessors.pop(successorJobStoreID):
1745
+ for predecessor_id in self.toilState.successor_to_predecessors.pop(
1746
+ successorJobStoreID
1747
+ ):
1431
1748
 
1432
1749
  predecessor = self.toilState.get_job(predecessor_id)
1433
1750
 
@@ -1436,8 +1753,11 @@ class Leader:
1436
1753
 
1437
1754
  # Indicate that it has failed jobs.
1438
1755
  self.toilState.hasFailedSuccessors.add(predecessor_id)
1439
- logger.debug("Marking job: %s as having failed successors (found by "
1440
- "reading successors failed job)", predecessor)
1756
+ logger.debug(
1757
+ "Marking job: %s as having failed successors (found by "
1758
+ "reading successors failed job)",
1759
+ predecessor,
1760
+ )
1441
1761
 
1442
1762
  # If the predecessor has no remaining successors, add to list of updated jobs
1443
1763
  if self.toilState.count_pending_successors(predecessor_id) == 0:
@@ -1451,8 +1771,12 @@ class Leader:
1451
1771
 
1452
1772
  # Mark the predecessor as failed
1453
1773
  self.toilState.hasFailedSuccessors.add(predecessor_id)
1454
- logger.debug("Totally failed job: %s is marking direct predecessor: %s "
1455
- "as having failed jobs", job_desc, self.toilState.get_job(predecessor_id))
1774
+ logger.debug(
1775
+ "Totally failed job: %s is marking direct predecessor: %s "
1776
+ "as having failed jobs",
1777
+ job_desc,
1778
+ self.toilState.get_job(predecessor_id),
1779
+ )
1456
1780
 
1457
1781
  self._updatePredecessorStatus(job_id)
1458
1782
 
@@ -1462,38 +1786,59 @@ class Leader:
1462
1786
  # Is a service host job, so its predecessor is its client
1463
1787
  client_id = self.toilState.service_to_client.pop(jobStoreID)
1464
1788
  self.toilState.servicesIssued[client_id].remove(jobStoreID)
1465
- if len(self.toilState.servicesIssued[client_id]) == 0: # Predecessor job has
1789
+ if (
1790
+ len(self.toilState.servicesIssued[client_id]) == 0
1791
+ ): # Predecessor job has
1466
1792
  # all its services terminated
1467
- self.toilState.servicesIssued.pop(client_id) # The job has no running services
1793
+ self.toilState.servicesIssued.pop(
1794
+ client_id
1795
+ ) # The job has no running services
1468
1796
 
1469
- logger.debug('Job %s is no longer waiting on services; all services have stopped', self.toilState.get_job(client_id))
1797
+ logger.debug(
1798
+ "Job %s is no longer waiting on services; all services have stopped",
1799
+ self.toilState.get_job(client_id),
1800
+ )
1470
1801
 
1471
1802
  # Now we know the job is done we can add it to the list of
1472
1803
  # updated job files
1473
1804
  self._messages.publish(JobUpdatedMessage(client_id, 0))
1474
1805
  else:
1475
- logger.debug('Job %s is still waiting on %d services',
1476
- self.toilState.get_job(client_id),
1477
- len(self.toilState.servicesIssued[client_id]))
1806
+ logger.debug(
1807
+ "Job %s is still waiting on %d services",
1808
+ self.toilState.get_job(client_id),
1809
+ len(self.toilState.servicesIssued[client_id]),
1810
+ )
1478
1811
  elif jobStoreID not in self.toilState.successor_to_predecessors:
1479
- #We have reach the root job
1812
+ # We have reach the root job
1480
1813
  if self._messages.count(JobUpdatedMessage) != 0:
1481
1814
  raise RuntimeError("Root job is done but other jobs are still updated")
1482
1815
  if len(self.toilState.successor_to_predecessors) != 0:
1483
- raise RuntimeError("Job {} is finished and had no predecessor, but we have other outstanding jobs "
1484
- "with predecessors: {}".format(jobStoreID, self.toilState.successor_to_predecessors.keys()))
1816
+ raise RuntimeError(
1817
+ "Job {} is finished and had no predecessor, but we have other outstanding jobs "
1818
+ "with predecessors: {}".format(
1819
+ jobStoreID, self.toilState.successor_to_predecessors.keys()
1820
+ )
1821
+ )
1485
1822
  if len(self.toilState.successorCounts) != 0:
1486
- raise RuntimeError("Root job is done but jobs waiting on successors: {self.toilState.successorCounts}")
1487
- logger.debug("Reached root job %s so no predecessors to clean up" % jobStoreID)
1823
+ raise RuntimeError(
1824
+ "Root job is done but jobs waiting on successors: {self.toilState.successorCounts}"
1825
+ )
1826
+ logger.debug(
1827
+ "Reached root job %s so no predecessors to clean up" % jobStoreID
1828
+ )
1488
1829
 
1489
1830
  else:
1490
1831
  # Is a non-root, non-service job
1491
1832
  logger.debug("Cleaning the predecessors of %s" % jobStoreID)
1492
1833
 
1493
1834
  # For each predecessor
1494
- for predecessor_id in self.toilState.successor_to_predecessors.pop(jobStoreID):
1835
+ for predecessor_id in self.toilState.successor_to_predecessors.pop(
1836
+ jobStoreID
1837
+ ):
1495
1838
  if not isinstance(predecessor_id, str):
1496
- raise RuntimeError("Predecessor ID should be str but is {type(predecessor_id)}")
1839
+ raise RuntimeError(
1840
+ "Predecessor ID should be str but is {type(predecessor_id)}"
1841
+ )
1497
1842
  predecessor = self.toilState.get_job(predecessor_id)
1498
1843
 
1499
1844
  # Tell the predecessor that this job is done (keep only other successor jobs)