toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. toil/__init__.py +122 -315
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +173 -89
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
  5. toil/batchSystems/awsBatch.py +244 -135
  6. toil/batchSystems/cleanup_support.py +26 -16
  7. toil/batchSystems/contained_executor.py +31 -28
  8. toil/batchSystems/gridengine.py +86 -50
  9. toil/batchSystems/htcondor.py +166 -89
  10. toil/batchSystems/kubernetes.py +632 -382
  11. toil/batchSystems/local_support.py +20 -15
  12. toil/batchSystems/lsf.py +134 -81
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +290 -151
  16. toil/batchSystems/mesos/executor.py +79 -50
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +46 -28
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +296 -125
  21. toil/batchSystems/slurm.py +603 -138
  22. toil/batchSystems/torque.py +47 -33
  23. toil/bus.py +186 -76
  24. toil/common.py +664 -368
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1136 -483
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +63 -42
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +140 -60
  32. toil/fileStores/cachingFileStore.py +717 -269
  33. toil/fileStores/nonCachingFileStore.py +116 -87
  34. toil/job.py +1225 -368
  35. toil/jobStores/abstractJobStore.py +416 -266
  36. toil/jobStores/aws/jobStore.py +863 -477
  37. toil/jobStores/aws/utils.py +201 -120
  38. toil/jobStores/conftest.py +3 -2
  39. toil/jobStores/fileJobStore.py +292 -154
  40. toil/jobStores/googleJobStore.py +140 -74
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +668 -272
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +74 -31
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +214 -39
  49. toil/lib/aws/utils.py +287 -231
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +104 -47
  53. toil/lib/docker.py +131 -103
  54. toil/lib/ec2.py +361 -199
  55. toil/lib/ec2nodes.py +174 -106
  56. toil/lib/encryption/_dummy.py +5 -3
  57. toil/lib/encryption/_nacl.py +10 -6
  58. toil/lib/encryption/conftest.py +1 -0
  59. toil/lib/exceptions.py +26 -7
  60. toil/lib/expando.py +5 -3
  61. toil/lib/ftp_utils.py +217 -0
  62. toil/lib/generatedEC2Lists.py +127 -19
  63. toil/lib/humanize.py +6 -2
  64. toil/lib/integration.py +341 -0
  65. toil/lib/io.py +141 -15
  66. toil/lib/iterables.py +4 -2
  67. toil/lib/memoize.py +12 -8
  68. toil/lib/misc.py +66 -21
  69. toil/lib/objects.py +2 -2
  70. toil/lib/resources.py +68 -15
  71. toil/lib/retry.py +126 -81
  72. toil/lib/threading.py +299 -82
  73. toil/lib/throttle.py +16 -15
  74. toil/options/common.py +843 -409
  75. toil/options/cwl.py +175 -90
  76. toil/options/runner.py +50 -0
  77. toil/options/wdl.py +73 -17
  78. toil/provisioners/__init__.py +117 -46
  79. toil/provisioners/abstractProvisioner.py +332 -157
  80. toil/provisioners/aws/__init__.py +70 -33
  81. toil/provisioners/aws/awsProvisioner.py +1145 -715
  82. toil/provisioners/clusterScaler.py +541 -279
  83. toil/provisioners/gceProvisioner.py +282 -179
  84. toil/provisioners/node.py +155 -79
  85. toil/realtimeLogger.py +34 -22
  86. toil/resource.py +137 -75
  87. toil/server/app.py +128 -62
  88. toil/server/celery_app.py +3 -1
  89. toil/server/cli/wes_cwl_runner.py +82 -53
  90. toil/server/utils.py +54 -28
  91. toil/server/wes/abstract_backend.py +64 -26
  92. toil/server/wes/amazon_wes_utils.py +21 -15
  93. toil/server/wes/tasks.py +121 -63
  94. toil/server/wes/toil_backend.py +142 -107
  95. toil/server/wsgi_app.py +4 -3
  96. toil/serviceManager.py +58 -22
  97. toil/statsAndLogging.py +224 -70
  98. toil/test/__init__.py +282 -183
  99. toil/test/batchSystems/batchSystemTest.py +460 -210
  100. toil/test/batchSystems/batch_system_plugin_test.py +90 -0
  101. toil/test/batchSystems/test_gridengine.py +173 -0
  102. toil/test/batchSystems/test_lsf_helper.py +67 -58
  103. toil/test/batchSystems/test_slurm.py +110 -49
  104. toil/test/cactus/__init__.py +0 -0
  105. toil/test/cactus/test_cactus_integration.py +56 -0
  106. toil/test/cwl/cwlTest.py +496 -287
  107. toil/test/cwl/measure_default_memory.cwl +12 -0
  108. toil/test/cwl/not_run_required_input.cwl +29 -0
  109. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  110. toil/test/cwl/seqtk_seq.cwl +1 -1
  111. toil/test/docs/scriptsTest.py +69 -46
  112. toil/test/jobStores/jobStoreTest.py +427 -264
  113. toil/test/lib/aws/test_iam.py +118 -50
  114. toil/test/lib/aws/test_s3.py +16 -9
  115. toil/test/lib/aws/test_utils.py +5 -6
  116. toil/test/lib/dockerTest.py +118 -141
  117. toil/test/lib/test_conversions.py +113 -115
  118. toil/test/lib/test_ec2.py +58 -50
  119. toil/test/lib/test_integration.py +104 -0
  120. toil/test/lib/test_misc.py +12 -5
  121. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  122. toil/test/mesos/helloWorld.py +7 -6
  123. toil/test/mesos/stress.py +25 -20
  124. toil/test/options/__init__.py +13 -0
  125. toil/test/options/options.py +42 -0
  126. toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
  127. toil/test/provisioners/clusterScalerTest.py +440 -250
  128. toil/test/provisioners/clusterTest.py +166 -44
  129. toil/test/provisioners/gceProvisionerTest.py +174 -100
  130. toil/test/provisioners/provisionerTest.py +25 -13
  131. toil/test/provisioners/restartScript.py +5 -4
  132. toil/test/server/serverTest.py +188 -141
  133. toil/test/sort/restart_sort.py +137 -68
  134. toil/test/sort/sort.py +134 -66
  135. toil/test/sort/sortTest.py +91 -49
  136. toil/test/src/autoDeploymentTest.py +141 -101
  137. toil/test/src/busTest.py +20 -18
  138. toil/test/src/checkpointTest.py +8 -2
  139. toil/test/src/deferredFunctionTest.py +49 -35
  140. toil/test/src/dockerCheckTest.py +32 -24
  141. toil/test/src/environmentTest.py +135 -0
  142. toil/test/src/fileStoreTest.py +539 -272
  143. toil/test/src/helloWorldTest.py +7 -4
  144. toil/test/src/importExportFileTest.py +61 -31
  145. toil/test/src/jobDescriptionTest.py +46 -21
  146. toil/test/src/jobEncapsulationTest.py +2 -0
  147. toil/test/src/jobFileStoreTest.py +74 -50
  148. toil/test/src/jobServiceTest.py +187 -73
  149. toil/test/src/jobTest.py +121 -71
  150. toil/test/src/miscTests.py +19 -18
  151. toil/test/src/promisedRequirementTest.py +82 -36
  152. toil/test/src/promisesTest.py +7 -6
  153. toil/test/src/realtimeLoggerTest.py +10 -6
  154. toil/test/src/regularLogTest.py +71 -37
  155. toil/test/src/resourceTest.py +80 -49
  156. toil/test/src/restartDAGTest.py +36 -22
  157. toil/test/src/resumabilityTest.py +9 -2
  158. toil/test/src/retainTempDirTest.py +45 -14
  159. toil/test/src/systemTest.py +12 -8
  160. toil/test/src/threadingTest.py +44 -25
  161. toil/test/src/toilContextManagerTest.py +10 -7
  162. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  163. toil/test/src/workerTest.py +73 -23
  164. toil/test/utils/toilDebugTest.py +103 -33
  165. toil/test/utils/toilKillTest.py +4 -5
  166. toil/test/utils/utilsTest.py +245 -106
  167. toil/test/wdl/wdltoil_test.py +818 -149
  168. toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
  169. toil/toilState.py +120 -35
  170. toil/utils/toilConfig.py +13 -4
  171. toil/utils/toilDebugFile.py +44 -27
  172. toil/utils/toilDebugJob.py +214 -27
  173. toil/utils/toilDestroyCluster.py +11 -6
  174. toil/utils/toilKill.py +8 -3
  175. toil/utils/toilLaunchCluster.py +256 -140
  176. toil/utils/toilMain.py +37 -16
  177. toil/utils/toilRsyncCluster.py +32 -14
  178. toil/utils/toilSshCluster.py +49 -22
  179. toil/utils/toilStats.py +356 -273
  180. toil/utils/toilStatus.py +292 -139
  181. toil/utils/toilUpdateEC2Instances.py +3 -1
  182. toil/version.py +12 -12
  183. toil/wdl/utils.py +5 -5
  184. toil/wdl/wdltoil.py +3913 -1033
  185. toil/worker.py +367 -184
  186. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
  187. toil-8.0.0.dist-info/METADATA +173 -0
  188. toil-8.0.0.dist-info/RECORD +253 -0
  189. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
  190. toil-6.1.0a1.dist-info/METADATA +0 -125
  191. toil-6.1.0a1.dist-info/RECORD +0 -237
  192. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
  193. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/leader.py CHANGED
@@ -21,30 +21,36 @@ import os
21
21
  import pickle
22
22
  import sys
23
23
  import time
24
- from typing import Any, Dict, List, Optional, Set, Union
24
+ from typing import Any, Optional, Union
25
25
 
26
26
  import enlighten
27
27
 
28
28
  from toil import resolveEntryPoint
29
29
  from toil.batchSystems import DeadlockException
30
- from toil.batchSystems.abstractBatchSystem import (AbstractBatchSystem,
31
- BatchJobExitReason)
32
- from toil.bus import (JobCompletedMessage,
33
- JobFailedMessage,
34
- JobIssuedMessage,
35
- JobMissingMessage,
36
- JobUpdatedMessage,
37
- QueueSizeMessage,
38
- gen_message_bus_path)
30
+ from toil.batchSystems.abstractBatchSystem import (
31
+ EXIT_STATUS_UNAVAILABLE_VALUE,
32
+ AbstractBatchSystem,
33
+ BatchJobExitReason,
34
+ )
35
+ from toil.bus import (
36
+ JobCompletedMessage,
37
+ JobFailedMessage,
38
+ JobIssuedMessage,
39
+ JobMissingMessage,
40
+ JobUpdatedMessage,
41
+ QueueSizeMessage,
42
+ get_job_kind,
43
+ )
39
44
  from toil.common import Config, ToilMetrics
40
45
  from toil.cwl.utils import CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
41
46
  from toil.exceptions import FailedJobsException
42
- from toil.job import (CheckpointJobDescription,
43
- JobDescription,
44
- ServiceJobDescription,
45
- TemporaryID)
46
- from toil.jobStores.abstractJobStore import (AbstractJobStore,
47
- NoSuchJobException)
47
+ from toil.job import (
48
+ CheckpointJobDescription,
49
+ JobDescription,
50
+ ServiceJobDescription,
51
+ TemporaryID,
52
+ )
53
+ from toil.jobStores.abstractJobStore import AbstractJobStore, NoSuchJobException
48
54
  from toil.lib.throttle import LocalThrottle
49
55
  from toil.provisioners.abstractProvisioner import AbstractProvisioner
50
56
  from toil.provisioners.clusterScaler import ScalerThread
@@ -78,13 +84,15 @@ class Leader:
78
84
  consulting the job store, and issuing them in the batch system.
79
85
  """
80
86
 
81
- def __init__(self,
82
- config: Config,
83
- batchSystem: AbstractBatchSystem,
84
- provisioner: Optional[AbstractProvisioner],
85
- jobStore: AbstractJobStore,
86
- rootJob: JobDescription,
87
- jobCache: Optional[Dict[Union[str, TemporaryID], JobDescription]] = None) -> None:
87
+ def __init__(
88
+ self,
89
+ config: Config,
90
+ batchSystem: AbstractBatchSystem,
91
+ provisioner: Optional[AbstractProvisioner],
92
+ jobStore: AbstractJobStore,
93
+ rootJob: JobDescription,
94
+ jobCache: Optional[dict[Union[str, TemporaryID], JobDescription]] = None,
95
+ ) -> None:
88
96
  """
89
97
  Create a Toil Leader object.
90
98
 
@@ -114,14 +122,11 @@ class Leader:
114
122
  # state change information about jobs.
115
123
  self.toilState = ToilState(self.jobStore)
116
124
 
117
- if self.config.write_messages is None:
118
- # The user hasn't specified a place for the message bus so we
119
- # should make one.
120
- self.config.write_messages = gen_message_bus_path()
121
-
122
125
  # Message bus messages need to go to the given file.
123
126
  # Keep a reference to the return value so the listener stays alive.
124
- self._message_subscription = self.toilState.bus.connect_output_file(self.config.write_messages)
127
+ self._message_subscription = self.toilState.bus.connect_output_file(
128
+ self.config.write_messages
129
+ )
125
130
 
126
131
  # Connect to the message bus, so we will get all the messages of these
127
132
  # types in an inbox.
@@ -136,17 +141,22 @@ class Leader:
136
141
  # this, somehow, so they can also see messages from this?
137
142
  self.toilState.load_workflow(rootJob, jobCache=jobCache)
138
143
 
139
- logger.debug("Found %s jobs to start and %i jobs with successors to run",
140
- self._messages.count(JobUpdatedMessage), len(self.toilState.successorCounts))
144
+ logger.debug(
145
+ "Found %s jobs to start and %i jobs with successors to run",
146
+ self._messages.count(JobUpdatedMessage),
147
+ len(self.toilState.successorCounts),
148
+ )
141
149
 
142
150
  # Batch system
143
151
  self.batchSystem = batchSystem
144
152
  if len(self.batchSystem.getIssuedBatchJobIDs()) != 0:
145
- raise RuntimeError("The initialized batchsystem did not start with 0 active jobs.")
153
+ raise RuntimeError(
154
+ "The initialized batchsystem did not start with 0 active jobs."
155
+ )
146
156
  logger.debug("Checked batch system has no running jobs and no updated jobs")
147
157
 
148
158
  # Map of batch system IDs to job store IDs
149
- self.issued_jobs_by_batch_system_id: Dict[int, str] = {}
159
+ self.issued_jobs_by_batch_system_id: dict[int, str] = {}
150
160
 
151
161
  # Number of preemptible jobs currently being run by batch system
152
162
  self.preemptibleJobsIssued = 0
@@ -154,10 +164,12 @@ class Leader:
154
164
  # Tracking the number service jobs issued,
155
165
  # this is used limit the number of services issued to the batch system
156
166
  self.serviceJobsIssued = 0
157
- self.serviceJobsToBeIssued: List[str] = [] # A queue of IDs of service jobs that await scheduling
167
+ self.serviceJobsToBeIssued: list[str] = (
168
+ []
169
+ ) # A queue of IDs of service jobs that await scheduling
158
170
  # Equivalents for service jobs to be run on preemptible nodes
159
171
  self.preemptibleServiceJobsIssued = 0
160
- self.preemptibleServiceJobsToBeIssued: List[str] = []
172
+ self.preemptibleServiceJobsToBeIssued: list[str] = []
161
173
 
162
174
  # Timing of the rescuing method
163
175
  self.timeSinceJobsLastRescued = None
@@ -165,7 +177,7 @@ class Leader:
165
177
  # For each issued job's batch system ID, how many times did we not see
166
178
  # it when we should have? If this hits a threshold, the job is declared
167
179
  # missing and killed and possibly retried.
168
- self.reissueMissingJobs_missingHash: Dict[int, int] = {}
180
+ self.reissueMissingJobs_missingHash: dict[int, int] = {}
169
181
 
170
182
  # Class used to create/destroy nodes in the cluster, may be None if
171
183
  # using a statically defined cluster
@@ -183,7 +195,7 @@ class Leader:
183
195
  self.statsAndLogging = StatsAndLogging(self.jobStore, self.config)
184
196
 
185
197
  # Set used to monitor deadlocked jobs
186
- self.potentialDeadlockedJobs: Set[str] = set()
198
+ self.potentialDeadlockedJobs: set[str] = set()
187
199
  self.potentialDeadlockTime = 0
188
200
 
189
201
  # A dashboard that runs on the leader node in AWS clusters to track the state
@@ -191,8 +203,13 @@ class Leader:
191
203
  self.toilMetrics: Optional[ToilMetrics] = None
192
204
 
193
205
  # internal jobs we should not expose at top level debugging
194
- self.debugJobNames = ("CWLJob", "CWLWorkflow", "CWLScatter", "CWLGather",
195
- "ResolveIndirect")
206
+ self.debugJobNames = (
207
+ "CWLJob",
208
+ "CWLWorkflow",
209
+ "CWLScatter",
210
+ "CWLGather",
211
+ "ResolveIndirect",
212
+ )
196
213
 
197
214
  self.deadlockThrottler = LocalThrottle(self.config.deadlockCheckInterval)
198
215
 
@@ -210,8 +227,10 @@ class Leader:
210
227
  self.GOOD_COLOR = (0, 60, 108)
211
228
  self.BAD_COLOR = (253, 199, 0)
212
229
  # And set a format that shows failures
213
- self.PROGRESS_BAR_FORMAT = ('{desc}{desc_pad}{percentage:3.0f}%|{bar}| {count:{len_total}d}/{total:d} '
214
- '({count_1:d} failures) [{elapsed}<{eta}, {rate:.2f}{unit_pad}{unit}/s]')
230
+ self.PROGRESS_BAR_FORMAT = (
231
+ "{desc}{desc_pad}{percentage:3.0f}%|{bar}| {count:{len_total}d}/{total:d} "
232
+ "({count_1:d} failures) [{elapsed}<{eta}, {rate:.2f}{unit_pad}{unit}/s]"
233
+ )
215
234
  # TODO: No way to set background color on the terminal for the bar.
216
235
 
217
236
  # What exit code should the process use if the workflow failed?
@@ -229,16 +248,25 @@ class Leader:
229
248
  """
230
249
  self.jobStore.write_kill_flag(kill=False)
231
250
 
232
- with enlighten.get_manager(stream=sys.stderr, enabled=not self.config.disableProgress) as manager:
251
+ with enlighten.get_manager(
252
+ stream=sys.stderr, enabled=not self.config.disableProgress
253
+ ) as manager:
233
254
  # Set up the fancy console UI if desirable
234
- self.progress_overall = manager.counter(total=0, desc='Workflow Progress', unit='jobs',
235
- color=self.GOOD_COLOR, bar_format=self.PROGRESS_BAR_FORMAT)
255
+ self.progress_overall = manager.counter(
256
+ total=0,
257
+ desc="Workflow Progress",
258
+ unit="jobs",
259
+ color=self.GOOD_COLOR,
260
+ bar_format=self.PROGRESS_BAR_FORMAT,
261
+ )
236
262
  self.progress_failed = self.progress_overall.add_subcounter(self.BAD_COLOR)
237
263
 
238
264
  # Start the stats/logging aggregation thread
239
265
  self.statsAndLogging.start()
240
266
  if self.config.metrics:
241
- self.toilMetrics = ToilMetrics(self.toilState.bus, provisioner=self.provisioner)
267
+ self.toilMetrics = ToilMetrics(
268
+ self.toilState.bus, provisioner=self.provisioner
269
+ )
242
270
 
243
271
  try:
244
272
 
@@ -255,10 +283,13 @@ class Leader:
255
283
  self.innerLoop()
256
284
  finally:
257
285
  if self.clusterScaler is not None:
258
- logger.debug('Waiting for workers to shutdown.')
286
+ logger.debug("Waiting for workers to shutdown.")
259
287
  startTime = time.time()
260
288
  self.clusterScaler.shutdown()
261
- logger.debug('Worker shutdown complete in %s seconds.', time.time() - startTime)
289
+ logger.debug(
290
+ "Worker shutdown complete in %s seconds.",
291
+ time.time() - startTime,
292
+ )
262
293
 
263
294
  finally:
264
295
  # Ensure service manager thread is properly shutdown
@@ -271,37 +302,59 @@ class Leader:
271
302
  self.toilMetrics.shutdown()
272
303
 
273
304
  # Filter the failed jobs
274
- self.toilState.totalFailedJobs = [j for j in self.toilState.totalFailedJobs if self.toilState.job_exists(j)]
305
+ self.toilState.totalFailedJobs = [
306
+ j
307
+ for j in self.toilState.totalFailedJobs
308
+ if self.toilState.job_exists(j)
309
+ ]
275
310
 
276
311
  try:
277
312
  self.create_status_sentinel_file(self.toilState.totalFailedJobs)
278
313
  except OSError as e:
279
- logger.debug(f'Error from importFile with hardlink=True: {e}')
314
+ logger.debug(f"Error from importFile with hardlink=True: {e}")
280
315
 
281
- logger.info("Finished toil run %s" %
282
- ("successfully." if not self.toilState.totalFailedJobs \
283
- else ("with %s failed jobs." % len(self.toilState.totalFailedJobs))))
316
+ logger.info(
317
+ "Finished toil run %s"
318
+ % (
319
+ "successfully."
320
+ if not self.toilState.totalFailedJobs
321
+ else ("with %s failed jobs." % len(self.toilState.totalFailedJobs))
322
+ )
323
+ )
284
324
 
285
325
  if len(self.toilState.totalFailedJobs):
286
326
  failed_jobs = []
287
327
  for job_id in self.toilState.totalFailedJobs:
288
328
  # Refresh all the failed jobs to get e.g. the log file IDs that the workers wrote
289
329
  self.toilState.reset_job(job_id)
290
- failed_jobs.append(self.toilState.get_job(job_id))
291
-
292
- logger.info("Failed jobs at end of the run: %s", ' '.join(str(j) for j in failed_jobs))
293
- raise FailedJobsException(self.jobStore, failed_jobs, exit_code=self.recommended_fail_exit_code)
330
+ try:
331
+ failed_jobs.append(self.toilState.get_job(job_id))
332
+ except NoSuchJobException:
333
+ # Job actually finished and was removed
334
+ pass
335
+
336
+ logger.info(
337
+ "Failed jobs at end of the run: %s",
338
+ " ".join(str(j) for j in failed_jobs),
339
+ )
340
+ raise FailedJobsException(
341
+ self.jobStore,
342
+ failed_jobs,
343
+ exit_code=self.recommended_fail_exit_code,
344
+ )
294
345
 
295
346
  return self.jobStore.get_root_job_return_value()
296
347
 
297
348
  def create_status_sentinel_file(self, fail: bool) -> None:
298
349
  """Create a file in the jobstore indicating failure or success."""
299
- logName = 'failed.log' if fail else 'succeeded.log'
350
+ logName = "failed.log" if fail else "succeeded.log"
300
351
  localLog = os.path.join(os.getcwd(), logName)
301
- open(localLog, 'w').close()
302
- self.jobStore.import_file('file://' + localLog, logName, hardlink=True)
352
+ open(localLog, "w").close()
353
+ self.jobStore.import_file("file://" + localLog, logName, hardlink=True)
303
354
 
304
- if os.path.exists(localLog): # Bandaid for Jenkins tests failing stochastically and unexplainably.
355
+ if os.path.exists(
356
+ localLog
357
+ ): # Bandaid for Jenkins tests failing stochastically and unexplainably.
305
358
  os.remove(localLog)
306
359
 
307
360
  def _handledFailedSuccessor(self, successor_id: str, predecessor_id: str) -> bool:
@@ -313,8 +366,11 @@ class Leader:
313
366
  :returns: True if there are still active successors.
314
367
  False if all successors have failed and the job is queued to run to handle the failed successors.
315
368
  """
316
- logger.debug("Successor job: %s of job: %s has failed """
317
- "predecessors", self.toilState.get_job(successor_id), self.toilState.get_job(predecessor_id))
369
+ logger.debug(
370
+ "Successor job: %s of job: %s has failed " "" "predecessors",
371
+ self.toilState.get_job(successor_id),
372
+ self.toilState.get_job(predecessor_id),
373
+ )
318
374
 
319
375
  # Add the job to the set having failed successors
320
376
  self.toilState.hasFailedSuccessors.add(predecessor_id)
@@ -328,9 +384,12 @@ class Leader:
328
384
  # If the job now has no active successors, add to active jobs
329
385
  # so it can be processed as a job with failed successors.
330
386
  if self.toilState.count_pending_successors(predecessor_id) == 0:
331
- logger.debug("Job: %s has no successors to run "
332
- "and some are failed, adding to list of jobs "
333
- "with failed successors", self.toilState.get_job(predecessor_id))
387
+ logger.debug(
388
+ "Job: %s has no successors to run "
389
+ "and some are failed, adding to list of jobs "
390
+ "with failed successors",
391
+ self.toilState.get_job(predecessor_id),
392
+ )
334
393
  self._messages.publish(JobUpdatedMessage(predecessor_id, 0))
335
394
  # Report no successors are running
336
395
  return False
@@ -338,7 +397,9 @@ class Leader:
338
397
  # Some successors are still active
339
398
  return True
340
399
 
341
- def _checkSuccessorReadyToRunMultiplePredecessors(self, successor_id: str, predecessor_id: str) -> bool:
400
+ def _checkSuccessorReadyToRunMultiplePredecessors(
401
+ self, successor_id: str, predecessor_id: str
402
+ ) -> bool:
342
403
  """
343
404
  Check if a successor job is ready to run when there are multiple predecessors.
344
405
 
@@ -359,8 +420,11 @@ class Leader:
359
420
  # Grab the predecessor for reporting
360
421
  predecessor = self.toilState.get_job(predecessor_id)
361
422
 
362
- logger.debug("Successor job: %s of job: %s has multiple "
363
- "predecessors", successor, predecessor)
423
+ logger.debug(
424
+ "Successor job: %s of job: %s has multiple " "predecessors",
425
+ successor,
426
+ predecessor,
427
+ )
364
428
 
365
429
  # Add the predecessor as a finished predecessor to the successor
366
430
  successor.predecessorsFinished.add(predecessor_id)
@@ -379,13 +443,17 @@ class Leader:
379
443
  if len(successor.predecessorsFinished) == successor.predecessorNumber:
380
444
  # All the successor's predecessors are done now.
381
445
  # Remove the successor job from the set of waiting multi-predecessor jobs.
382
- self.toilState.jobsToBeScheduledWithMultiplePredecessors.remove(successor_id)
446
+ self.toilState.jobsToBeScheduledWithMultiplePredecessors.remove(
447
+ successor_id
448
+ )
383
449
  return True
384
450
  else:
385
451
  # The job is not ready to run
386
452
  return False
387
453
 
388
- def _makeJobSuccessorReadyToRun(self, successor_id: str, predecessor_id: str) -> bool:
454
+ def _makeJobSuccessorReadyToRun(
455
+ self, successor_id: str, predecessor_id: str
456
+ ) -> bool:
389
457
  """
390
458
  Make a successor job ready to run if possible.
391
459
 
@@ -393,7 +461,7 @@ class Leader:
393
461
  :param predecessor_id: The job which the successor comes after.
394
462
  :returns: False if the successor job should not yet be run or True otherwise.
395
463
  """
396
- #Build map from successor to predecessors.
464
+ # Build map from successor to predecessors.
397
465
  if successor_id not in self.toilState.successor_to_predecessors:
398
466
  self.toilState.successor_to_predecessors[successor_id] = set()
399
467
  if not isinstance(successor_id, str):
@@ -404,9 +472,15 @@ class Leader:
404
472
 
405
473
  # Grab the successor
406
474
  successor = self.toilState.get_job(successor_id)
407
- logger.debug("Added job %s as coming after job %s", successor, self.toilState.get_job(predecessor_id))
475
+ logger.debug(
476
+ "Added job %s as coming after job %s",
477
+ successor,
478
+ self.toilState.get_job(predecessor_id),
479
+ )
408
480
  if successor.predecessorNumber > 1:
409
- return self._checkSuccessorReadyToRunMultiplePredecessors(successor_id, predecessor_id)
481
+ return self._checkSuccessorReadyToRunMultiplePredecessors(
482
+ successor_id, predecessor_id
483
+ )
410
484
  else:
411
485
  return True
412
486
 
@@ -425,13 +499,20 @@ class Leader:
425
499
  next_successors = predecessor.nextSuccessors()
426
500
 
427
501
  if next_successors is None or len(next_successors) == 0:
428
- raise RuntimeError(f"Job {self} trying to run successors, but it doesn't have any")
429
- logger.debug("Job: %s has %i successors to schedule",
430
- predecessor_id, len(next_successors))
431
- #Record the number of successors that must be completed before
432
- #the job can be considered again
502
+ raise RuntimeError(
503
+ f"Job {self} trying to run successors, but it doesn't have any"
504
+ )
505
+ logger.debug(
506
+ "Job: %s has %i successors to schedule",
507
+ predecessor_id,
508
+ len(next_successors),
509
+ )
510
+ # Record the number of successors that must be completed before
511
+ # the job can be considered again
433
512
  if self.toilState.count_pending_successors(predecessor_id) != 0:
434
- raise RuntimeError('Attempted to schedule successors of the same job twice!')
513
+ raise RuntimeError(
514
+ "Attempted to schedule successors of the same job twice!"
515
+ )
435
516
  self.toilState.successors_pending(predecessor_id, len(next_successors))
436
517
 
437
518
  # For each successor schedule if all predecessors have been completed
@@ -442,7 +523,11 @@ class Leader:
442
523
  except NoSuchJobException:
443
524
  # Job already done and gone, but probably shouldn't be. Or maybe isn't visible yet.
444
525
  # TODO: Shouldn't this be an error?
445
- logger.warning("Job %s is a successor of %s but is already done and gone.", successor_id, predecessor_id)
526
+ logger.warning(
527
+ "Job %s is a successor of %s but is already done and gone.",
528
+ successor_id,
529
+ predecessor_id,
530
+ )
446
531
  # Don't try and run it
447
532
  continue
448
533
  if self._makeJobSuccessorReadyToRun(successor_id, predecessor_id):
@@ -464,46 +549,62 @@ class Leader:
464
549
  # The job has services running; signal for them to be killed.
465
550
  # Once they are killed, then the job will be updated again and then
466
551
  # scheduled to be removed.
467
- logger.warning("Telling job %s to terminate its services due to successor failure",
468
- predecessor)
469
- self.serviceManager.kill_services(self.toilState.servicesIssued[predecessor_id],
470
- error=True)
552
+ logger.warning(
553
+ "Telling job %s to terminate its services due to successor failure",
554
+ predecessor,
555
+ )
556
+ self.serviceManager.kill_services(
557
+ self.toilState.servicesIssued[predecessor_id], error=True
558
+ )
471
559
  elif self.toilState.count_pending_successors(predecessor_id) > 0:
472
560
  # The job has non-service jobs running; wait for them to finish.
473
561
  # the job will be re-added to the updated jobs when these jobs
474
562
  # are done
475
- logger.debug("Job %s with ID: %s with failed successors still has successor jobs running",
476
- predecessor, predecessor_id)
477
- elif (isinstance(predecessor, CheckpointJobDescription) and
478
- predecessor.checkpoint is not None and
479
- predecessor.remainingTryCount > 1):
563
+ logger.debug(
564
+ "Job %s with ID: %s with failed successors still has successor jobs running",
565
+ predecessor,
566
+ predecessor_id,
567
+ )
568
+ elif (
569
+ isinstance(predecessor, CheckpointJobDescription)
570
+ and predecessor.checkpoint is not None
571
+ and predecessor.remainingTryCount > 1
572
+ ):
480
573
  # If the job is a checkpoint and has remaining retries...
481
574
  # The logic behind using > 1 rather than > 0 here: Since this job has
482
575
  # been tried once (without decreasing its try count as the job
483
576
  # itself was successful), and its subtree failed, it shouldn't be retried
484
577
  # unless it has more than 1 try.
485
578
  if predecessor_id in self.toilState.jobs_issued:
486
- logger.debug('Checkpoint job %s was updated while issued', predecessor_id)
579
+ logger.debug(
580
+ "Checkpoint job %s was updated while issued", predecessor_id
581
+ )
487
582
  else:
488
583
  # It hasn't already been reissued.
489
584
  # This check lets us be robust against repeated job update
490
585
  # messages (such as from services starting *and* failing), by
491
586
  # making sure that we don't stay in a state that where we
492
587
  # reissue the job every time we get one.
493
- logger.warning('Job: %s is being restarted as a checkpoint after the total '
494
- 'failure of jobs in its subtree.', predecessor_id)
588
+ logger.warning(
589
+ "Job: %s is being restarted as a checkpoint after the total "
590
+ "failure of jobs in its subtree.",
591
+ predecessor_id,
592
+ )
495
593
  self.issueJob(predecessor)
496
594
  else:
497
595
  # Mark it totally failed
498
- logger.debug("Job %s is being processed as completely failed", predecessor_id)
596
+ logger.debug(
597
+ "Job %s is being processed as completely failed", predecessor_id
598
+ )
499
599
  self.processTotallyFailedJob(predecessor_id)
500
600
 
501
601
  def _processReadyJob(self, job_id: str, result_status: int):
502
602
  # We operate on the JobDescription mostly.
503
603
  readyJob = self.toilState.get_job(job_id)
504
604
 
505
- logger.debug('Updating status of job %s with result status: %s',
506
- readyJob, result_status)
605
+ logger.debug(
606
+ "Updating status of job %s with result status: %s", readyJob, result_status
607
+ )
507
608
 
508
609
  # TODO: Filter out nonexistent successors/services now, so we can tell
509
610
  # if they are all done and the job needs deleting?
@@ -516,14 +617,17 @@ class Leader:
516
617
  # want to act on it; we want to wait until it gets the update it
517
618
  # gets when the service manager is done trying to start its
518
619
  # services.
519
- logger.debug("Got a job to update which is still owned by the service "
520
- "manager: %s", readyJob.jobStoreID)
620
+ logger.debug(
621
+ "Got a job to update which is still owned by the service "
622
+ "manager: %s",
623
+ readyJob.jobStoreID,
624
+ )
521
625
  elif readyJob.jobStoreID in self.toilState.hasFailedSuccessors:
522
626
  self._processFailedSuccessors(job_id)
523
- elif readyJob.command is not None or result_status != 0:
524
- # The job has a command it must be run before any successors.
627
+ elif readyJob.has_body() or result_status != 0:
628
+ # The job has a body it must be run before any successors.
525
629
  # Similarly, if the job previously failed we rerun it, even if it doesn't have a
526
- # command to run, to eliminate any parts of the stack now completed.
630
+ # body to run, to eliminate any parts of the stack now completed.
527
631
  isServiceJob = readyJob.jobStoreID in self.toilState.service_to_client
528
632
 
529
633
  # We want to run the job, and expend one of its "tries" (possibly
@@ -531,8 +635,9 @@ class Leader:
531
635
 
532
636
  # If the job has run out of tries or is a service job whose error flag has
533
637
  # been indicated, fail the job.
534
- if (readyJob.remainingTryCount == 0 or
535
- (isServiceJob and not self.jobStore.file_exists(readyJob.errorJobStoreID))):
638
+ if readyJob.remainingTryCount == 0 or (
639
+ isServiceJob and not self.jobStore.file_exists(readyJob.errorJobStoreID)
640
+ ):
536
641
  self.processTotallyFailedJob(job_id)
537
642
  logger.warning("Job %s is completely failed", readyJob)
538
643
  else:
@@ -543,28 +648,39 @@ class Leader:
543
648
  # Build a map from the service jobs to the job and a map
544
649
  # of the services created for the job
545
650
  if readyJob.jobStoreID in self.toilState.servicesIssued:
546
- raise RuntimeError(f"The ready job: {readyJob.jobStoreID} was already issued.")
651
+ raise RuntimeError(
652
+ f"The ready job: {readyJob.jobStoreID} was already issued."
653
+ )
547
654
  self.toilState.servicesIssued[readyJob.jobStoreID] = set()
548
655
  for serviceJobList in readyJob.serviceHostIDsInBatches():
549
656
  for serviceID in serviceJobList:
550
657
  if serviceID in self.toilState.service_to_client:
551
- raise RuntimeError(f"The ready service ID: {serviceID} was already added.")
658
+ raise RuntimeError(
659
+ f"The ready service ID: {serviceID} was already added."
660
+ )
661
+ # TODO: Why do we refresh here?
552
662
  self.toilState.reset_job(serviceID)
553
663
  serviceHost = self.toilState.get_job(serviceID)
554
664
  self.toilState.service_to_client[serviceID] = readyJob.jobStoreID
555
665
  self.toilState.servicesIssued[readyJob.jobStoreID].add(serviceID)
556
666
 
557
- logger.debug("Giving job: %s to service manager to schedule its jobs", readyJob)
667
+ logger.debug(
668
+ "Giving job: %s to service manager to schedule its jobs", readyJob
669
+ )
558
670
  # Use the service manager to start the services
559
671
  self.serviceManager.put_client(job_id)
560
672
  elif readyJob.nextSuccessors() is not None:
561
673
  # There are successors to run
562
674
  self._runJobSuccessors(job_id)
563
675
  elif readyJob.jobStoreID in self.toilState.servicesIssued:
564
- logger.debug("Telling job: %s to terminate its services due to the "
565
- "successful completion of its successor jobs",
566
- readyJob)
567
- self.serviceManager.kill_services(self.toilState.servicesIssued[readyJob.jobStoreID], error=False)
676
+ logger.debug(
677
+ "Telling job: %s to terminate its services due to the "
678
+ "successful completion of its successor jobs",
679
+ readyJob,
680
+ )
681
+ self.serviceManager.kill_services(
682
+ self.toilState.servicesIssued[readyJob.jobStoreID], error=False
683
+ )
568
684
  else:
569
685
  # There are no remaining tasks to schedule within the job.
570
686
  #
@@ -593,7 +709,10 @@ class Leader:
593
709
  try:
594
710
  self.toilState.delete_job(readyJob.jobStoreID)
595
711
  except Exception as e:
596
- logger.exception("Re-processing success for job we could not remove: %s", readyJob)
712
+ logger.exception(
713
+ "Re-processing success for job we could not remove: %s",
714
+ readyJob,
715
+ )
597
716
  # Kick it back to being handled as succeeded again. We
598
717
  # don't want to have a failure here cause a Toil-level
599
718
  # retry which causes more actual jobs to try to run.
@@ -605,12 +724,18 @@ class Leader:
605
724
  self.processRemovedJob(readyJob, 0)
606
725
  else:
607
726
  self.processTotallyFailedJob(job_id)
608
- logger.error("Job: %s is empty but completely failed - something is very wrong", readyJob.jobStoreID)
727
+ logger.error(
728
+ "Job: %s is empty but completely failed - something is very wrong",
729
+ readyJob.jobStoreID,
730
+ )
609
731
 
610
732
  def _processReadyJobs(self):
611
733
  """Process jobs that are ready to be scheduled/have successors to schedule."""
612
- logger.debug('Built the jobs list, currently have %i jobs to update and %i jobs issued',
613
- self._messages.count(JobUpdatedMessage), self.getNumberOfJobsIssued())
734
+ logger.debug(
735
+ "Built the jobs list, currently have %i jobs to update and %i jobs issued",
736
+ self._messages.count(JobUpdatedMessage),
737
+ self.getNumberOfJobsIssued(),
738
+ )
614
739
 
615
740
  # Now go through and, for each job that has updated this tick, process it.
616
741
 
@@ -625,9 +750,13 @@ class Leader:
625
750
  if message.job_id in handled_with_status:
626
751
  if handled_with_status[message.job_id] == message.result_status:
627
752
  # This is a harmless duplicate
628
- logger.debug("Job %s already updated this tick with status %s and "
629
- "we've received duplicate message %s", message.job_id,
630
- handled_with_status[message.job_id], message)
753
+ logger.debug(
754
+ "Job %s already updated this tick with status %s and "
755
+ "we've received duplicate message %s",
756
+ message.job_id,
757
+ handled_with_status[message.job_id],
758
+ message,
759
+ )
631
760
  else:
632
761
  # This is a conflicting update. We may have already treated
633
762
  # a job as succeeding but now we've heard it's failed, or
@@ -635,9 +764,13 @@ class Leader:
635
764
  # This probably shouldn't happen, but does because the
636
765
  # scheduler is not correct somehow and hasn't been for a
637
766
  # long time. Complain about it.
638
- logger.warning("Job %s already updated this tick with status %s "
639
- "but we've now received %s", message.job_id,
640
- handled_with_status[message.job_id], message)
767
+ logger.warning(
768
+ "Job %s already updated this tick with status %s "
769
+ "but we've now received %s",
770
+ message.job_id,
771
+ handled_with_status[message.job_id],
772
+ message,
773
+ )
641
774
  # Either way, we only want to handle one update per tick, like
642
775
  # the old dict-based implementation.
643
776
  continue
@@ -655,16 +788,21 @@ class Leader:
655
788
  if service_id is None:
656
789
  break
657
790
 
658
- logger.debug('Launching service job: %s', self.toilState.get_job(service_id))
791
+ logger.debug(
792
+ "Launching service job: %s", self.toilState.get_job(service_id)
793
+ )
659
794
  self.issueServiceJob(service_id)
660
795
 
661
796
  def _processJobsWithRunningServices(self):
662
797
  """Get jobs whose services have started."""
663
798
  while True:
664
799
  client_id = self.serviceManager.get_ready_client(0)
665
- if client_id is None: # Stop trying to get jobs when function returns None
800
+ if client_id is None: # Stop trying to get jobs when function returns None
666
801
  break
667
- logger.debug('Job: %s has established its services; all services are running', client_id)
802
+ logger.debug(
803
+ "Job: %s has established its services; all services are running",
804
+ client_id,
805
+ )
668
806
 
669
807
  # Grab the client job description
670
808
  client = self.toilState.get_job(client_id)
@@ -677,9 +815,9 @@ class Leader:
677
815
  """Get jobs whose services have failed to start."""
678
816
  while True:
679
817
  client_id = self.serviceManager.get_unservable_client(0)
680
- if client_id is None: # Stop trying to get jobs when function returns None
818
+ if client_id is None: # Stop trying to get jobs when function returns None
681
819
  break
682
- logger.debug('Job: %s has failed to establish its services.', client_id)
820
+ logger.debug("Job: %s has failed to establish its services.", client_id)
683
821
 
684
822
  # Grab the client job description
685
823
  client = self.toilState.get_job(client_id)
@@ -694,29 +832,56 @@ class Leader:
694
832
  def _gatherUpdatedJobs(self, updatedJobTuple):
695
833
  """Gather any new, updated JobDescriptions from the batch system."""
696
834
  bsID, exitStatus, exitReason, wallTime = (
697
- updatedJobTuple.jobID, updatedJobTuple.exitStatus, updatedJobTuple.exitReason,
698
- updatedJobTuple.wallTime)
835
+ updatedJobTuple.jobID,
836
+ updatedJobTuple.exitStatus,
837
+ updatedJobTuple.exitReason,
838
+ updatedJobTuple.wallTime,
839
+ )
699
840
  # easy, track different state
700
841
  try:
701
- updatedJob = self.toilState.get_job(self.issued_jobs_by_batch_system_id[bsID])
842
+ updatedJob = self.toilState.get_job(
843
+ self.issued_jobs_by_batch_system_id[bsID]
844
+ )
702
845
  except KeyError:
703
- logger.warning("A result seems to already have been processed for job %s", bsID)
846
+ logger.warning(
847
+ "A result seems to already have been processed for job %s", bsID
848
+ )
704
849
  else:
705
850
  if exitStatus == 0:
706
- logger.debug('Job ended: %s', updatedJob)
851
+ logger.debug("Job ended: %s", updatedJob)
707
852
  else:
708
- logger.warning(f'Job failed with exit value {exitStatus}: {updatedJob}\n'
709
- f'Exit reason: {exitReason}')
853
+ status_string = (
854
+ str(exitStatus)
855
+ if exitStatus != EXIT_STATUS_UNAVAILABLE_VALUE
856
+ else "<UNAVAILABLE>"
857
+ )
858
+ logger.warning(
859
+ f"Job failed with exit value {status_string}: {updatedJob}\n"
860
+ f"Exit reason: {BatchJobExitReason.to_string(exitReason)}"
861
+ )
862
+ # This logic is undefined for which of the failing jobs will send its exit code
863
+ # when there are multiple failing jobs with different exit statuses
864
+ self.recommended_fail_exit_code = exitStatus
710
865
  if exitStatus == CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE:
711
866
  # This is a CWL job informing us that the workflow is
712
867
  # asking things of us that Toil can't do. When we raise an
713
868
  # exception because of this, make sure to forward along
714
869
  # this exit code.
715
870
  logger.warning("This indicates an unsupported CWL requirement!")
716
- self.recommended_fail_exit_code = CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
871
+ self.recommended_fail_exit_code = (
872
+ CWL_UNSUPPORTED_REQUIREMENT_EXIT_CODE
873
+ )
717
874
  # Tell everyone it stopped running.
718
- self._messages.publish(JobCompletedMessage(updatedJob.get_job_kind(), updatedJob.jobStoreID, exitStatus))
719
- self.process_finished_job(bsID, exitStatus, wall_time=wallTime, exit_reason=exitReason)
875
+ self._messages.publish(
876
+ JobCompletedMessage(
877
+ get_job_kind(updatedJob.get_names()),
878
+ updatedJob.jobStoreID,
879
+ exitStatus,
880
+ )
881
+ )
882
+ self.process_finished_job(
883
+ bsID, exitStatus, wall_time=wallTime, exit_reason=exitReason
884
+ )
720
885
 
721
886
  def _processLostJobs(self):
722
887
  """Process jobs that have gone awry."""
@@ -724,7 +889,9 @@ class Leader:
724
889
  # gather for rescueJobsFrequency seconds) check if there are any jobs
725
890
  # that have run too long (see self.reissueOverLongJobs) or which have
726
891
  # gone missing from the batch system (see self.reissueMissingJobs)
727
- if ((time.time() - self.timeSinceJobsLastRescued) >= self.config.rescueJobsFrequency):
892
+ if (
893
+ time.time() - self.timeSinceJobsLastRescued
894
+ ) >= self.config.rescueJobsFrequency:
728
895
  # We only rescue jobs every N seconds, and when we have apparently
729
896
  # exhausted the current job supply
730
897
  self.reissueOverLongJobs()
@@ -744,9 +911,11 @@ class Leader:
744
911
  """
745
912
  self.timeSinceJobsLastRescued = time.time()
746
913
 
747
- while self._messages.count(JobUpdatedMessage) > 0 or \
748
- self.getNumberOfJobsIssued() or \
749
- self.serviceManager.get_job_count():
914
+ while (
915
+ self._messages.count(JobUpdatedMessage) > 0
916
+ or self.getNumberOfJobsIssued()
917
+ or self.serviceManager.get_job_count()
918
+ ):
750
919
 
751
920
  if self._messages.count(JobUpdatedMessage) > 0:
752
921
  self._processReadyJobs()
@@ -798,13 +967,21 @@ class Leader:
798
967
  if not self._messages.empty():
799
968
  raise RuntimeError(f"Pending messages at shutdown: {self._messages}")
800
969
  if self.toilState.successorCounts != {}:
801
- raise RuntimeError(f"Jobs waiting on successors at shutdown: {self.toilState.successorCounts}")
970
+ raise RuntimeError(
971
+ f"Jobs waiting on successors at shutdown: {self.toilState.successorCounts}"
972
+ )
802
973
  if self.toilState.successor_to_predecessors != {}:
803
- raise RuntimeError(f"Successors pending for their predecessors at shutdown: {self.toilState.successor_to_predecessors}")
974
+ raise RuntimeError(
975
+ f"Successors pending for their predecessors at shutdown: {self.toilState.successor_to_predecessors}"
976
+ )
804
977
  if self.toilState.service_to_client != {}:
805
- raise RuntimeError(f"Services pending for their clients at shutdown: {self.toilState.service_to_client}")
978
+ raise RuntimeError(
979
+ f"Services pending for their clients at shutdown: {self.toilState.service_to_client}"
980
+ )
806
981
  if self.toilState.servicesIssued != {}:
807
- raise RuntimeError(f"Services running at shutdown: {self.toilState.servicesIssued}")
982
+ raise RuntimeError(
983
+ f"Services running at shutdown: {self.toilState.servicesIssued}"
984
+ )
808
985
 
809
986
  def checkForDeadlocks(self):
810
987
  """Check if the system is deadlocked running service jobs."""
@@ -814,18 +991,22 @@ class Leader:
814
991
  # If there are no updated jobs and at least some jobs running
815
992
  if totalServicesIssued >= totalRunningJobs and totalRunningJobs > 0:
816
993
  # Collect all running service job store IDs into a set to compare with the deadlock set
817
- running_service_ids: Set[str] = set()
994
+ running_service_ids: set[str] = set()
818
995
  for js_id in self.issued_jobs_by_batch_system_id.values():
819
996
  job = self.toilState.get_job(js_id)
820
- if isinstance(job, ServiceJobDescription) and self.serviceManager.is_running(js_id):
997
+ if isinstance(
998
+ job, ServiceJobDescription
999
+ ) and self.serviceManager.is_running(js_id):
821
1000
  running_service_ids.add(js_id)
822
1001
 
823
1002
  if len(running_service_ids) > totalRunningJobs:
824
1003
  # This is too many services.
825
1004
  # TODO: couldn't more jobs have started since we polled the
826
1005
  # running job count?
827
- raise RuntimeError(f"Supposedly running {len(running_service_ids)} services, which is"
828
- f"more than the {totalRunningJobs} currently running jobs overall.")
1006
+ raise RuntimeError(
1007
+ f"Supposedly running {len(running_service_ids)} services, which is"
1008
+ f"more than the {totalRunningJobs} currently running jobs overall."
1009
+ )
829
1010
 
830
1011
  # If all the running jobs are active services then we have a potential deadlock
831
1012
  if len(running_service_ids) == totalRunningJobs:
@@ -839,27 +1020,49 @@ class Leader:
839
1020
  # Use a generic message if none is available
840
1021
  message = "Cluster may be too small."
841
1022
 
842
-
843
1023
  # See if this is a new potential deadlock
844
1024
  if self.potentialDeadlockedJobs != running_service_ids:
845
- logger.warning(("Potential deadlock detected! All %s running jobs are service jobs, "
846
- "with no normal jobs to use them! %s"), totalRunningJobs, message)
1025
+ logger.warning(
1026
+ (
1027
+ "Potential deadlock detected! All %s running jobs are service jobs, "
1028
+ "with no normal jobs to use them! %s"
1029
+ ),
1030
+ totalRunningJobs,
1031
+ message,
1032
+ )
847
1033
  self.potentialDeadlockedJobs = running_service_ids
848
1034
  self.potentialDeadlockTime = time.time()
849
1035
  else:
850
1036
  # We wait self.config.deadlockWait seconds before declaring the system deadlocked
851
1037
  stuckFor = time.time() - self.potentialDeadlockTime
852
1038
  if stuckFor >= self.config.deadlockWait:
853
- logger.error("We have been deadlocked since %s on these service jobs: %s",
854
- self.potentialDeadlockTime, self.potentialDeadlockedJobs)
855
- raise DeadlockException(("The workflow is service deadlocked - all %d running jobs "
856
- "have been the same active services for at least %s seconds") % (totalRunningJobs, self.config.deadlockWait))
1039
+ logger.error(
1040
+ "We have been deadlocked since %s on these service jobs: %s",
1041
+ self.potentialDeadlockTime,
1042
+ self.potentialDeadlockedJobs,
1043
+ )
1044
+ raise DeadlockException(
1045
+ (
1046
+ "The workflow is service deadlocked - all %d running jobs "
1047
+ "have been the same active services for at least %s seconds"
1048
+ )
1049
+ % (totalRunningJobs, self.config.deadlockWait)
1050
+ )
857
1051
  else:
858
1052
  # Complain that we are still stuck.
859
- waitingNormalJobs = self.getNumberOfJobsIssued() - totalServicesIssued
860
- logger.warning(("Potentially deadlocked for %.0f seconds. Waiting at most %.0f more seconds "
861
- "for any of %d issued non-service jobs to schedule and start. %s"),
862
- stuckFor, self.config.deadlockWait - stuckFor, waitingNormalJobs, message)
1053
+ waitingNormalJobs = (
1054
+ self.getNumberOfJobsIssued() - totalServicesIssued
1055
+ )
1056
+ logger.warning(
1057
+ (
1058
+ "Potentially deadlocked for %.0f seconds. Waiting at most %.0f more seconds "
1059
+ "for any of %d issued non-service jobs to schedule and start. %s"
1060
+ ),
1061
+ stuckFor,
1062
+ self.config.deadlockWait - stuckFor,
1063
+ waitingNormalJobs,
1064
+ message,
1065
+ )
863
1066
  else:
864
1067
  # We have observed non-service jobs running, so reset the potential deadlock
865
1068
  self.feed_deadlock_watchdog()
@@ -880,34 +1083,38 @@ class Leader:
880
1083
  """Add a job to the queue of jobs currently trying to run."""
881
1084
  # Never issue the same job multiple times simultaneously
882
1085
  if jobNode.jobStoreID in self.toilState.jobs_issued:
883
- raise RuntimeError(f"Attempted to issue {jobNode} multiple times simultaneously!")
1086
+ raise RuntimeError(
1087
+ f"Attempted to issue {jobNode} multiple times simultaneously!"
1088
+ )
884
1089
 
885
- workerCommand = [resolveEntryPoint('_toil_worker'),
886
- jobNode.jobName,
887
- self.jobStoreLocator,
888
- jobNode.jobStoreID]
1090
+ workerCommand = [
1091
+ resolveEntryPoint("_toil_worker"),
1092
+ jobNode.jobName,
1093
+ self.jobStoreLocator,
1094
+ jobNode.jobStoreID,
1095
+ ]
889
1096
 
890
1097
  for context in self.batchSystem.getWorkerContexts():
891
1098
  # For each context manager hook the batch system wants to run in
892
1099
  # the worker, serialize and send it.
893
- workerCommand.append('--context')
894
- workerCommand.append(base64.b64encode(pickle.dumps(context)).decode('utf-8'))
895
-
896
- # We locally override the command. This shouldn't get persisted back to
897
- # the job store, or we will detach the job body from the job
898
- # description. TODO: Don't do it this way! It's weird!
899
- jobNode.command = ' '.join(workerCommand)
1100
+ workerCommand.append("--context")
1101
+ workerCommand.append(
1102
+ base64.b64encode(pickle.dumps(context)).decode("utf-8")
1103
+ )
900
1104
 
901
- omp_threads = os.environ.get('OMP_NUM_THREADS') \
902
- or str(max(1, int(jobNode.cores))) # make sure OMP_NUM_THREADS is a positive integer
1105
+ omp_threads = os.environ.get("OMP_NUM_THREADS") or str(
1106
+ max(1, int(jobNode.cores))
1107
+ ) # make sure OMP_NUM_THREADS is a positive integer
903
1108
 
904
1109
  job_environment = {
905
1110
  # Set the number of cores used by OpenMP applications
906
- 'OMP_NUM_THREADS': omp_threads,
1111
+ "OMP_NUM_THREADS": omp_threads,
907
1112
  }
908
1113
 
909
1114
  # jobBatchSystemID is an int for each job
910
- jobBatchSystemID = self.batchSystem.issueBatchJob(jobNode, job_environment=job_environment)
1115
+ jobBatchSystemID = self.batchSystem.issueBatchJob(
1116
+ " ".join(workerCommand), jobNode, job_environment=job_environment
1117
+ )
911
1118
  # Record the job by the ID the batch system will use to talk about it with us
912
1119
  self.issued_jobs_by_batch_system_id[jobBatchSystemID] = jobNode.jobStoreID
913
1120
  # Record that this job is issued right now and shouldn't e.g. be issued again.
@@ -917,11 +1124,18 @@ class Leader:
917
1124
  # so increment this value after the job is added to the issuedJob dict
918
1125
  self.preemptibleJobsIssued += 1
919
1126
  cur_logger = logger.debug if jobNode.local else logger.info
920
- cur_logger("Issued job %s with job batch system ID: "
921
- "%s and %s",
922
- jobNode, str(jobBatchSystemID), jobNode.requirements_string())
1127
+ cur_logger(
1128
+ "Issued job %s with job batch system ID: " "%s and %s",
1129
+ jobNode,
1130
+ str(jobBatchSystemID),
1131
+ jobNode.requirements_string(),
1132
+ )
923
1133
  # Tell everyone it is issued and the queue size changed
924
- self._messages.publish(JobIssuedMessage(jobNode.get_job_kind(), jobNode.jobStoreID, jobBatchSystemID))
1134
+ self._messages.publish(
1135
+ JobIssuedMessage(
1136
+ get_job_kind(jobNode.get_names()), jobNode.jobStoreID, jobBatchSystemID
1137
+ )
1138
+ )
925
1139
  self._messages.publish(QueueSizeMessage(self.getNumberOfJobsIssued()))
926
1140
  # Tell the user there's another job to do
927
1141
  self.progress_overall.total += 1
@@ -941,7 +1155,9 @@ class Leader:
941
1155
  # Grab the service job description
942
1156
  service = self.toilState.get_job(service_id)
943
1157
  if not isinstance(service, ServiceJobDescription):
944
- raise RuntimeError("The grabbed service job description is not the right type.")
1158
+ raise RuntimeError(
1159
+ "The grabbed service job description is not the right type."
1160
+ )
945
1161
 
946
1162
  if service.preemptible:
947
1163
  self.preemptibleServiceJobsToBeIssued.append(service_id)
@@ -951,14 +1167,23 @@ class Leader:
951
1167
 
952
1168
  def issueQueingServiceJobs(self):
953
1169
  """Issues any queuing service jobs up to the limit of the maximum allowed."""
954
- while len(self.serviceJobsToBeIssued) > 0 and self.serviceJobsIssued < self.config.maxServiceJobs:
1170
+ while (
1171
+ len(self.serviceJobsToBeIssued) > 0
1172
+ and self.serviceJobsIssued < self.config.maxServiceJobs
1173
+ ):
955
1174
  self.issueJob(self.toilState.get_job(self.serviceJobsToBeIssued.pop()))
956
1175
  self.serviceJobsIssued += 1
957
- while len(self.preemptibleServiceJobsToBeIssued) > 0 and self.preemptibleServiceJobsIssued < self.config.maxPreemptibleServiceJobs:
958
- self.issueJob(self.toilState.get_job(self.preemptibleServiceJobsToBeIssued.pop()))
1176
+ while (
1177
+ len(self.preemptibleServiceJobsToBeIssued) > 0
1178
+ and self.preemptibleServiceJobsIssued
1179
+ < self.config.maxPreemptibleServiceJobs
1180
+ ):
1181
+ self.issueJob(
1182
+ self.toilState.get_job(self.preemptibleServiceJobsToBeIssued.pop())
1183
+ )
959
1184
  self.preemptibleServiceJobsIssued += 1
960
1185
 
961
- def getNumberOfJobsIssued(self, preemptible: Optional[bool]=None) -> int:
1186
+ def getNumberOfJobsIssued(self, preemptible: Optional[bool] = None) -> int:
962
1187
  """
963
1188
  Get number of jobs that have been added by issueJob(s) and not removed by removeJob.
964
1189
 
@@ -1008,12 +1233,16 @@ class Leader:
1008
1233
  """
1009
1234
  if jobBatchSystemID not in self.issued_jobs_by_batch_system_id:
1010
1235
  raise RuntimeError("Job was already removed or was never issued.")
1011
- issuedDesc = self.toilState.get_job(self.issued_jobs_by_batch_system_id[jobBatchSystemID])
1236
+ issuedDesc = self.toilState.get_job(
1237
+ self.issued_jobs_by_batch_system_id[jobBatchSystemID]
1238
+ )
1012
1239
  if issuedDesc.preemptible:
1013
1240
  # len(issued_jobs_by_batch_system_id) should always be greater than or equal to preemptibleJobsIssued,
1014
1241
  # so decrement this value before removing the job from the issuedJob map
1015
1242
  if self.preemptibleJobsIssued <= 0:
1016
- raise RuntimeError("The number of preemptive issued jobs cannot be negative.")
1243
+ raise RuntimeError(
1244
+ "The number of preemptive issued jobs cannot be negative."
1245
+ )
1017
1246
  self.preemptibleJobsIssued -= 1
1018
1247
  # It's not issued anymore.
1019
1248
  del self.issued_jobs_by_batch_system_id[jobBatchSystemID]
@@ -1033,19 +1262,24 @@ class Leader:
1033
1262
 
1034
1263
  return issuedDesc
1035
1264
 
1036
- def getJobs(self, preemptible: Optional[bool] = None) -> List[JobDescription]:
1265
+ def getJobs(self, preemptible: Optional[bool] = None) -> list[JobDescription]:
1037
1266
  """
1038
1267
  Get all issued jobs.
1039
1268
 
1040
1269
  :param preemptible: If specified, select only preemptible or only non-preemptible jobs.
1041
1270
  """
1042
1271
 
1043
- jobs = [self.toilState.get_job(job_store_id) for job_store_id in self.issued_jobs_by_batch_system_id.values()]
1272
+ jobs = [
1273
+ self.toilState.get_job(job_store_id)
1274
+ for job_store_id in self.issued_jobs_by_batch_system_id.values()
1275
+ ]
1044
1276
  if preemptible is not None:
1045
1277
  jobs = [job for job in jobs if job.preemptible == preemptible]
1046
1278
  return jobs
1047
1279
 
1048
- def killJobs(self, jobsToKill):
1280
+ def killJobs(
1281
+ self, jobsToKill, exit_reason: BatchJobExitReason = BatchJobExitReason.KILLED
1282
+ ):
1049
1283
  """
1050
1284
  Kills the given set of jobs and then sends them for processing.
1051
1285
 
@@ -1059,7 +1293,9 @@ class Leader:
1059
1293
  self.batchSystem.killBatchJobs(jobsToKill)
1060
1294
  for jobBatchSystemID in jobsToKill:
1061
1295
  # Reissue immediately, noting that we killed the job
1062
- willRerun = self.process_finished_job(jobBatchSystemID, 1, exit_reason=BatchJobExitReason.KILLED)
1296
+ willRerun = self.process_finished_job(
1297
+ jobBatchSystemID, 1, exit_reason=exit_reason
1298
+ )
1063
1299
 
1064
1300
  if willRerun:
1065
1301
  # Compose a list of all the jobs that will run again
@@ -1067,8 +1303,7 @@ class Leader:
1067
1303
 
1068
1304
  return jobsRerunning
1069
1305
 
1070
-
1071
- #Following functions handle error cases for when jobs have gone awry with the batch system.
1306
+ # Following functions handle error cases for when jobs have gone awry with the batch system.
1072
1307
 
1073
1308
  def reissueOverLongJobs(self) -> None:
1074
1309
  """
@@ -1079,20 +1314,30 @@ class Leader:
1079
1314
  """
1080
1315
  maxJobDuration = self.config.maxJobDuration
1081
1316
  jobsToKill = []
1082
- if maxJobDuration < 10000000: # We won't bother doing anything if rescue time > 16 weeks.
1317
+ if (
1318
+ maxJobDuration < 10000000
1319
+ ): # We won't bother doing anything if rescue time > 16 weeks.
1083
1320
  runningJobs = self.batchSystem.getRunningBatchJobIDs()
1084
1321
  for jobBatchSystemID in list(runningJobs.keys()):
1085
1322
  if runningJobs[jobBatchSystemID] > maxJobDuration:
1086
- logger.warning("The job: %s has been running for: %s seconds, more than the "
1087
- "max job duration: %s, we'll kill it",
1088
- self.issued_jobs_by_batch_system_id[jobBatchSystemID],
1089
- str(runningJobs[jobBatchSystemID]),
1090
- str(maxJobDuration))
1323
+ logger.warning(
1324
+ "The job: %s has been running for: %s seconds, more than the "
1325
+ "max job duration: %s, we'll kill it",
1326
+ self.issued_jobs_by_batch_system_id[jobBatchSystemID],
1327
+ str(runningJobs[jobBatchSystemID]),
1328
+ str(maxJobDuration),
1329
+ )
1091
1330
  jobsToKill.append(jobBatchSystemID)
1092
- reissued = self.killJobs(jobsToKill)
1331
+ reissued = self.killJobs(
1332
+ jobsToKill, exit_reason=BatchJobExitReason.MAXJOBDURATION
1333
+ )
1093
1334
  if len(jobsToKill) > 0:
1094
1335
  # Summarize our actions
1095
- logger.info("Killed %d over long jobs and reissued %d of them", len(jobsToKill), len(reissued))
1336
+ logger.info(
1337
+ "Killed %d over long jobs and reissued %d of them",
1338
+ len(jobsToKill),
1339
+ len(reissued),
1340
+ )
1096
1341
 
1097
1342
  def reissueMissingJobs(self, killAfterNTimesMissing=3):
1098
1343
  """
@@ -1104,11 +1349,13 @@ class Leader:
1104
1349
  """
1105
1350
  issuedJobs = set(self.batchSystem.getIssuedBatchJobIDs())
1106
1351
  jobBatchSystemIDsSet = set(list(self.issued_jobs_by_batch_system_id.keys()))
1107
- #Clean up the reissueMissingJobs_missingHash hash, getting rid of jobs that have turned up
1352
+ # Clean up the reissueMissingJobs_missingHash hash, getting rid of jobs that have turned up
1108
1353
  missingJobIDsSet = set(list(self.reissueMissingJobs_missingHash.keys()))
1109
1354
  for jobBatchSystemID in missingJobIDsSet.difference(jobBatchSystemIDsSet):
1110
1355
  self.reissueMissingJobs_missingHash.pop(jobBatchSystemID)
1111
- logger.warning("Batch system id: %s is no longer missing", str(jobBatchSystemID))
1356
+ logger.warning(
1357
+ "Batch system id: %s is no longer missing", str(jobBatchSystemID)
1358
+ )
1112
1359
  # checks we have no unexpected jobs running
1113
1360
  if not issuedJobs.issubset(jobBatchSystemIDsSet):
1114
1361
  raise RuntimeError("An unexpected job is still running.")
@@ -1120,24 +1367,33 @@ class Leader:
1120
1367
  else:
1121
1368
  self.reissueMissingJobs_missingHash[jobBatchSystemID] = 1
1122
1369
  timesMissing = self.reissueMissingJobs_missingHash[jobBatchSystemID]
1123
- logger.warning("Job store ID %s with batch system id %s is missing for the %i time",
1124
- jobStoreID, str(jobBatchSystemID), timesMissing)
1370
+ logger.warning(
1371
+ "Job store ID %s with batch system id %s is missing for the %i time",
1372
+ jobStoreID,
1373
+ str(jobBatchSystemID),
1374
+ timesMissing,
1375
+ )
1125
1376
  # Tell everyone it is missing
1126
1377
  self._messages.publish(JobMissingMessage(jobStoreID))
1127
1378
  if timesMissing == killAfterNTimesMissing:
1128
1379
  self.reissueMissingJobs_missingHash.pop(jobBatchSystemID)
1129
1380
  jobsToKill.append(jobBatchSystemID)
1130
- self.killJobs(jobsToKill)
1131
- return len( self.reissueMissingJobs_missingHash ) == 0 #We use this to inform
1132
- #if there are missing jobs
1381
+ self.killJobs(jobsToKill, exit_reason=BatchJobExitReason.MISSING)
1382
+ return len(self.reissueMissingJobs_missingHash) == 0 # We use this to inform
1383
+ # if there are missing jobs
1133
1384
 
1134
1385
  def processRemovedJob(self, issuedJob, result_status):
1135
1386
  if result_status != 0:
1136
- logger.warning("Despite the batch system claiming failure the "
1137
- "job %s seems to have finished and been removed", issuedJob)
1387
+ logger.warning(
1388
+ "Despite the batch system claiming failure the "
1389
+ "job %s seems to have finished and been removed",
1390
+ issuedJob,
1391
+ )
1138
1392
  self._updatePredecessorStatus(issuedJob.jobStoreID)
1139
1393
 
1140
- def process_finished_job(self, batch_system_id, result_status, wall_time=None, exit_reason=None) -> bool:
1394
+ def process_finished_job(
1395
+ self, batch_system_id, result_status, wall_time=None, exit_reason=None
1396
+ ) -> bool:
1141
1397
  """
1142
1398
  Process finished jobs.
1143
1399
 
@@ -1157,15 +1413,21 @@ class Leader:
1157
1413
  self.progress_overall.update(incr=-1)
1158
1414
  self.progress_failed.update(incr=1)
1159
1415
 
1160
- # Delegate to the vers
1161
- return self.process_finished_job_description(issued_job, result_status, wall_time, exit_reason, batch_system_id)
1162
-
1163
- def process_finished_job_description(self, finished_job: JobDescription, result_status: int,
1164
- wall_time: Optional[float] = None,
1165
- exit_reason: Optional[BatchJobExitReason] = None,
1166
- batch_system_id: Optional[int] = None) -> bool:
1416
+ # Delegate to the version that uses a JobDescription
1417
+ return self.process_finished_job_description(
1418
+ issued_job, result_status, wall_time, exit_reason, batch_system_id
1419
+ )
1420
+
1421
+ def process_finished_job_description(
1422
+ self,
1423
+ finished_job: JobDescription,
1424
+ result_status: int,
1425
+ wall_time: Optional[float] = None,
1426
+ exit_reason: Optional[BatchJobExitReason] = None,
1427
+ batch_system_id: Optional[int] = None,
1428
+ ) -> bool:
1167
1429
  """
1168
- Process a finished JobDescription based upon its succees or failure.
1430
+ Process a finished JobDescription based upon its success or failure.
1169
1431
 
1170
1432
  If wall-clock time is available, informs the cluster scaler about the
1171
1433
  job finishing.
@@ -1185,22 +1447,67 @@ class Leader:
1185
1447
  # TODO: Use message bus?
1186
1448
  self.clusterScaler.addCompletedJob(finished_job, wall_time)
1187
1449
  if self.toilState.job_exists(job_store_id):
1188
- logger.debug("Job %s continues to exist (i.e. has more to do)", finished_job)
1450
+ logger.debug(
1451
+ "Job %s continues to exist (i.e. has more to do)", finished_job
1452
+ )
1189
1453
  try:
1190
1454
  # Reload the job as modified by the worker
1191
- self.toilState.reset_job(job_store_id)
1192
- replacement_job = self.toilState.get_job(job_store_id)
1455
+ if finished_job.has_body():
1456
+ # The worker was expected to do some work. We expect the
1457
+ # worker to have updated the job description.
1458
+
1459
+ # If the job succeeded, we wait around to see the update
1460
+ # and fail the job if we don't see it.
1461
+ if result_status == 0:
1462
+ timeout = self.config.job_store_timeout
1463
+ complaint = (
1464
+ f"has no new version available after {timeout} "
1465
+ "seconds. Either worker updates to "
1466
+ "the job store are delayed longer than your "
1467
+ "--jobStoreTimeout, or the worker trying to run the "
1468
+ "job was killed (or never started)."
1469
+ )
1470
+ else:
1471
+ timeout = 0
1472
+ complaint = (
1473
+ "has no new version available immediately. The "
1474
+ "batch system may have killed (or never started) "
1475
+ "the Toil worker."
1476
+ )
1477
+ change_detected = self.toilState.reset_job_expecting_change(
1478
+ job_store_id, timeout
1479
+ )
1480
+ replacement_job = self.toilState.get_job(job_store_id)
1481
+
1482
+ if not change_detected:
1483
+ logger.warning("Job %s %s", replacement_job, complaint)
1484
+ if result_status == 0:
1485
+ # Make the job fail because we ran it and it finished
1486
+ # and we never heard back.
1487
+ logger.error(
1488
+ "Marking ostensibly successful job %s that did "
1489
+ "not report in to the job store before "
1490
+ "--jobStoreTimeout as having been partitioned "
1491
+ "from us.",
1492
+ replacement_job,
1493
+ )
1494
+ result_status = EXIT_STATUS_UNAVAILABLE_VALUE
1495
+ exit_reason = BatchJobExitReason.PARTITION
1496
+ else:
1497
+ # If there was no body sent, the worker won't commit any
1498
+ # changes to the job description. So don't wait around for
1499
+ # any and don't complain if we don't see them.
1500
+ self.toilState.reset_job(job_store_id)
1501
+ replacement_job = self.toilState.get_job(job_store_id)
1502
+
1193
1503
  except NoSuchJobException:
1194
1504
  # We have a ghost job - the job has been deleted but a stale
1195
1505
  # read from e.g. a non-POSIX-compliant filesystem gave us a
1196
1506
  # false positive when we checked for its existence. Process the
1197
1507
  # job from here as any other job removed from the job store.
1198
- # This is a hack until we can figure out how to actually always
1199
- # have a strongly-consistent communications channel. See
1200
- # https://github.com/BD2KGenomics/toil/issues/1091
1201
- logger.warning('Got a stale read for job %s; caught its '
1202
- 'completion in time, but other jobs may try to run twice! Fix '
1203
- 'the consistency of your job store storage!', finished_job)
1508
+ logger.debug(
1509
+ "Job %s is actually complete upon closer inspection", finished_job
1510
+ )
1204
1511
  self.processRemovedJob(finished_job, result_status)
1205
1512
  return False
1206
1513
  if replacement_job.logJobStoreFileID is not None:
@@ -1208,17 +1515,31 @@ class Leader:
1208
1515
  # more memory efficient than read().striplines() while leaving off the
1209
1516
  # trailing \n left when using readlines()
1210
1517
  # http://stackoverflow.com/a/15233739
1211
- StatsAndLogging.logWithFormatting(job_store_id, log_stream, method=logger.warning,
1212
- message='The job seems to have left a log file, indicating failure: %s' % replacement_job)
1518
+ StatsAndLogging.logWithFormatting(
1519
+ f'Log from job "{job_store_id}"',
1520
+ log_stream,
1521
+ method=logger.warning,
1522
+ message="The job seems to have left a log file, indicating failure: %s"
1523
+ % replacement_job,
1524
+ )
1213
1525
  if self.config.writeLogs or self.config.writeLogsGzip:
1214
1526
  with replacement_job.getLogFileHandle(self.jobStore) as log_stream:
1215
- StatsAndLogging.writeLogFiles(replacement_job.chainedJobs, log_stream, self.config, failed=True)
1527
+ # Send log data from the job store to each per-job log file involved.
1528
+ StatsAndLogging.writeLogFiles(
1529
+ [names.stats_name for names in replacement_job.get_chain()],
1530
+ log_stream,
1531
+ self.config,
1532
+ failed=True,
1533
+ )
1216
1534
  if result_status != 0:
1217
1535
  # If the batch system returned a non-zero exit code then the worker
1218
1536
  # is assumed not to have captured the failure of the job, so we
1219
1537
  # reduce the try count here.
1220
1538
  if replacement_job.logJobStoreFileID is None:
1221
- logger.warning("No log file is present, despite job failing: %s", replacement_job)
1539
+ logger.warning(
1540
+ "No log file is present, despite job failing: %s",
1541
+ replacement_job,
1542
+ )
1222
1543
 
1223
1544
  if batch_system_id is not None:
1224
1545
  # Look for any standard output/error files created by the batch system.
@@ -1227,31 +1548,60 @@ class Leader:
1227
1548
  # --workDir / TOIL_WORKDIR is on a shared file system.
1228
1549
  # They live directly in the Toil work directory because that is
1229
1550
  # guaranteed to exist on the leader and workers.
1230
- file_list = glob.glob(self.batchSystem.format_std_out_err_glob(batch_system_id))
1551
+ file_list = glob.glob(
1552
+ self.batchSystem.format_std_out_err_glob(batch_system_id)
1553
+ )
1231
1554
  for log_file in file_list:
1232
1555
  try:
1233
- log_stream = open(log_file, 'rb')
1556
+ log_stream = open(log_file, "rb")
1234
1557
  except:
1235
- logger.warning('The batch system left a file %s, but it could not be opened' % log_file)
1558
+ logger.warning(
1559
+ "The batch system left a file %s, but it could not be opened"
1560
+ % log_file
1561
+ )
1236
1562
  else:
1237
1563
  with log_stream:
1238
1564
  if os.path.getsize(log_file) > 0:
1239
- StatsAndLogging.logWithFormatting(job_store_id, log_stream, method=logger.warning,
1240
- message='The batch system left a non-empty file %s:' % log_file)
1241
- if self.config.writeLogs or self.config.writeLogsGzip:
1242
- file_root, _ = os.path.splitext(os.path.basename(log_file))
1243
- job_names = replacement_job.chainedJobs
1244
- if job_names is None: # For jobs that fail this way, replacement_job.chainedJobs is not guaranteed to be set
1245
- job_names = [str(replacement_job)]
1246
- job_names = [j + '_' + file_root for j in job_names]
1565
+ StatsAndLogging.logWithFormatting(
1566
+ f'Log from job "{job_store_id}"',
1567
+ log_stream,
1568
+ method=logger.warning,
1569
+ message="The batch system left a non-empty file %s:"
1570
+ % log_file,
1571
+ )
1572
+ if (
1573
+ self.config.writeLogs
1574
+ or self.config.writeLogsGzip
1575
+ ):
1576
+ file_root, _ = os.path.splitext(
1577
+ os.path.basename(log_file)
1578
+ )
1579
+ job_names = [
1580
+ names.stats_name
1581
+ for names in replacement_job.get_chain()
1582
+ ]
1583
+ # Tack the batch system log file name onto each job's name
1584
+ job_names = [
1585
+ j + "_" + file_root for j in job_names
1586
+ ]
1247
1587
  log_stream.seek(0)
1248
- StatsAndLogging.writeLogFiles(job_names, log_stream, self.config, failed=True)
1588
+ StatsAndLogging.writeLogFiles(
1589
+ job_names,
1590
+ log_stream,
1591
+ self.config,
1592
+ failed=True,
1593
+ )
1249
1594
  else:
1250
- logger.warning('The batch system left an empty file %s' % log_file)
1595
+ logger.warning(
1596
+ "The batch system left an empty file %s"
1597
+ % log_file
1598
+ )
1251
1599
 
1252
1600
  # Tell the job to reset itself after a failure.
1253
1601
  # It needs to know the failure reason if available; some are handled specially.
1254
- replacement_job.setupJobAfterFailure(exit_status=result_status, exit_reason=exit_reason)
1602
+ replacement_job.setupJobAfterFailure(
1603
+ exit_status=result_status, exit_reason=exit_reason
1604
+ )
1255
1605
  self.toilState.commit_job(job_store_id)
1256
1606
 
1257
1607
  elif job_store_id in self.toilState.hasFailedSuccessors:
@@ -1259,18 +1609,20 @@ class Leader:
1259
1609
  self.toilState.hasFailedSuccessors.remove(job_store_id)
1260
1610
 
1261
1611
  # Now that we know the job is done we can add it to the list of updated jobs
1262
- self._messages.publish(JobUpdatedMessage(replacement_job.jobStoreID, result_status))
1612
+ self._messages.publish(
1613
+ JobUpdatedMessage(replacement_job.jobStoreID, result_status)
1614
+ )
1263
1615
  logger.debug("Added job: %s to updated jobs", replacement_job)
1264
1616
 
1265
1617
  # Return True if it will rerun (still has retries) and false if it
1266
1618
  # is completely failed.
1267
1619
  return replacement_job.remainingTryCount > 0
1268
- else: #The job is done
1620
+ else: # The job is done
1269
1621
  self.processRemovedJob(finished_job, result_status)
1270
1622
  # Being done, it won't run again.
1271
1623
  return False
1272
1624
 
1273
- def getSuccessors(self, job_id: str, alreadySeenSuccessors: Set[str]) -> Set[str]:
1625
+ def getSuccessors(self, job_id: str, alreadySeenSuccessors: set[str]) -> set[str]:
1274
1626
  """
1275
1627
  Get successors of the given job by walking the job graph recursively.
1276
1628
 
@@ -1278,6 +1630,7 @@ class Leader:
1278
1630
  :returns: The set of found successors. This set is added to alreadySeenSuccessors.
1279
1631
  """
1280
1632
  successors = set()
1633
+
1281
1634
  def successorRecursion(job_id: str) -> None:
1282
1635
  # TODO: do we need to reload from the job store here, or is the cache OK?
1283
1636
  jobDesc = self.toilState.get_job(job_id)
@@ -1309,12 +1662,15 @@ class Leader:
1309
1662
 
1310
1663
  # Tell everyone it failed
1311
1664
 
1312
- self._messages.publish(JobFailedMessage(job_desc.get_job_kind(), job_id))
1665
+ self._messages.publish(
1666
+ JobFailedMessage(get_job_kind(job_desc.get_names()), job_id)
1667
+ )
1313
1668
 
1314
1669
  if job_id in self.toilState.service_to_client:
1315
1670
  # Is a service job
1316
- logger.debug("Service job is being processed as a totally failed job: %s", job_desc)
1317
-
1671
+ logger.debug(
1672
+ "Service job is being processed as a totally failed job: %s", job_desc
1673
+ )
1318
1674
 
1319
1675
  if not isinstance(job_desc, ServiceJobDescription):
1320
1676
  raise RuntimeError("The service job description type is incorrect.")
@@ -1338,8 +1694,13 @@ class Leader:
1338
1694
  # properly, and to remember that this service failed with an error
1339
1695
  # and possibly never started.
1340
1696
  if client_id in self.toilState.servicesIssued:
1341
- self.serviceManager.kill_services(self.toilState.servicesIssued[client_id], error=True)
1342
- logger.warning("Job: %s is instructing all other services of its parent job to quit", job_desc)
1697
+ self.serviceManager.kill_services(
1698
+ self.toilState.servicesIssued[client_id], error=True
1699
+ )
1700
+ logger.warning(
1701
+ "Job: %s is instructing all other services of its parent job to quit",
1702
+ job_desc,
1703
+ )
1343
1704
 
1344
1705
  # This ensures that the job will not attempt to run any of it's
1345
1706
  # successors on the stack
@@ -1363,9 +1724,14 @@ class Leader:
1363
1724
  # Any successor already in toilState.failedSuccessors will not be traversed
1364
1725
  # All successors traversed will be added to toilState.failedSuccessors and returned
1365
1726
  # as a set (unseenSuccessors).
1366
- unseenSuccessors = self.getSuccessors(job_id, self.toilState.failedSuccessors)
1367
- logger.debug("Found new failed successors: %s of job: %s", " ".join(
1368
- unseenSuccessors), job_desc)
1727
+ unseenSuccessors = self.getSuccessors(
1728
+ job_id, self.toilState.failedSuccessors
1729
+ )
1730
+ logger.debug(
1731
+ "Found new failed successors: %s of job: %s",
1732
+ " ".join(unseenSuccessors),
1733
+ job_desc,
1734
+ )
1369
1735
 
1370
1736
  # For each newly found successor
1371
1737
  for successorJobStoreID in unseenSuccessors:
@@ -1376,7 +1742,9 @@ class Leader:
1376
1742
  # For each such predecessor job
1377
1743
  # (we remove the successor from toilState.successor_to_predecessors to avoid doing
1378
1744
  # this multiple times for each failed predecessor)
1379
- for predecessor_id in self.toilState.successor_to_predecessors.pop(successorJobStoreID):
1745
+ for predecessor_id in self.toilState.successor_to_predecessors.pop(
1746
+ successorJobStoreID
1747
+ ):
1380
1748
 
1381
1749
  predecessor = self.toilState.get_job(predecessor_id)
1382
1750
 
@@ -1385,8 +1753,11 @@ class Leader:
1385
1753
 
1386
1754
  # Indicate that it has failed jobs.
1387
1755
  self.toilState.hasFailedSuccessors.add(predecessor_id)
1388
- logger.debug("Marking job: %s as having failed successors (found by "
1389
- "reading successors failed job)", predecessor)
1756
+ logger.debug(
1757
+ "Marking job: %s as having failed successors (found by "
1758
+ "reading successors failed job)",
1759
+ predecessor,
1760
+ )
1390
1761
 
1391
1762
  # If the predecessor has no remaining successors, add to list of updated jobs
1392
1763
  if self.toilState.count_pending_successors(predecessor_id) == 0:
@@ -1400,8 +1771,12 @@ class Leader:
1400
1771
 
1401
1772
  # Mark the predecessor as failed
1402
1773
  self.toilState.hasFailedSuccessors.add(predecessor_id)
1403
- logger.debug("Totally failed job: %s is marking direct predecessor: %s "
1404
- "as having failed jobs", job_desc, self.toilState.get_job(predecessor_id))
1774
+ logger.debug(
1775
+ "Totally failed job: %s is marking direct predecessor: %s "
1776
+ "as having failed jobs",
1777
+ job_desc,
1778
+ self.toilState.get_job(predecessor_id),
1779
+ )
1405
1780
 
1406
1781
  self._updatePredecessorStatus(job_id)
1407
1782
 
@@ -1411,38 +1786,59 @@ class Leader:
1411
1786
  # Is a service host job, so its predecessor is its client
1412
1787
  client_id = self.toilState.service_to_client.pop(jobStoreID)
1413
1788
  self.toilState.servicesIssued[client_id].remove(jobStoreID)
1414
- if len(self.toilState.servicesIssued[client_id]) == 0: # Predecessor job has
1789
+ if (
1790
+ len(self.toilState.servicesIssued[client_id]) == 0
1791
+ ): # Predecessor job has
1415
1792
  # all its services terminated
1416
- self.toilState.servicesIssued.pop(client_id) # The job has no running services
1793
+ self.toilState.servicesIssued.pop(
1794
+ client_id
1795
+ ) # The job has no running services
1417
1796
 
1418
- logger.debug('Job %s is no longer waiting on services; all services have stopped', self.toilState.get_job(client_id))
1797
+ logger.debug(
1798
+ "Job %s is no longer waiting on services; all services have stopped",
1799
+ self.toilState.get_job(client_id),
1800
+ )
1419
1801
 
1420
1802
  # Now we know the job is done we can add it to the list of
1421
1803
  # updated job files
1422
1804
  self._messages.publish(JobUpdatedMessage(client_id, 0))
1423
1805
  else:
1424
- logger.debug('Job %s is still waiting on %d services',
1425
- self.toilState.get_job(client_id),
1426
- len(self.toilState.servicesIssued[client_id]))
1806
+ logger.debug(
1807
+ "Job %s is still waiting on %d services",
1808
+ self.toilState.get_job(client_id),
1809
+ len(self.toilState.servicesIssued[client_id]),
1810
+ )
1427
1811
  elif jobStoreID not in self.toilState.successor_to_predecessors:
1428
- #We have reach the root job
1812
+ # We have reach the root job
1429
1813
  if self._messages.count(JobUpdatedMessage) != 0:
1430
1814
  raise RuntimeError("Root job is done but other jobs are still updated")
1431
1815
  if len(self.toilState.successor_to_predecessors) != 0:
1432
- raise RuntimeError("Job {} is finished and had no predecessor, but we have other outstanding jobs "
1433
- "with predecessors: {}".format(jobStoreID, self.toilState.successor_to_predecessors.keys()))
1816
+ raise RuntimeError(
1817
+ "Job {} is finished and had no predecessor, but we have other outstanding jobs "
1818
+ "with predecessors: {}".format(
1819
+ jobStoreID, self.toilState.successor_to_predecessors.keys()
1820
+ )
1821
+ )
1434
1822
  if len(self.toilState.successorCounts) != 0:
1435
- raise RuntimeError("Root job is done but jobs waiting on successors: {self.toilState.successorCounts}")
1436
- logger.debug("Reached root job %s so no predecessors to clean up" % jobStoreID)
1823
+ raise RuntimeError(
1824
+ "Root job is done but jobs waiting on successors: {self.toilState.successorCounts}"
1825
+ )
1826
+ logger.debug(
1827
+ "Reached root job %s so no predecessors to clean up" % jobStoreID
1828
+ )
1437
1829
 
1438
1830
  else:
1439
1831
  # Is a non-root, non-service job
1440
1832
  logger.debug("Cleaning the predecessors of %s" % jobStoreID)
1441
1833
 
1442
1834
  # For each predecessor
1443
- for predecessor_id in self.toilState.successor_to_predecessors.pop(jobStoreID):
1835
+ for predecessor_id in self.toilState.successor_to_predecessors.pop(
1836
+ jobStoreID
1837
+ ):
1444
1838
  if not isinstance(predecessor_id, str):
1445
- raise RuntimeError("Predecessor ID should be str but is {type(predecessor_id)}")
1839
+ raise RuntimeError(
1840
+ "Predecessor ID should be str but is {type(predecessor_id)}"
1841
+ )
1446
1842
  predecessor = self.toilState.get_job(predecessor_id)
1447
1843
 
1448
1844
  # Tell the predecessor that this job is done (keep only other successor jobs)