toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. toil/__init__.py +122 -315
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +173 -89
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
  5. toil/batchSystems/awsBatch.py +244 -135
  6. toil/batchSystems/cleanup_support.py +26 -16
  7. toil/batchSystems/contained_executor.py +31 -28
  8. toil/batchSystems/gridengine.py +86 -50
  9. toil/batchSystems/htcondor.py +166 -89
  10. toil/batchSystems/kubernetes.py +632 -382
  11. toil/batchSystems/local_support.py +20 -15
  12. toil/batchSystems/lsf.py +134 -81
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +290 -151
  16. toil/batchSystems/mesos/executor.py +79 -50
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +46 -28
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +296 -125
  21. toil/batchSystems/slurm.py +603 -138
  22. toil/batchSystems/torque.py +47 -33
  23. toil/bus.py +186 -76
  24. toil/common.py +664 -368
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1136 -483
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +63 -42
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +140 -60
  32. toil/fileStores/cachingFileStore.py +717 -269
  33. toil/fileStores/nonCachingFileStore.py +116 -87
  34. toil/job.py +1225 -368
  35. toil/jobStores/abstractJobStore.py +416 -266
  36. toil/jobStores/aws/jobStore.py +863 -477
  37. toil/jobStores/aws/utils.py +201 -120
  38. toil/jobStores/conftest.py +3 -2
  39. toil/jobStores/fileJobStore.py +292 -154
  40. toil/jobStores/googleJobStore.py +140 -74
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +668 -272
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +74 -31
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +214 -39
  49. toil/lib/aws/utils.py +287 -231
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +104 -47
  53. toil/lib/docker.py +131 -103
  54. toil/lib/ec2.py +361 -199
  55. toil/lib/ec2nodes.py +174 -106
  56. toil/lib/encryption/_dummy.py +5 -3
  57. toil/lib/encryption/_nacl.py +10 -6
  58. toil/lib/encryption/conftest.py +1 -0
  59. toil/lib/exceptions.py +26 -7
  60. toil/lib/expando.py +5 -3
  61. toil/lib/ftp_utils.py +217 -0
  62. toil/lib/generatedEC2Lists.py +127 -19
  63. toil/lib/humanize.py +6 -2
  64. toil/lib/integration.py +341 -0
  65. toil/lib/io.py +141 -15
  66. toil/lib/iterables.py +4 -2
  67. toil/lib/memoize.py +12 -8
  68. toil/lib/misc.py +66 -21
  69. toil/lib/objects.py +2 -2
  70. toil/lib/resources.py +68 -15
  71. toil/lib/retry.py +126 -81
  72. toil/lib/threading.py +299 -82
  73. toil/lib/throttle.py +16 -15
  74. toil/options/common.py +843 -409
  75. toil/options/cwl.py +175 -90
  76. toil/options/runner.py +50 -0
  77. toil/options/wdl.py +73 -17
  78. toil/provisioners/__init__.py +117 -46
  79. toil/provisioners/abstractProvisioner.py +332 -157
  80. toil/provisioners/aws/__init__.py +70 -33
  81. toil/provisioners/aws/awsProvisioner.py +1145 -715
  82. toil/provisioners/clusterScaler.py +541 -279
  83. toil/provisioners/gceProvisioner.py +282 -179
  84. toil/provisioners/node.py +155 -79
  85. toil/realtimeLogger.py +34 -22
  86. toil/resource.py +137 -75
  87. toil/server/app.py +128 -62
  88. toil/server/celery_app.py +3 -1
  89. toil/server/cli/wes_cwl_runner.py +82 -53
  90. toil/server/utils.py +54 -28
  91. toil/server/wes/abstract_backend.py +64 -26
  92. toil/server/wes/amazon_wes_utils.py +21 -15
  93. toil/server/wes/tasks.py +121 -63
  94. toil/server/wes/toil_backend.py +142 -107
  95. toil/server/wsgi_app.py +4 -3
  96. toil/serviceManager.py +58 -22
  97. toil/statsAndLogging.py +224 -70
  98. toil/test/__init__.py +282 -183
  99. toil/test/batchSystems/batchSystemTest.py +460 -210
  100. toil/test/batchSystems/batch_system_plugin_test.py +90 -0
  101. toil/test/batchSystems/test_gridengine.py +173 -0
  102. toil/test/batchSystems/test_lsf_helper.py +67 -58
  103. toil/test/batchSystems/test_slurm.py +110 -49
  104. toil/test/cactus/__init__.py +0 -0
  105. toil/test/cactus/test_cactus_integration.py +56 -0
  106. toil/test/cwl/cwlTest.py +496 -287
  107. toil/test/cwl/measure_default_memory.cwl +12 -0
  108. toil/test/cwl/not_run_required_input.cwl +29 -0
  109. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  110. toil/test/cwl/seqtk_seq.cwl +1 -1
  111. toil/test/docs/scriptsTest.py +69 -46
  112. toil/test/jobStores/jobStoreTest.py +427 -264
  113. toil/test/lib/aws/test_iam.py +118 -50
  114. toil/test/lib/aws/test_s3.py +16 -9
  115. toil/test/lib/aws/test_utils.py +5 -6
  116. toil/test/lib/dockerTest.py +118 -141
  117. toil/test/lib/test_conversions.py +113 -115
  118. toil/test/lib/test_ec2.py +58 -50
  119. toil/test/lib/test_integration.py +104 -0
  120. toil/test/lib/test_misc.py +12 -5
  121. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  122. toil/test/mesos/helloWorld.py +7 -6
  123. toil/test/mesos/stress.py +25 -20
  124. toil/test/options/__init__.py +13 -0
  125. toil/test/options/options.py +42 -0
  126. toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
  127. toil/test/provisioners/clusterScalerTest.py +440 -250
  128. toil/test/provisioners/clusterTest.py +166 -44
  129. toil/test/provisioners/gceProvisionerTest.py +174 -100
  130. toil/test/provisioners/provisionerTest.py +25 -13
  131. toil/test/provisioners/restartScript.py +5 -4
  132. toil/test/server/serverTest.py +188 -141
  133. toil/test/sort/restart_sort.py +137 -68
  134. toil/test/sort/sort.py +134 -66
  135. toil/test/sort/sortTest.py +91 -49
  136. toil/test/src/autoDeploymentTest.py +141 -101
  137. toil/test/src/busTest.py +20 -18
  138. toil/test/src/checkpointTest.py +8 -2
  139. toil/test/src/deferredFunctionTest.py +49 -35
  140. toil/test/src/dockerCheckTest.py +32 -24
  141. toil/test/src/environmentTest.py +135 -0
  142. toil/test/src/fileStoreTest.py +539 -272
  143. toil/test/src/helloWorldTest.py +7 -4
  144. toil/test/src/importExportFileTest.py +61 -31
  145. toil/test/src/jobDescriptionTest.py +46 -21
  146. toil/test/src/jobEncapsulationTest.py +2 -0
  147. toil/test/src/jobFileStoreTest.py +74 -50
  148. toil/test/src/jobServiceTest.py +187 -73
  149. toil/test/src/jobTest.py +121 -71
  150. toil/test/src/miscTests.py +19 -18
  151. toil/test/src/promisedRequirementTest.py +82 -36
  152. toil/test/src/promisesTest.py +7 -6
  153. toil/test/src/realtimeLoggerTest.py +10 -6
  154. toil/test/src/regularLogTest.py +71 -37
  155. toil/test/src/resourceTest.py +80 -49
  156. toil/test/src/restartDAGTest.py +36 -22
  157. toil/test/src/resumabilityTest.py +9 -2
  158. toil/test/src/retainTempDirTest.py +45 -14
  159. toil/test/src/systemTest.py +12 -8
  160. toil/test/src/threadingTest.py +44 -25
  161. toil/test/src/toilContextManagerTest.py +10 -7
  162. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  163. toil/test/src/workerTest.py +73 -23
  164. toil/test/utils/toilDebugTest.py +103 -33
  165. toil/test/utils/toilKillTest.py +4 -5
  166. toil/test/utils/utilsTest.py +245 -106
  167. toil/test/wdl/wdltoil_test.py +818 -149
  168. toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
  169. toil/toilState.py +120 -35
  170. toil/utils/toilConfig.py +13 -4
  171. toil/utils/toilDebugFile.py +44 -27
  172. toil/utils/toilDebugJob.py +214 -27
  173. toil/utils/toilDestroyCluster.py +11 -6
  174. toil/utils/toilKill.py +8 -3
  175. toil/utils/toilLaunchCluster.py +256 -140
  176. toil/utils/toilMain.py +37 -16
  177. toil/utils/toilRsyncCluster.py +32 -14
  178. toil/utils/toilSshCluster.py +49 -22
  179. toil/utils/toilStats.py +356 -273
  180. toil/utils/toilStatus.py +292 -139
  181. toil/utils/toilUpdateEC2Instances.py +3 -1
  182. toil/version.py +12 -12
  183. toil/wdl/utils.py +5 -5
  184. toil/wdl/wdltoil.py +3913 -1033
  185. toil/worker.py +367 -184
  186. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
  187. toil-8.0.0.dist-info/METADATA +173 -0
  188. toil-8.0.0.dist-info/RECORD +253 -0
  189. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
  190. toil-6.1.0a1.dist-info/METADATA +0 -125
  191. toil-6.1.0a1.dist-info/RECORD +0 -237
  192. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
  193. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
@@ -17,14 +17,19 @@ from abc import ABCMeta, abstractmethod
17
17
  from datetime import datetime
18
18
  from queue import Empty, Queue
19
19
  from threading import Lock, Thread
20
- from typing import Dict, List, Optional, Tuple, Union
20
+ from typing import Optional, Union
21
21
 
22
- from toil.batchSystems.abstractBatchSystem import (BatchJobExitReason,
23
- UpdatedBatchJobInfo)
22
+ from toil.batchSystems.abstractBatchSystem import (
23
+ BatchJobExitReason,
24
+ UpdatedBatchJobInfo,
25
+ )
24
26
  from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport
25
- from toil.bus import ExternalBatchIdMessage
26
- from toil.job import AcceleratorRequirement
27
+ from toil.bus import ExternalBatchIdMessage, get_job_kind
28
+ from toil.common import Config
29
+ from toil.job import AcceleratorRequirement, JobDescription
30
+ from toil.statsAndLogging import TRACE
27
31
  from toil.lib.misc import CalledProcessErrorStderr
32
+ from toil.lib.retry import DEFAULT_DELAYS, old_retry
28
33
 
29
34
  logger = logging.getLogger(__name__)
30
35
 
@@ -37,7 +42,15 @@ logger = logging.getLogger(__name__)
37
42
  # Unit name of the job
38
43
  # Environment dict for the job
39
44
  # Accelerator requirements for the job
40
- JobTuple = Tuple[int, float, int, str, str, Dict[str, str], List[AcceleratorRequirement]]
45
+ JobTuple = tuple[
46
+ int, float, int, str, str, dict[str, str], list[AcceleratorRequirement]
47
+ ]
48
+
49
+
50
+ class ExceededRetryAttempts(Exception):
51
+ def __init__(self):
52
+ super().__init__("Exceeded retry attempts talking to scheduler.")
53
+
41
54
 
42
55
  class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
43
56
  """
@@ -45,35 +58,51 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
45
58
  standard HPC cluster. By default auto-deployment is not implemented.
46
59
  """
47
60
 
48
- class Worker(Thread, metaclass=ABCMeta):
49
-
50
- def __init__(self, newJobsQueue: Queue, updatedJobsQueue: Queue, killQueue: Queue, killedJobsQueue: Queue, boss: 'AbstractGridEngineBatchSystem') -> None:
61
+ class GridEngineThreadException(Exception):
62
+ pass
63
+
64
+ class GridEngineThread(Thread, metaclass=ABCMeta):
65
+ def __init__(
66
+ self,
67
+ newJobsQueue: Queue,
68
+ updatedJobsQueue: Queue,
69
+ killQueue: Queue,
70
+ killedJobsQueue: Queue,
71
+ boss: "AbstractGridEngineBatchSystem",
72
+ ) -> None:
51
73
  """
52
- Abstract worker interface class. All instances are created with five
74
+ Abstract thread interface class. All instances are created with five
53
75
  initial arguments (below). Note the Queue instances passed are empty.
54
76
 
55
77
  :param newJobsQueue: a Queue of new (unsubmitted) jobs
56
78
  :param updatedJobsQueue: a Queue of jobs that have been updated
57
79
  :param killQueue: a Queue of active jobs that need to be killed
58
- :param killedJobsQueue: Queue of killed jobs for this worker
80
+ :param killedJobsQueue: Queue of killed jobs for this thread
59
81
  :param boss: the AbstractGridEngineBatchSystem instance that
60
- controls this AbstractGridEngineWorker
82
+ controls this GridEngineThread
61
83
 
62
84
  """
63
85
  Thread.__init__(self)
64
86
  self.boss = boss
65
- self.boss.config.statePollingWait = \
87
+ self.boss.config.statePollingWait = (
66
88
  self.boss.config.statePollingWait or self.boss.getWaitDuration()
89
+ )
90
+ self.boss.config.state_polling_timeout = (
91
+ self.boss.config.state_polling_timeout
92
+ or self.boss.config.statePollingWait * 10
93
+ )
67
94
  self.newJobsQueue = newJobsQueue
68
95
  self.updatedJobsQueue = updatedJobsQueue
69
96
  self.killQueue = killQueue
70
97
  self.killedJobsQueue = killedJobsQueue
71
- self.waitingJobs: List[JobTuple] = list()
98
+ self.waitingJobs: list[JobTuple] = list()
72
99
  self.runningJobs = set()
100
+ # TODO: Why do we need a lock for this? We have the GIL.
73
101
  self.runningJobsLock = Lock()
74
- self.batchJobIDs: Dict[int, str] = dict()
102
+ self.batchJobIDs: dict[int, str] = dict()
75
103
  self._checkOnJobsCache = None
76
104
  self._checkOnJobsTimestamp = None
105
+ self.exception = None
77
106
 
78
107
  def getBatchSystemID(self, jobID: int) -> str:
79
108
  """
@@ -107,25 +136,35 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
107
136
  """
108
137
  Create a new job with the given attributes.
109
138
 
110
- Implementation-specific; called by AbstractGridEngineWorker.run()
139
+ Implementation-specific; called by GridEngineThread.run()
111
140
  """
112
141
  activity = False
113
142
  # Load new job id if present:
114
143
  if newJob is not None:
115
144
  self.waitingJobs.append(newJob)
116
145
  # Launch jobs as necessary:
117
- while len(self.waitingJobs) > 0 and \
118
- len(self.runningJobs) < int(self.boss.config.max_jobs):
146
+ while len(self.waitingJobs) > 0 and len(self.runningJobs) < int(
147
+ self.boss.config.max_jobs
148
+ ):
119
149
  activity = True
120
- jobID, cpu, memory, command, jobName, environment, gpus = self.waitingJobs.pop(0)
121
-
150
+ jobID, cpu, memory, command, jobName, environment, gpus = (
151
+ self.waitingJobs.pop(0)
152
+ )
153
+ if self.boss.config.memory_is_product and cpu > 1:
154
+ memory = memory // cpu
122
155
  # prepare job submission command
123
- subLine = self.prepareSubmission(cpu, memory, jobID, command, jobName, environment, gpus)
156
+ subLine = self.prepareSubmission(
157
+ cpu, memory, jobID, command, jobName, environment, gpus
158
+ )
124
159
  logger.debug("Running %r", subLine)
125
160
  batchJobID = self.boss.with_retries(self.submitJob, subLine)
126
161
  if self.boss._outbox is not None:
127
- #JobID corresponds to the toil version of the jobID, dif from jobstore idea of the id, batchjobid is what we get from slurm
128
- self.boss._outbox.publish(ExternalBatchIdMessage(jobID, batchJobID, self.boss.__class__.__name__))
162
+ # JobID corresponds to the toil version of the jobID, dif from jobstore idea of the id, batchjobid is what we get from slurm
163
+ self.boss._outbox.publish(
164
+ ExternalBatchIdMessage(
165
+ jobID, batchJobID, self.boss.__class__.__name__
166
+ )
167
+ )
129
168
 
130
169
  logger.debug("Submitted job %s", str(batchJobID))
131
170
 
@@ -143,7 +182,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
143
182
 
144
183
  def killJobs(self):
145
184
  """
146
- Kill any running jobs within worker
185
+ Kill any running jobs within thread
147
186
  """
148
187
  killList = list()
149
188
  while True:
@@ -160,7 +199,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
160
199
  # Do the dirty job
161
200
  for jobID in list(killList):
162
201
  if jobID in self.runningJobs:
163
- logger.debug('Killing job: %s', jobID)
202
+ logger.debug("Killing job: %s", jobID)
164
203
 
165
204
  # this call should be implementation-specific, all other
166
205
  # code is redundant w/ other implementations
@@ -175,13 +214,17 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
175
214
  while killList:
176
215
  for jobID in list(killList):
177
216
  batchJobID = self.getBatchSystemID(jobID)
178
- if self.boss.with_retries(self.getJobExitCode, batchJobID) is not None:
179
- logger.debug('Adding jobID %s to killedJobsQueue', jobID)
217
+ exit_code = self.boss.with_retries(self.getJobExitCode, batchJobID)
218
+ if exit_code is not None:
219
+ logger.debug("Adding jobID %s to killedJobsQueue", jobID)
180
220
  self.killedJobsQueue.put(jobID)
181
221
  killList.remove(jobID)
182
222
  self.forgetJob(jobID)
183
223
  if len(killList) > 0:
184
- logger.warning("Some jobs weren't killed, trying again in %is.", self.boss.sleepSeconds())
224
+ logger.warning(
225
+ "Some jobs weren't killed, trying again in %is.",
226
+ self.boss.sleepSeconds(),
227
+ )
185
228
 
186
229
  return True
187
230
 
@@ -193,7 +236,9 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
193
236
  """
194
237
 
195
238
  if self._checkOnJobsTimestamp:
196
- time_since_last_check = (datetime.now() - self._checkOnJobsTimestamp).total_seconds()
239
+ time_since_last_check = (
240
+ datetime.now() - self._checkOnJobsTimestamp
241
+ ).total_seconds()
197
242
  if time_since_last_check < self.boss.config.statePollingWait:
198
243
  return self._checkOnJobsCache
199
244
 
@@ -201,47 +246,36 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
201
246
  running_job_list = list(self.runningJobs)
202
247
  batch_job_id_list = [self.getBatchSystemID(j) for j in running_job_list]
203
248
  if batch_job_id_list:
204
- try:
205
- # Get the statuses as a batch
206
- statuses = self.boss.with_retries(
207
- self.coalesce_job_exit_codes, batch_job_id_list
208
- )
209
- except NotImplementedError:
210
- # We have to get the statuses individually
211
- for running_job_id, batch_job_id in zip(running_job_list, batch_job_id_list):
212
- status = self.boss.with_retries(self.getJobExitCode, batch_job_id)
213
- activity = self._handle_job_status(
214
- running_job_id, status, activity
215
- )
216
- else:
217
- # We got the statuses as a batch
218
- for running_job_id, status in zip(running_job_list, statuses):
219
- activity = self._handle_job_status(
220
- running_job_id, status, activity
221
- )
249
+ # Get the statuses as a batch
250
+ statuses = self.boss.with_retries(
251
+ self.coalesce_job_exit_codes, batch_job_id_list
252
+ )
253
+ # We got the statuses as a batch
254
+ for running_job_id, status in zip(running_job_list, statuses):
255
+ activity = self._handle_job_status(running_job_id, status, activity)
222
256
 
223
257
  self._checkOnJobsCache = activity
224
258
  self._checkOnJobsTimestamp = datetime.now()
225
259
  return activity
226
260
 
227
261
  def _handle_job_status(
228
- self, job_id: int, status: Union[int, None], activity: bool
262
+ self,
263
+ job_id: int,
264
+ status: Union[int, tuple[int, Optional[BatchJobExitReason]], None],
265
+ activity: bool,
229
266
  ) -> bool:
230
267
  """
231
268
  Helper method for checkOnJobs to handle job statuses
232
269
  """
233
270
  if status is not None:
271
+ if isinstance(status, int):
272
+ code = status
273
+ reason = None
274
+ else:
275
+ code, reason = status
234
276
  self.updatedJobsQueue.put(
235
277
  UpdatedBatchJobInfo(
236
- jobID=job_id, exitStatus=status, exitReason=None, wallTime=None
237
- )
238
- )
239
- self.forgetJob(job_id)
240
- return True
241
- if status is not None and isinstance(status, BatchJobExitReason):
242
- self.updatedJobsQueue.put(
243
- UpdatedBatchJobInfo(
244
- jobID=job_id, exitStatus=1, exitReason=status, wallTime=None
278
+ jobID=job_id, exitStatus=code, exitReason=reason, wallTime=None
245
279
  )
246
280
  )
247
281
  self.forgetJob(job_id)
@@ -256,7 +290,9 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
256
290
  activity = True
257
291
  newJob = self.newJobsQueue.get()
258
292
  if newJob is None:
259
- logger.debug('Received queue sentinel.')
293
+ logger.debug("Received queue sentinel.")
294
+ # Send out kill signals before stopping
295
+ self.killJobs()
260
296
  return False
261
297
  if self.killJobs():
262
298
  activity = True
@@ -265,7 +301,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
265
301
  if self.checkOnJobs():
266
302
  activity = True
267
303
  if not activity:
268
- logger.debug('No activity, sleeping for %is', self.boss.sleepSeconds())
304
+ logger.log(TRACE, "No activity, sleeping for %is", self.boss.sleepSeconds())
269
305
  return True
270
306
 
271
307
  def run(self):
@@ -276,32 +312,47 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
276
312
  while self._runStep():
277
313
  pass
278
314
  except Exception as ex:
279
- logger.error("GridEngine like batch system failure", exc_info=ex)
280
- raise
281
-
282
- def coalesce_job_exit_codes(self, batch_job_id_list: list) -> list:
315
+ self.exception = ex
316
+ logger.error("GridEngine like batch system failure: %s", ex)
317
+ # don't raise exception as is_alive will still be set to false,
318
+ # signalling exception in the thread as we expect the thread to
319
+ # always be running for the duration of the workflow
320
+
321
+ def coalesce_job_exit_codes(
322
+ self, batch_job_id_list: list
323
+ ) -> list[Union[int, tuple[int, Optional[BatchJobExitReason]], None]]:
283
324
  """
284
- Returns exit codes for a list of jobs.
325
+ Returns exit codes and possibly exit reasons for a list of jobs, or None if they are running.
285
326
 
286
- Called by AbstractGridEngineWorker.checkOnJobs().
327
+ Called by GridEngineThread.checkOnJobs().
287
328
 
288
- This is an optional part of the interface. It should raise
289
- NotImplementedError if not actually implemented for a particular
290
- scheduler.
329
+ The default implementation falls back on self.getJobExitCode and polls each job individually
291
330
 
292
331
  :param string batch_job_id_list: List of batch system job ID
293
332
  """
294
- raise NotImplementedError()
333
+ statuses = []
334
+ try:
335
+ for batch_job_id in batch_job_id_list:
336
+ statuses.append(
337
+ self.boss.with_retries(self.getJobExitCode, batch_job_id)
338
+ )
339
+ except CalledProcessErrorStderr as err:
340
+ # This avoids the nested retry issue where we could issue n^2 retries when the backing scheduler somehow disappears
341
+ # We catch the internal retry exception and raise something else so the outer retry doesn't retry the entire function again
342
+ raise ExceededRetryAttempts() from err
343
+ return statuses
295
344
 
296
345
  @abstractmethod
297
- def prepareSubmission(self,
298
- cpu: int,
299
- memory: int,
300
- jobID: int,
301
- command: str,
302
- jobName: str,
303
- job_environment: Optional[Dict[str, str]] = None,
304
- gpus: Optional[int] = None) -> List[str]:
346
+ def prepareSubmission(
347
+ self,
348
+ cpu: int,
349
+ memory: int,
350
+ jobID: int,
351
+ command: str,
352
+ jobName: str,
353
+ job_environment: Optional[dict[str, str]] = None,
354
+ gpus: Optional[int] = None,
355
+ ) -> list[str]:
305
356
  """
306
357
  Preparation in putting together a command-line string
307
358
  for submitting to batch system (via submitJob().)
@@ -344,29 +395,35 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
344
395
  def killJob(self, jobID):
345
396
  """
346
397
  Kill specific job with the Toil job ID. Implementation-specific; called
347
- by AbstractGridEngineWorker.killJobs()
398
+ by GridEngineThread.killJobs()
348
399
 
349
400
  :param string jobID: Toil job ID
350
401
  """
351
402
  raise NotImplementedError()
352
403
 
353
404
  @abstractmethod
354
- def getJobExitCode(self, batchJobID):
405
+ def getJobExitCode(
406
+ self, batchJobID
407
+ ) -> Union[int, tuple[int, Optional[BatchJobExitReason]], None]:
355
408
  """
356
- Returns job exit code or an instance of abstractBatchSystem.BatchJobExitReason.
357
- if something else happened other than the job exiting.
358
- Implementation-specific; called by AbstractGridEngineWorker.checkOnJobs()
409
+ Returns job exit code and possibly an instance of abstractBatchSystem.BatchJobExitReason.
359
410
 
360
- :param string batchjobID: batch system job ID
411
+ Returns None if the job is still running.
412
+
413
+ If the job is not running but the exit code is not available, it
414
+ will be EXIT_STATUS_UNAVAILABLE_VALUE. Implementation-specific;
415
+ called by GridEngineThread.checkOnJobs().
361
416
 
362
- :rtype: int|toil.batchSystems.abstractBatchSystem.BatchJobExitReason: exit code int
363
- or BatchJobExitReason if something else happened other than job exiting.
417
+ The exit code will only be 0 if the job affirmatively succeeded.
418
+
419
+ :param string batchjobID: batch system job ID
364
420
  """
365
421
  raise NotImplementedError()
366
422
 
367
- def __init__(self, config, maxCores, maxMemory, maxDisk):
368
- super().__init__(
369
- config, maxCores, maxMemory, maxDisk)
423
+ def __init__(
424
+ self, config: Config, maxCores: float, maxMemory: int, maxDisk: int
425
+ ) -> None:
426
+ super().__init__(config, maxCores, maxMemory, maxDisk)
370
427
  self.config = config
371
428
 
372
429
  self.currentJobs = set()
@@ -375,43 +432,70 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
375
432
  self.updatedJobsQueue = Queue()
376
433
  self.killQueue = Queue()
377
434
  self.killedJobsQueue = Queue()
378
- # get the associated worker class here
379
- self.worker = self.Worker(self.newJobsQueue, self.updatedJobsQueue,
380
- self.killQueue, self.killedJobsQueue, self)
381
- self.worker.start()
435
+ # get the associated thread class here
436
+ self.background_thread = self.GridEngineThread(
437
+ self.newJobsQueue,
438
+ self.updatedJobsQueue,
439
+ self.killQueue,
440
+ self.killedJobsQueue,
441
+ self,
442
+ )
443
+ self.background_thread.start()
382
444
  self._getRunningBatchJobIDsTimestamp = None
383
445
  self._getRunningBatchJobIDsCache = {}
384
446
 
385
- @classmethod
386
- def supportsWorkerCleanup(cls):
387
- return False
388
-
389
447
  @classmethod
390
448
  def supportsAutoDeployment(cls):
391
449
  return False
392
450
 
393
- def issueBatchJob(self, jobDesc, job_environment: Optional[Dict[str, str]] = None):
394
- # Avoid submitting internal jobs to the batch queue, handle locally
395
- localID = self.handleLocalJob(jobDesc)
396
- if localID is not None:
397
- return localID
451
+ def count_needed_gpus(self, job_desc: JobDescription):
452
+ """
453
+ Count the number of cluster-allocateable GPUs we want to allocate for the given job.
454
+ """
455
+ gpus = 0
456
+ if isinstance(job_desc.accelerators, list):
457
+ for accelerator in job_desc.accelerators:
458
+ if accelerator["kind"] == "gpu":
459
+ gpus += accelerator["count"]
398
460
  else:
399
- self.check_resource_request(jobDesc)
400
- jobID = self.getNextJobID()
401
- self.currentJobs.add(jobID)
402
- gpus = 0
403
- if isinstance(jobDesc.accelerators, list):
404
- for accelerator in jobDesc.accelerators:
405
- if accelerator['kind'] == 'gpu':
406
- gpus = accelerator['count']
407
- else:
408
- gpus = jobDesc.accelerators
461
+ gpus = job_desc.accelerators
409
462
 
410
- self.newJobsQueue.put((jobID, jobDesc.cores, jobDesc.memory, jobDesc.command, jobDesc.get_job_kind(),
411
- job_environment, gpus))
412
- logger.debug("Issued the job command: %s with job id: %s and job name %s", jobDesc.command, str(jobID),
413
- jobDesc.get_job_kind())
414
- return jobID
463
+ return gpus
464
+
465
+ def issueBatchJob(
466
+ self,
467
+ command: str,
468
+ job_desc: JobDescription,
469
+ job_environment: Optional[dict[str, str]] = None,
470
+ ):
471
+ # Avoid submitting internal jobs to the batch queue, handle locally
472
+ local_id = self.handleLocalJob(command, job_desc)
473
+ if local_id is not None:
474
+ return local_id
475
+ else:
476
+ self.check_resource_request(job_desc)
477
+ gpus = self.count_needed_gpus(job_desc)
478
+ job_id = self.getNextJobID()
479
+ self.currentJobs.add(job_id)
480
+
481
+ self.newJobsQueue.put(
482
+ (
483
+ job_id,
484
+ job_desc.cores,
485
+ job_desc.memory,
486
+ command,
487
+ get_job_kind(job_desc.get_names()),
488
+ job_environment,
489
+ gpus,
490
+ )
491
+ )
492
+ logger.debug(
493
+ "Issued the job command: %s with job id: %s and job name %s",
494
+ command,
495
+ str(job_id),
496
+ get_job_kind(job_desc.get_names()),
497
+ )
498
+ return job_id
415
499
 
416
500
  def killBatchJobs(self, jobIDs):
417
501
  """
@@ -420,11 +504,18 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
420
504
  """
421
505
  self.killLocalJobs(jobIDs)
422
506
  jobIDs = set(jobIDs)
423
- logger.debug('Jobs to be killed: %r', jobIDs)
507
+ logger.debug("Jobs to be killed: %r", jobIDs)
424
508
  for jobID in jobIDs:
425
509
  self.killQueue.put(jobID)
426
510
  while jobIDs:
427
- killedJobId = self.killedJobsQueue.get()
511
+ try:
512
+ killedJobId = self.killedJobsQueue.get(timeout=10)
513
+ except Empty:
514
+ if not self.background_thread.is_alive():
515
+ raise self.GridEngineThreadException(
516
+ "Grid engine thread failed unexpectedly"
517
+ ) from self.background_thread.exception
518
+ continue
428
519
  if killedJobId is None:
429
520
  break
430
521
  jobIDs.remove(killedJobId)
@@ -434,8 +525,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
434
525
  if killedJobId in self.currentJobs:
435
526
  self.currentJobs.remove(killedJobId)
436
527
  if jobIDs:
437
- logger.debug('Some kills (%s) still pending, sleeping %is', len(jobIDs),
438
- self.sleepSeconds())
528
+ logger.debug(
529
+ "Some kills (%s) still pending, sleeping %is",
530
+ len(jobIDs),
531
+ self.sleepSeconds(),
532
+ )
439
533
 
440
534
  def getIssuedBatchJobIDs(self):
441
535
  """
@@ -450,13 +544,14 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
450
544
  Respects statePollingWait and will return cached results if not within
451
545
  time period to talk with the scheduler.
452
546
  """
453
- if (self._getRunningBatchJobIDsTimestamp and (
454
- datetime.now() -
455
- self._getRunningBatchJobIDsTimestamp).total_seconds() <
456
- self.config.statePollingWait):
547
+ if (
548
+ self._getRunningBatchJobIDsTimestamp
549
+ and (datetime.now() - self._getRunningBatchJobIDsTimestamp).total_seconds()
550
+ < self.config.statePollingWait
551
+ ):
457
552
  batchIds = self._getRunningBatchJobIDsCache
458
553
  else:
459
- batchIds = self.with_retries(self.worker.getRunningJobIDs)
554
+ batchIds = self.with_retries(self.background_thread.getRunningJobIDs)
460
555
  self._getRunningBatchJobIDsCache = batchIds
461
556
  self._getRunningBatchJobIDsTimestamp = datetime.now()
462
557
  batchIds.update(self.getRunningLocalJobIDs())
@@ -464,6 +559,13 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
464
559
 
465
560
  def getUpdatedBatchJob(self, maxWait):
466
561
  local_tuple = self.getUpdatedLocalJob(0)
562
+
563
+ if not self.background_thread.is_alive():
564
+ # kill remaining jobs on the thread
565
+ self.background_thread.killJobs()
566
+ raise self.GridEngineThreadException(
567
+ "Unexpected GridEngineThread failure"
568
+ ) from self.background_thread.exception
467
569
  if local_tuple:
468
570
  return local_tuple
469
571
  else:
@@ -471,24 +573,42 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
471
573
  item = self.updatedJobsQueue.get(timeout=maxWait)
472
574
  except Empty:
473
575
  return None
474
- logger.debug('UpdatedJobsQueue Item: %s', item)
576
+ logger.debug("UpdatedJobsQueue Item: %s", item)
475
577
  self.currentJobs.remove(item.jobID)
476
578
  return item
477
579
 
478
580
  def shutdown(self) -> None:
479
581
  """
480
- Signals worker to shutdown (via sentinel) then cleanly joins the thread
582
+ Signals thread to shutdown (via sentinel) then cleanly joins the thread
481
583
  """
584
+
585
+ for jobID in self.getIssuedBatchJobIDs():
586
+ # Send kill signals to any jobs that might be running
587
+ self.killQueue.put(jobID)
588
+
482
589
  self.shutdownLocal()
483
590
  newJobsQueue = self.newJobsQueue
484
591
  self.newJobsQueue = None
485
592
 
486
593
  newJobsQueue.put(None)
487
- self.worker.join()
594
+ self.background_thread.join()
595
+
596
+ # Now in one thread, kill all the jobs
597
+ if len(self.background_thread.runningJobs) > 0:
598
+ logger.warning(
599
+ "Cleaning up %s jobs still running at shutdown",
600
+ len(self.background_thread.runningJobs),
601
+ )
602
+ for job in self.background_thread.runningJobs:
603
+ self.killQueue.put(job)
604
+ self.background_thread.killJobs()
488
605
 
489
606
  def setEnv(self, name, value=None):
490
- if value and ',' in value:
491
- raise ValueError(type(self).__name__ + " does not support commata in environment variable values")
607
+ if value and "," in value:
608
+ raise ValueError(
609
+ type(self).__name__
610
+ + " does not support commata in environment variable values"
611
+ )
492
612
  return super().setEnv(name, value)
493
613
 
494
614
  @classmethod
@@ -496,28 +616,32 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
496
616
  return 1
497
617
 
498
618
  def sleepSeconds(self, sleeptime=1):
499
- """ Helper function to drop on all state-querying functions to avoid over-querying.
500
- """
619
+ """Helper function to drop on all state-querying functions to avoid over-querying."""
501
620
  time.sleep(sleeptime)
502
621
  return sleeptime
503
622
 
504
623
  def with_retries(self, operation, *args, **kwargs):
505
624
  """
506
- Call operation with args and kwargs. If one of the calls to an SGE
507
- command fails, sleep and try again for a set number of times.
625
+ Call operation with args and kwargs. If one of the calls to a
626
+ command fails, sleep and try again.
508
627
  """
509
- maxTries = 3
510
- tries = 0
511
- while True:
512
- tries += 1
513
- try:
514
- return operation(*args, **kwargs)
515
- except CalledProcessErrorStderr as err:
516
- if tries < maxTries:
517
- logger.error("Will retry errored operation %s, code %d: %s",
518
- operation.__name__, err.returncode, err.stderr)
519
- time.sleep(self.config.statePollingWait)
520
- else:
521
- logger.error("Failed operation %s, code %d: %s",
522
- operation.__name__, err.returncode, err.stderr)
628
+ for attempt in old_retry(
629
+ # Don't retry more often than the state polling wait.
630
+ delays=[
631
+ max(delay, self.config.statePollingWait) for delay in DEFAULT_DELAYS
632
+ ],
633
+ timeout=self.config.state_polling_timeout,
634
+ predicate=lambda e: isinstance(e, CalledProcessErrorStderr),
635
+ ):
636
+ with attempt:
637
+ try:
638
+ return operation(*args, **kwargs)
639
+ except CalledProcessErrorStderr as err:
640
+ logger.error(
641
+ "Errored operation %s, code %d: %s",
642
+ operation.__name__,
643
+ err.returncode,
644
+ err.stderr,
645
+ )
646
+ # Raise up to the retry logic, which will retry until timeout
523
647
  raise err