toil 7.0.0__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (190) hide show
  1. toil/__init__.py +121 -83
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +137 -77
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
  5. toil/batchSystems/awsBatch.py +237 -128
  6. toil/batchSystems/cleanup_support.py +22 -16
  7. toil/batchSystems/contained_executor.py +30 -26
  8. toil/batchSystems/gridengine.py +85 -49
  9. toil/batchSystems/htcondor.py +164 -87
  10. toil/batchSystems/kubernetes.py +622 -386
  11. toil/batchSystems/local_support.py +17 -12
  12. toil/batchSystems/lsf.py +132 -79
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +288 -149
  16. toil/batchSystems/mesos/executor.py +77 -49
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +38 -29
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +293 -123
  21. toil/batchSystems/slurm.py +489 -137
  22. toil/batchSystems/torque.py +46 -32
  23. toil/bus.py +141 -73
  24. toil/common.py +630 -359
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1114 -532
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +62 -41
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +88 -57
  32. toil/fileStores/cachingFileStore.py +711 -247
  33. toil/fileStores/nonCachingFileStore.py +113 -75
  34. toil/job.py +988 -315
  35. toil/jobStores/abstractJobStore.py +387 -243
  36. toil/jobStores/aws/jobStore.py +727 -403
  37. toil/jobStores/aws/utils.py +161 -109
  38. toil/jobStores/conftest.py +1 -0
  39. toil/jobStores/fileJobStore.py +289 -151
  40. toil/jobStores/googleJobStore.py +137 -70
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +614 -269
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +55 -28
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +193 -58
  49. toil/lib/aws/utils.py +238 -218
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +83 -49
  53. toil/lib/docker.py +131 -103
  54. toil/lib/ec2.py +322 -209
  55. toil/lib/ec2nodes.py +174 -106
  56. toil/lib/encryption/_dummy.py +5 -3
  57. toil/lib/encryption/_nacl.py +10 -6
  58. toil/lib/encryption/conftest.py +1 -0
  59. toil/lib/exceptions.py +26 -7
  60. toil/lib/expando.py +4 -2
  61. toil/lib/ftp_utils.py +217 -0
  62. toil/lib/generatedEC2Lists.py +127 -19
  63. toil/lib/humanize.py +6 -2
  64. toil/lib/integration.py +341 -0
  65. toil/lib/io.py +99 -11
  66. toil/lib/iterables.py +4 -2
  67. toil/lib/memoize.py +12 -8
  68. toil/lib/misc.py +65 -18
  69. toil/lib/objects.py +2 -2
  70. toil/lib/resources.py +19 -7
  71. toil/lib/retry.py +115 -77
  72. toil/lib/threading.py +282 -80
  73. toil/lib/throttle.py +15 -14
  74. toil/options/common.py +834 -401
  75. toil/options/cwl.py +175 -90
  76. toil/options/runner.py +50 -0
  77. toil/options/wdl.py +70 -19
  78. toil/provisioners/__init__.py +111 -46
  79. toil/provisioners/abstractProvisioner.py +322 -157
  80. toil/provisioners/aws/__init__.py +62 -30
  81. toil/provisioners/aws/awsProvisioner.py +980 -627
  82. toil/provisioners/clusterScaler.py +541 -279
  83. toil/provisioners/gceProvisioner.py +282 -179
  84. toil/provisioners/node.py +147 -79
  85. toil/realtimeLogger.py +34 -22
  86. toil/resource.py +137 -75
  87. toil/server/app.py +127 -61
  88. toil/server/celery_app.py +3 -1
  89. toil/server/cli/wes_cwl_runner.py +82 -53
  90. toil/server/utils.py +54 -28
  91. toil/server/wes/abstract_backend.py +64 -26
  92. toil/server/wes/amazon_wes_utils.py +21 -15
  93. toil/server/wes/tasks.py +121 -63
  94. toil/server/wes/toil_backend.py +142 -107
  95. toil/server/wsgi_app.py +4 -3
  96. toil/serviceManager.py +58 -22
  97. toil/statsAndLogging.py +148 -64
  98. toil/test/__init__.py +263 -179
  99. toil/test/batchSystems/batchSystemTest.py +438 -195
  100. toil/test/batchSystems/batch_system_plugin_test.py +18 -7
  101. toil/test/batchSystems/test_gridengine.py +173 -0
  102. toil/test/batchSystems/test_lsf_helper.py +67 -58
  103. toil/test/batchSystems/test_slurm.py +93 -47
  104. toil/test/cactus/test_cactus_integration.py +20 -22
  105. toil/test/cwl/cwlTest.py +271 -71
  106. toil/test/cwl/measure_default_memory.cwl +12 -0
  107. toil/test/cwl/not_run_required_input.cwl +29 -0
  108. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  109. toil/test/docs/scriptsTest.py +60 -34
  110. toil/test/jobStores/jobStoreTest.py +412 -235
  111. toil/test/lib/aws/test_iam.py +116 -48
  112. toil/test/lib/aws/test_s3.py +16 -9
  113. toil/test/lib/aws/test_utils.py +5 -6
  114. toil/test/lib/dockerTest.py +118 -141
  115. toil/test/lib/test_conversions.py +113 -115
  116. toil/test/lib/test_ec2.py +57 -49
  117. toil/test/lib/test_integration.py +104 -0
  118. toil/test/lib/test_misc.py +12 -5
  119. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  120. toil/test/mesos/helloWorld.py +7 -6
  121. toil/test/mesos/stress.py +25 -20
  122. toil/test/options/options.py +7 -2
  123. toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
  124. toil/test/provisioners/clusterScalerTest.py +440 -250
  125. toil/test/provisioners/clusterTest.py +81 -42
  126. toil/test/provisioners/gceProvisionerTest.py +174 -100
  127. toil/test/provisioners/provisionerTest.py +25 -13
  128. toil/test/provisioners/restartScript.py +5 -4
  129. toil/test/server/serverTest.py +188 -141
  130. toil/test/sort/restart_sort.py +137 -68
  131. toil/test/sort/sort.py +134 -66
  132. toil/test/sort/sortTest.py +91 -49
  133. toil/test/src/autoDeploymentTest.py +140 -100
  134. toil/test/src/busTest.py +20 -18
  135. toil/test/src/checkpointTest.py +8 -2
  136. toil/test/src/deferredFunctionTest.py +49 -35
  137. toil/test/src/dockerCheckTest.py +33 -26
  138. toil/test/src/environmentTest.py +20 -10
  139. toil/test/src/fileStoreTest.py +538 -271
  140. toil/test/src/helloWorldTest.py +7 -4
  141. toil/test/src/importExportFileTest.py +61 -31
  142. toil/test/src/jobDescriptionTest.py +32 -17
  143. toil/test/src/jobEncapsulationTest.py +2 -0
  144. toil/test/src/jobFileStoreTest.py +74 -50
  145. toil/test/src/jobServiceTest.py +187 -73
  146. toil/test/src/jobTest.py +120 -70
  147. toil/test/src/miscTests.py +19 -18
  148. toil/test/src/promisedRequirementTest.py +82 -36
  149. toil/test/src/promisesTest.py +7 -6
  150. toil/test/src/realtimeLoggerTest.py +6 -6
  151. toil/test/src/regularLogTest.py +71 -37
  152. toil/test/src/resourceTest.py +80 -49
  153. toil/test/src/restartDAGTest.py +36 -22
  154. toil/test/src/resumabilityTest.py +9 -2
  155. toil/test/src/retainTempDirTest.py +45 -14
  156. toil/test/src/systemTest.py +12 -8
  157. toil/test/src/threadingTest.py +44 -25
  158. toil/test/src/toilContextManagerTest.py +10 -7
  159. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  160. toil/test/src/workerTest.py +33 -16
  161. toil/test/utils/toilDebugTest.py +70 -58
  162. toil/test/utils/toilKillTest.py +4 -5
  163. toil/test/utils/utilsTest.py +239 -102
  164. toil/test/wdl/wdltoil_test.py +789 -148
  165. toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
  166. toil/toilState.py +52 -26
  167. toil/utils/toilConfig.py +13 -4
  168. toil/utils/toilDebugFile.py +44 -27
  169. toil/utils/toilDebugJob.py +85 -25
  170. toil/utils/toilDestroyCluster.py +11 -6
  171. toil/utils/toilKill.py +8 -3
  172. toil/utils/toilLaunchCluster.py +251 -145
  173. toil/utils/toilMain.py +37 -16
  174. toil/utils/toilRsyncCluster.py +27 -14
  175. toil/utils/toilSshCluster.py +45 -22
  176. toil/utils/toilStats.py +75 -36
  177. toil/utils/toilStatus.py +226 -119
  178. toil/utils/toilUpdateEC2Instances.py +3 -1
  179. toil/version.py +11 -11
  180. toil/wdl/utils.py +5 -5
  181. toil/wdl/wdltoil.py +3513 -1052
  182. toil/worker.py +269 -128
  183. toil-8.0.0.dist-info/METADATA +173 -0
  184. toil-8.0.0.dist-info/RECORD +253 -0
  185. {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
  186. toil-7.0.0.dist-info/METADATA +0 -158
  187. toil-7.0.0.dist-info/RECORD +0 -244
  188. {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/LICENSE +0 -0
  189. {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
  190. {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
@@ -17,15 +17,19 @@ from abc import ABCMeta, abstractmethod
17
17
  from datetime import datetime
18
18
  from queue import Empty, Queue
19
19
  from threading import Lock, Thread
20
- from typing import Dict, List, Optional, Tuple, Union
20
+ from typing import Optional, Union
21
21
 
22
- from toil.batchSystems.abstractBatchSystem import (BatchJobExitReason,
23
- UpdatedBatchJobInfo)
22
+ from toil.batchSystems.abstractBatchSystem import (
23
+ BatchJobExitReason,
24
+ UpdatedBatchJobInfo,
25
+ )
24
26
  from toil.batchSystems.cleanup_support import BatchSystemCleanupSupport
25
27
  from toil.bus import ExternalBatchIdMessage, get_job_kind
26
- from toil.job import AcceleratorRequirement
28
+ from toil.common import Config
29
+ from toil.job import AcceleratorRequirement, JobDescription
30
+ from toil.statsAndLogging import TRACE
27
31
  from toil.lib.misc import CalledProcessErrorStderr
28
- from toil.lib.retry import old_retry, DEFAULT_DELAYS
32
+ from toil.lib.retry import DEFAULT_DELAYS, old_retry
29
33
 
30
34
  logger = logging.getLogger(__name__)
31
35
 
@@ -38,18 +42,34 @@ logger = logging.getLogger(__name__)
38
42
  # Unit name of the job
39
43
  # Environment dict for the job
40
44
  # Accelerator requirements for the job
41
- JobTuple = Tuple[int, float, int, str, str, Dict[str, str], List[AcceleratorRequirement]]
45
+ JobTuple = tuple[
46
+ int, float, int, str, str, dict[str, str], list[AcceleratorRequirement]
47
+ ]
48
+
49
+
50
+ class ExceededRetryAttempts(Exception):
51
+ def __init__(self):
52
+ super().__init__("Exceeded retry attempts talking to scheduler.")
53
+
42
54
 
43
55
  class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
44
56
  """
45
57
  A partial implementation of BatchSystemSupport for batch systems run on a
46
58
  standard HPC cluster. By default auto-deployment is not implemented.
47
59
  """
60
+
48
61
  class GridEngineThreadException(Exception):
49
62
  pass
50
63
 
51
64
  class GridEngineThread(Thread, metaclass=ABCMeta):
52
- def __init__(self, newJobsQueue: Queue, updatedJobsQueue: Queue, killQueue: Queue, killedJobsQueue: Queue, boss: 'AbstractGridEngineBatchSystem') -> None:
65
+ def __init__(
66
+ self,
67
+ newJobsQueue: Queue,
68
+ updatedJobsQueue: Queue,
69
+ killQueue: Queue,
70
+ killedJobsQueue: Queue,
71
+ boss: "AbstractGridEngineBatchSystem",
72
+ ) -> None:
53
73
  """
54
74
  Abstract thread interface class. All instances are created with five
55
75
  initial arguments (below). Note the Queue instances passed are empty.
@@ -64,18 +84,22 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
64
84
  """
65
85
  Thread.__init__(self)
66
86
  self.boss = boss
67
- self.boss.config.statePollingWait = \
87
+ self.boss.config.statePollingWait = (
68
88
  self.boss.config.statePollingWait or self.boss.getWaitDuration()
69
- self.boss.config.state_polling_timeout = \
70
- self.boss.config.state_polling_timeout or self.boss.config.statePollingWait * 10
89
+ )
90
+ self.boss.config.state_polling_timeout = (
91
+ self.boss.config.state_polling_timeout
92
+ or self.boss.config.statePollingWait * 10
93
+ )
71
94
  self.newJobsQueue = newJobsQueue
72
95
  self.updatedJobsQueue = updatedJobsQueue
73
96
  self.killQueue = killQueue
74
97
  self.killedJobsQueue = killedJobsQueue
75
- self.waitingJobs: List[JobTuple] = list()
98
+ self.waitingJobs: list[JobTuple] = list()
76
99
  self.runningJobs = set()
100
+ # TODO: Why do we need a lock for this? We have the GIL.
77
101
  self.runningJobsLock = Lock()
78
- self.batchJobIDs: Dict[int, str] = dict()
102
+ self.batchJobIDs: dict[int, str] = dict()
79
103
  self._checkOnJobsCache = None
80
104
  self._checkOnJobsTimestamp = None
81
105
  self.exception = None
@@ -119,18 +143,28 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
119
143
  if newJob is not None:
120
144
  self.waitingJobs.append(newJob)
121
145
  # Launch jobs as necessary:
122
- while len(self.waitingJobs) > 0 and \
123
- len(self.runningJobs) < int(self.boss.config.max_jobs):
146
+ while len(self.waitingJobs) > 0 and len(self.runningJobs) < int(
147
+ self.boss.config.max_jobs
148
+ ):
124
149
  activity = True
125
- jobID, cpu, memory, command, jobName, environment, gpus = self.waitingJobs.pop(0)
126
-
150
+ jobID, cpu, memory, command, jobName, environment, gpus = (
151
+ self.waitingJobs.pop(0)
152
+ )
153
+ if self.boss.config.memory_is_product and cpu > 1:
154
+ memory = memory // cpu
127
155
  # prepare job submission command
128
- subLine = self.prepareSubmission(cpu, memory, jobID, command, jobName, environment, gpus)
156
+ subLine = self.prepareSubmission(
157
+ cpu, memory, jobID, command, jobName, environment, gpus
158
+ )
129
159
  logger.debug("Running %r", subLine)
130
160
  batchJobID = self.boss.with_retries(self.submitJob, subLine)
131
161
  if self.boss._outbox is not None:
132
- #JobID corresponds to the toil version of the jobID, dif from jobstore idea of the id, batchjobid is what we get from slurm
133
- self.boss._outbox.publish(ExternalBatchIdMessage(jobID, batchJobID, self.boss.__class__.__name__))
162
+ # JobID corresponds to the toil version of the jobID, dif from jobstore idea of the id, batchjobid is what we get from slurm
163
+ self.boss._outbox.publish(
164
+ ExternalBatchIdMessage(
165
+ jobID, batchJobID, self.boss.__class__.__name__
166
+ )
167
+ )
134
168
 
135
169
  logger.debug("Submitted job %s", str(batchJobID))
136
170
 
@@ -165,7 +199,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
165
199
  # Do the dirty job
166
200
  for jobID in list(killList):
167
201
  if jobID in self.runningJobs:
168
- logger.debug('Killing job: %s', jobID)
202
+ logger.debug("Killing job: %s", jobID)
169
203
 
170
204
  # this call should be implementation-specific, all other
171
205
  # code is redundant w/ other implementations
@@ -182,12 +216,15 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
182
216
  batchJobID = self.getBatchSystemID(jobID)
183
217
  exit_code = self.boss.with_retries(self.getJobExitCode, batchJobID)
184
218
  if exit_code is not None:
185
- logger.debug('Adding jobID %s to killedJobsQueue', jobID)
219
+ logger.debug("Adding jobID %s to killedJobsQueue", jobID)
186
220
  self.killedJobsQueue.put(jobID)
187
221
  killList.remove(jobID)
188
222
  self.forgetJob(jobID)
189
223
  if len(killList) > 0:
190
- logger.warning("Some jobs weren't killed, trying again in %is.", self.boss.sleepSeconds())
224
+ logger.warning(
225
+ "Some jobs weren't killed, trying again in %is.",
226
+ self.boss.sleepSeconds(),
227
+ )
191
228
 
192
229
  return True
193
230
 
@@ -199,7 +236,9 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
199
236
  """
200
237
 
201
238
  if self._checkOnJobsTimestamp:
202
- time_since_last_check = (datetime.now() - self._checkOnJobsTimestamp).total_seconds()
239
+ time_since_last_check = (
240
+ datetime.now() - self._checkOnJobsTimestamp
241
+ ).total_seconds()
203
242
  if time_since_last_check < self.boss.config.statePollingWait:
204
243
  return self._checkOnJobsCache
205
244
 
@@ -207,31 +246,23 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
207
246
  running_job_list = list(self.runningJobs)
208
247
  batch_job_id_list = [self.getBatchSystemID(j) for j in running_job_list]
209
248
  if batch_job_id_list:
210
- try:
211
- # Get the statuses as a batch
212
- statuses = self.boss.with_retries(
213
- self.coalesce_job_exit_codes, batch_job_id_list
214
- )
215
- except NotImplementedError:
216
- # We have to get the statuses individually
217
- for running_job_id, batch_job_id in zip(running_job_list, batch_job_id_list):
218
- status = self.boss.with_retries(self.getJobExitCode, batch_job_id)
219
- activity = self._handle_job_status(
220
- running_job_id, status, activity
221
- )
222
- else:
223
- # We got the statuses as a batch
224
- for running_job_id, status in zip(running_job_list, statuses):
225
- activity = self._handle_job_status(
226
- running_job_id, status, activity
227
- )
249
+ # Get the statuses as a batch
250
+ statuses = self.boss.with_retries(
251
+ self.coalesce_job_exit_codes, batch_job_id_list
252
+ )
253
+ # We got the statuses as a batch
254
+ for running_job_id, status in zip(running_job_list, statuses):
255
+ activity = self._handle_job_status(running_job_id, status, activity)
228
256
 
229
257
  self._checkOnJobsCache = activity
230
258
  self._checkOnJobsTimestamp = datetime.now()
231
259
  return activity
232
260
 
233
261
  def _handle_job_status(
234
- self, job_id: int, status: Union[int, Tuple[int, Optional[BatchJobExitReason]], None], activity: bool
262
+ self,
263
+ job_id: int,
264
+ status: Union[int, tuple[int, Optional[BatchJobExitReason]], None],
265
+ activity: bool,
235
266
  ) -> bool:
236
267
  """
237
268
  Helper method for checkOnJobs to handle job statuses
@@ -259,7 +290,9 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
259
290
  activity = True
260
291
  newJob = self.newJobsQueue.get()
261
292
  if newJob is None:
262
- logger.debug('Received queue sentinel.')
293
+ logger.debug("Received queue sentinel.")
294
+ # Send out kill signals before stopping
295
+ self.killJobs()
263
296
  return False
264
297
  if self.killJobs():
265
298
  activity = True
@@ -268,7 +301,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
268
301
  if self.checkOnJobs():
269
302
  activity = True
270
303
  if not activity:
271
- logger.debug('No activity, sleeping for %is', self.boss.sleepSeconds())
304
+ logger.log(TRACE, "No activity, sleeping for %is", self.boss.sleepSeconds())
272
305
  return True
273
306
 
274
307
  def run(self):
@@ -285,29 +318,41 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
285
318
  # signalling exception in the thread as we expect the thread to
286
319
  # always be running for the duration of the workflow
287
320
 
288
- def coalesce_job_exit_codes(self, batch_job_id_list: list) -> List[Union[int, Tuple[int, Optional[BatchJobExitReason]], None]]:
321
+ def coalesce_job_exit_codes(
322
+ self, batch_job_id_list: list
323
+ ) -> list[Union[int, tuple[int, Optional[BatchJobExitReason]], None]]:
289
324
  """
290
325
  Returns exit codes and possibly exit reasons for a list of jobs, or None if they are running.
291
326
 
292
327
  Called by GridEngineThread.checkOnJobs().
293
328
 
294
- This is an optional part of the interface. It should raise
295
- NotImplementedError if not actually implemented for a particular
296
- scheduler.
329
+ The default implementation falls back on self.getJobExitCode and polls each job individually
297
330
 
298
331
  :param string batch_job_id_list: List of batch system job ID
299
332
  """
300
- raise NotImplementedError()
333
+ statuses = []
334
+ try:
335
+ for batch_job_id in batch_job_id_list:
336
+ statuses.append(
337
+ self.boss.with_retries(self.getJobExitCode, batch_job_id)
338
+ )
339
+ except CalledProcessErrorStderr as err:
340
+ # This avoids the nested retry issue where we could issue n^2 retries when the backing scheduler somehow disappears
341
+ # We catch the internal retry exception and raise something else so the outer retry doesn't retry the entire function again
342
+ raise ExceededRetryAttempts() from err
343
+ return statuses
301
344
 
302
345
  @abstractmethod
303
- def prepareSubmission(self,
304
- cpu: int,
305
- memory: int,
306
- jobID: int,
307
- command: str,
308
- jobName: str,
309
- job_environment: Optional[Dict[str, str]] = None,
310
- gpus: Optional[int] = None) -> List[str]:
346
+ def prepareSubmission(
347
+ self,
348
+ cpu: int,
349
+ memory: int,
350
+ jobID: int,
351
+ command: str,
352
+ jobName: str,
353
+ job_environment: Optional[dict[str, str]] = None,
354
+ gpus: Optional[int] = None,
355
+ ) -> list[str]:
311
356
  """
312
357
  Preparation in putting together a command-line string
313
358
  for submitting to batch system (via submitJob().)
@@ -357,7 +402,9 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
357
402
  raise NotImplementedError()
358
403
 
359
404
  @abstractmethod
360
- def getJobExitCode(self, batchJobID) -> Union[int, Tuple[int, Optional[BatchJobExitReason]], None]:
405
+ def getJobExitCode(
406
+ self, batchJobID
407
+ ) -> Union[int, tuple[int, Optional[BatchJobExitReason]], None]:
361
408
  """
362
409
  Returns job exit code and possibly an instance of abstractBatchSystem.BatchJobExitReason.
363
410
 
@@ -373,9 +420,10 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
373
420
  """
374
421
  raise NotImplementedError()
375
422
 
376
- def __init__(self, config, maxCores, maxMemory, maxDisk):
377
- super().__init__(
378
- config, maxCores, maxMemory, maxDisk)
423
+ def __init__(
424
+ self, config: Config, maxCores: float, maxMemory: int, maxDisk: int
425
+ ) -> None:
426
+ super().__init__(config, maxCores, maxMemory, maxDisk)
379
427
  self.config = config
380
428
 
381
429
  self.currentJobs = set()
@@ -385,8 +433,13 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
385
433
  self.killQueue = Queue()
386
434
  self.killedJobsQueue = Queue()
387
435
  # get the associated thread class here
388
- self.background_thread = self.GridEngineThread(self.newJobsQueue, self.updatedJobsQueue,
389
- self.killQueue, self.killedJobsQueue, self)
436
+ self.background_thread = self.GridEngineThread(
437
+ self.newJobsQueue,
438
+ self.updatedJobsQueue,
439
+ self.killQueue,
440
+ self.killedJobsQueue,
441
+ self,
442
+ )
390
443
  self.background_thread.start()
391
444
  self._getRunningBatchJobIDsTimestamp = None
392
445
  self._getRunningBatchJobIDsCache = {}
@@ -395,28 +448,54 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
395
448
  def supportsAutoDeployment(cls):
396
449
  return False
397
450
 
398
- def issueBatchJob(self, command: str, jobDesc, job_environment: Optional[Dict[str, str]] = None):
451
+ def count_needed_gpus(self, job_desc: JobDescription):
452
+ """
453
+ Count the number of cluster-allocateable GPUs we want to allocate for the given job.
454
+ """
455
+ gpus = 0
456
+ if isinstance(job_desc.accelerators, list):
457
+ for accelerator in job_desc.accelerators:
458
+ if accelerator["kind"] == "gpu":
459
+ gpus += accelerator["count"]
460
+ else:
461
+ gpus = job_desc.accelerators
462
+
463
+ return gpus
464
+
465
+ def issueBatchJob(
466
+ self,
467
+ command: str,
468
+ job_desc: JobDescription,
469
+ job_environment: Optional[dict[str, str]] = None,
470
+ ):
399
471
  # Avoid submitting internal jobs to the batch queue, handle locally
400
- localID = self.handleLocalJob(command, jobDesc)
401
- if localID is not None:
402
- return localID
472
+ local_id = self.handleLocalJob(command, job_desc)
473
+ if local_id is not None:
474
+ return local_id
403
475
  else:
404
- self.check_resource_request(jobDesc)
405
- jobID = self.getNextJobID()
406
- self.currentJobs.add(jobID)
407
- gpus = 0
408
- if isinstance(jobDesc.accelerators, list):
409
- for accelerator in jobDesc.accelerators:
410
- if accelerator['kind'] == 'gpu':
411
- gpus = accelerator['count']
412
- else:
413
- gpus = jobDesc.accelerators
414
-
415
- self.newJobsQueue.put((jobID, jobDesc.cores, jobDesc.memory, command, get_job_kind(jobDesc.get_names()),
416
- job_environment, gpus))
417
- logger.debug("Issued the job command: %s with job id: %s and job name %s", command, str(jobID),
418
- get_job_kind(jobDesc.get_names()))
419
- return jobID
476
+ self.check_resource_request(job_desc)
477
+ gpus = self.count_needed_gpus(job_desc)
478
+ job_id = self.getNextJobID()
479
+ self.currentJobs.add(job_id)
480
+
481
+ self.newJobsQueue.put(
482
+ (
483
+ job_id,
484
+ job_desc.cores,
485
+ job_desc.memory,
486
+ command,
487
+ get_job_kind(job_desc.get_names()),
488
+ job_environment,
489
+ gpus,
490
+ )
491
+ )
492
+ logger.debug(
493
+ "Issued the job command: %s with job id: %s and job name %s",
494
+ command,
495
+ str(job_id),
496
+ get_job_kind(job_desc.get_names()),
497
+ )
498
+ return job_id
420
499
 
421
500
  def killBatchJobs(self, jobIDs):
422
501
  """
@@ -425,7 +504,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
425
504
  """
426
505
  self.killLocalJobs(jobIDs)
427
506
  jobIDs = set(jobIDs)
428
- logger.debug('Jobs to be killed: %r', jobIDs)
507
+ logger.debug("Jobs to be killed: %r", jobIDs)
429
508
  for jobID in jobIDs:
430
509
  self.killQueue.put(jobID)
431
510
  while jobIDs:
@@ -433,7 +512,9 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
433
512
  killedJobId = self.killedJobsQueue.get(timeout=10)
434
513
  except Empty:
435
514
  if not self.background_thread.is_alive():
436
- raise self.GridEngineThreadException("Grid engine thread failed unexpectedly") from self.background_thread.exception
515
+ raise self.GridEngineThreadException(
516
+ "Grid engine thread failed unexpectedly"
517
+ ) from self.background_thread.exception
437
518
  continue
438
519
  if killedJobId is None:
439
520
  break
@@ -444,8 +525,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
444
525
  if killedJobId in self.currentJobs:
445
526
  self.currentJobs.remove(killedJobId)
446
527
  if jobIDs:
447
- logger.debug('Some kills (%s) still pending, sleeping %is', len(jobIDs),
448
- self.sleepSeconds())
528
+ logger.debug(
529
+ "Some kills (%s) still pending, sleeping %is",
530
+ len(jobIDs),
531
+ self.sleepSeconds(),
532
+ )
449
533
 
450
534
  def getIssuedBatchJobIDs(self):
451
535
  """
@@ -460,10 +544,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
460
544
  Respects statePollingWait and will return cached results if not within
461
545
  time period to talk with the scheduler.
462
546
  """
463
- if (self._getRunningBatchJobIDsTimestamp and (
464
- datetime.now() -
465
- self._getRunningBatchJobIDsTimestamp).total_seconds() <
466
- self.config.statePollingWait):
547
+ if (
548
+ self._getRunningBatchJobIDsTimestamp
549
+ and (datetime.now() - self._getRunningBatchJobIDsTimestamp).total_seconds()
550
+ < self.config.statePollingWait
551
+ ):
467
552
  batchIds = self._getRunningBatchJobIDsCache
468
553
  else:
469
554
  batchIds = self.with_retries(self.background_thread.getRunningJobIDs)
@@ -478,7 +563,9 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
478
563
  if not self.background_thread.is_alive():
479
564
  # kill remaining jobs on the thread
480
565
  self.background_thread.killJobs()
481
- raise self.GridEngineThreadException("Unexpected GridEngineThread failure") from self.background_thread.exception
566
+ raise self.GridEngineThreadException(
567
+ "Unexpected GridEngineThread failure"
568
+ ) from self.background_thread.exception
482
569
  if local_tuple:
483
570
  return local_tuple
484
571
  else:
@@ -486,7 +573,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
486
573
  item = self.updatedJobsQueue.get(timeout=maxWait)
487
574
  except Empty:
488
575
  return None
489
- logger.debug('UpdatedJobsQueue Item: %s', item)
576
+ logger.debug("UpdatedJobsQueue Item: %s", item)
490
577
  self.currentJobs.remove(item.jobID)
491
578
  return item
492
579
 
@@ -494,6 +581,11 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
494
581
  """
495
582
  Signals thread to shutdown (via sentinel) then cleanly joins the thread
496
583
  """
584
+
585
+ for jobID in self.getIssuedBatchJobIDs():
586
+ # Send kill signals to any jobs that might be running
587
+ self.killQueue.put(jobID)
588
+
497
589
  self.shutdownLocal()
498
590
  newJobsQueue = self.newJobsQueue
499
591
  self.newJobsQueue = None
@@ -501,9 +593,22 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
501
593
  newJobsQueue.put(None)
502
594
  self.background_thread.join()
503
595
 
596
+ # Now in one thread, kill all the jobs
597
+ if len(self.background_thread.runningJobs) > 0:
598
+ logger.warning(
599
+ "Cleaning up %s jobs still running at shutdown",
600
+ len(self.background_thread.runningJobs),
601
+ )
602
+ for job in self.background_thread.runningJobs:
603
+ self.killQueue.put(job)
604
+ self.background_thread.killJobs()
605
+
504
606
  def setEnv(self, name, value=None):
505
- if value and ',' in value:
506
- raise ValueError(type(self).__name__ + " does not support commata in environment variable values")
607
+ if value and "," in value:
608
+ raise ValueError(
609
+ type(self).__name__
610
+ + " does not support commata in environment variable values"
611
+ )
507
612
  return super().setEnv(name, value)
508
613
 
509
614
  @classmethod
@@ -511,8 +616,7 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
511
616
  return 1
512
617
 
513
618
  def sleepSeconds(self, sleeptime=1):
514
- """ Helper function to drop on all state-querying functions to avoid over-querying.
515
- """
619
+ """Helper function to drop on all state-querying functions to avoid over-querying."""
516
620
  time.sleep(sleeptime)
517
621
  return sleeptime
518
622
 
@@ -523,15 +627,21 @@ class AbstractGridEngineBatchSystem(BatchSystemCleanupSupport):
523
627
  """
524
628
  for attempt in old_retry(
525
629
  # Don't retry more often than the state polling wait.
526
- delays=[max(delay, self.config.statePollingWait) for delay in DEFAULT_DELAYS],
630
+ delays=[
631
+ max(delay, self.config.statePollingWait) for delay in DEFAULT_DELAYS
632
+ ],
527
633
  timeout=self.config.state_polling_timeout,
528
- predicate=lambda e: isinstance(e, CalledProcessErrorStderr)
634
+ predicate=lambda e: isinstance(e, CalledProcessErrorStderr),
529
635
  ):
530
636
  with attempt:
531
637
  try:
532
638
  return operation(*args, **kwargs)
533
639
  except CalledProcessErrorStderr as err:
534
- logger.error("Errored operation %s, code %d: %s",
535
- operation.__name__, err.returncode, err.stderr)
640
+ logger.error(
641
+ "Errored operation %s, code %d: %s",
642
+ operation.__name__,
643
+ err.returncode,
644
+ err.stderr,
645
+ )
536
646
  # Raise up to the retry logic, which will retry until timeout
537
647
  raise err