toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. toil/__init__.py +122 -315
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +173 -89
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
  5. toil/batchSystems/awsBatch.py +244 -135
  6. toil/batchSystems/cleanup_support.py +26 -16
  7. toil/batchSystems/contained_executor.py +31 -28
  8. toil/batchSystems/gridengine.py +86 -50
  9. toil/batchSystems/htcondor.py +166 -89
  10. toil/batchSystems/kubernetes.py +632 -382
  11. toil/batchSystems/local_support.py +20 -15
  12. toil/batchSystems/lsf.py +134 -81
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +290 -151
  16. toil/batchSystems/mesos/executor.py +79 -50
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +46 -28
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +296 -125
  21. toil/batchSystems/slurm.py +603 -138
  22. toil/batchSystems/torque.py +47 -33
  23. toil/bus.py +186 -76
  24. toil/common.py +664 -368
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1136 -483
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +63 -42
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +140 -60
  32. toil/fileStores/cachingFileStore.py +717 -269
  33. toil/fileStores/nonCachingFileStore.py +116 -87
  34. toil/job.py +1225 -368
  35. toil/jobStores/abstractJobStore.py +416 -266
  36. toil/jobStores/aws/jobStore.py +863 -477
  37. toil/jobStores/aws/utils.py +201 -120
  38. toil/jobStores/conftest.py +3 -2
  39. toil/jobStores/fileJobStore.py +292 -154
  40. toil/jobStores/googleJobStore.py +140 -74
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +668 -272
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +74 -31
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +214 -39
  49. toil/lib/aws/utils.py +287 -231
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +104 -47
  53. toil/lib/docker.py +131 -103
  54. toil/lib/ec2.py +361 -199
  55. toil/lib/ec2nodes.py +174 -106
  56. toil/lib/encryption/_dummy.py +5 -3
  57. toil/lib/encryption/_nacl.py +10 -6
  58. toil/lib/encryption/conftest.py +1 -0
  59. toil/lib/exceptions.py +26 -7
  60. toil/lib/expando.py +5 -3
  61. toil/lib/ftp_utils.py +217 -0
  62. toil/lib/generatedEC2Lists.py +127 -19
  63. toil/lib/humanize.py +6 -2
  64. toil/lib/integration.py +341 -0
  65. toil/lib/io.py +141 -15
  66. toil/lib/iterables.py +4 -2
  67. toil/lib/memoize.py +12 -8
  68. toil/lib/misc.py +66 -21
  69. toil/lib/objects.py +2 -2
  70. toil/lib/resources.py +68 -15
  71. toil/lib/retry.py +126 -81
  72. toil/lib/threading.py +299 -82
  73. toil/lib/throttle.py +16 -15
  74. toil/options/common.py +843 -409
  75. toil/options/cwl.py +175 -90
  76. toil/options/runner.py +50 -0
  77. toil/options/wdl.py +73 -17
  78. toil/provisioners/__init__.py +117 -46
  79. toil/provisioners/abstractProvisioner.py +332 -157
  80. toil/provisioners/aws/__init__.py +70 -33
  81. toil/provisioners/aws/awsProvisioner.py +1145 -715
  82. toil/provisioners/clusterScaler.py +541 -279
  83. toil/provisioners/gceProvisioner.py +282 -179
  84. toil/provisioners/node.py +155 -79
  85. toil/realtimeLogger.py +34 -22
  86. toil/resource.py +137 -75
  87. toil/server/app.py +128 -62
  88. toil/server/celery_app.py +3 -1
  89. toil/server/cli/wes_cwl_runner.py +82 -53
  90. toil/server/utils.py +54 -28
  91. toil/server/wes/abstract_backend.py +64 -26
  92. toil/server/wes/amazon_wes_utils.py +21 -15
  93. toil/server/wes/tasks.py +121 -63
  94. toil/server/wes/toil_backend.py +142 -107
  95. toil/server/wsgi_app.py +4 -3
  96. toil/serviceManager.py +58 -22
  97. toil/statsAndLogging.py +224 -70
  98. toil/test/__init__.py +282 -183
  99. toil/test/batchSystems/batchSystemTest.py +460 -210
  100. toil/test/batchSystems/batch_system_plugin_test.py +90 -0
  101. toil/test/batchSystems/test_gridengine.py +173 -0
  102. toil/test/batchSystems/test_lsf_helper.py +67 -58
  103. toil/test/batchSystems/test_slurm.py +110 -49
  104. toil/test/cactus/__init__.py +0 -0
  105. toil/test/cactus/test_cactus_integration.py +56 -0
  106. toil/test/cwl/cwlTest.py +496 -287
  107. toil/test/cwl/measure_default_memory.cwl +12 -0
  108. toil/test/cwl/not_run_required_input.cwl +29 -0
  109. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  110. toil/test/cwl/seqtk_seq.cwl +1 -1
  111. toil/test/docs/scriptsTest.py +69 -46
  112. toil/test/jobStores/jobStoreTest.py +427 -264
  113. toil/test/lib/aws/test_iam.py +118 -50
  114. toil/test/lib/aws/test_s3.py +16 -9
  115. toil/test/lib/aws/test_utils.py +5 -6
  116. toil/test/lib/dockerTest.py +118 -141
  117. toil/test/lib/test_conversions.py +113 -115
  118. toil/test/lib/test_ec2.py +58 -50
  119. toil/test/lib/test_integration.py +104 -0
  120. toil/test/lib/test_misc.py +12 -5
  121. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  122. toil/test/mesos/helloWorld.py +7 -6
  123. toil/test/mesos/stress.py +25 -20
  124. toil/test/options/__init__.py +13 -0
  125. toil/test/options/options.py +42 -0
  126. toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
  127. toil/test/provisioners/clusterScalerTest.py +440 -250
  128. toil/test/provisioners/clusterTest.py +166 -44
  129. toil/test/provisioners/gceProvisionerTest.py +174 -100
  130. toil/test/provisioners/provisionerTest.py +25 -13
  131. toil/test/provisioners/restartScript.py +5 -4
  132. toil/test/server/serverTest.py +188 -141
  133. toil/test/sort/restart_sort.py +137 -68
  134. toil/test/sort/sort.py +134 -66
  135. toil/test/sort/sortTest.py +91 -49
  136. toil/test/src/autoDeploymentTest.py +141 -101
  137. toil/test/src/busTest.py +20 -18
  138. toil/test/src/checkpointTest.py +8 -2
  139. toil/test/src/deferredFunctionTest.py +49 -35
  140. toil/test/src/dockerCheckTest.py +32 -24
  141. toil/test/src/environmentTest.py +135 -0
  142. toil/test/src/fileStoreTest.py +539 -272
  143. toil/test/src/helloWorldTest.py +7 -4
  144. toil/test/src/importExportFileTest.py +61 -31
  145. toil/test/src/jobDescriptionTest.py +46 -21
  146. toil/test/src/jobEncapsulationTest.py +2 -0
  147. toil/test/src/jobFileStoreTest.py +74 -50
  148. toil/test/src/jobServiceTest.py +187 -73
  149. toil/test/src/jobTest.py +121 -71
  150. toil/test/src/miscTests.py +19 -18
  151. toil/test/src/promisedRequirementTest.py +82 -36
  152. toil/test/src/promisesTest.py +7 -6
  153. toil/test/src/realtimeLoggerTest.py +10 -6
  154. toil/test/src/regularLogTest.py +71 -37
  155. toil/test/src/resourceTest.py +80 -49
  156. toil/test/src/restartDAGTest.py +36 -22
  157. toil/test/src/resumabilityTest.py +9 -2
  158. toil/test/src/retainTempDirTest.py +45 -14
  159. toil/test/src/systemTest.py +12 -8
  160. toil/test/src/threadingTest.py +44 -25
  161. toil/test/src/toilContextManagerTest.py +10 -7
  162. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  163. toil/test/src/workerTest.py +73 -23
  164. toil/test/utils/toilDebugTest.py +103 -33
  165. toil/test/utils/toilKillTest.py +4 -5
  166. toil/test/utils/utilsTest.py +245 -106
  167. toil/test/wdl/wdltoil_test.py +818 -149
  168. toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
  169. toil/toilState.py +120 -35
  170. toil/utils/toilConfig.py +13 -4
  171. toil/utils/toilDebugFile.py +44 -27
  172. toil/utils/toilDebugJob.py +214 -27
  173. toil/utils/toilDestroyCluster.py +11 -6
  174. toil/utils/toilKill.py +8 -3
  175. toil/utils/toilLaunchCluster.py +256 -140
  176. toil/utils/toilMain.py +37 -16
  177. toil/utils/toilRsyncCluster.py +32 -14
  178. toil/utils/toilSshCluster.py +49 -22
  179. toil/utils/toilStats.py +356 -273
  180. toil/utils/toilStatus.py +292 -139
  181. toil/utils/toilUpdateEC2Instances.py +3 -1
  182. toil/version.py +12 -12
  183. toil/wdl/utils.py +5 -5
  184. toil/wdl/wdltoil.py +3913 -1033
  185. toil/worker.py +367 -184
  186. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
  187. toil-8.0.0.dist-info/METADATA +173 -0
  188. toil-8.0.0.dist-info/RECORD +253 -0
  189. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
  190. toil-6.1.0a1.dist-info/METADATA +0 -125
  191. toil-6.1.0a1.dist-info/RECORD +0 -237
  192. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
  193. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
@@ -18,27 +18,21 @@ import math
18
18
  import os
19
19
  import time
20
20
  from collections import defaultdict
21
- from typing import (TYPE_CHECKING,
22
- Any,
23
- Callable,
24
- Dict,
25
- List,
26
- Optional,
27
- Set,
28
- Tuple,
29
- Union)
30
-
31
- from toil.batchSystems.abstractBatchSystem import (AbstractBatchSystem,
32
- AbstractScalableBatchSystem,
33
- NodeInfo)
21
+ from typing import TYPE_CHECKING, Any, Callable, Optional, Union
22
+
23
+ from toil.batchSystems.abstractBatchSystem import (
24
+ AbstractBatchSystem,
25
+ AbstractScalableBatchSystem,
26
+ NodeInfo,
27
+ )
34
28
  from toil.bus import ClusterDesiredSizeMessage, ClusterSizeMessage
35
29
  from toil.common import Config
36
- from toil.options.common import defaultTargetTime
37
30
  from toil.job import JobDescription, ServiceJobDescription
38
31
  from toil.lib.conversions import bytes2human, human2bytes
39
32
  from toil.lib.retry import old_retry
40
33
  from toil.lib.threading import ExceptionalThread
41
34
  from toil.lib.throttle import throttle
35
+ from toil.options.common import defaultTargetTime
42
36
  from toil.provisioners.abstractProvisioner import AbstractProvisioner, Shape
43
37
 
44
38
  if TYPE_CHECKING:
@@ -48,18 +42,25 @@ if TYPE_CHECKING:
48
42
  logger = logging.getLogger(__name__)
49
43
 
50
44
  # Properties of GKE's memory overhead algorithm
51
- EVICTION_THRESHOLD = human2bytes('100MiB')
52
- RESERVE_SMALL_LIMIT = human2bytes('1GiB')
53
- RESERVE_SMALL_AMOUNT = human2bytes('255MiB')
54
- RESERVE_BREAKPOINTS: List[Union[int, float]] = [human2bytes('4GiB'), human2bytes('8GiB'), human2bytes('16GiB'), human2bytes('128GiB'), math.inf]
45
+ EVICTION_THRESHOLD = human2bytes("100MiB")
46
+ RESERVE_SMALL_LIMIT = human2bytes("1GiB")
47
+ RESERVE_SMALL_AMOUNT = human2bytes("255MiB")
48
+ RESERVE_BREAKPOINTS: list[Union[int, float]] = [
49
+ human2bytes("4GiB"),
50
+ human2bytes("8GiB"),
51
+ human2bytes("16GiB"),
52
+ human2bytes("128GiB"),
53
+ math.inf,
54
+ ]
55
55
  RESERVE_FRACTIONS = [0.25, 0.2, 0.1, 0.06, 0.02]
56
56
 
57
57
  # Guess of how much disk space on the root volume is used for the OS and essential container images
58
- OS_SIZE = human2bytes('5G')
58
+ OS_SIZE = human2bytes("5G")
59
59
 
60
60
  # Define a type for an explanation of why a job can't fit on a node.
61
61
  # Consists of a resource name and a constraining value for that resource.
62
- FailedConstraint = Tuple[str, Union[int, float, bool]]
62
+ FailedConstraint = tuple[str, Union[int, float, bool]]
63
+
63
64
 
64
65
  class BinPackedFit:
65
66
  """
@@ -80,24 +81,30 @@ class BinPackedFit:
80
81
  :returns: The minimum number of minimal node allocations estimated to be required to run all
81
82
  the jobs in jobShapes.
82
83
  """
83
- nodeReservations: Dict[Shape, List['NodeReservation']]
84
84
 
85
- def __init__(self, nodeShapes: List[Shape], targetTime: float = defaultTargetTime) -> None:
85
+ nodeReservations: dict[Shape, list["NodeReservation"]]
86
+
87
+ def __init__(
88
+ self, nodeShapes: list[Shape], targetTime: float = defaultTargetTime
89
+ ) -> None:
86
90
  self.nodeShapes = sorted(nodeShapes)
87
91
  self.targetTime = targetTime
88
92
  self.nodeReservations = {nodeShape: [] for nodeShape in nodeShapes}
89
93
 
90
- def binPack(self, jobShapes: List[Shape]) -> Dict[Shape, List[FailedConstraint]]:
94
+ def binPack(self, jobShapes: list[Shape]) -> dict[Shape, list[FailedConstraint]]:
91
95
  """
92
96
  Pack a list of jobShapes into the fewest nodes reasonable.
93
-
97
+
94
98
  Can be run multiple times.
95
-
99
+
96
100
  Returns any distinct Shapes that did not fit, mapping to reasons they did not fit.
97
101
  """
98
102
  # TODO: Check for redundancy with batchsystems.mesos.JobQueue() sorting
99
- logger.debug('Running bin packing for node shapes %s and %s job(s).',
100
- self.nodeShapes, len(jobShapes))
103
+ logger.debug(
104
+ "Running bin packing for node shapes %s and %s job(s).",
105
+ self.nodeShapes,
106
+ len(jobShapes),
107
+ )
101
108
  # Sort in descending order from largest to smallest. The FFD like-strategy will pack the
102
109
  # jobs in order from longest to shortest.
103
110
  jobShapes.sort()
@@ -111,11 +118,13 @@ class BinPackedFit:
111
118
  could_not_fit[rejection[0]] = rejection[1]
112
119
  return could_not_fit
113
120
 
114
- def addJobShape(self, jobShape: Shape) -> Optional[Tuple[Shape, List[FailedConstraint]]]:
121
+ def addJobShape(
122
+ self, jobShape: Shape
123
+ ) -> Optional[tuple[Shape, list[FailedConstraint]]]:
115
124
  """
116
125
  Add the job to the first node reservation in which it will fit. (This
117
126
  is the bin-packing aspect).
118
-
127
+
119
128
  Returns the job shape again, and a list of failed constraints, if it did not fit.
120
129
  """
121
130
  chosenNodeShape = None
@@ -126,24 +135,33 @@ class BinPackedFit:
126
135
  break
127
136
 
128
137
  if chosenNodeShape is None:
129
- logger.debug("Couldn't fit job with requirements %s into any nodes in the nodeTypes "
130
- "list.", jobShape)
138
+ logger.debug(
139
+ "Couldn't fit job with requirements %s into any nodes in the nodeTypes "
140
+ "list.",
141
+ jobShape,
142
+ )
131
143
  # Go back and debug why this happened.
132
- fewest_constraints: Optional[List[FailedConstraint]] = None
144
+ fewest_constraints: Optional[list[FailedConstraint]] = None
133
145
  for shape in self.nodeShapes:
134
146
  failures = NodeReservation(nodeShape).get_failed_constraints(jobShape)
135
- if fewest_constraints is None or len(failures) < len(fewest_constraints):
147
+ if fewest_constraints is None or len(failures) < len(
148
+ fewest_constraints
149
+ ):
136
150
  # This was closer to fitting.
137
151
  # TODO: Check the actual constraint values so we don't tell
138
152
  # the user to raise the memory on the smallest machine?
139
153
  fewest_constraints = failures
140
-
141
- return jobShape, fewest_constraints if fewest_constraints is not None else []
154
+
155
+ return jobShape, (
156
+ fewest_constraints if fewest_constraints is not None else []
157
+ )
142
158
 
143
159
  # grab current list of job objects appended to this instance type
144
160
  nodeReservations = self.nodeReservations[chosenNodeShape]
145
161
  for nodeReservation in nodeReservations:
146
- if nodeReservation.attemptToAddJob(jobShape, chosenNodeShape, self.targetTime):
162
+ if nodeReservation.attemptToAddJob(
163
+ jobShape, chosenNodeShape, self.targetTime
164
+ ):
147
165
  # We succeeded adding the job to this node reservation. Now we're done.
148
166
  return None
149
167
 
@@ -160,7 +178,7 @@ class BinPackedFit:
160
178
  reservation = extendThisReservation
161
179
  return None
162
180
 
163
- def getRequiredNodes(self) -> Dict[Shape, int]:
181
+ def getRequiredNodes(self) -> dict[Shape, int]:
164
182
  """Return a dict from node shape to number of nodes required to run the packed jobs."""
165
183
  return {
166
184
  nodeShape: len(self.nodeReservations[nodeShape])
@@ -184,48 +202,72 @@ class NodeReservation:
184
202
  self.nReservation: Optional[NodeReservation] = None
185
203
 
186
204
  def __str__(self) -> str:
187
- return "-------------------\n" \
188
- "Current Reservation\n" \
189
- "-------------------\n" \
190
- "Shape wallTime: %s\n" \
191
- "Shape memory: %s\n" \
192
- "Shape cores: %s\n" \
193
- "Shape disk: %s\n" \
194
- "Shape preempt: %s\n" \
195
- "\n" \
196
- "nReserv wallTime: %s\n" \
197
- "nReserv memory: %s\n" \
198
- "nReserv cores: %s\n" \
199
- "nReserv disk: %s\n" \
200
- "nReserv preempt: %s\n" \
201
- "\n" \
202
- "Time slices: %s\n" \
203
- "\n" % \
204
- (self.shape.wallTime,
205
+ return (
206
+ "-------------------\n"
207
+ "Current Reservation\n"
208
+ "-------------------\n"
209
+ "Shape wallTime: %s\n"
210
+ "Shape memory: %s\n"
211
+ "Shape cores: %s\n"
212
+ "Shape disk: %s\n"
213
+ "Shape preempt: %s\n"
214
+ "\n"
215
+ "nReserv wallTime: %s\n"
216
+ "nReserv memory: %s\n"
217
+ "nReserv cores: %s\n"
218
+ "nReserv disk: %s\n"
219
+ "nReserv preempt: %s\n"
220
+ "\n"
221
+ "Time slices: %s\n"
222
+ "\n"
223
+ % (
224
+ self.shape.wallTime,
205
225
  self.shape.memory,
206
226
  self.shape.cores,
207
227
  self.shape.disk,
208
228
  self.shape.preemptible,
209
- self.nReservation.shape.wallTime if self.nReservation is not None else str(None),
210
- self.nReservation.shape.memory if self.nReservation is not None else str(None),
211
- self.nReservation.shape.cores if self.nReservation is not None else str(None),
212
- self.nReservation.shape.disk if self.nReservation is not None else str(None),
213
- self.nReservation.shape.preemptible if self.nReservation is not None else str(None),
214
- str(len(self.shapes())))
215
-
216
- def get_failed_constraints(self, job_shape: Shape) -> List[FailedConstraint]:
229
+ (
230
+ self.nReservation.shape.wallTime
231
+ if self.nReservation is not None
232
+ else str(None)
233
+ ),
234
+ (
235
+ self.nReservation.shape.memory
236
+ if self.nReservation is not None
237
+ else str(None)
238
+ ),
239
+ (
240
+ self.nReservation.shape.cores
241
+ if self.nReservation is not None
242
+ else str(None)
243
+ ),
244
+ (
245
+ self.nReservation.shape.disk
246
+ if self.nReservation is not None
247
+ else str(None)
248
+ ),
249
+ (
250
+ self.nReservation.shape.preemptible
251
+ if self.nReservation is not None
252
+ else str(None)
253
+ ),
254
+ str(len(self.shapes())),
255
+ )
256
+ )
257
+
258
+ def get_failed_constraints(self, job_shape: Shape) -> list[FailedConstraint]:
217
259
  """
218
260
  Check if a job shape's resource requirements will fit within this allocation.
219
-
261
+
220
262
  If the job does *not* fit, returns the failing constraints: the resources
221
263
  that can't be accomodated, and the limits that were hit.
222
-
264
+
223
265
  If the job *does* fit, returns an empty list.
224
-
266
+
225
267
  Must always agree with fits()! This codepath is slower and used for diagnosis.
226
268
  """
227
-
228
- failures: List[FailedConstraint] = []
269
+
270
+ failures: list[FailedConstraint] = []
229
271
  if job_shape.memory > self.shape.memory:
230
272
  failures.append(("memory", self.shape.memory))
231
273
  if job_shape.cores > self.shape.cores:
@@ -235,15 +277,17 @@ class NodeReservation:
235
277
  if not job_shape.preemptible and self.shape.preemptible:
236
278
  failures.append(("preemptible", self.shape.preemptible))
237
279
  return failures
238
-
280
+
239
281
  def fits(self, jobShape: Shape) -> bool:
240
282
  """Check if a job shape's resource requirements will fit within this allocation."""
241
- return jobShape.memory <= self.shape.memory and \
242
- jobShape.cores <= self.shape.cores and \
243
- jobShape.disk <= self.shape.disk and \
244
- (jobShape.preemptible or not self.shape.preemptible)
283
+ return (
284
+ jobShape.memory <= self.shape.memory
285
+ and jobShape.cores <= self.shape.cores
286
+ and jobShape.disk <= self.shape.disk
287
+ and (jobShape.preemptible or not self.shape.preemptible)
288
+ )
245
289
 
246
- def shapes(self) -> List[Shape]:
290
+ def shapes(self) -> list[Shape]:
247
291
  """Get all time-slice shapes, in order, from this reservation on."""
248
292
  shapes = []
249
293
  curRes: Optional[NodeReservation] = self
@@ -254,11 +298,13 @@ class NodeReservation:
254
298
 
255
299
  def subtract(self, jobShape: Shape) -> None:
256
300
  """Subtract the resources necessary to run a jobShape from the reservation."""
257
- self.shape = Shape(self.shape.wallTime,
258
- self.shape.memory - jobShape.memory,
259
- self.shape.cores - jobShape.cores,
260
- self.shape.disk - jobShape.disk,
261
- self.shape.preemptible)
301
+ self.shape = Shape(
302
+ self.shape.wallTime,
303
+ self.shape.memory - jobShape.memory,
304
+ self.shape.cores - jobShape.cores,
305
+ self.shape.disk - jobShape.disk,
306
+ self.shape.preemptible,
307
+ )
262
308
 
263
309
  def attemptToAddJob(
264
310
  self, jobShape: Shape, nodeShape: Shape, targetTime: float
@@ -286,27 +332,42 @@ class NodeReservation:
286
332
  # does the job time fit in the reservation's remaining time?
287
333
  if availableTime >= jobShape.wallTime:
288
334
  timeSlice: float = 0
289
- while (startingReservation != endingReservation):
335
+ while startingReservation != endingReservation:
290
336
  # removes resources only (NO time) from startingReservation
291
337
  startingReservation.subtract(jobShape) # type: ignore
292
338
  # set aside the timeSlice
293
339
  timeSlice += startingReservation.shape.wallTime # type: ignore
294
340
  startingReservation = startingReservation.nReservation # type: ignore
295
- assert jobShape.wallTime - timeSlice <= startingReservation.shape.wallTime
296
- adjustEndingReservationForJob(endingReservation, jobShape, timeSlice)
341
+ assert (
342
+ jobShape.wallTime - timeSlice
343
+ <= startingReservation.shape.wallTime
344
+ )
345
+ adjustEndingReservationForJob(
346
+ endingReservation, jobShape, timeSlice
347
+ )
297
348
  # Packed the job.
298
349
  return True
299
350
 
300
351
  # If the job would fit, but is longer than the total node allocation
301
352
  # extend the node allocation
302
- elif endingReservation.nReservation == None and startingReservation == self:
353
+ elif (
354
+ endingReservation.nReservation == None
355
+ and startingReservation == self
356
+ ):
303
357
  # Extend the node reservation to accommodate jobShape
304
358
  endingReservation.nReservation = NodeReservation(nodeShape)
305
359
  # can't run the job with the current resources
306
360
  else:
307
- if startingReservationTime + availableTime + endingReservation.shape.wallTime <= targetTime:
361
+ if (
362
+ startingReservationTime
363
+ + availableTime
364
+ + endingReservation.shape.wallTime
365
+ <= targetTime
366
+ ):
308
367
  startingReservation = endingReservation.nReservation
309
- startingReservationTime += availableTime + endingReservation.shape.wallTime
368
+ startingReservationTime += (
369
+ availableTime + endingReservation.shape.wallTime
370
+ )
310
371
  availableTime = 0
311
372
  else:
312
373
  break
@@ -332,7 +393,9 @@ def adjustEndingReservationForJob(
332
393
  """
333
394
  if jobShape.wallTime - wallTime < reservation.shape.wallTime:
334
395
  # This job only partially fills one of the slices. Create a new slice.
335
- reservation.shape, nS = split(reservation.shape, jobShape, jobShape.wallTime - wallTime)
396
+ reservation.shape, nS = split(
397
+ reservation.shape, jobShape, jobShape.wallTime - wallTime
398
+ )
336
399
  nS.nReservation = reservation.nReservation
337
400
  reservation.nReservation = nS
338
401
  else:
@@ -342,30 +405,40 @@ def adjustEndingReservationForJob(
342
405
 
343
406
  def split(
344
407
  nodeShape: Shape, jobShape: Shape, wallTime: float
345
- ) -> Tuple[Shape, NodeReservation]:
408
+ ) -> tuple[Shape, NodeReservation]:
346
409
  """
347
410
  Partition a node allocation into two to fit the job.
348
411
 
349
412
  Returning the modified shape of the node and a new node reservation for
350
413
  the extra time that the job didn't fill.
351
414
  """
352
- return (Shape(wallTime,
353
- nodeShape.memory - jobShape.memory,
354
- nodeShape.cores - jobShape.cores,
355
- nodeShape.disk - jobShape.disk,
356
- nodeShape.preemptible),
357
- NodeReservation(Shape(nodeShape.wallTime - wallTime,
358
- nodeShape.memory,
359
- nodeShape.cores,
360
- nodeShape.disk,
361
- nodeShape.preemptible)))
362
-
363
-
364
- def binPacking(nodeShapes: List[Shape], jobShapes: List[Shape], goalTime: float) -> Tuple[Dict[Shape, int], Dict[Shape, List[FailedConstraint]]]:
415
+ return (
416
+ Shape(
417
+ wallTime,
418
+ nodeShape.memory - jobShape.memory,
419
+ nodeShape.cores - jobShape.cores,
420
+ nodeShape.disk - jobShape.disk,
421
+ nodeShape.preemptible,
422
+ ),
423
+ NodeReservation(
424
+ Shape(
425
+ nodeShape.wallTime - wallTime,
426
+ nodeShape.memory,
427
+ nodeShape.cores,
428
+ nodeShape.disk,
429
+ nodeShape.preemptible,
430
+ )
431
+ ),
432
+ )
433
+
434
+
435
+ def binPacking(
436
+ nodeShapes: list[Shape], jobShapes: list[Shape], goalTime: float
437
+ ) -> tuple[dict[Shape, int], dict[Shape, list[FailedConstraint]]]:
365
438
  """
366
439
  Using the given node shape bins, pack the given job shapes into nodes to
367
440
  get them done in the given amount of time.
368
-
441
+
369
442
  Returns a dict saying how many of each node will be needed, a dict from job
370
443
  shapes that could not fit to reasons why.
371
444
  """
@@ -388,34 +461,37 @@ class ClusterScaler:
388
461
  self.provisioner = provisioner
389
462
  self.leader = leader
390
463
  self.config = config
391
- self.static: Dict[bool, Dict[str, "Node"]] = {}
392
-
464
+ self.static: dict[bool, dict[str, "Node"]] = {}
465
+
393
466
  # If we encounter a Shape of job that we don't think we can run, call
394
467
  # these callbacks with the Shape that didn't fit and the Shapes that
395
468
  # were available.
396
- self.on_too_big: List[Callable[[Shape, List[Shape]], Any]] = []
469
+ self.on_too_big: list[Callable[[Shape, list[Shape]], Any]] = []
397
470
 
398
471
  # Dictionary of job names to their average runtime, used to estimate wall time of queued
399
472
  # jobs for bin-packing
400
- self.jobNameToAvgRuntime: Dict[str, float] = {}
401
- self.jobNameToNumCompleted: Dict[str, int] = {}
473
+ self.jobNameToAvgRuntime: dict[str, float] = {}
474
+ self.jobNameToNumCompleted: dict[str, int] = {}
402
475
  self.totalAvgRuntime = 0.0
403
476
  self.totalJobsCompleted = 0
404
477
 
405
478
  self.targetTime: float = config.targetTime
406
479
  if self.targetTime <= 0:
407
- raise RuntimeError('targetTime (%s) must be a positive integer!' % self.targetTime)
480
+ raise RuntimeError(
481
+ "targetTime (%s) must be a positive integer!" % self.targetTime
482
+ )
408
483
  self.betaInertia = config.betaInertia
409
484
  if not 0.0 <= self.betaInertia <= 0.9:
410
- raise RuntimeError('betaInertia (%f) must be between 0.0 and 0.9!' % self.betaInertia)
411
-
485
+ raise RuntimeError(
486
+ "betaInertia (%f) must be between 0.0 and 0.9!" % self.betaInertia
487
+ )
412
488
 
413
489
  # Pull scaling information from the provisioner.
414
490
  self.nodeShapeToType = provisioner.getAutoscaledInstanceShapes()
415
491
  self.instance_types = list(self.nodeShapeToType.values())
416
492
  self.nodeShapes = list(self.nodeShapeToType.keys())
417
493
 
418
- self.ignoredNodes: Set[str] = set()
494
+ self.ignoredNodes: set[str] = set()
419
495
 
420
496
  # A *deficit* exists when we have more jobs that can run on preemptible
421
497
  # nodes than we have preemptible nodes. In order to not block these jobs,
@@ -426,13 +502,17 @@ class ClusterScaler:
426
502
  # of provisioned preemptible nodes and the number of nodes that were requested.
427
503
  # Then, when provisioning non-preemptible nodes of the same type, we attempt to
428
504
  # make up the deficit.
429
- self.preemptibleNodeDeficit = {instance_type: 0 for instance_type in self.instance_types}
505
+ self.preemptibleNodeDeficit = {
506
+ instance_type: 0 for instance_type in self.instance_types
507
+ }
430
508
 
431
509
  # Keeps track of the last raw (i.e. float, not limited by
432
510
  # max/min nodes) estimates of the number of nodes needed for
433
511
  # each node shape. NB: we start with an estimate of 0, so
434
512
  # scaling up is smoothed as well.
435
- self.previousWeightedEstimate = {nodeShape: 0.0 for nodeShape in self.nodeShapes}
513
+ self.previousWeightedEstimate = {
514
+ nodeShape: 0.0 for nodeShape in self.nodeShapes
515
+ }
436
516
 
437
517
  assert len(self.nodeShapes) > 0
438
518
 
@@ -454,26 +534,38 @@ class ClusterScaler:
454
534
  self.nodeShapes.sort()
455
535
 
456
536
  # Nodes might not actually provide all the resources of their nominal shapes
457
- self.node_shapes_after_overhead = self.nodeShapes if config.assume_zero_overhead else [self._reserve_overhead(s) for s in self.nodeShapes]
458
- self.without_overhead = {k: v for k, v in zip(self.node_shapes_after_overhead, self.nodeShapes)}
537
+ self.node_shapes_after_overhead = (
538
+ self.nodeShapes
539
+ if config.assume_zero_overhead
540
+ else [self._reserve_overhead(s) for s in self.nodeShapes]
541
+ )
542
+ self.without_overhead = {
543
+ k: v for k, v in zip(self.node_shapes_after_overhead, self.nodeShapes)
544
+ }
459
545
 
460
- #Node shape to number of currently provisioned nodes
461
- totalNodes: Dict[Shape, int] = defaultdict(int)
462
- if isinstance(leader.batchSystem, AbstractScalableBatchSystem) and leader.provisioner:
546
+ # Node shape to number of currently provisioned nodes
547
+ totalNodes: dict[Shape, int] = defaultdict(int)
548
+ if (
549
+ isinstance(leader.batchSystem, AbstractScalableBatchSystem)
550
+ and leader.provisioner
551
+ ):
463
552
  for preemptible in (True, False):
464
- nodes: List["Node"] = []
553
+ nodes: list["Node"] = []
465
554
  for nodeShape, instance_type in self.nodeShapeToType.items():
466
- nodes_thisType = leader.provisioner.getProvisionedWorkers(instance_type=instance_type,
467
- preemptible=preemptible)
555
+ nodes_thisType = leader.provisioner.getProvisionedWorkers(
556
+ instance_type=instance_type, preemptible=preemptible
557
+ )
468
558
  totalNodes[nodeShape] += len(nodes_thisType)
469
559
  nodes.extend(nodes_thisType)
470
560
 
471
561
  self.setStaticNodes(nodes, preemptible)
472
562
 
473
- logger.debug('Starting with the following nodes in the cluster: %s' % totalNodes)
563
+ logger.debug(
564
+ "Starting with the following nodes in the cluster: %s" % totalNodes
565
+ )
474
566
 
475
567
  if not sum(config.maxNodes) > 0:
476
- raise RuntimeError('Not configured to create nodes of any type.')
568
+ raise RuntimeError("Not configured to create nodes of any type.")
477
569
 
478
570
  def _round(self, number: float) -> int:
479
571
  """
@@ -529,7 +621,7 @@ class ClusterScaler:
529
621
  # TODO: Figure out if the disk is an OS disk of a scratch disk
530
622
  smaller.disk -= self._disk_overhead(smaller.disk)
531
623
 
532
- logger.debug('Node shape %s can hold jobs of shape %s', full_node, smaller)
624
+ logger.debug("Node shape %s can hold jobs of shape %s", full_node, smaller)
533
625
 
534
626
  return smaller
535
627
 
@@ -558,12 +650,21 @@ class ClusterScaler:
558
650
  # since the previous breakpoint, like a progressive income tax.
559
651
  limit = min(breakpoint, memory_bytes)
560
652
  reservation = fraction * (limit - accounted)
561
- logger.debug('Reserve %s of memory between %s and %s', bytes2human(reservation), bytes2human(accounted), bytes2human(limit))
653
+ logger.debug(
654
+ "Reserve %s of memory between %s and %s",
655
+ bytes2human(reservation),
656
+ bytes2human(accounted),
657
+ bytes2human(limit),
658
+ )
562
659
  reserved += reservation
563
660
  accounted = limit
564
661
  if accounted >= memory_bytes:
565
662
  break
566
- logger.debug('Reserved %s/%s memory for overhead', bytes2human(reserved), bytes2human(memory_bytes))
663
+ logger.debug(
664
+ "Reserved %s/%s memory for overhead",
665
+ bytes2human(reserved),
666
+ bytes2human(memory_bytes),
667
+ )
567
668
 
568
669
  return int(reserved) + EVICTION_THRESHOLD
569
670
 
@@ -579,15 +680,20 @@ class ClusterScaler:
579
680
 
580
681
  if disk_bytes <= disk_needed:
581
682
  # We don't think we can actually use any of this disk
582
- logger.warning('All %sB of disk on a node type are likely to be needed by the OS! The node probably cannot do any useful work!', bytes2human(disk_bytes))
683
+ logger.warning(
684
+ "All %sB of disk on a node type are likely to be needed by the OS! The node probably cannot do any useful work!",
685
+ bytes2human(disk_bytes),
686
+ )
583
687
  return disk_bytes
584
688
 
585
689
  if disk_needed * 2 > disk_bytes:
586
- logger.warning('A node type has only %sB disk, of which more than half are expected to be used by the OS. Consider using a larger --nodeStorage', bytes2human(disk_bytes))
690
+ logger.warning(
691
+ "A node type has only %sB disk, of which more than half are expected to be used by the OS. Consider using a larger --nodeStorage",
692
+ bytes2human(disk_bytes),
693
+ )
587
694
 
588
695
  return disk_needed
589
696
 
590
-
591
697
  def getAverageRuntime(self, jobName: str, service: bool = False) -> float:
592
698
  if service:
593
699
  # We short-circuit service jobs and assume that they will
@@ -599,15 +705,15 @@ class ClusterScaler:
599
705
  # be running at once for any actual work to get done.
600
706
  return self.targetTime * 24 + 3600
601
707
  if jobName in self.jobNameToAvgRuntime:
602
- #Have seen jobs of this type before, so estimate
603
- #the runtime based on average of previous jobs of this type
708
+ # Have seen jobs of this type before, so estimate
709
+ # the runtime based on average of previous jobs of this type
604
710
  return self.jobNameToAvgRuntime[jobName]
605
711
  elif self.totalAvgRuntime > 0:
606
- #Haven't seen this job yet, so estimate its runtime as
607
- #the average runtime of all completed jobs
712
+ # Haven't seen this job yet, so estimate its runtime as
713
+ # the average runtime of all completed jobs
608
714
  return self.totalAvgRuntime
609
715
  else:
610
- #Have no information whatsoever
716
+ # Have no information whatsoever
611
717
  return 1.0
612
718
 
613
719
  def addCompletedJob(self, job: JobDescription, wallTime: int) -> None:
@@ -618,21 +724,25 @@ class ClusterScaler:
618
724
  :param int wallTime: The wall-time taken to complete the job in seconds.
619
725
  """
620
726
 
621
- #Adjust average runtimes to include this job.
727
+ # Adjust average runtimes to include this job.
622
728
  if job.jobName in self.jobNameToAvgRuntime:
623
729
  prevAvg = self.jobNameToAvgRuntime[job.jobName]
624
730
  prevNum = self.jobNameToNumCompleted[job.jobName]
625
- self.jobNameToAvgRuntime[job.jobName] = float(prevAvg*prevNum + wallTime)/(prevNum + 1)
731
+ self.jobNameToAvgRuntime[job.jobName] = float(
732
+ prevAvg * prevNum + wallTime
733
+ ) / (prevNum + 1)
626
734
  self.jobNameToNumCompleted[job.jobName] += 1
627
735
  else:
628
736
  self.jobNameToAvgRuntime[job.jobName] = wallTime
629
737
  self.jobNameToNumCompleted[job.jobName] = 1
630
738
 
631
739
  self.totalJobsCompleted += 1
632
- self.totalAvgRuntime = float(self.totalAvgRuntime * (self.totalJobsCompleted - 1) + \
633
- wallTime)/self.totalJobsCompleted
740
+ self.totalAvgRuntime = (
741
+ float(self.totalAvgRuntime * (self.totalJobsCompleted - 1) + wallTime)
742
+ / self.totalJobsCompleted
743
+ )
634
744
 
635
- def setStaticNodes(self, nodes: List["Node"], preemptible: bool) -> None:
745
+ def setStaticNodes(self, nodes: list["Node"], preemptible: bool) -> None:
636
746
  """
637
747
  Used to track statically provisioned nodes. This method must be called
638
748
  before any auto-scaled nodes are provisioned.
@@ -642,12 +752,12 @@ class ClusterScaler:
642
752
 
643
753
  :param nodes: list of Node objects
644
754
  """
645
- prefix = 'non-' if not preemptible else ''
755
+ prefix = "non-" if not preemptible else ""
646
756
  logger.debug("Adding %s to %spreemptible static nodes", nodes, prefix)
647
757
  if nodes is not None:
648
- self.static[preemptible] = {node.privateIP : node for node in nodes}
758
+ self.static[preemptible] = {node.privateIP: node for node in nodes}
649
759
 
650
- def getStaticNodes(self, preemptible: bool) -> Dict[str, "Node"]:
760
+ def getStaticNodes(self, preemptible: bool) -> dict[str, "Node"]:
651
761
  """
652
762
  Returns nodes set in setStaticNodes().
653
763
 
@@ -662,14 +772,17 @@ class ClusterScaler:
662
772
 
663
773
  Returns an integer.
664
774
  """
665
- weightedEstimate = (1 - self.betaInertia) * estimatedNodeCount + \
666
- self.betaInertia * self.previousWeightedEstimate[nodeShape]
775
+ weightedEstimate = (
776
+ 1 - self.betaInertia
777
+ ) * estimatedNodeCount + self.betaInertia * self.previousWeightedEstimate[
778
+ nodeShape
779
+ ]
667
780
  self.previousWeightedEstimate[nodeShape] = weightedEstimate
668
781
  return self._round(weightedEstimate)
669
782
 
670
783
  def getEstimatedNodeCounts(
671
- self, queuedJobShapes: List[Shape], currentNodeCounts: Dict[Shape, int]
672
- ) -> Tuple[Dict[Shape, int], Dict[Shape, List[FailedConstraint]]]:
784
+ self, queuedJobShapes: list[Shape], currentNodeCounts: dict[Shape, int]
785
+ ) -> tuple[dict[Shape, int], dict[Shape, list[FailedConstraint]]]:
673
786
  """
674
787
  Given the resource requirements of queued jobs and the current size of the cluster.
675
788
 
@@ -682,21 +795,30 @@ class ClusterScaler:
682
795
  nodesToRunQueuedJobs, could_not_fit = binPacking(
683
796
  jobShapes=queuedJobShapes,
684
797
  nodeShapes=self.node_shapes_after_overhead,
685
- goalTime=self.targetTime
798
+ goalTime=self.targetTime,
686
799
  )
687
-
800
+
688
801
  # Then translate back to get results in terms of full nodes without overhead.
689
- nodesToRunQueuedJobs = {self.without_overhead[k]: v for k, v in nodesToRunQueuedJobs.items()}
802
+ nodesToRunQueuedJobs = {
803
+ self.without_overhead[k]: v for k, v in nodesToRunQueuedJobs.items()
804
+ }
690
805
 
691
806
  estimatedNodeCounts = {}
692
807
  for nodeShape in self.nodeShapes:
693
808
  instance_type = self.nodeShapeToType[nodeShape]
694
809
 
695
- logger.debug(f"Nodes of type {instance_type} to run queued jobs: {nodesToRunQueuedJobs[nodeShape]}")
810
+ logger.debug(
811
+ f"Nodes of type {instance_type} to run queued jobs: {nodesToRunQueuedJobs[nodeShape]}"
812
+ )
696
813
  # Actual calculation of the estimated number of nodes required
697
- estimatedNodeCount = 0 if nodesToRunQueuedJobs[nodeShape] == 0 \
814
+ estimatedNodeCount = (
815
+ 0
816
+ if nodesToRunQueuedJobs[nodeShape] == 0
698
817
  else max(1, self._round(nodesToRunQueuedJobs[nodeShape]))
699
- logger.debug("Estimating %i nodes of shape %s" % (estimatedNodeCount, nodeShape))
818
+ )
819
+ logger.debug(
820
+ "Estimating %i nodes of shape %s" % (estimatedNodeCount, nodeShape)
821
+ )
700
822
 
701
823
  # Use inertia parameter to smooth out fluctuations according to an exponentially
702
824
  # weighted moving average.
@@ -710,37 +832,56 @@ class ClusterScaler:
710
832
  # The number of nodes we provision as compensation for missing preemptible
711
833
  # nodes is the product of the deficit (the number of preemptible nodes we did
712
834
  # _not_ allocate) and configuration preference.
713
- compensationNodes = self._round(self.preemptibleNodeDeficit[instance_type] * compensation)
835
+ compensationNodes = self._round(
836
+ self.preemptibleNodeDeficit[instance_type] * compensation
837
+ )
714
838
  if compensationNodes > 0:
715
- logger.debug('Adding %d non-preemptible nodes of type %s to compensate for a '
716
- 'deficit of %d preemptible ones.', compensationNodes,
717
- instance_type,
718
- self.preemptibleNodeDeficit[instance_type])
839
+ logger.debug(
840
+ "Adding %d non-preemptible nodes of type %s to compensate for a "
841
+ "deficit of %d preemptible ones.",
842
+ compensationNodes,
843
+ instance_type,
844
+ self.preemptibleNodeDeficit[instance_type],
845
+ )
719
846
  estimatedNodeCount += compensationNodes
720
847
 
721
848
  # Tell everyone how big the cluster is
722
- logger.debug("Currently %i nodes of type %s in cluster" % (currentNodeCounts[nodeShape],
723
- instance_type))
724
- self.leader.toilState.bus.publish(ClusterSizeMessage(instance_type, currentNodeCounts[nodeShape]))
725
- self.leader.toilState.bus.publish(ClusterDesiredSizeMessage(instance_type, estimatedNodeCount))
849
+ logger.debug(
850
+ "Currently %i nodes of type %s in cluster"
851
+ % (currentNodeCounts[nodeShape], instance_type)
852
+ )
853
+ self.leader.toilState.bus.publish(
854
+ ClusterSizeMessage(instance_type, currentNodeCounts[nodeShape])
855
+ )
856
+ self.leader.toilState.bus.publish(
857
+ ClusterDesiredSizeMessage(instance_type, estimatedNodeCount)
858
+ )
726
859
 
727
860
  # Bound number using the max and min node parameters
728
861
  if estimatedNodeCount > self.maxNodes[nodeShape]:
729
- logger.debug('Limiting the estimated number of necessary %s (%s) to the '
730
- 'configured maximum (%s).', instance_type,
731
- estimatedNodeCount,
732
- self.maxNodes[nodeShape])
862
+ logger.debug(
863
+ "Limiting the estimated number of necessary %s (%s) to the "
864
+ "configured maximum (%s).",
865
+ instance_type,
866
+ estimatedNodeCount,
867
+ self.maxNodes[nodeShape],
868
+ )
733
869
  estimatedNodeCount = self.maxNodes[nodeShape]
734
870
  elif estimatedNodeCount < self.minNodes[nodeShape]:
735
- logger.debug('Raising the estimated number of necessary %s (%s) to the '
736
- 'configured minimum (%s).', instance_type,
737
- estimatedNodeCount,
738
- self.minNodes[nodeShape])
871
+ logger.debug(
872
+ "Raising the estimated number of necessary %s (%s) to the "
873
+ "configured minimum (%s).",
874
+ instance_type,
875
+ estimatedNodeCount,
876
+ self.minNodes[nodeShape],
877
+ )
739
878
  estimatedNodeCount = self.minNodes[nodeShape]
740
879
  estimatedNodeCounts[nodeShape] = estimatedNodeCount
741
880
  return estimatedNodeCounts, could_not_fit
742
881
 
743
- def updateClusterSize(self, estimatedNodeCounts: Dict[Shape, int]) -> Dict[Shape, int]:
882
+ def updateClusterSize(
883
+ self, estimatedNodeCounts: dict[Shape, int]
884
+ ) -> dict[Shape, int]:
744
885
  """
745
886
  Given the desired and current size of the cluster, attempts to launch/remove instances to get to the desired size.
746
887
 
@@ -752,21 +893,26 @@ class ClusterScaler:
752
893
  for nodeShape, estimatedNodeCount in estimatedNodeCounts.items():
753
894
  instance_type = self.nodeShapeToType[nodeShape]
754
895
 
755
- newNodeCount = self.setNodeCount(instance_type, estimatedNodeCount, preemptible=nodeShape.preemptible)
896
+ newNodeCount = self.setNodeCount(
897
+ instance_type, estimatedNodeCount, preemptible=nodeShape.preemptible
898
+ )
756
899
  # If we were scaling up a preemptible node type and failed to meet
757
900
  # our target, we will attempt to compensate for the deficit while scaling
758
901
  # non-preemptible nodes of this type.
759
902
  if nodeShape.preemptible:
760
903
  if newNodeCount < estimatedNodeCount:
761
904
  deficit = estimatedNodeCount - newNodeCount
762
- logger.debug('Preemptible scaler detected deficit of %d nodes of type %s.' % (deficit, instance_type))
905
+ logger.debug(
906
+ "Preemptible scaler detected deficit of %d nodes of type %s."
907
+ % (deficit, instance_type)
908
+ )
763
909
  self.preemptibleNodeDeficit[instance_type] = deficit
764
910
  else:
765
911
  self.preemptibleNodeDeficit[instance_type] = 0
766
912
  newNodeCounts[nodeShape] = newNodeCount
767
913
 
768
- #Attempt to terminate any nodes that we previously designated for
769
- #termination, but which still had workers running.
914
+ # Attempt to terminate any nodes that we previously designated for
915
+ # termination, but which still had workers running.
770
916
  self._terminateIgnoredNodes()
771
917
  return newNodeCounts
772
918
 
@@ -800,18 +946,29 @@ class ClusterScaler:
800
946
  actual cluster size at the time this method returns.
801
947
  """
802
948
  if not isinstance(self.leader.batchSystem, AbstractScalableBatchSystem):
803
- raise RuntimeError('Non-scalable batch system abusing a scalable-only function.')
949
+ raise RuntimeError(
950
+ "Non-scalable batch system abusing a scalable-only function."
951
+ )
804
952
  for attempt in old_retry(predicate=self.provisioner.retryPredicate):
805
953
  with attempt:
806
954
  nodes = self.getNodes(preemptible)
807
955
  logger.debug("Cluster contains %i instances" % len(nodes))
808
956
 
809
- nodes = {node: nodes[node] for node in nodes if node.nodeType == instance_type}
810
- ignoredNodes = [node for node in nodes if node.privateIP in self.ignoredNodes]
957
+ nodes = {
958
+ node: nodes[node]
959
+ for node in nodes
960
+ if node.nodeType == instance_type
961
+ }
962
+ ignoredNodes = [
963
+ node for node in nodes if node.privateIP in self.ignoredNodes
964
+ ]
811
965
  numIgnoredNodes = len(ignoredNodes)
812
966
  numCurrentNodes = len(nodes)
813
- logger.debug("Cluster contains %i instances of type %s (%i ignored and draining jobs until "
814
- "they can be safely terminated)" % (numCurrentNodes, instance_type, numIgnoredNodes))
967
+ logger.debug(
968
+ "Cluster contains %i instances of type %s (%i ignored and draining jobs until "
969
+ "they can be safely terminated)"
970
+ % (numCurrentNodes, instance_type, numIgnoredNodes)
971
+ )
815
972
  if not force:
816
973
  delta = numNodes - (numCurrentNodes - numIgnoredNodes)
817
974
  else:
@@ -819,38 +976,59 @@ class ClusterScaler:
819
976
  if delta > 0 and numIgnoredNodes > 0:
820
977
  # We can un-ignore a few nodes to compensate for the additional nodes we want.
821
978
  numNodesToUnignore = min(delta, numIgnoredNodes)
822
- logger.debug('Unignoring %i nodes because we want to scale back up again.' % numNodesToUnignore)
979
+ logger.debug(
980
+ "Unignoring %i nodes because we want to scale back up again."
981
+ % numNodesToUnignore
982
+ )
823
983
  delta -= numNodesToUnignore
824
984
 
825
985
  for node in ignoredNodes[:numNodesToUnignore]:
826
986
  self.ignoredNodes.remove(node.privateIP)
827
987
  self.leader.batchSystem.unignoreNode(node.privateIP)
828
988
  if delta > 0:
829
- logger.info('Adding %i %s nodes to get to desired cluster size of %i.',
830
- delta,
831
- 'preemptible' if preemptible else 'non-preemptible',
832
- numNodes)
833
- numNodes = numCurrentNodes + self._addNodes(instance_type, numNodes=delta,
834
- preemptible=preemptible)
989
+ logger.info(
990
+ "Adding %i %s nodes to get to desired cluster size of %i.",
991
+ delta,
992
+ "preemptible" if preemptible else "non-preemptible",
993
+ numNodes,
994
+ )
995
+ numNodes = numCurrentNodes + self._addNodes(
996
+ instance_type, numNodes=delta, preemptible=preemptible
997
+ )
835
998
  elif delta < 0:
836
- logger.info('Removing %i %s nodes to get to desired cluster size of %i.', -delta, 'preemptible' if preemptible else 'non-preemptible', numNodes)
837
- numNodes = numCurrentNodes - self._removeNodes(nodes,
838
- instance_type=instance_type,
839
- num_nodes=-delta,
840
- preemptible=preemptible,
841
- force=force)
999
+ logger.info(
1000
+ "Removing %i %s nodes to get to desired cluster size of %i.",
1001
+ -delta,
1002
+ "preemptible" if preemptible else "non-preemptible",
1003
+ numNodes,
1004
+ )
1005
+ numNodes = numCurrentNodes - self._removeNodes(
1006
+ nodes,
1007
+ instance_type=instance_type,
1008
+ num_nodes=-delta,
1009
+ preemptible=preemptible,
1010
+ force=force,
1011
+ )
842
1012
  elif force:
843
- logger.debug('Cluster already at desired size of %i. Nothing to do.', numNodes)
1013
+ logger.debug(
1014
+ "Cluster already at desired size of %i. Nothing to do.",
1015
+ numNodes,
1016
+ )
844
1017
  else:
845
- logger.debug('Cluster (minus ignored nodes) already at desired size of %i. Nothing to do.', numNodes)
1018
+ logger.debug(
1019
+ "Cluster (minus ignored nodes) already at desired size of %i. Nothing to do.",
1020
+ numNodes,
1021
+ )
846
1022
  return numNodes
847
1023
 
848
1024
  def _addNodes(self, instance_type: str, numNodes: int, preemptible: bool) -> int:
849
- return self.provisioner.addNodes(nodeTypes={instance_type}, numNodes=numNodes, preemptible=preemptible)
1025
+ return self.provisioner.addNodes(
1026
+ nodeTypes={instance_type}, numNodes=numNodes, preemptible=preemptible
1027
+ )
850
1028
 
851
1029
  def _removeNodes(
852
1030
  self,
853
- nodes: Dict["Node", NodeInfo],
1031
+ nodes: dict["Node", NodeInfo],
854
1032
  instance_type: str,
855
1033
  num_nodes: int,
856
1034
  preemptible: bool = False,
@@ -867,17 +1045,18 @@ class ClusterScaler:
867
1045
  nodes = self.getNodes(preemptible)
868
1046
  # Filter down to nodes of the correct node type
869
1047
 
870
- nodes = {node: nodes[node] for node in nodes if
871
- node.nodeType == instance_type}
1048
+ nodes = {
1049
+ node: nodes[node] for node in nodes if node.nodeType == instance_type
1050
+ }
872
1051
 
873
1052
  filtered_nodes = self.filter_out_static_nodes(nodes, preemptible)
874
1053
  filtered_nodes = filtered_nodes[:num_nodes]
875
1054
 
876
1055
  # Join nodes and instances on private IP address.
877
- logger.debug('Nodes considered to terminate: %s', ' '.join(map(str, nodes)))
1056
+ logger.debug("Nodes considered to terminate: %s", " ".join(map(str, nodes)))
878
1057
 
879
1058
  # Tell the batch system to stop sending jobs to these nodes
880
- for (node, nodeInfo) in filtered_nodes:
1059
+ for node, nodeInfo in filtered_nodes:
881
1060
  self.ignoredNodes.add(node.privateIP)
882
1061
  self.leader.batchSystem.ignoreNode(node.privateIP)
883
1062
 
@@ -886,8 +1065,11 @@ class ClusterScaler:
886
1065
  # will be terminated in _removeIgnoredNodes later on
887
1066
  # once all jobs have finished, but they will be ignored by
888
1067
  # the batch system and cluster scaler from now on
889
- filtered_nodes = [(node, nodeInfo) for (node, nodeInfo) in filtered_nodes if
890
- nodeInfo and nodeInfo.workers < 1]
1068
+ filtered_nodes = [
1069
+ (node, nodeInfo)
1070
+ for (node, nodeInfo) in filtered_nodes
1071
+ if nodeInfo and nodeInfo.workers < 1
1072
+ ]
891
1073
  nodes_to_terminate = [node for (node, nodeInfo) in filtered_nodes]
892
1074
  for node in nodes_to_terminate:
893
1075
  if node.privateIP in self.ignoredNodes:
@@ -895,10 +1077,12 @@ class ClusterScaler:
895
1077
  self.leader.batchSystem.unignoreNode(node.privateIP)
896
1078
  else:
897
1079
  # Without load info all we can do is sort instances by time left in billing cycle.
898
- nodes_to_terminate = sorted(nodes.keys(), key=lambda x: x.remainingBillingInterval())
1080
+ nodes_to_terminate = sorted(
1081
+ nodes.keys(), key=lambda x: x.remainingBillingInterval()
1082
+ )
899
1083
  nodes_to_terminate = nodes_to_terminate[:num_nodes]
900
1084
  number_terminated = len(nodes_to_terminate)
901
- logger.debug('Terminating %i instance(s).', number_terminated)
1085
+ logger.debug("Terminating %i instance(s).", number_terminated)
902
1086
  for node in nodes_to_terminate:
903
1087
  if node.privateIP in self.ignoredNodes:
904
1088
  # TODO: Why are we undoing what was just done above???
@@ -912,7 +1096,9 @@ class ClusterScaler:
912
1096
  but which still have workers running.
913
1097
  """
914
1098
  if not isinstance(self.leader.batchSystem, AbstractScalableBatchSystem):
915
- raise RuntimeError('Non-scalable batch system abusing a scalable-only function.')
1099
+ raise RuntimeError(
1100
+ "Non-scalable batch system abusing a scalable-only function."
1101
+ )
916
1102
 
917
1103
  # start with a dictionary of all nodes and filter down
918
1104
  nodes = self.getNodes()
@@ -926,10 +1112,18 @@ class ClusterScaler:
926
1112
  self.ignoredNodes.remove(ip)
927
1113
  self.leader.batchSystem.unignoreNode(ip)
928
1114
 
929
- logger.debug("There are %i nodes being ignored by the batch system, "
930
- "checking if they can be terminated" % len(self.ignoredNodes))
931
- nodes = {node: info for node, info in nodes.items() if node.privateIP in self.ignoredNodes}
932
- nodes = {node: info for node, info in nodes.items() if info and info.workers < 1}
1115
+ logger.debug(
1116
+ "There are %i nodes being ignored by the batch system, "
1117
+ "checking if they can be terminated" % len(self.ignoredNodes)
1118
+ )
1119
+ nodes = {
1120
+ node: info
1121
+ for node, info in nodes.items()
1122
+ if node.privateIP in self.ignoredNodes
1123
+ }
1124
+ nodes = {
1125
+ node: info for node, info in nodes.items() if info and info.workers < 1
1126
+ }
933
1127
  nodes_to_terminate = list(nodes.keys())
934
1128
 
935
1129
  for node in nodes_to_terminate:
@@ -938,25 +1132,32 @@ class ClusterScaler:
938
1132
  self.provisioner.terminateNodes(nodes_to_terminate)
939
1133
 
940
1134
  def filter_out_static_nodes(
941
- self,
942
- nodes: Dict["Node", NodeInfo],
943
- preemptible: bool = False) -> List[Tuple["Node", NodeInfo]]:
1135
+ self, nodes: dict["Node", NodeInfo], preemptible: bool = False
1136
+ ) -> list[tuple["Node", NodeInfo]]:
944
1137
  filtered_nodes = []
945
1138
  for node, nodeInfo in nodes.items():
946
1139
  if node:
947
- non = 'non-' if not preemptible else ''
1140
+ non = "non-" if not preemptible else ""
948
1141
  if node.privateIP in self.getStaticNodes(preemptible):
949
1142
  # we don't want to automatically terminate any statically provisioned nodes
950
- logger.debug(f'Found {node.privateIP} in {non}preemptible static nodes')
1143
+ logger.debug(
1144
+ f"Found {node.privateIP} in {non}preemptible static nodes"
1145
+ )
951
1146
  else:
952
- logger.debug(f'Did not find {node.privateIP} in {non}preemptible static nodes')
1147
+ logger.debug(
1148
+ f"Did not find {node.privateIP} in {non}preemptible static nodes"
1149
+ )
953
1150
  filtered_nodes.append((node, nodeInfo))
954
1151
  # Sort nodes by number of workers and time left in billing cycle
955
- filtered_nodes.sort(key=lambda node_nodeInfo: (
956
- node_nodeInfo[1].workers if node_nodeInfo[1] else 1, node_nodeInfo[0].remainingBillingInterval()))
1152
+ filtered_nodes.sort(
1153
+ key=lambda node_nodeInfo: (
1154
+ node_nodeInfo[1].workers if node_nodeInfo[1] else 1,
1155
+ node_nodeInfo[0].remainingBillingInterval(),
1156
+ )
1157
+ )
957
1158
  return filtered_nodes
958
1159
 
959
- def getNodes(self, preemptible: Optional[bool] = None) -> Dict["Node", NodeInfo]:
1160
+ def getNodes(self, preemptible: Optional[bool] = None) -> dict["Node", NodeInfo]:
960
1161
  """
961
1162
  Returns a dictionary mapping node identifiers of preemptible or non-preemptible nodes to
962
1163
  NodeInfo objects, one for each node.
@@ -968,25 +1169,31 @@ class ClusterScaler:
968
1169
  If None, all nodes will be returned.
969
1170
  """
970
1171
  if not isinstance(self.leader.batchSystem, AbstractScalableBatchSystem):
971
- raise RuntimeError('Non-scalable batch system abusing a scalable-only function.')
1172
+ raise RuntimeError(
1173
+ "Non-scalable batch system abusing a scalable-only function."
1174
+ )
972
1175
  # nodes seen within the last 600 seconds (10 minutes)
973
1176
  recent_nodes = self.leader.batchSystem.getNodes(preemptible, timeout=600)
974
1177
  # all available nodes
975
1178
  all_nodes = self.leader.batchSystem.getNodes(preemptible)
976
1179
  # nodes that are supposedly doing something
977
- provisioned_nodes = self.provisioner.getProvisionedWorkers(preemptible=preemptible)
1180
+ provisioned_nodes = self.provisioner.getProvisionedWorkers(
1181
+ preemptible=preemptible
1182
+ )
978
1183
 
979
1184
  if len(recent_nodes) != len(provisioned_nodes):
980
1185
  logger.debug("Consolidating state between mesos and provisioner")
981
1186
 
982
- nodeToInfo: Dict["Node", NodeInfo] = {}
1187
+ nodeToInfo: dict["Node", NodeInfo] = {}
983
1188
  # fixme: what happens if awsFilterImpairedNodes is used?
984
1189
  # if this assertion is false it means that user-managed nodes are being
985
1190
  # used that are outside the provisioner's control
986
1191
  # this would violate many basic assumptions in autoscaling so it currently not allowed
987
1192
  for node, ip in ((node, node.privateIP) for node in provisioned_nodes):
988
1193
  if ip not in recent_nodes:
989
- logger.debug("Worker node at %s is not reporting executor information", ip)
1194
+ logger.debug(
1195
+ "Worker node at %s is not reporting executor information", ip
1196
+ )
990
1197
 
991
1198
  # get up-to-date information about the node, if available
992
1199
  info = all_nodes.get(ip)
@@ -1009,9 +1216,15 @@ class ClusterScaler:
1009
1216
  #
1010
1217
  # In all 3 situations it's safe to fake executor info with 0 workers,
1011
1218
  # since in all cases there are no workers running.
1012
- info = NodeInfo(coresTotal=1, coresUsed=0, requestedCores=0,
1013
- memoryTotal=1, memoryUsed=0, requestedMemory=0,
1014
- workers=0)
1219
+ info = NodeInfo(
1220
+ coresTotal=1,
1221
+ coresUsed=0,
1222
+ requestedCores=0,
1223
+ memoryTotal=1,
1224
+ memoryUsed=0,
1225
+ requestedMemory=0,
1226
+ workers=0,
1227
+ )
1015
1228
  else:
1016
1229
  # mesos knows about the ip & we have up-to-date information - easy!
1017
1230
  info = recent_nodes[ip]
@@ -1020,40 +1233,55 @@ class ClusterScaler:
1020
1233
  return nodeToInfo
1021
1234
 
1022
1235
  def shutDown(self) -> None:
1023
- logger.debug('Forcing provisioner to reduce cluster size to zero.')
1236
+ logger.debug("Forcing provisioner to reduce cluster size to zero.")
1024
1237
  for nodeShape in self.nodeShapes:
1025
1238
  preemptible = nodeShape.preemptible
1026
1239
  instance_type = self.nodeShapeToType[nodeShape]
1027
- self.setNodeCount(instance_type=instance_type, numNodes=0, preemptible=preemptible, force=True)
1240
+ self.setNodeCount(
1241
+ instance_type=instance_type,
1242
+ numNodes=0,
1243
+ preemptible=preemptible,
1244
+ force=True,
1245
+ )
1246
+
1028
1247
 
1029
1248
  class JobTooBigError(Exception):
1030
1249
  """
1031
1250
  Raised in the scaler thread when a job cannot fit in any available node
1032
1251
  type and is likely to lock up the workflow.
1033
1252
  """
1034
-
1035
- def __init__(self, job: Optional[JobDescription] = None, shape: Optional[Shape] = None, constraints: Optional[List[FailedConstraint]] = None):
1253
+
1254
+ def __init__(
1255
+ self,
1256
+ job: Optional[JobDescription] = None,
1257
+ shape: Optional[Shape] = None,
1258
+ constraints: Optional[list[FailedConstraint]] = None,
1259
+ ):
1036
1260
  """
1037
1261
  Make a JobTooBigError.
1038
-
1262
+
1039
1263
  Can have a job, the job's shape, and the limiting resources and amounts. All are optional.
1040
1264
  """
1041
1265
  self.job = job
1042
1266
  self.shape = shape
1043
1267
  self.constraints = constraints if constraints is not None else []
1044
-
1268
+
1045
1269
  parts = [
1046
1270
  f"The job {self.job}" if self.job else "A job",
1047
1271
  f" with shape {self.shape}" if self.shape else "",
1048
- " is too big for any available node type."
1272
+ " is too big for any available node type.",
1049
1273
  ]
1050
-
1274
+
1051
1275
  if self.constraints:
1052
1276
  parts.append(" It could have fit if it only needed ")
1053
- parts.append(", ".join([f"{limit} {resource}" for resource, limit in self.constraints]))
1054
- parts.append(".")
1055
-
1056
- self.msg = ''.join(parts)
1277
+ parts.append(
1278
+ ", ".join(
1279
+ [f"{limit} {resource}" for resource, limit in self.constraints]
1280
+ )
1281
+ )
1282
+ parts.append(".")
1283
+
1284
+ self.msg = "".join(parts)
1057
1285
  super().__init__()
1058
1286
 
1059
1287
  def __str__(self) -> str:
@@ -1062,6 +1290,7 @@ class JobTooBigError(Exception):
1062
1290
  """
1063
1291
  return self.msg
1064
1292
 
1293
+
1065
1294
  class ScalerThread(ExceptionalThread):
1066
1295
  """
1067
1296
  A thread that automatically scales the number of either preemptible or non-preemptible worker
@@ -1077,10 +1306,17 @@ class ScalerThread(ExceptionalThread):
1077
1306
  is made, else the size of the cluster is adapted. The beta factor is an inertia parameter
1078
1307
  that prevents continual fluctuations in the number of nodes.
1079
1308
  """
1080
- def __init__(self, provisioner: AbstractProvisioner, leader: "Leader", config: Config, stop_on_exception: bool = False) -> None:
1081
- super().__init__(name='scaler')
1309
+
1310
+ def __init__(
1311
+ self,
1312
+ provisioner: AbstractProvisioner,
1313
+ leader: "Leader",
1314
+ config: Config,
1315
+ stop_on_exception: bool = False,
1316
+ ) -> None:
1317
+ super().__init__(name="scaler")
1082
1318
  self.scaler = ClusterScaler(provisioner, leader, config)
1083
-
1319
+
1084
1320
  # Indicates that the scaling thread should shutdown
1085
1321
  self.stop = False
1086
1322
  # Indicates that we should stop the thread if we encounter an error.
@@ -1090,13 +1326,13 @@ class ScalerThread(ExceptionalThread):
1090
1326
  self.stats = None
1091
1327
  if config.clusterStats:
1092
1328
  logger.debug("Starting up cluster statistics...")
1093
- self.stats = ClusterStats(leader.config.clusterStats,
1094
- leader.batchSystem,
1095
- provisioner.clusterName)
1329
+ self.stats = ClusterStats(
1330
+ leader.config.clusterStats, leader.batchSystem, provisioner.clusterName
1331
+ )
1096
1332
  for preemptible in [True, False]:
1097
1333
  self.stats.startStats(preemptible=preemptible)
1098
1334
  logger.debug("...Cluster stats started.")
1099
-
1335
+
1100
1336
  def check(self) -> None:
1101
1337
  """
1102
1338
  Attempt to join any existing scaler threads that may have died or finished.
@@ -1121,20 +1357,27 @@ class ScalerThread(ExceptionalThread):
1121
1357
 
1122
1358
  def tryRun(self) -> None:
1123
1359
  if self.scaler.leader.provisioner is None:
1124
- raise RuntimeError('No provisioner found for a scaling cluster '
1125
- '(cannot access "getProvisionedWorkers").')
1360
+ raise RuntimeError(
1361
+ "No provisioner found for a scaling cluster "
1362
+ '(cannot access "getProvisionedWorkers").'
1363
+ )
1126
1364
  while not self.stop:
1127
1365
  with throttle(self.scaler.config.scaleInterval):
1128
1366
  try:
1129
1367
  queuedJobs = self.scaler.leader.getJobs()
1130
1368
  queuedJobShapes = [
1131
- Shape(wallTime=self.scaler.getAverageRuntime(
1132
- jobName=job.jobName,
1133
- service=isinstance(job, ServiceJobDescription)),
1369
+ Shape(
1370
+ wallTime=self.scaler.getAverageRuntime(
1371
+ jobName=job.jobName,
1372
+ service=isinstance(job, ServiceJobDescription),
1373
+ ),
1134
1374
  memory=job.memory,
1135
1375
  cores=job.cores,
1136
1376
  disk=job.disk,
1137
- preemptible=job.preemptible) for job in queuedJobs]
1377
+ preemptible=job.preemptible,
1378
+ )
1379
+ for job in queuedJobs
1380
+ ]
1138
1381
  currentNodeCounts = {}
1139
1382
  for nodeShape in self.scaler.nodeShapes:
1140
1383
  instance_type = self.scaler.nodeShapeToType[nodeShape]
@@ -1144,14 +1387,16 @@ class ScalerThread(ExceptionalThread):
1144
1387
  preemptible=nodeShape.preemptible,
1145
1388
  )
1146
1389
  )
1147
- estimatedNodeCounts, could_not_fit = self.scaler.getEstimatedNodeCounts(
1148
- queuedJobShapes, currentNodeCounts
1390
+ estimatedNodeCounts, could_not_fit = (
1391
+ self.scaler.getEstimatedNodeCounts(
1392
+ queuedJobShapes, currentNodeCounts
1393
+ )
1149
1394
  )
1150
1395
  self.scaler.updateClusterSize(estimatedNodeCounts)
1151
1396
  if self.stats:
1152
1397
  self.stats.checkStats()
1153
-
1154
- if len(could_not_fit) != 0:
1398
+
1399
+ if len(could_not_fit) != 0:
1155
1400
  # If we have any jobs left over that we couldn't fit, complain.
1156
1401
  bad_job: Optional[JobDescription] = None
1157
1402
  bad_shape: Optional[Shape] = None
@@ -1164,39 +1409,49 @@ class ScalerThread(ExceptionalThread):
1164
1409
  if bad_shape is None:
1165
1410
  # If we can't find an offending job, grab an arbitrary offending shape.
1166
1411
  bad_shape = next(iter(could_not_fit))
1167
-
1168
- raise JobTooBigError(job=bad_job, shape=bad_shape, constraints=could_not_fit[bad_shape])
1169
-
1412
+
1413
+ raise JobTooBigError(
1414
+ job=bad_job,
1415
+ shape=bad_shape,
1416
+ constraints=could_not_fit[bad_shape],
1417
+ )
1418
+
1170
1419
  except:
1171
1420
  if self.stop_on_exception:
1172
1421
  logger.critical("Stopping ScalerThread due to an error.")
1173
1422
  raise
1174
1423
  else:
1175
- logger.exception("Exception encountered in scaler thread. Making a best-effort "
1176
- "attempt to keep going, but things may go wrong from now on.")
1424
+ logger.exception(
1425
+ "Exception encountered in scaler thread. Making a best-effort "
1426
+ "attempt to keep going, but things may go wrong from now on."
1427
+ )
1177
1428
  self.scaler.shutDown()
1178
1429
 
1430
+
1179
1431
  class ClusterStats:
1180
1432
  def __init__(
1181
1433
  self, path: str, batchSystem: AbstractBatchSystem, clusterName: Optional[str]
1182
1434
  ) -> None:
1183
1435
  logger.debug("Initializing cluster statistics")
1184
- self.stats: Dict[str, Dict[str, List[Dict[str, Any]]]] = {}
1185
- self.statsThreads: List[ExceptionalThread] = []
1436
+ self.stats: dict[str, dict[str, list[dict[str, Any]]]] = {}
1437
+ self.statsThreads: list[ExceptionalThread] = []
1186
1438
  self.statsPath = path
1187
1439
  self.stop = False
1188
1440
  self.clusterName = clusterName
1189
1441
  self.batchSystem = batchSystem
1190
- self.scaleable = isinstance(self.batchSystem, AbstractScalableBatchSystem) \
1191
- if batchSystem else False
1442
+ self.scaleable = (
1443
+ isinstance(self.batchSystem, AbstractScalableBatchSystem)
1444
+ if batchSystem
1445
+ else False
1446
+ )
1192
1447
 
1193
1448
  def shutDownStats(self) -> None:
1194
1449
  if self.stop:
1195
1450
  return
1196
1451
 
1197
1452
  def getFileName() -> str:
1198
- extension = '.json'
1199
- file = '%s-stats' % self.clusterName
1453
+ extension = ".json"
1454
+ file = "%s-stats" % self.clusterName
1200
1455
  counter = 0
1201
1456
  while True:
1202
1457
  suffix = str(counter).zfill(3) + extension
@@ -1204,12 +1459,13 @@ class ClusterStats:
1204
1459
  if not os.path.exists(fullName):
1205
1460
  return fullName
1206
1461
  counter += 1
1462
+
1207
1463
  if self.statsPath and self.scaleable:
1208
1464
  self.stop = True
1209
1465
  for thread in self.statsThreads:
1210
1466
  thread.join()
1211
1467
  fileName = getFileName()
1212
- with open(fileName, 'w') as f:
1468
+ with open(fileName, "w") as f:
1213
1469
  json.dump(self.stats, f)
1214
1470
 
1215
1471
  def startStats(self, preemptible: bool) -> None:
@@ -1223,22 +1479,26 @@ class ClusterStats:
1223
1479
  thread.join(timeout=0)
1224
1480
 
1225
1481
  def _gatherStats(self, preemptible: bool) -> None:
1226
- def toDict(nodeInfo: NodeInfo) -> Dict[str, Any]:
1482
+ def toDict(nodeInfo: NodeInfo) -> dict[str, Any]:
1227
1483
  # convert NodeInfo object to dict to improve JSON output
1228
- return dict(memory=nodeInfo.memoryUsed,
1229
- cores=nodeInfo.coresUsed,
1230
- memoryTotal=nodeInfo.memoryTotal,
1231
- coresTotal=nodeInfo.coresTotal,
1232
- requestedCores=nodeInfo.requestedCores,
1233
- requestedMemory=nodeInfo.requestedMemory,
1234
- workers=nodeInfo.workers,
1235
- time=time.time() # add time stamp
1236
- )
1484
+ return dict(
1485
+ memory=nodeInfo.memoryUsed,
1486
+ cores=nodeInfo.coresUsed,
1487
+ memoryTotal=nodeInfo.memoryTotal,
1488
+ coresTotal=nodeInfo.coresTotal,
1489
+ requestedCores=nodeInfo.requestedCores,
1490
+ requestedMemory=nodeInfo.requestedMemory,
1491
+ workers=nodeInfo.workers,
1492
+ time=time.time(), # add time stamp
1493
+ )
1494
+
1237
1495
  if self.scaleable:
1238
1496
  logger.debug("Starting to gather statistics")
1239
- stats: Dict[str, List[Dict[str, Any]]] = {}
1497
+ stats: dict[str, list[dict[str, Any]]] = {}
1240
1498
  if not isinstance(self.batchSystem, AbstractScalableBatchSystem):
1241
- raise RuntimeError('Non-scalable batch system abusing a scalable-only function.')
1499
+ raise RuntimeError(
1500
+ "Non-scalable batch system abusing a scalable-only function."
1501
+ )
1242
1502
  try:
1243
1503
  while not self.stop:
1244
1504
  nodeInfo = self.batchSystem.getNodes(preemptible)
@@ -1255,6 +1515,8 @@ class ClusterStats:
1255
1515
  stats[nodeIP] = [nodeStatsDict]
1256
1516
  time.sleep(60)
1257
1517
  finally:
1258
- threadName = 'Preemptible' if preemptible else 'Non-preemptible'
1259
- logger.debug('%s provisioner stats thread shut down successfully.', threadName)
1518
+ threadName = "Preemptible" if preemptible else "Non-preemptible"
1519
+ logger.debug(
1520
+ "%s provisioner stats thread shut down successfully.", threadName
1521
+ )
1260
1522
  self.stats[threadName] = stats