toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
|
@@ -18,27 +18,21 @@ import math
|
|
|
18
18
|
import os
|
|
19
19
|
import time
|
|
20
20
|
from collections import defaultdict
|
|
21
|
-
from typing import
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
Tuple,
|
|
29
|
-
Union)
|
|
30
|
-
|
|
31
|
-
from toil.batchSystems.abstractBatchSystem import (AbstractBatchSystem,
|
|
32
|
-
AbstractScalableBatchSystem,
|
|
33
|
-
NodeInfo)
|
|
21
|
+
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
|
|
22
|
+
|
|
23
|
+
from toil.batchSystems.abstractBatchSystem import (
|
|
24
|
+
AbstractBatchSystem,
|
|
25
|
+
AbstractScalableBatchSystem,
|
|
26
|
+
NodeInfo,
|
|
27
|
+
)
|
|
34
28
|
from toil.bus import ClusterDesiredSizeMessage, ClusterSizeMessage
|
|
35
29
|
from toil.common import Config
|
|
36
|
-
from toil.options.common import defaultTargetTime
|
|
37
30
|
from toil.job import JobDescription, ServiceJobDescription
|
|
38
31
|
from toil.lib.conversions import bytes2human, human2bytes
|
|
39
32
|
from toil.lib.retry import old_retry
|
|
40
33
|
from toil.lib.threading import ExceptionalThread
|
|
41
34
|
from toil.lib.throttle import throttle
|
|
35
|
+
from toil.options.common import defaultTargetTime
|
|
42
36
|
from toil.provisioners.abstractProvisioner import AbstractProvisioner, Shape
|
|
43
37
|
|
|
44
38
|
if TYPE_CHECKING:
|
|
@@ -48,18 +42,25 @@ if TYPE_CHECKING:
|
|
|
48
42
|
logger = logging.getLogger(__name__)
|
|
49
43
|
|
|
50
44
|
# Properties of GKE's memory overhead algorithm
|
|
51
|
-
EVICTION_THRESHOLD = human2bytes(
|
|
52
|
-
RESERVE_SMALL_LIMIT = human2bytes(
|
|
53
|
-
RESERVE_SMALL_AMOUNT = human2bytes(
|
|
54
|
-
RESERVE_BREAKPOINTS:
|
|
45
|
+
EVICTION_THRESHOLD = human2bytes("100MiB")
|
|
46
|
+
RESERVE_SMALL_LIMIT = human2bytes("1GiB")
|
|
47
|
+
RESERVE_SMALL_AMOUNT = human2bytes("255MiB")
|
|
48
|
+
RESERVE_BREAKPOINTS: list[Union[int, float]] = [
|
|
49
|
+
human2bytes("4GiB"),
|
|
50
|
+
human2bytes("8GiB"),
|
|
51
|
+
human2bytes("16GiB"),
|
|
52
|
+
human2bytes("128GiB"),
|
|
53
|
+
math.inf,
|
|
54
|
+
]
|
|
55
55
|
RESERVE_FRACTIONS = [0.25, 0.2, 0.1, 0.06, 0.02]
|
|
56
56
|
|
|
57
57
|
# Guess of how much disk space on the root volume is used for the OS and essential container images
|
|
58
|
-
OS_SIZE = human2bytes(
|
|
58
|
+
OS_SIZE = human2bytes("5G")
|
|
59
59
|
|
|
60
60
|
# Define a type for an explanation of why a job can't fit on a node.
|
|
61
61
|
# Consists of a resource name and a constraining value for that resource.
|
|
62
|
-
FailedConstraint =
|
|
62
|
+
FailedConstraint = tuple[str, Union[int, float, bool]]
|
|
63
|
+
|
|
63
64
|
|
|
64
65
|
class BinPackedFit:
|
|
65
66
|
"""
|
|
@@ -80,24 +81,30 @@ class BinPackedFit:
|
|
|
80
81
|
:returns: The minimum number of minimal node allocations estimated to be required to run all
|
|
81
82
|
the jobs in jobShapes.
|
|
82
83
|
"""
|
|
83
|
-
nodeReservations: Dict[Shape, List['NodeReservation']]
|
|
84
84
|
|
|
85
|
-
|
|
85
|
+
nodeReservations: dict[Shape, list["NodeReservation"]]
|
|
86
|
+
|
|
87
|
+
def __init__(
|
|
88
|
+
self, nodeShapes: list[Shape], targetTime: float = defaultTargetTime
|
|
89
|
+
) -> None:
|
|
86
90
|
self.nodeShapes = sorted(nodeShapes)
|
|
87
91
|
self.targetTime = targetTime
|
|
88
92
|
self.nodeReservations = {nodeShape: [] for nodeShape in nodeShapes}
|
|
89
93
|
|
|
90
|
-
def binPack(self, jobShapes:
|
|
94
|
+
def binPack(self, jobShapes: list[Shape]) -> dict[Shape, list[FailedConstraint]]:
|
|
91
95
|
"""
|
|
92
96
|
Pack a list of jobShapes into the fewest nodes reasonable.
|
|
93
|
-
|
|
97
|
+
|
|
94
98
|
Can be run multiple times.
|
|
95
|
-
|
|
99
|
+
|
|
96
100
|
Returns any distinct Shapes that did not fit, mapping to reasons they did not fit.
|
|
97
101
|
"""
|
|
98
102
|
# TODO: Check for redundancy with batchsystems.mesos.JobQueue() sorting
|
|
99
|
-
logger.debug(
|
|
100
|
-
|
|
103
|
+
logger.debug(
|
|
104
|
+
"Running bin packing for node shapes %s and %s job(s).",
|
|
105
|
+
self.nodeShapes,
|
|
106
|
+
len(jobShapes),
|
|
107
|
+
)
|
|
101
108
|
# Sort in descending order from largest to smallest. The FFD like-strategy will pack the
|
|
102
109
|
# jobs in order from longest to shortest.
|
|
103
110
|
jobShapes.sort()
|
|
@@ -111,11 +118,13 @@ class BinPackedFit:
|
|
|
111
118
|
could_not_fit[rejection[0]] = rejection[1]
|
|
112
119
|
return could_not_fit
|
|
113
120
|
|
|
114
|
-
def addJobShape(
|
|
121
|
+
def addJobShape(
|
|
122
|
+
self, jobShape: Shape
|
|
123
|
+
) -> Optional[tuple[Shape, list[FailedConstraint]]]:
|
|
115
124
|
"""
|
|
116
125
|
Add the job to the first node reservation in which it will fit. (This
|
|
117
126
|
is the bin-packing aspect).
|
|
118
|
-
|
|
127
|
+
|
|
119
128
|
Returns the job shape again, and a list of failed constraints, if it did not fit.
|
|
120
129
|
"""
|
|
121
130
|
chosenNodeShape = None
|
|
@@ -126,24 +135,33 @@ class BinPackedFit:
|
|
|
126
135
|
break
|
|
127
136
|
|
|
128
137
|
if chosenNodeShape is None:
|
|
129
|
-
logger.debug(
|
|
130
|
-
|
|
138
|
+
logger.debug(
|
|
139
|
+
"Couldn't fit job with requirements %s into any nodes in the nodeTypes "
|
|
140
|
+
"list.",
|
|
141
|
+
jobShape,
|
|
142
|
+
)
|
|
131
143
|
# Go back and debug why this happened.
|
|
132
|
-
fewest_constraints: Optional[
|
|
144
|
+
fewest_constraints: Optional[list[FailedConstraint]] = None
|
|
133
145
|
for shape in self.nodeShapes:
|
|
134
146
|
failures = NodeReservation(nodeShape).get_failed_constraints(jobShape)
|
|
135
|
-
if fewest_constraints is None or len(failures) < len(
|
|
147
|
+
if fewest_constraints is None or len(failures) < len(
|
|
148
|
+
fewest_constraints
|
|
149
|
+
):
|
|
136
150
|
# This was closer to fitting.
|
|
137
151
|
# TODO: Check the actual constraint values so we don't tell
|
|
138
152
|
# the user to raise the memory on the smallest machine?
|
|
139
153
|
fewest_constraints = failures
|
|
140
|
-
|
|
141
|
-
return jobShape,
|
|
154
|
+
|
|
155
|
+
return jobShape, (
|
|
156
|
+
fewest_constraints if fewest_constraints is not None else []
|
|
157
|
+
)
|
|
142
158
|
|
|
143
159
|
# grab current list of job objects appended to this instance type
|
|
144
160
|
nodeReservations = self.nodeReservations[chosenNodeShape]
|
|
145
161
|
for nodeReservation in nodeReservations:
|
|
146
|
-
if nodeReservation.attemptToAddJob(
|
|
162
|
+
if nodeReservation.attemptToAddJob(
|
|
163
|
+
jobShape, chosenNodeShape, self.targetTime
|
|
164
|
+
):
|
|
147
165
|
# We succeeded adding the job to this node reservation. Now we're done.
|
|
148
166
|
return None
|
|
149
167
|
|
|
@@ -160,7 +178,7 @@ class BinPackedFit:
|
|
|
160
178
|
reservation = extendThisReservation
|
|
161
179
|
return None
|
|
162
180
|
|
|
163
|
-
def getRequiredNodes(self) ->
|
|
181
|
+
def getRequiredNodes(self) -> dict[Shape, int]:
|
|
164
182
|
"""Return a dict from node shape to number of nodes required to run the packed jobs."""
|
|
165
183
|
return {
|
|
166
184
|
nodeShape: len(self.nodeReservations[nodeShape])
|
|
@@ -184,48 +202,72 @@ class NodeReservation:
|
|
|
184
202
|
self.nReservation: Optional[NodeReservation] = None
|
|
185
203
|
|
|
186
204
|
def __str__(self) -> str:
|
|
187
|
-
return
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
+
return (
|
|
206
|
+
"-------------------\n"
|
|
207
|
+
"Current Reservation\n"
|
|
208
|
+
"-------------------\n"
|
|
209
|
+
"Shape wallTime: %s\n"
|
|
210
|
+
"Shape memory: %s\n"
|
|
211
|
+
"Shape cores: %s\n"
|
|
212
|
+
"Shape disk: %s\n"
|
|
213
|
+
"Shape preempt: %s\n"
|
|
214
|
+
"\n"
|
|
215
|
+
"nReserv wallTime: %s\n"
|
|
216
|
+
"nReserv memory: %s\n"
|
|
217
|
+
"nReserv cores: %s\n"
|
|
218
|
+
"nReserv disk: %s\n"
|
|
219
|
+
"nReserv preempt: %s\n"
|
|
220
|
+
"\n"
|
|
221
|
+
"Time slices: %s\n"
|
|
222
|
+
"\n"
|
|
223
|
+
% (
|
|
224
|
+
self.shape.wallTime,
|
|
205
225
|
self.shape.memory,
|
|
206
226
|
self.shape.cores,
|
|
207
227
|
self.shape.disk,
|
|
208
228
|
self.shape.preemptible,
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
229
|
+
(
|
|
230
|
+
self.nReservation.shape.wallTime
|
|
231
|
+
if self.nReservation is not None
|
|
232
|
+
else str(None)
|
|
233
|
+
),
|
|
234
|
+
(
|
|
235
|
+
self.nReservation.shape.memory
|
|
236
|
+
if self.nReservation is not None
|
|
237
|
+
else str(None)
|
|
238
|
+
),
|
|
239
|
+
(
|
|
240
|
+
self.nReservation.shape.cores
|
|
241
|
+
if self.nReservation is not None
|
|
242
|
+
else str(None)
|
|
243
|
+
),
|
|
244
|
+
(
|
|
245
|
+
self.nReservation.shape.disk
|
|
246
|
+
if self.nReservation is not None
|
|
247
|
+
else str(None)
|
|
248
|
+
),
|
|
249
|
+
(
|
|
250
|
+
self.nReservation.shape.preemptible
|
|
251
|
+
if self.nReservation is not None
|
|
252
|
+
else str(None)
|
|
253
|
+
),
|
|
254
|
+
str(len(self.shapes())),
|
|
255
|
+
)
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
def get_failed_constraints(self, job_shape: Shape) -> list[FailedConstraint]:
|
|
217
259
|
"""
|
|
218
260
|
Check if a job shape's resource requirements will fit within this allocation.
|
|
219
|
-
|
|
261
|
+
|
|
220
262
|
If the job does *not* fit, returns the failing constraints: the resources
|
|
221
263
|
that can't be accomodated, and the limits that were hit.
|
|
222
|
-
|
|
264
|
+
|
|
223
265
|
If the job *does* fit, returns an empty list.
|
|
224
|
-
|
|
266
|
+
|
|
225
267
|
Must always agree with fits()! This codepath is slower and used for diagnosis.
|
|
226
268
|
"""
|
|
227
|
-
|
|
228
|
-
failures:
|
|
269
|
+
|
|
270
|
+
failures: list[FailedConstraint] = []
|
|
229
271
|
if job_shape.memory > self.shape.memory:
|
|
230
272
|
failures.append(("memory", self.shape.memory))
|
|
231
273
|
if job_shape.cores > self.shape.cores:
|
|
@@ -235,15 +277,17 @@ class NodeReservation:
|
|
|
235
277
|
if not job_shape.preemptible and self.shape.preemptible:
|
|
236
278
|
failures.append(("preemptible", self.shape.preemptible))
|
|
237
279
|
return failures
|
|
238
|
-
|
|
280
|
+
|
|
239
281
|
def fits(self, jobShape: Shape) -> bool:
|
|
240
282
|
"""Check if a job shape's resource requirements will fit within this allocation."""
|
|
241
|
-
return
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
283
|
+
return (
|
|
284
|
+
jobShape.memory <= self.shape.memory
|
|
285
|
+
and jobShape.cores <= self.shape.cores
|
|
286
|
+
and jobShape.disk <= self.shape.disk
|
|
287
|
+
and (jobShape.preemptible or not self.shape.preemptible)
|
|
288
|
+
)
|
|
245
289
|
|
|
246
|
-
def shapes(self) ->
|
|
290
|
+
def shapes(self) -> list[Shape]:
|
|
247
291
|
"""Get all time-slice shapes, in order, from this reservation on."""
|
|
248
292
|
shapes = []
|
|
249
293
|
curRes: Optional[NodeReservation] = self
|
|
@@ -254,11 +298,13 @@ class NodeReservation:
|
|
|
254
298
|
|
|
255
299
|
def subtract(self, jobShape: Shape) -> None:
|
|
256
300
|
"""Subtract the resources necessary to run a jobShape from the reservation."""
|
|
257
|
-
self.shape = Shape(
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
301
|
+
self.shape = Shape(
|
|
302
|
+
self.shape.wallTime,
|
|
303
|
+
self.shape.memory - jobShape.memory,
|
|
304
|
+
self.shape.cores - jobShape.cores,
|
|
305
|
+
self.shape.disk - jobShape.disk,
|
|
306
|
+
self.shape.preemptible,
|
|
307
|
+
)
|
|
262
308
|
|
|
263
309
|
def attemptToAddJob(
|
|
264
310
|
self, jobShape: Shape, nodeShape: Shape, targetTime: float
|
|
@@ -286,27 +332,42 @@ class NodeReservation:
|
|
|
286
332
|
# does the job time fit in the reservation's remaining time?
|
|
287
333
|
if availableTime >= jobShape.wallTime:
|
|
288
334
|
timeSlice: float = 0
|
|
289
|
-
while
|
|
335
|
+
while startingReservation != endingReservation:
|
|
290
336
|
# removes resources only (NO time) from startingReservation
|
|
291
337
|
startingReservation.subtract(jobShape) # type: ignore
|
|
292
338
|
# set aside the timeSlice
|
|
293
339
|
timeSlice += startingReservation.shape.wallTime # type: ignore
|
|
294
340
|
startingReservation = startingReservation.nReservation # type: ignore
|
|
295
|
-
assert
|
|
296
|
-
|
|
341
|
+
assert (
|
|
342
|
+
jobShape.wallTime - timeSlice
|
|
343
|
+
<= startingReservation.shape.wallTime
|
|
344
|
+
)
|
|
345
|
+
adjustEndingReservationForJob(
|
|
346
|
+
endingReservation, jobShape, timeSlice
|
|
347
|
+
)
|
|
297
348
|
# Packed the job.
|
|
298
349
|
return True
|
|
299
350
|
|
|
300
351
|
# If the job would fit, but is longer than the total node allocation
|
|
301
352
|
# extend the node allocation
|
|
302
|
-
elif
|
|
353
|
+
elif (
|
|
354
|
+
endingReservation.nReservation == None
|
|
355
|
+
and startingReservation == self
|
|
356
|
+
):
|
|
303
357
|
# Extend the node reservation to accommodate jobShape
|
|
304
358
|
endingReservation.nReservation = NodeReservation(nodeShape)
|
|
305
359
|
# can't run the job with the current resources
|
|
306
360
|
else:
|
|
307
|
-
if
|
|
361
|
+
if (
|
|
362
|
+
startingReservationTime
|
|
363
|
+
+ availableTime
|
|
364
|
+
+ endingReservation.shape.wallTime
|
|
365
|
+
<= targetTime
|
|
366
|
+
):
|
|
308
367
|
startingReservation = endingReservation.nReservation
|
|
309
|
-
startingReservationTime +=
|
|
368
|
+
startingReservationTime += (
|
|
369
|
+
availableTime + endingReservation.shape.wallTime
|
|
370
|
+
)
|
|
310
371
|
availableTime = 0
|
|
311
372
|
else:
|
|
312
373
|
break
|
|
@@ -332,7 +393,9 @@ def adjustEndingReservationForJob(
|
|
|
332
393
|
"""
|
|
333
394
|
if jobShape.wallTime - wallTime < reservation.shape.wallTime:
|
|
334
395
|
# This job only partially fills one of the slices. Create a new slice.
|
|
335
|
-
reservation.shape, nS = split(
|
|
396
|
+
reservation.shape, nS = split(
|
|
397
|
+
reservation.shape, jobShape, jobShape.wallTime - wallTime
|
|
398
|
+
)
|
|
336
399
|
nS.nReservation = reservation.nReservation
|
|
337
400
|
reservation.nReservation = nS
|
|
338
401
|
else:
|
|
@@ -342,30 +405,40 @@ def adjustEndingReservationForJob(
|
|
|
342
405
|
|
|
343
406
|
def split(
|
|
344
407
|
nodeShape: Shape, jobShape: Shape, wallTime: float
|
|
345
|
-
) ->
|
|
408
|
+
) -> tuple[Shape, NodeReservation]:
|
|
346
409
|
"""
|
|
347
410
|
Partition a node allocation into two to fit the job.
|
|
348
411
|
|
|
349
412
|
Returning the modified shape of the node and a new node reservation for
|
|
350
413
|
the extra time that the job didn't fill.
|
|
351
414
|
"""
|
|
352
|
-
return (
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
415
|
+
return (
|
|
416
|
+
Shape(
|
|
417
|
+
wallTime,
|
|
418
|
+
nodeShape.memory - jobShape.memory,
|
|
419
|
+
nodeShape.cores - jobShape.cores,
|
|
420
|
+
nodeShape.disk - jobShape.disk,
|
|
421
|
+
nodeShape.preemptible,
|
|
422
|
+
),
|
|
423
|
+
NodeReservation(
|
|
424
|
+
Shape(
|
|
425
|
+
nodeShape.wallTime - wallTime,
|
|
426
|
+
nodeShape.memory,
|
|
427
|
+
nodeShape.cores,
|
|
428
|
+
nodeShape.disk,
|
|
429
|
+
nodeShape.preemptible,
|
|
430
|
+
)
|
|
431
|
+
),
|
|
432
|
+
)
|
|
433
|
+
|
|
434
|
+
|
|
435
|
+
def binPacking(
|
|
436
|
+
nodeShapes: list[Shape], jobShapes: list[Shape], goalTime: float
|
|
437
|
+
) -> tuple[dict[Shape, int], dict[Shape, list[FailedConstraint]]]:
|
|
365
438
|
"""
|
|
366
439
|
Using the given node shape bins, pack the given job shapes into nodes to
|
|
367
440
|
get them done in the given amount of time.
|
|
368
|
-
|
|
441
|
+
|
|
369
442
|
Returns a dict saying how many of each node will be needed, a dict from job
|
|
370
443
|
shapes that could not fit to reasons why.
|
|
371
444
|
"""
|
|
@@ -388,34 +461,37 @@ class ClusterScaler:
|
|
|
388
461
|
self.provisioner = provisioner
|
|
389
462
|
self.leader = leader
|
|
390
463
|
self.config = config
|
|
391
|
-
self.static:
|
|
392
|
-
|
|
464
|
+
self.static: dict[bool, dict[str, "Node"]] = {}
|
|
465
|
+
|
|
393
466
|
# If we encounter a Shape of job that we don't think we can run, call
|
|
394
467
|
# these callbacks with the Shape that didn't fit and the Shapes that
|
|
395
468
|
# were available.
|
|
396
|
-
self.on_too_big:
|
|
469
|
+
self.on_too_big: list[Callable[[Shape, list[Shape]], Any]] = []
|
|
397
470
|
|
|
398
471
|
# Dictionary of job names to their average runtime, used to estimate wall time of queued
|
|
399
472
|
# jobs for bin-packing
|
|
400
|
-
self.jobNameToAvgRuntime:
|
|
401
|
-
self.jobNameToNumCompleted:
|
|
473
|
+
self.jobNameToAvgRuntime: dict[str, float] = {}
|
|
474
|
+
self.jobNameToNumCompleted: dict[str, int] = {}
|
|
402
475
|
self.totalAvgRuntime = 0.0
|
|
403
476
|
self.totalJobsCompleted = 0
|
|
404
477
|
|
|
405
478
|
self.targetTime: float = config.targetTime
|
|
406
479
|
if self.targetTime <= 0:
|
|
407
|
-
raise RuntimeError(
|
|
480
|
+
raise RuntimeError(
|
|
481
|
+
"targetTime (%s) must be a positive integer!" % self.targetTime
|
|
482
|
+
)
|
|
408
483
|
self.betaInertia = config.betaInertia
|
|
409
484
|
if not 0.0 <= self.betaInertia <= 0.9:
|
|
410
|
-
raise RuntimeError(
|
|
411
|
-
|
|
485
|
+
raise RuntimeError(
|
|
486
|
+
"betaInertia (%f) must be between 0.0 and 0.9!" % self.betaInertia
|
|
487
|
+
)
|
|
412
488
|
|
|
413
489
|
# Pull scaling information from the provisioner.
|
|
414
490
|
self.nodeShapeToType = provisioner.getAutoscaledInstanceShapes()
|
|
415
491
|
self.instance_types = list(self.nodeShapeToType.values())
|
|
416
492
|
self.nodeShapes = list(self.nodeShapeToType.keys())
|
|
417
493
|
|
|
418
|
-
self.ignoredNodes:
|
|
494
|
+
self.ignoredNodes: set[str] = set()
|
|
419
495
|
|
|
420
496
|
# A *deficit* exists when we have more jobs that can run on preemptible
|
|
421
497
|
# nodes than we have preemptible nodes. In order to not block these jobs,
|
|
@@ -426,13 +502,17 @@ class ClusterScaler:
|
|
|
426
502
|
# of provisioned preemptible nodes and the number of nodes that were requested.
|
|
427
503
|
# Then, when provisioning non-preemptible nodes of the same type, we attempt to
|
|
428
504
|
# make up the deficit.
|
|
429
|
-
self.preemptibleNodeDeficit = {
|
|
505
|
+
self.preemptibleNodeDeficit = {
|
|
506
|
+
instance_type: 0 for instance_type in self.instance_types
|
|
507
|
+
}
|
|
430
508
|
|
|
431
509
|
# Keeps track of the last raw (i.e. float, not limited by
|
|
432
510
|
# max/min nodes) estimates of the number of nodes needed for
|
|
433
511
|
# each node shape. NB: we start with an estimate of 0, so
|
|
434
512
|
# scaling up is smoothed as well.
|
|
435
|
-
self.previousWeightedEstimate = {
|
|
513
|
+
self.previousWeightedEstimate = {
|
|
514
|
+
nodeShape: 0.0 for nodeShape in self.nodeShapes
|
|
515
|
+
}
|
|
436
516
|
|
|
437
517
|
assert len(self.nodeShapes) > 0
|
|
438
518
|
|
|
@@ -454,26 +534,38 @@ class ClusterScaler:
|
|
|
454
534
|
self.nodeShapes.sort()
|
|
455
535
|
|
|
456
536
|
# Nodes might not actually provide all the resources of their nominal shapes
|
|
457
|
-
self.node_shapes_after_overhead =
|
|
458
|
-
|
|
537
|
+
self.node_shapes_after_overhead = (
|
|
538
|
+
self.nodeShapes
|
|
539
|
+
if config.assume_zero_overhead
|
|
540
|
+
else [self._reserve_overhead(s) for s in self.nodeShapes]
|
|
541
|
+
)
|
|
542
|
+
self.without_overhead = {
|
|
543
|
+
k: v for k, v in zip(self.node_shapes_after_overhead, self.nodeShapes)
|
|
544
|
+
}
|
|
459
545
|
|
|
460
|
-
#Node shape to number of currently provisioned nodes
|
|
461
|
-
totalNodes:
|
|
462
|
-
if
|
|
546
|
+
# Node shape to number of currently provisioned nodes
|
|
547
|
+
totalNodes: dict[Shape, int] = defaultdict(int)
|
|
548
|
+
if (
|
|
549
|
+
isinstance(leader.batchSystem, AbstractScalableBatchSystem)
|
|
550
|
+
and leader.provisioner
|
|
551
|
+
):
|
|
463
552
|
for preemptible in (True, False):
|
|
464
|
-
nodes:
|
|
553
|
+
nodes: list["Node"] = []
|
|
465
554
|
for nodeShape, instance_type in self.nodeShapeToType.items():
|
|
466
|
-
nodes_thisType = leader.provisioner.getProvisionedWorkers(
|
|
467
|
-
|
|
555
|
+
nodes_thisType = leader.provisioner.getProvisionedWorkers(
|
|
556
|
+
instance_type=instance_type, preemptible=preemptible
|
|
557
|
+
)
|
|
468
558
|
totalNodes[nodeShape] += len(nodes_thisType)
|
|
469
559
|
nodes.extend(nodes_thisType)
|
|
470
560
|
|
|
471
561
|
self.setStaticNodes(nodes, preemptible)
|
|
472
562
|
|
|
473
|
-
logger.debug(
|
|
563
|
+
logger.debug(
|
|
564
|
+
"Starting with the following nodes in the cluster: %s" % totalNodes
|
|
565
|
+
)
|
|
474
566
|
|
|
475
567
|
if not sum(config.maxNodes) > 0:
|
|
476
|
-
raise RuntimeError(
|
|
568
|
+
raise RuntimeError("Not configured to create nodes of any type.")
|
|
477
569
|
|
|
478
570
|
def _round(self, number: float) -> int:
|
|
479
571
|
"""
|
|
@@ -529,7 +621,7 @@ class ClusterScaler:
|
|
|
529
621
|
# TODO: Figure out if the disk is an OS disk of a scratch disk
|
|
530
622
|
smaller.disk -= self._disk_overhead(smaller.disk)
|
|
531
623
|
|
|
532
|
-
logger.debug(
|
|
624
|
+
logger.debug("Node shape %s can hold jobs of shape %s", full_node, smaller)
|
|
533
625
|
|
|
534
626
|
return smaller
|
|
535
627
|
|
|
@@ -558,12 +650,21 @@ class ClusterScaler:
|
|
|
558
650
|
# since the previous breakpoint, like a progressive income tax.
|
|
559
651
|
limit = min(breakpoint, memory_bytes)
|
|
560
652
|
reservation = fraction * (limit - accounted)
|
|
561
|
-
logger.debug(
|
|
653
|
+
logger.debug(
|
|
654
|
+
"Reserve %s of memory between %s and %s",
|
|
655
|
+
bytes2human(reservation),
|
|
656
|
+
bytes2human(accounted),
|
|
657
|
+
bytes2human(limit),
|
|
658
|
+
)
|
|
562
659
|
reserved += reservation
|
|
563
660
|
accounted = limit
|
|
564
661
|
if accounted >= memory_bytes:
|
|
565
662
|
break
|
|
566
|
-
logger.debug(
|
|
663
|
+
logger.debug(
|
|
664
|
+
"Reserved %s/%s memory for overhead",
|
|
665
|
+
bytes2human(reserved),
|
|
666
|
+
bytes2human(memory_bytes),
|
|
667
|
+
)
|
|
567
668
|
|
|
568
669
|
return int(reserved) + EVICTION_THRESHOLD
|
|
569
670
|
|
|
@@ -579,15 +680,20 @@ class ClusterScaler:
|
|
|
579
680
|
|
|
580
681
|
if disk_bytes <= disk_needed:
|
|
581
682
|
# We don't think we can actually use any of this disk
|
|
582
|
-
logger.warning(
|
|
683
|
+
logger.warning(
|
|
684
|
+
"All %sB of disk on a node type are likely to be needed by the OS! The node probably cannot do any useful work!",
|
|
685
|
+
bytes2human(disk_bytes),
|
|
686
|
+
)
|
|
583
687
|
return disk_bytes
|
|
584
688
|
|
|
585
689
|
if disk_needed * 2 > disk_bytes:
|
|
586
|
-
logger.warning(
|
|
690
|
+
logger.warning(
|
|
691
|
+
"A node type has only %sB disk, of which more than half are expected to be used by the OS. Consider using a larger --nodeStorage",
|
|
692
|
+
bytes2human(disk_bytes),
|
|
693
|
+
)
|
|
587
694
|
|
|
588
695
|
return disk_needed
|
|
589
696
|
|
|
590
|
-
|
|
591
697
|
def getAverageRuntime(self, jobName: str, service: bool = False) -> float:
|
|
592
698
|
if service:
|
|
593
699
|
# We short-circuit service jobs and assume that they will
|
|
@@ -599,15 +705,15 @@ class ClusterScaler:
|
|
|
599
705
|
# be running at once for any actual work to get done.
|
|
600
706
|
return self.targetTime * 24 + 3600
|
|
601
707
|
if jobName in self.jobNameToAvgRuntime:
|
|
602
|
-
#Have seen jobs of this type before, so estimate
|
|
603
|
-
#the runtime based on average of previous jobs of this type
|
|
708
|
+
# Have seen jobs of this type before, so estimate
|
|
709
|
+
# the runtime based on average of previous jobs of this type
|
|
604
710
|
return self.jobNameToAvgRuntime[jobName]
|
|
605
711
|
elif self.totalAvgRuntime > 0:
|
|
606
|
-
#Haven't seen this job yet, so estimate its runtime as
|
|
607
|
-
#the average runtime of all completed jobs
|
|
712
|
+
# Haven't seen this job yet, so estimate its runtime as
|
|
713
|
+
# the average runtime of all completed jobs
|
|
608
714
|
return self.totalAvgRuntime
|
|
609
715
|
else:
|
|
610
|
-
#Have no information whatsoever
|
|
716
|
+
# Have no information whatsoever
|
|
611
717
|
return 1.0
|
|
612
718
|
|
|
613
719
|
def addCompletedJob(self, job: JobDescription, wallTime: int) -> None:
|
|
@@ -618,21 +724,25 @@ class ClusterScaler:
|
|
|
618
724
|
:param int wallTime: The wall-time taken to complete the job in seconds.
|
|
619
725
|
"""
|
|
620
726
|
|
|
621
|
-
#Adjust average runtimes to include this job.
|
|
727
|
+
# Adjust average runtimes to include this job.
|
|
622
728
|
if job.jobName in self.jobNameToAvgRuntime:
|
|
623
729
|
prevAvg = self.jobNameToAvgRuntime[job.jobName]
|
|
624
730
|
prevNum = self.jobNameToNumCompleted[job.jobName]
|
|
625
|
-
self.jobNameToAvgRuntime[job.jobName] = float(
|
|
731
|
+
self.jobNameToAvgRuntime[job.jobName] = float(
|
|
732
|
+
prevAvg * prevNum + wallTime
|
|
733
|
+
) / (prevNum + 1)
|
|
626
734
|
self.jobNameToNumCompleted[job.jobName] += 1
|
|
627
735
|
else:
|
|
628
736
|
self.jobNameToAvgRuntime[job.jobName] = wallTime
|
|
629
737
|
self.jobNameToNumCompleted[job.jobName] = 1
|
|
630
738
|
|
|
631
739
|
self.totalJobsCompleted += 1
|
|
632
|
-
self.totalAvgRuntime =
|
|
633
|
-
|
|
740
|
+
self.totalAvgRuntime = (
|
|
741
|
+
float(self.totalAvgRuntime * (self.totalJobsCompleted - 1) + wallTime)
|
|
742
|
+
/ self.totalJobsCompleted
|
|
743
|
+
)
|
|
634
744
|
|
|
635
|
-
def setStaticNodes(self, nodes:
|
|
745
|
+
def setStaticNodes(self, nodes: list["Node"], preemptible: bool) -> None:
|
|
636
746
|
"""
|
|
637
747
|
Used to track statically provisioned nodes. This method must be called
|
|
638
748
|
before any auto-scaled nodes are provisioned.
|
|
@@ -642,12 +752,12 @@ class ClusterScaler:
|
|
|
642
752
|
|
|
643
753
|
:param nodes: list of Node objects
|
|
644
754
|
"""
|
|
645
|
-
prefix =
|
|
755
|
+
prefix = "non-" if not preemptible else ""
|
|
646
756
|
logger.debug("Adding %s to %spreemptible static nodes", nodes, prefix)
|
|
647
757
|
if nodes is not None:
|
|
648
|
-
self.static[preemptible] = {node.privateIP
|
|
758
|
+
self.static[preemptible] = {node.privateIP: node for node in nodes}
|
|
649
759
|
|
|
650
|
-
def getStaticNodes(self, preemptible: bool) ->
|
|
760
|
+
def getStaticNodes(self, preemptible: bool) -> dict[str, "Node"]:
|
|
651
761
|
"""
|
|
652
762
|
Returns nodes set in setStaticNodes().
|
|
653
763
|
|
|
@@ -662,14 +772,17 @@ class ClusterScaler:
|
|
|
662
772
|
|
|
663
773
|
Returns an integer.
|
|
664
774
|
"""
|
|
665
|
-
weightedEstimate = (
|
|
666
|
-
|
|
775
|
+
weightedEstimate = (
|
|
776
|
+
1 - self.betaInertia
|
|
777
|
+
) * estimatedNodeCount + self.betaInertia * self.previousWeightedEstimate[
|
|
778
|
+
nodeShape
|
|
779
|
+
]
|
|
667
780
|
self.previousWeightedEstimate[nodeShape] = weightedEstimate
|
|
668
781
|
return self._round(weightedEstimate)
|
|
669
782
|
|
|
670
783
|
def getEstimatedNodeCounts(
|
|
671
|
-
self, queuedJobShapes:
|
|
672
|
-
) ->
|
|
784
|
+
self, queuedJobShapes: list[Shape], currentNodeCounts: dict[Shape, int]
|
|
785
|
+
) -> tuple[dict[Shape, int], dict[Shape, list[FailedConstraint]]]:
|
|
673
786
|
"""
|
|
674
787
|
Given the resource requirements of queued jobs and the current size of the cluster.
|
|
675
788
|
|
|
@@ -682,21 +795,30 @@ class ClusterScaler:
|
|
|
682
795
|
nodesToRunQueuedJobs, could_not_fit = binPacking(
|
|
683
796
|
jobShapes=queuedJobShapes,
|
|
684
797
|
nodeShapes=self.node_shapes_after_overhead,
|
|
685
|
-
goalTime=self.targetTime
|
|
798
|
+
goalTime=self.targetTime,
|
|
686
799
|
)
|
|
687
|
-
|
|
800
|
+
|
|
688
801
|
# Then translate back to get results in terms of full nodes without overhead.
|
|
689
|
-
nodesToRunQueuedJobs = {
|
|
802
|
+
nodesToRunQueuedJobs = {
|
|
803
|
+
self.without_overhead[k]: v for k, v in nodesToRunQueuedJobs.items()
|
|
804
|
+
}
|
|
690
805
|
|
|
691
806
|
estimatedNodeCounts = {}
|
|
692
807
|
for nodeShape in self.nodeShapes:
|
|
693
808
|
instance_type = self.nodeShapeToType[nodeShape]
|
|
694
809
|
|
|
695
|
-
logger.debug(
|
|
810
|
+
logger.debug(
|
|
811
|
+
f"Nodes of type {instance_type} to run queued jobs: {nodesToRunQueuedJobs[nodeShape]}"
|
|
812
|
+
)
|
|
696
813
|
# Actual calculation of the estimated number of nodes required
|
|
697
|
-
estimatedNodeCount =
|
|
814
|
+
estimatedNodeCount = (
|
|
815
|
+
0
|
|
816
|
+
if nodesToRunQueuedJobs[nodeShape] == 0
|
|
698
817
|
else max(1, self._round(nodesToRunQueuedJobs[nodeShape]))
|
|
699
|
-
|
|
818
|
+
)
|
|
819
|
+
logger.debug(
|
|
820
|
+
"Estimating %i nodes of shape %s" % (estimatedNodeCount, nodeShape)
|
|
821
|
+
)
|
|
700
822
|
|
|
701
823
|
# Use inertia parameter to smooth out fluctuations according to an exponentially
|
|
702
824
|
# weighted moving average.
|
|
@@ -710,37 +832,56 @@ class ClusterScaler:
|
|
|
710
832
|
# The number of nodes we provision as compensation for missing preemptible
|
|
711
833
|
# nodes is the product of the deficit (the number of preemptible nodes we did
|
|
712
834
|
# _not_ allocate) and configuration preference.
|
|
713
|
-
compensationNodes = self._round(
|
|
835
|
+
compensationNodes = self._round(
|
|
836
|
+
self.preemptibleNodeDeficit[instance_type] * compensation
|
|
837
|
+
)
|
|
714
838
|
if compensationNodes > 0:
|
|
715
|
-
logger.debug(
|
|
716
|
-
|
|
717
|
-
|
|
718
|
-
|
|
839
|
+
logger.debug(
|
|
840
|
+
"Adding %d non-preemptible nodes of type %s to compensate for a "
|
|
841
|
+
"deficit of %d preemptible ones.",
|
|
842
|
+
compensationNodes,
|
|
843
|
+
instance_type,
|
|
844
|
+
self.preemptibleNodeDeficit[instance_type],
|
|
845
|
+
)
|
|
719
846
|
estimatedNodeCount += compensationNodes
|
|
720
847
|
|
|
721
848
|
# Tell everyone how big the cluster is
|
|
722
|
-
logger.debug(
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
|
|
849
|
+
logger.debug(
|
|
850
|
+
"Currently %i nodes of type %s in cluster"
|
|
851
|
+
% (currentNodeCounts[nodeShape], instance_type)
|
|
852
|
+
)
|
|
853
|
+
self.leader.toilState.bus.publish(
|
|
854
|
+
ClusterSizeMessage(instance_type, currentNodeCounts[nodeShape])
|
|
855
|
+
)
|
|
856
|
+
self.leader.toilState.bus.publish(
|
|
857
|
+
ClusterDesiredSizeMessage(instance_type, estimatedNodeCount)
|
|
858
|
+
)
|
|
726
859
|
|
|
727
860
|
# Bound number using the max and min node parameters
|
|
728
861
|
if estimatedNodeCount > self.maxNodes[nodeShape]:
|
|
729
|
-
logger.debug(
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
|
|
862
|
+
logger.debug(
|
|
863
|
+
"Limiting the estimated number of necessary %s (%s) to the "
|
|
864
|
+
"configured maximum (%s).",
|
|
865
|
+
instance_type,
|
|
866
|
+
estimatedNodeCount,
|
|
867
|
+
self.maxNodes[nodeShape],
|
|
868
|
+
)
|
|
733
869
|
estimatedNodeCount = self.maxNodes[nodeShape]
|
|
734
870
|
elif estimatedNodeCount < self.minNodes[nodeShape]:
|
|
735
|
-
logger.debug(
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
871
|
+
logger.debug(
|
|
872
|
+
"Raising the estimated number of necessary %s (%s) to the "
|
|
873
|
+
"configured minimum (%s).",
|
|
874
|
+
instance_type,
|
|
875
|
+
estimatedNodeCount,
|
|
876
|
+
self.minNodes[nodeShape],
|
|
877
|
+
)
|
|
739
878
|
estimatedNodeCount = self.minNodes[nodeShape]
|
|
740
879
|
estimatedNodeCounts[nodeShape] = estimatedNodeCount
|
|
741
880
|
return estimatedNodeCounts, could_not_fit
|
|
742
881
|
|
|
743
|
-
def updateClusterSize(
|
|
882
|
+
def updateClusterSize(
|
|
883
|
+
self, estimatedNodeCounts: dict[Shape, int]
|
|
884
|
+
) -> dict[Shape, int]:
|
|
744
885
|
"""
|
|
745
886
|
Given the desired and current size of the cluster, attempts to launch/remove instances to get to the desired size.
|
|
746
887
|
|
|
@@ -752,21 +893,26 @@ class ClusterScaler:
|
|
|
752
893
|
for nodeShape, estimatedNodeCount in estimatedNodeCounts.items():
|
|
753
894
|
instance_type = self.nodeShapeToType[nodeShape]
|
|
754
895
|
|
|
755
|
-
newNodeCount = self.setNodeCount(
|
|
896
|
+
newNodeCount = self.setNodeCount(
|
|
897
|
+
instance_type, estimatedNodeCount, preemptible=nodeShape.preemptible
|
|
898
|
+
)
|
|
756
899
|
# If we were scaling up a preemptible node type and failed to meet
|
|
757
900
|
# our target, we will attempt to compensate for the deficit while scaling
|
|
758
901
|
# non-preemptible nodes of this type.
|
|
759
902
|
if nodeShape.preemptible:
|
|
760
903
|
if newNodeCount < estimatedNodeCount:
|
|
761
904
|
deficit = estimatedNodeCount - newNodeCount
|
|
762
|
-
logger.debug(
|
|
905
|
+
logger.debug(
|
|
906
|
+
"Preemptible scaler detected deficit of %d nodes of type %s."
|
|
907
|
+
% (deficit, instance_type)
|
|
908
|
+
)
|
|
763
909
|
self.preemptibleNodeDeficit[instance_type] = deficit
|
|
764
910
|
else:
|
|
765
911
|
self.preemptibleNodeDeficit[instance_type] = 0
|
|
766
912
|
newNodeCounts[nodeShape] = newNodeCount
|
|
767
913
|
|
|
768
|
-
#Attempt to terminate any nodes that we previously designated for
|
|
769
|
-
#termination, but which still had workers running.
|
|
914
|
+
# Attempt to terminate any nodes that we previously designated for
|
|
915
|
+
# termination, but which still had workers running.
|
|
770
916
|
self._terminateIgnoredNodes()
|
|
771
917
|
return newNodeCounts
|
|
772
918
|
|
|
@@ -800,18 +946,29 @@ class ClusterScaler:
|
|
|
800
946
|
actual cluster size at the time this method returns.
|
|
801
947
|
"""
|
|
802
948
|
if not isinstance(self.leader.batchSystem, AbstractScalableBatchSystem):
|
|
803
|
-
raise RuntimeError(
|
|
949
|
+
raise RuntimeError(
|
|
950
|
+
"Non-scalable batch system abusing a scalable-only function."
|
|
951
|
+
)
|
|
804
952
|
for attempt in old_retry(predicate=self.provisioner.retryPredicate):
|
|
805
953
|
with attempt:
|
|
806
954
|
nodes = self.getNodes(preemptible)
|
|
807
955
|
logger.debug("Cluster contains %i instances" % len(nodes))
|
|
808
956
|
|
|
809
|
-
nodes = {
|
|
810
|
-
|
|
957
|
+
nodes = {
|
|
958
|
+
node: nodes[node]
|
|
959
|
+
for node in nodes
|
|
960
|
+
if node.nodeType == instance_type
|
|
961
|
+
}
|
|
962
|
+
ignoredNodes = [
|
|
963
|
+
node for node in nodes if node.privateIP in self.ignoredNodes
|
|
964
|
+
]
|
|
811
965
|
numIgnoredNodes = len(ignoredNodes)
|
|
812
966
|
numCurrentNodes = len(nodes)
|
|
813
|
-
logger.debug(
|
|
814
|
-
|
|
967
|
+
logger.debug(
|
|
968
|
+
"Cluster contains %i instances of type %s (%i ignored and draining jobs until "
|
|
969
|
+
"they can be safely terminated)"
|
|
970
|
+
% (numCurrentNodes, instance_type, numIgnoredNodes)
|
|
971
|
+
)
|
|
815
972
|
if not force:
|
|
816
973
|
delta = numNodes - (numCurrentNodes - numIgnoredNodes)
|
|
817
974
|
else:
|
|
@@ -819,38 +976,59 @@ class ClusterScaler:
|
|
|
819
976
|
if delta > 0 and numIgnoredNodes > 0:
|
|
820
977
|
# We can un-ignore a few nodes to compensate for the additional nodes we want.
|
|
821
978
|
numNodesToUnignore = min(delta, numIgnoredNodes)
|
|
822
|
-
logger.debug(
|
|
979
|
+
logger.debug(
|
|
980
|
+
"Unignoring %i nodes because we want to scale back up again."
|
|
981
|
+
% numNodesToUnignore
|
|
982
|
+
)
|
|
823
983
|
delta -= numNodesToUnignore
|
|
824
984
|
|
|
825
985
|
for node in ignoredNodes[:numNodesToUnignore]:
|
|
826
986
|
self.ignoredNodes.remove(node.privateIP)
|
|
827
987
|
self.leader.batchSystem.unignoreNode(node.privateIP)
|
|
828
988
|
if delta > 0:
|
|
829
|
-
logger.info(
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
|
|
833
|
-
|
|
834
|
-
|
|
989
|
+
logger.info(
|
|
990
|
+
"Adding %i %s nodes to get to desired cluster size of %i.",
|
|
991
|
+
delta,
|
|
992
|
+
"preemptible" if preemptible else "non-preemptible",
|
|
993
|
+
numNodes,
|
|
994
|
+
)
|
|
995
|
+
numNodes = numCurrentNodes + self._addNodes(
|
|
996
|
+
instance_type, numNodes=delta, preemptible=preemptible
|
|
997
|
+
)
|
|
835
998
|
elif delta < 0:
|
|
836
|
-
logger.info(
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
999
|
+
logger.info(
|
|
1000
|
+
"Removing %i %s nodes to get to desired cluster size of %i.",
|
|
1001
|
+
-delta,
|
|
1002
|
+
"preemptible" if preemptible else "non-preemptible",
|
|
1003
|
+
numNodes,
|
|
1004
|
+
)
|
|
1005
|
+
numNodes = numCurrentNodes - self._removeNodes(
|
|
1006
|
+
nodes,
|
|
1007
|
+
instance_type=instance_type,
|
|
1008
|
+
num_nodes=-delta,
|
|
1009
|
+
preemptible=preemptible,
|
|
1010
|
+
force=force,
|
|
1011
|
+
)
|
|
842
1012
|
elif force:
|
|
843
|
-
logger.debug(
|
|
1013
|
+
logger.debug(
|
|
1014
|
+
"Cluster already at desired size of %i. Nothing to do.",
|
|
1015
|
+
numNodes,
|
|
1016
|
+
)
|
|
844
1017
|
else:
|
|
845
|
-
logger.debug(
|
|
1018
|
+
logger.debug(
|
|
1019
|
+
"Cluster (minus ignored nodes) already at desired size of %i. Nothing to do.",
|
|
1020
|
+
numNodes,
|
|
1021
|
+
)
|
|
846
1022
|
return numNodes
|
|
847
1023
|
|
|
848
1024
|
def _addNodes(self, instance_type: str, numNodes: int, preemptible: bool) -> int:
|
|
849
|
-
return self.provisioner.addNodes(
|
|
1025
|
+
return self.provisioner.addNodes(
|
|
1026
|
+
nodeTypes={instance_type}, numNodes=numNodes, preemptible=preemptible
|
|
1027
|
+
)
|
|
850
1028
|
|
|
851
1029
|
def _removeNodes(
|
|
852
1030
|
self,
|
|
853
|
-
nodes:
|
|
1031
|
+
nodes: dict["Node", NodeInfo],
|
|
854
1032
|
instance_type: str,
|
|
855
1033
|
num_nodes: int,
|
|
856
1034
|
preemptible: bool = False,
|
|
@@ -867,17 +1045,18 @@ class ClusterScaler:
|
|
|
867
1045
|
nodes = self.getNodes(preemptible)
|
|
868
1046
|
# Filter down to nodes of the correct node type
|
|
869
1047
|
|
|
870
|
-
nodes = {
|
|
871
|
-
|
|
1048
|
+
nodes = {
|
|
1049
|
+
node: nodes[node] for node in nodes if node.nodeType == instance_type
|
|
1050
|
+
}
|
|
872
1051
|
|
|
873
1052
|
filtered_nodes = self.filter_out_static_nodes(nodes, preemptible)
|
|
874
1053
|
filtered_nodes = filtered_nodes[:num_nodes]
|
|
875
1054
|
|
|
876
1055
|
# Join nodes and instances on private IP address.
|
|
877
|
-
logger.debug(
|
|
1056
|
+
logger.debug("Nodes considered to terminate: %s", " ".join(map(str, nodes)))
|
|
878
1057
|
|
|
879
1058
|
# Tell the batch system to stop sending jobs to these nodes
|
|
880
|
-
for
|
|
1059
|
+
for node, nodeInfo in filtered_nodes:
|
|
881
1060
|
self.ignoredNodes.add(node.privateIP)
|
|
882
1061
|
self.leader.batchSystem.ignoreNode(node.privateIP)
|
|
883
1062
|
|
|
@@ -886,8 +1065,11 @@ class ClusterScaler:
|
|
|
886
1065
|
# will be terminated in _removeIgnoredNodes later on
|
|
887
1066
|
# once all jobs have finished, but they will be ignored by
|
|
888
1067
|
# the batch system and cluster scaler from now on
|
|
889
|
-
filtered_nodes = [
|
|
890
|
-
|
|
1068
|
+
filtered_nodes = [
|
|
1069
|
+
(node, nodeInfo)
|
|
1070
|
+
for (node, nodeInfo) in filtered_nodes
|
|
1071
|
+
if nodeInfo and nodeInfo.workers < 1
|
|
1072
|
+
]
|
|
891
1073
|
nodes_to_terminate = [node for (node, nodeInfo) in filtered_nodes]
|
|
892
1074
|
for node in nodes_to_terminate:
|
|
893
1075
|
if node.privateIP in self.ignoredNodes:
|
|
@@ -895,10 +1077,12 @@ class ClusterScaler:
|
|
|
895
1077
|
self.leader.batchSystem.unignoreNode(node.privateIP)
|
|
896
1078
|
else:
|
|
897
1079
|
# Without load info all we can do is sort instances by time left in billing cycle.
|
|
898
|
-
nodes_to_terminate = sorted(
|
|
1080
|
+
nodes_to_terminate = sorted(
|
|
1081
|
+
nodes.keys(), key=lambda x: x.remainingBillingInterval()
|
|
1082
|
+
)
|
|
899
1083
|
nodes_to_terminate = nodes_to_terminate[:num_nodes]
|
|
900
1084
|
number_terminated = len(nodes_to_terminate)
|
|
901
|
-
logger.debug(
|
|
1085
|
+
logger.debug("Terminating %i instance(s).", number_terminated)
|
|
902
1086
|
for node in nodes_to_terminate:
|
|
903
1087
|
if node.privateIP in self.ignoredNodes:
|
|
904
1088
|
# TODO: Why are we undoing what was just done above???
|
|
@@ -912,7 +1096,9 @@ class ClusterScaler:
|
|
|
912
1096
|
but which still have workers running.
|
|
913
1097
|
"""
|
|
914
1098
|
if not isinstance(self.leader.batchSystem, AbstractScalableBatchSystem):
|
|
915
|
-
raise RuntimeError(
|
|
1099
|
+
raise RuntimeError(
|
|
1100
|
+
"Non-scalable batch system abusing a scalable-only function."
|
|
1101
|
+
)
|
|
916
1102
|
|
|
917
1103
|
# start with a dictionary of all nodes and filter down
|
|
918
1104
|
nodes = self.getNodes()
|
|
@@ -926,10 +1112,18 @@ class ClusterScaler:
|
|
|
926
1112
|
self.ignoredNodes.remove(ip)
|
|
927
1113
|
self.leader.batchSystem.unignoreNode(ip)
|
|
928
1114
|
|
|
929
|
-
logger.debug(
|
|
930
|
-
|
|
931
|
-
|
|
932
|
-
|
|
1115
|
+
logger.debug(
|
|
1116
|
+
"There are %i nodes being ignored by the batch system, "
|
|
1117
|
+
"checking if they can be terminated" % len(self.ignoredNodes)
|
|
1118
|
+
)
|
|
1119
|
+
nodes = {
|
|
1120
|
+
node: info
|
|
1121
|
+
for node, info in nodes.items()
|
|
1122
|
+
if node.privateIP in self.ignoredNodes
|
|
1123
|
+
}
|
|
1124
|
+
nodes = {
|
|
1125
|
+
node: info for node, info in nodes.items() if info and info.workers < 1
|
|
1126
|
+
}
|
|
933
1127
|
nodes_to_terminate = list(nodes.keys())
|
|
934
1128
|
|
|
935
1129
|
for node in nodes_to_terminate:
|
|
@@ -938,25 +1132,32 @@ class ClusterScaler:
|
|
|
938
1132
|
self.provisioner.terminateNodes(nodes_to_terminate)
|
|
939
1133
|
|
|
940
1134
|
def filter_out_static_nodes(
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
preemptible: bool = False) -> List[Tuple["Node", NodeInfo]]:
|
|
1135
|
+
self, nodes: dict["Node", NodeInfo], preemptible: bool = False
|
|
1136
|
+
) -> list[tuple["Node", NodeInfo]]:
|
|
944
1137
|
filtered_nodes = []
|
|
945
1138
|
for node, nodeInfo in nodes.items():
|
|
946
1139
|
if node:
|
|
947
|
-
non =
|
|
1140
|
+
non = "non-" if not preemptible else ""
|
|
948
1141
|
if node.privateIP in self.getStaticNodes(preemptible):
|
|
949
1142
|
# we don't want to automatically terminate any statically provisioned nodes
|
|
950
|
-
logger.debug(
|
|
1143
|
+
logger.debug(
|
|
1144
|
+
f"Found {node.privateIP} in {non}preemptible static nodes"
|
|
1145
|
+
)
|
|
951
1146
|
else:
|
|
952
|
-
logger.debug(
|
|
1147
|
+
logger.debug(
|
|
1148
|
+
f"Did not find {node.privateIP} in {non}preemptible static nodes"
|
|
1149
|
+
)
|
|
953
1150
|
filtered_nodes.append((node, nodeInfo))
|
|
954
1151
|
# Sort nodes by number of workers and time left in billing cycle
|
|
955
|
-
filtered_nodes.sort(
|
|
956
|
-
|
|
1152
|
+
filtered_nodes.sort(
|
|
1153
|
+
key=lambda node_nodeInfo: (
|
|
1154
|
+
node_nodeInfo[1].workers if node_nodeInfo[1] else 1,
|
|
1155
|
+
node_nodeInfo[0].remainingBillingInterval(),
|
|
1156
|
+
)
|
|
1157
|
+
)
|
|
957
1158
|
return filtered_nodes
|
|
958
1159
|
|
|
959
|
-
def getNodes(self, preemptible: Optional[bool] = None) ->
|
|
1160
|
+
def getNodes(self, preemptible: Optional[bool] = None) -> dict["Node", NodeInfo]:
|
|
960
1161
|
"""
|
|
961
1162
|
Returns a dictionary mapping node identifiers of preemptible or non-preemptible nodes to
|
|
962
1163
|
NodeInfo objects, one for each node.
|
|
@@ -968,25 +1169,31 @@ class ClusterScaler:
|
|
|
968
1169
|
If None, all nodes will be returned.
|
|
969
1170
|
"""
|
|
970
1171
|
if not isinstance(self.leader.batchSystem, AbstractScalableBatchSystem):
|
|
971
|
-
raise RuntimeError(
|
|
1172
|
+
raise RuntimeError(
|
|
1173
|
+
"Non-scalable batch system abusing a scalable-only function."
|
|
1174
|
+
)
|
|
972
1175
|
# nodes seen within the last 600 seconds (10 minutes)
|
|
973
1176
|
recent_nodes = self.leader.batchSystem.getNodes(preemptible, timeout=600)
|
|
974
1177
|
# all available nodes
|
|
975
1178
|
all_nodes = self.leader.batchSystem.getNodes(preemptible)
|
|
976
1179
|
# nodes that are supposedly doing something
|
|
977
|
-
provisioned_nodes = self.provisioner.getProvisionedWorkers(
|
|
1180
|
+
provisioned_nodes = self.provisioner.getProvisionedWorkers(
|
|
1181
|
+
preemptible=preemptible
|
|
1182
|
+
)
|
|
978
1183
|
|
|
979
1184
|
if len(recent_nodes) != len(provisioned_nodes):
|
|
980
1185
|
logger.debug("Consolidating state between mesos and provisioner")
|
|
981
1186
|
|
|
982
|
-
nodeToInfo:
|
|
1187
|
+
nodeToInfo: dict["Node", NodeInfo] = {}
|
|
983
1188
|
# fixme: what happens if awsFilterImpairedNodes is used?
|
|
984
1189
|
# if this assertion is false it means that user-managed nodes are being
|
|
985
1190
|
# used that are outside the provisioner's control
|
|
986
1191
|
# this would violate many basic assumptions in autoscaling so it currently not allowed
|
|
987
1192
|
for node, ip in ((node, node.privateIP) for node in provisioned_nodes):
|
|
988
1193
|
if ip not in recent_nodes:
|
|
989
|
-
logger.debug(
|
|
1194
|
+
logger.debug(
|
|
1195
|
+
"Worker node at %s is not reporting executor information", ip
|
|
1196
|
+
)
|
|
990
1197
|
|
|
991
1198
|
# get up-to-date information about the node, if available
|
|
992
1199
|
info = all_nodes.get(ip)
|
|
@@ -1009,9 +1216,15 @@ class ClusterScaler:
|
|
|
1009
1216
|
#
|
|
1010
1217
|
# In all 3 situations it's safe to fake executor info with 0 workers,
|
|
1011
1218
|
# since in all cases there are no workers running.
|
|
1012
|
-
info = NodeInfo(
|
|
1013
|
-
|
|
1014
|
-
|
|
1219
|
+
info = NodeInfo(
|
|
1220
|
+
coresTotal=1,
|
|
1221
|
+
coresUsed=0,
|
|
1222
|
+
requestedCores=0,
|
|
1223
|
+
memoryTotal=1,
|
|
1224
|
+
memoryUsed=0,
|
|
1225
|
+
requestedMemory=0,
|
|
1226
|
+
workers=0,
|
|
1227
|
+
)
|
|
1015
1228
|
else:
|
|
1016
1229
|
# mesos knows about the ip & we have up-to-date information - easy!
|
|
1017
1230
|
info = recent_nodes[ip]
|
|
@@ -1020,40 +1233,55 @@ class ClusterScaler:
|
|
|
1020
1233
|
return nodeToInfo
|
|
1021
1234
|
|
|
1022
1235
|
def shutDown(self) -> None:
|
|
1023
|
-
logger.debug(
|
|
1236
|
+
logger.debug("Forcing provisioner to reduce cluster size to zero.")
|
|
1024
1237
|
for nodeShape in self.nodeShapes:
|
|
1025
1238
|
preemptible = nodeShape.preemptible
|
|
1026
1239
|
instance_type = self.nodeShapeToType[nodeShape]
|
|
1027
|
-
self.setNodeCount(
|
|
1240
|
+
self.setNodeCount(
|
|
1241
|
+
instance_type=instance_type,
|
|
1242
|
+
numNodes=0,
|
|
1243
|
+
preemptible=preemptible,
|
|
1244
|
+
force=True,
|
|
1245
|
+
)
|
|
1246
|
+
|
|
1028
1247
|
|
|
1029
1248
|
class JobTooBigError(Exception):
|
|
1030
1249
|
"""
|
|
1031
1250
|
Raised in the scaler thread when a job cannot fit in any available node
|
|
1032
1251
|
type and is likely to lock up the workflow.
|
|
1033
1252
|
"""
|
|
1034
|
-
|
|
1035
|
-
def __init__(
|
|
1253
|
+
|
|
1254
|
+
def __init__(
|
|
1255
|
+
self,
|
|
1256
|
+
job: Optional[JobDescription] = None,
|
|
1257
|
+
shape: Optional[Shape] = None,
|
|
1258
|
+
constraints: Optional[list[FailedConstraint]] = None,
|
|
1259
|
+
):
|
|
1036
1260
|
"""
|
|
1037
1261
|
Make a JobTooBigError.
|
|
1038
|
-
|
|
1262
|
+
|
|
1039
1263
|
Can have a job, the job's shape, and the limiting resources and amounts. All are optional.
|
|
1040
1264
|
"""
|
|
1041
1265
|
self.job = job
|
|
1042
1266
|
self.shape = shape
|
|
1043
1267
|
self.constraints = constraints if constraints is not None else []
|
|
1044
|
-
|
|
1268
|
+
|
|
1045
1269
|
parts = [
|
|
1046
1270
|
f"The job {self.job}" if self.job else "A job",
|
|
1047
1271
|
f" with shape {self.shape}" if self.shape else "",
|
|
1048
|
-
" is too big for any available node type."
|
|
1272
|
+
" is too big for any available node type.",
|
|
1049
1273
|
]
|
|
1050
|
-
|
|
1274
|
+
|
|
1051
1275
|
if self.constraints:
|
|
1052
1276
|
parts.append(" It could have fit if it only needed ")
|
|
1053
|
-
parts.append(
|
|
1054
|
-
|
|
1055
|
-
|
|
1056
|
-
|
|
1277
|
+
parts.append(
|
|
1278
|
+
", ".join(
|
|
1279
|
+
[f"{limit} {resource}" for resource, limit in self.constraints]
|
|
1280
|
+
)
|
|
1281
|
+
)
|
|
1282
|
+
parts.append(".")
|
|
1283
|
+
|
|
1284
|
+
self.msg = "".join(parts)
|
|
1057
1285
|
super().__init__()
|
|
1058
1286
|
|
|
1059
1287
|
def __str__(self) -> str:
|
|
@@ -1062,6 +1290,7 @@ class JobTooBigError(Exception):
|
|
|
1062
1290
|
"""
|
|
1063
1291
|
return self.msg
|
|
1064
1292
|
|
|
1293
|
+
|
|
1065
1294
|
class ScalerThread(ExceptionalThread):
|
|
1066
1295
|
"""
|
|
1067
1296
|
A thread that automatically scales the number of either preemptible or non-preemptible worker
|
|
@@ -1077,10 +1306,17 @@ class ScalerThread(ExceptionalThread):
|
|
|
1077
1306
|
is made, else the size of the cluster is adapted. The beta factor is an inertia parameter
|
|
1078
1307
|
that prevents continual fluctuations in the number of nodes.
|
|
1079
1308
|
"""
|
|
1080
|
-
|
|
1081
|
-
|
|
1309
|
+
|
|
1310
|
+
def __init__(
|
|
1311
|
+
self,
|
|
1312
|
+
provisioner: AbstractProvisioner,
|
|
1313
|
+
leader: "Leader",
|
|
1314
|
+
config: Config,
|
|
1315
|
+
stop_on_exception: bool = False,
|
|
1316
|
+
) -> None:
|
|
1317
|
+
super().__init__(name="scaler")
|
|
1082
1318
|
self.scaler = ClusterScaler(provisioner, leader, config)
|
|
1083
|
-
|
|
1319
|
+
|
|
1084
1320
|
# Indicates that the scaling thread should shutdown
|
|
1085
1321
|
self.stop = False
|
|
1086
1322
|
# Indicates that we should stop the thread if we encounter an error.
|
|
@@ -1090,13 +1326,13 @@ class ScalerThread(ExceptionalThread):
|
|
|
1090
1326
|
self.stats = None
|
|
1091
1327
|
if config.clusterStats:
|
|
1092
1328
|
logger.debug("Starting up cluster statistics...")
|
|
1093
|
-
self.stats = ClusterStats(
|
|
1094
|
-
|
|
1095
|
-
|
|
1329
|
+
self.stats = ClusterStats(
|
|
1330
|
+
leader.config.clusterStats, leader.batchSystem, provisioner.clusterName
|
|
1331
|
+
)
|
|
1096
1332
|
for preemptible in [True, False]:
|
|
1097
1333
|
self.stats.startStats(preemptible=preemptible)
|
|
1098
1334
|
logger.debug("...Cluster stats started.")
|
|
1099
|
-
|
|
1335
|
+
|
|
1100
1336
|
def check(self) -> None:
|
|
1101
1337
|
"""
|
|
1102
1338
|
Attempt to join any existing scaler threads that may have died or finished.
|
|
@@ -1121,20 +1357,27 @@ class ScalerThread(ExceptionalThread):
|
|
|
1121
1357
|
|
|
1122
1358
|
def tryRun(self) -> None:
|
|
1123
1359
|
if self.scaler.leader.provisioner is None:
|
|
1124
|
-
raise RuntimeError(
|
|
1125
|
-
|
|
1360
|
+
raise RuntimeError(
|
|
1361
|
+
"No provisioner found for a scaling cluster "
|
|
1362
|
+
'(cannot access "getProvisionedWorkers").'
|
|
1363
|
+
)
|
|
1126
1364
|
while not self.stop:
|
|
1127
1365
|
with throttle(self.scaler.config.scaleInterval):
|
|
1128
1366
|
try:
|
|
1129
1367
|
queuedJobs = self.scaler.leader.getJobs()
|
|
1130
1368
|
queuedJobShapes = [
|
|
1131
|
-
Shape(
|
|
1132
|
-
|
|
1133
|
-
|
|
1369
|
+
Shape(
|
|
1370
|
+
wallTime=self.scaler.getAverageRuntime(
|
|
1371
|
+
jobName=job.jobName,
|
|
1372
|
+
service=isinstance(job, ServiceJobDescription),
|
|
1373
|
+
),
|
|
1134
1374
|
memory=job.memory,
|
|
1135
1375
|
cores=job.cores,
|
|
1136
1376
|
disk=job.disk,
|
|
1137
|
-
preemptible=job.preemptible
|
|
1377
|
+
preemptible=job.preemptible,
|
|
1378
|
+
)
|
|
1379
|
+
for job in queuedJobs
|
|
1380
|
+
]
|
|
1138
1381
|
currentNodeCounts = {}
|
|
1139
1382
|
for nodeShape in self.scaler.nodeShapes:
|
|
1140
1383
|
instance_type = self.scaler.nodeShapeToType[nodeShape]
|
|
@@ -1144,14 +1387,16 @@ class ScalerThread(ExceptionalThread):
|
|
|
1144
1387
|
preemptible=nodeShape.preemptible,
|
|
1145
1388
|
)
|
|
1146
1389
|
)
|
|
1147
|
-
estimatedNodeCounts, could_not_fit =
|
|
1148
|
-
|
|
1390
|
+
estimatedNodeCounts, could_not_fit = (
|
|
1391
|
+
self.scaler.getEstimatedNodeCounts(
|
|
1392
|
+
queuedJobShapes, currentNodeCounts
|
|
1393
|
+
)
|
|
1149
1394
|
)
|
|
1150
1395
|
self.scaler.updateClusterSize(estimatedNodeCounts)
|
|
1151
1396
|
if self.stats:
|
|
1152
1397
|
self.stats.checkStats()
|
|
1153
|
-
|
|
1154
|
-
if len(could_not_fit) != 0:
|
|
1398
|
+
|
|
1399
|
+
if len(could_not_fit) != 0:
|
|
1155
1400
|
# If we have any jobs left over that we couldn't fit, complain.
|
|
1156
1401
|
bad_job: Optional[JobDescription] = None
|
|
1157
1402
|
bad_shape: Optional[Shape] = None
|
|
@@ -1164,39 +1409,49 @@ class ScalerThread(ExceptionalThread):
|
|
|
1164
1409
|
if bad_shape is None:
|
|
1165
1410
|
# If we can't find an offending job, grab an arbitrary offending shape.
|
|
1166
1411
|
bad_shape = next(iter(could_not_fit))
|
|
1167
|
-
|
|
1168
|
-
raise JobTooBigError(
|
|
1169
|
-
|
|
1412
|
+
|
|
1413
|
+
raise JobTooBigError(
|
|
1414
|
+
job=bad_job,
|
|
1415
|
+
shape=bad_shape,
|
|
1416
|
+
constraints=could_not_fit[bad_shape],
|
|
1417
|
+
)
|
|
1418
|
+
|
|
1170
1419
|
except:
|
|
1171
1420
|
if self.stop_on_exception:
|
|
1172
1421
|
logger.critical("Stopping ScalerThread due to an error.")
|
|
1173
1422
|
raise
|
|
1174
1423
|
else:
|
|
1175
|
-
logger.exception(
|
|
1176
|
-
|
|
1424
|
+
logger.exception(
|
|
1425
|
+
"Exception encountered in scaler thread. Making a best-effort "
|
|
1426
|
+
"attempt to keep going, but things may go wrong from now on."
|
|
1427
|
+
)
|
|
1177
1428
|
self.scaler.shutDown()
|
|
1178
1429
|
|
|
1430
|
+
|
|
1179
1431
|
class ClusterStats:
|
|
1180
1432
|
def __init__(
|
|
1181
1433
|
self, path: str, batchSystem: AbstractBatchSystem, clusterName: Optional[str]
|
|
1182
1434
|
) -> None:
|
|
1183
1435
|
logger.debug("Initializing cluster statistics")
|
|
1184
|
-
self.stats:
|
|
1185
|
-
self.statsThreads:
|
|
1436
|
+
self.stats: dict[str, dict[str, list[dict[str, Any]]]] = {}
|
|
1437
|
+
self.statsThreads: list[ExceptionalThread] = []
|
|
1186
1438
|
self.statsPath = path
|
|
1187
1439
|
self.stop = False
|
|
1188
1440
|
self.clusterName = clusterName
|
|
1189
1441
|
self.batchSystem = batchSystem
|
|
1190
|
-
self.scaleable =
|
|
1191
|
-
|
|
1442
|
+
self.scaleable = (
|
|
1443
|
+
isinstance(self.batchSystem, AbstractScalableBatchSystem)
|
|
1444
|
+
if batchSystem
|
|
1445
|
+
else False
|
|
1446
|
+
)
|
|
1192
1447
|
|
|
1193
1448
|
def shutDownStats(self) -> None:
|
|
1194
1449
|
if self.stop:
|
|
1195
1450
|
return
|
|
1196
1451
|
|
|
1197
1452
|
def getFileName() -> str:
|
|
1198
|
-
extension =
|
|
1199
|
-
file =
|
|
1453
|
+
extension = ".json"
|
|
1454
|
+
file = "%s-stats" % self.clusterName
|
|
1200
1455
|
counter = 0
|
|
1201
1456
|
while True:
|
|
1202
1457
|
suffix = str(counter).zfill(3) + extension
|
|
@@ -1204,12 +1459,13 @@ class ClusterStats:
|
|
|
1204
1459
|
if not os.path.exists(fullName):
|
|
1205
1460
|
return fullName
|
|
1206
1461
|
counter += 1
|
|
1462
|
+
|
|
1207
1463
|
if self.statsPath and self.scaleable:
|
|
1208
1464
|
self.stop = True
|
|
1209
1465
|
for thread in self.statsThreads:
|
|
1210
1466
|
thread.join()
|
|
1211
1467
|
fileName = getFileName()
|
|
1212
|
-
with open(fileName,
|
|
1468
|
+
with open(fileName, "w") as f:
|
|
1213
1469
|
json.dump(self.stats, f)
|
|
1214
1470
|
|
|
1215
1471
|
def startStats(self, preemptible: bool) -> None:
|
|
@@ -1223,22 +1479,26 @@ class ClusterStats:
|
|
|
1223
1479
|
thread.join(timeout=0)
|
|
1224
1480
|
|
|
1225
1481
|
def _gatherStats(self, preemptible: bool) -> None:
|
|
1226
|
-
def toDict(nodeInfo: NodeInfo) ->
|
|
1482
|
+
def toDict(nodeInfo: NodeInfo) -> dict[str, Any]:
|
|
1227
1483
|
# convert NodeInfo object to dict to improve JSON output
|
|
1228
|
-
return dict(
|
|
1229
|
-
|
|
1230
|
-
|
|
1231
|
-
|
|
1232
|
-
|
|
1233
|
-
|
|
1234
|
-
|
|
1235
|
-
|
|
1236
|
-
|
|
1484
|
+
return dict(
|
|
1485
|
+
memory=nodeInfo.memoryUsed,
|
|
1486
|
+
cores=nodeInfo.coresUsed,
|
|
1487
|
+
memoryTotal=nodeInfo.memoryTotal,
|
|
1488
|
+
coresTotal=nodeInfo.coresTotal,
|
|
1489
|
+
requestedCores=nodeInfo.requestedCores,
|
|
1490
|
+
requestedMemory=nodeInfo.requestedMemory,
|
|
1491
|
+
workers=nodeInfo.workers,
|
|
1492
|
+
time=time.time(), # add time stamp
|
|
1493
|
+
)
|
|
1494
|
+
|
|
1237
1495
|
if self.scaleable:
|
|
1238
1496
|
logger.debug("Starting to gather statistics")
|
|
1239
|
-
stats:
|
|
1497
|
+
stats: dict[str, list[dict[str, Any]]] = {}
|
|
1240
1498
|
if not isinstance(self.batchSystem, AbstractScalableBatchSystem):
|
|
1241
|
-
raise RuntimeError(
|
|
1499
|
+
raise RuntimeError(
|
|
1500
|
+
"Non-scalable batch system abusing a scalable-only function."
|
|
1501
|
+
)
|
|
1242
1502
|
try:
|
|
1243
1503
|
while not self.stop:
|
|
1244
1504
|
nodeInfo = self.batchSystem.getNodes(preemptible)
|
|
@@ -1255,6 +1515,8 @@ class ClusterStats:
|
|
|
1255
1515
|
stats[nodeIP] = [nodeStatsDict]
|
|
1256
1516
|
time.sleep(60)
|
|
1257
1517
|
finally:
|
|
1258
|
-
threadName =
|
|
1259
|
-
logger.debug(
|
|
1518
|
+
threadName = "Preemptible" if preemptible else "Non-preemptible"
|
|
1519
|
+
logger.debug(
|
|
1520
|
+
"%s provisioner stats thread shut down successfully.", threadName
|
|
1521
|
+
)
|
|
1260
1522
|
self.stats[threadName] = stats
|