toil 7.0.0__py3-none-any.whl → 8.1.0b1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +124 -86
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +137 -77
- toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
- toil/batchSystems/awsBatch.py +237 -128
- toil/batchSystems/cleanup_support.py +22 -16
- toil/batchSystems/contained_executor.py +30 -26
- toil/batchSystems/gridengine.py +85 -49
- toil/batchSystems/htcondor.py +164 -87
- toil/batchSystems/kubernetes.py +622 -386
- toil/batchSystems/local_support.py +17 -12
- toil/batchSystems/lsf.py +132 -79
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +288 -149
- toil/batchSystems/mesos/executor.py +77 -49
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +39 -29
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +293 -123
- toil/batchSystems/slurm.py +651 -155
- toil/batchSystems/torque.py +46 -32
- toil/bus.py +141 -73
- toil/common.py +784 -397
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1137 -534
- toil/cwl/utils.py +17 -22
- toil/deferred.py +62 -41
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +88 -57
- toil/fileStores/cachingFileStore.py +711 -247
- toil/fileStores/nonCachingFileStore.py +113 -75
- toil/job.py +1031 -349
- toil/jobStores/abstractJobStore.py +387 -243
- toil/jobStores/aws/jobStore.py +772 -412
- toil/jobStores/aws/utils.py +161 -109
- toil/jobStores/conftest.py +1 -0
- toil/jobStores/fileJobStore.py +289 -151
- toil/jobStores/googleJobStore.py +137 -70
- toil/jobStores/utils.py +36 -15
- toil/leader.py +614 -269
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +55 -28
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +204 -58
- toil/lib/aws/utils.py +290 -213
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +83 -49
- toil/lib/docker.py +131 -103
- toil/lib/dockstore.py +379 -0
- toil/lib/ec2.py +322 -209
- toil/lib/ec2nodes.py +174 -105
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +4 -2
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/history.py +1271 -0
- toil/lib/history_submission.py +681 -0
- toil/lib/humanize.py +6 -2
- toil/lib/io.py +121 -12
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +83 -18
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +19 -7
- toil/lib/retry.py +125 -87
- toil/lib/threading.py +282 -80
- toil/lib/throttle.py +15 -14
- toil/lib/trs.py +390 -0
- toil/lib/web.py +38 -0
- toil/options/common.py +850 -402
- toil/options/cwl.py +185 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +70 -19
- toil/provisioners/__init__.py +111 -46
- toil/provisioners/abstractProvisioner.py +322 -157
- toil/provisioners/aws/__init__.py +62 -30
- toil/provisioners/aws/awsProvisioner.py +980 -627
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +283 -180
- toil/provisioners/node.py +147 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +127 -61
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +84 -55
- toil/server/utils.py +56 -31
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +183 -65
- toil/test/__init__.py +263 -179
- toil/test/batchSystems/batchSystemTest.py +438 -195
- toil/test/batchSystems/batch_system_plugin_test.py +18 -7
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +265 -49
- toil/test/cactus/test_cactus_integration.py +20 -22
- toil/test/cwl/conftest.py +39 -0
- toil/test/cwl/cwlTest.py +375 -72
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/optional-file.cwl +18 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/docs/scriptsTest.py +60 -34
- toil/test/jobStores/jobStoreTest.py +412 -235
- toil/test/lib/aws/test_iam.py +116 -48
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +57 -49
- toil/test/lib/test_history.py +212 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/lib/test_trs.py +161 -0
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/options.py +7 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +81 -42
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +140 -100
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +33 -26
- toil/test/src/environmentTest.py +20 -10
- toil/test/src/fileStoreTest.py +538 -271
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +32 -17
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +120 -70
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +6 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +33 -16
- toil/test/utils/toilDebugTest.py +70 -58
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +239 -102
- toil/test/wdl/wdltoil_test.py +789 -148
- toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
- toil/toilState.py +52 -26
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +85 -25
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +251 -145
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +27 -14
- toil/utils/toilSshCluster.py +45 -22
- toil/utils/toilStats.py +75 -36
- toil/utils/toilStatus.py +226 -119
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +6 -6
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3528 -1053
- toil/worker.py +370 -149
- toil-8.1.0b1.dist-info/METADATA +178 -0
- toil-8.1.0b1.dist-info/RECORD +259 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/WHEEL +1 -1
- toil-7.0.0.dist-info/METADATA +0 -158
- toil-7.0.0.dist-info/RECORD +0 -244
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/LICENSE +0 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/entry_points.txt +0 -0
- {toil-7.0.0.dist-info → toil-8.1.0b1.dist-info}/top_level.txt +0 -0
|
@@ -20,7 +20,7 @@ import tempfile
|
|
|
20
20
|
import textwrap
|
|
21
21
|
from abc import ABC, abstractmethod
|
|
22
22
|
from functools import total_ordering
|
|
23
|
-
from typing import Any,
|
|
23
|
+
from typing import Any, Optional, Union
|
|
24
24
|
from urllib.parse import quote
|
|
25
25
|
from uuid import uuid4
|
|
26
26
|
|
|
@@ -55,6 +55,7 @@ class Shape:
|
|
|
55
55
|
The memory and disk attributes store the number of bytes required by a job (or provided by a
|
|
56
56
|
node) in RAM or on disk (SSD or HDD), respectively.
|
|
57
57
|
"""
|
|
58
|
+
|
|
58
59
|
def __init__(
|
|
59
60
|
self,
|
|
60
61
|
wallTime: Union[int, float],
|
|
@@ -70,11 +71,13 @@ class Shape:
|
|
|
70
71
|
self.preemptible = preemptible
|
|
71
72
|
|
|
72
73
|
def __eq__(self, other: Any) -> bool:
|
|
73
|
-
return (
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
74
|
+
return (
|
|
75
|
+
self.wallTime == other.wallTime
|
|
76
|
+
and self.memory == other.memory
|
|
77
|
+
and self.cores == other.cores
|
|
78
|
+
and self.disk == other.disk
|
|
79
|
+
and self.preemptible == other.preemptible
|
|
80
|
+
)
|
|
78
81
|
|
|
79
82
|
def greater_than(self, other: Any) -> bool:
|
|
80
83
|
if self.preemptible < other.preemptible:
|
|
@@ -104,12 +107,13 @@ class Shape:
|
|
|
104
107
|
return self.greater_than(other)
|
|
105
108
|
|
|
106
109
|
def __repr__(self) -> str:
|
|
107
|
-
return "Shape(wallTime=%s, memory=%s, cores=%s, disk=%s, preemptible=%s)" %
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
110
|
+
return "Shape(wallTime=%s, memory=%s, cores=%s, disk=%s, preemptible=%s)" % (
|
|
111
|
+
self.wallTime,
|
|
112
|
+
self.memory,
|
|
113
|
+
self.cores,
|
|
114
|
+
self.disk,
|
|
115
|
+
self.preemptible,
|
|
116
|
+
)
|
|
113
117
|
|
|
114
118
|
def __str__(self) -> str:
|
|
115
119
|
return self.__repr__()
|
|
@@ -117,17 +121,14 @@ class Shape:
|
|
|
117
121
|
def __hash__(self) -> int:
|
|
118
122
|
# Since we replaced __eq__ we need to replace __hash__ as well.
|
|
119
123
|
return hash(
|
|
120
|
-
(self.wallTime,
|
|
121
|
-
|
|
122
|
-
self.cores,
|
|
123
|
-
self.disk,
|
|
124
|
-
self.preemptible))
|
|
124
|
+
(self.wallTime, self.memory, self.cores, self.disk, self.preemptible)
|
|
125
|
+
)
|
|
125
126
|
|
|
126
127
|
|
|
127
128
|
class AbstractProvisioner(ABC):
|
|
128
129
|
"""Interface for provisioning worker nodes to use in a Toil cluster."""
|
|
129
130
|
|
|
130
|
-
LEADER_HOME_DIR =
|
|
131
|
+
LEADER_HOME_DIR = "/root/" # home directory in the Toil appliance on an instance
|
|
131
132
|
cloud: str = None
|
|
132
133
|
|
|
133
134
|
def __init__(
|
|
@@ -136,8 +137,8 @@ class AbstractProvisioner(ABC):
|
|
|
136
137
|
clusterType: Optional[str] = "mesos",
|
|
137
138
|
zone: Optional[str] = None,
|
|
138
139
|
nodeStorage: int = 50,
|
|
139
|
-
nodeStorageOverrides: Optional[
|
|
140
|
-
enable_fuse: bool = False
|
|
140
|
+
nodeStorageOverrides: Optional[list[str]] = None,
|
|
141
|
+
enable_fuse: bool = False,
|
|
141
142
|
) -> None:
|
|
142
143
|
"""
|
|
143
144
|
Initialize provisioner.
|
|
@@ -161,7 +162,7 @@ class AbstractProvisioner(ABC):
|
|
|
161
162
|
self._nodeStorage = nodeStorage
|
|
162
163
|
self._nodeStorageOverrides = {}
|
|
163
164
|
for override in nodeStorageOverrides or []:
|
|
164
|
-
nodeShape, storageOverride = override.split(
|
|
165
|
+
nodeShape, storageOverride = override.split(":")
|
|
165
166
|
self._nodeStorageOverrides[nodeShape] = int(storageOverride)
|
|
166
167
|
self._leaderPrivateIP: Optional[str] = None
|
|
167
168
|
# This will hold an SSH public key for Mesos clusters, or the
|
|
@@ -179,7 +180,7 @@ class AbstractProvisioner(ABC):
|
|
|
179
180
|
self.readClusterSettings()
|
|
180
181
|
|
|
181
182
|
@abstractmethod
|
|
182
|
-
def supportedClusterTypes(self) ->
|
|
183
|
+
def supportedClusterTypes(self) -> set[str]:
|
|
183
184
|
"""
|
|
184
185
|
Get all the cluster types that this provisioner implementation
|
|
185
186
|
supports.
|
|
@@ -245,12 +246,14 @@ class AbstractProvisioner(ABC):
|
|
|
245
246
|
:param leader: Node to pull credentials from, if not the current machine.
|
|
246
247
|
"""
|
|
247
248
|
|
|
248
|
-
if self.clusterType ==
|
|
249
|
+
if self.clusterType == "mesos":
|
|
249
250
|
# We're using a Mesos cluster, so set up SSH from leader to workers.
|
|
250
251
|
self._leaderWorkerAuthentication = self._setSSH(leader=leader)
|
|
251
|
-
elif self.clusterType ==
|
|
252
|
+
elif self.clusterType == "kubernetes":
|
|
252
253
|
# We're using a Kubernetes cluster.
|
|
253
|
-
self._leaderWorkerAuthentication = self._getKubernetesJoiningInfo(
|
|
254
|
+
self._leaderWorkerAuthentication = self._getKubernetesJoiningInfo(
|
|
255
|
+
leader=leader
|
|
256
|
+
)
|
|
254
257
|
|
|
255
258
|
def _clearLeaderWorkerAuthentication(self):
|
|
256
259
|
"""
|
|
@@ -277,16 +280,22 @@ class AbstractProvisioner(ABC):
|
|
|
277
280
|
|
|
278
281
|
# To work locally or remotely we need to do all our setup work as one
|
|
279
282
|
# big bash -c
|
|
280
|
-
command = [
|
|
281
|
-
|
|
282
|
-
|
|
283
|
+
command = [
|
|
284
|
+
"bash",
|
|
285
|
+
"-c",
|
|
286
|
+
(
|
|
287
|
+
"set -e; if [ ! -e /root/.sshSuccess ] ; "
|
|
288
|
+
'then ssh-keygen -f /root/.ssh/id_rsa -t rsa -N ""; '
|
|
289
|
+
"touch /root/.sshSuccess; fi; chmod 700 /root/.ssh;"
|
|
290
|
+
),
|
|
291
|
+
]
|
|
283
292
|
|
|
284
293
|
if leader is None:
|
|
285
294
|
# Run locally
|
|
286
295
|
subprocess.check_call(command)
|
|
287
296
|
|
|
288
297
|
# Grab from local file
|
|
289
|
-
with open(
|
|
298
|
+
with open("/root/.ssh/id_rsa.pub") as f:
|
|
290
299
|
leaderPublicKey = f.read()
|
|
291
300
|
else:
|
|
292
301
|
# Run remotely
|
|
@@ -294,20 +303,20 @@ class AbstractProvisioner(ABC):
|
|
|
294
303
|
|
|
295
304
|
# Grab from remote file
|
|
296
305
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
297
|
-
localFile = os.path.join(tmpdir,
|
|
298
|
-
leader.extractFile(
|
|
306
|
+
localFile = os.path.join(tmpdir, "id_rsa.pub")
|
|
307
|
+
leader.extractFile("/root/.ssh/id_rsa.pub", localFile, "toil_leader")
|
|
299
308
|
|
|
300
309
|
with open(localFile) as f:
|
|
301
310
|
leaderPublicKey = f.read()
|
|
302
311
|
|
|
303
312
|
# Drop the key type and keep just the key data
|
|
304
|
-
leaderPublicKey = leaderPublicKey.split(
|
|
313
|
+
leaderPublicKey = leaderPublicKey.split(" ")[1]
|
|
305
314
|
|
|
306
315
|
# confirm it really is an RSA public key
|
|
307
|
-
assert leaderPublicKey.startswith(
|
|
316
|
+
assert leaderPublicKey.startswith("AAAAB3NzaC1yc2E"), leaderPublicKey
|
|
308
317
|
return leaderPublicKey
|
|
309
318
|
|
|
310
|
-
def _getKubernetesJoiningInfo(self, leader: Node = None) ->
|
|
319
|
+
def _getKubernetesJoiningInfo(self, leader: Node = None) -> dict[str, str]:
|
|
311
320
|
"""
|
|
312
321
|
Get the Kubernetes joining info created when Kubernetes was set up on
|
|
313
322
|
this node, which is the leader, or on a different specified Node.
|
|
@@ -327,22 +336,24 @@ class AbstractProvisioner(ABC):
|
|
|
327
336
|
# This info is always supposed to be set up before the Toil appliance
|
|
328
337
|
# starts, and mounted in at the same path as on the host. So we just go
|
|
329
338
|
# read it.
|
|
330
|
-
with open(
|
|
339
|
+
with open("/etc/kubernetes/worker.ini") as f:
|
|
331
340
|
config.read_file(f)
|
|
332
341
|
else:
|
|
333
342
|
# Grab from remote file
|
|
334
343
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
335
|
-
localFile = os.path.join(tmpdir,
|
|
336
|
-
leader.extractFile(
|
|
344
|
+
localFile = os.path.join(tmpdir, "worker.ini")
|
|
345
|
+
leader.extractFile(
|
|
346
|
+
"/etc/kubernetes/worker.ini", localFile, "toil_leader"
|
|
347
|
+
)
|
|
337
348
|
|
|
338
349
|
with open(localFile) as f:
|
|
339
350
|
config.read_file(f)
|
|
340
351
|
|
|
341
352
|
# Grab everything out of the default section where our setup script put
|
|
342
353
|
# it.
|
|
343
|
-
return dict(config[
|
|
354
|
+
return dict(config["DEFAULT"])
|
|
344
355
|
|
|
345
|
-
def setAutoscaledNodeTypes(self, nodeTypes:
|
|
356
|
+
def setAutoscaledNodeTypes(self, nodeTypes: list[tuple[set[str], Optional[float]]]):
|
|
346
357
|
"""
|
|
347
358
|
Set node types, shapes and spot bids for Toil-managed autoscaling.
|
|
348
359
|
:param nodeTypes: A list of node types, as parsed with parse_node_types.
|
|
@@ -375,13 +386,13 @@ class AbstractProvisioner(ABC):
|
|
|
375
386
|
"""
|
|
376
387
|
return len(self.getAutoscaledInstanceShapes()) > 0
|
|
377
388
|
|
|
378
|
-
def getAutoscaledInstanceShapes(self) ->
|
|
389
|
+
def getAutoscaledInstanceShapes(self) -> dict[Shape, str]:
|
|
379
390
|
"""
|
|
380
391
|
Get all the node shapes and their named instance types that the Toil
|
|
381
392
|
autoscaler should manage.
|
|
382
393
|
"""
|
|
383
394
|
|
|
384
|
-
if hasattr(self,
|
|
395
|
+
if hasattr(self, "_shape_to_instance_type"):
|
|
385
396
|
# We have had Toil-managed autoscaling set up
|
|
386
397
|
return dict(self._shape_to_instance_type)
|
|
387
398
|
else:
|
|
@@ -418,7 +429,7 @@ class AbstractProvisioner(ABC):
|
|
|
418
429
|
@abstractmethod
|
|
419
430
|
def addNodes(
|
|
420
431
|
self,
|
|
421
|
-
nodeTypes:
|
|
432
|
+
nodeTypes: set[str],
|
|
422
433
|
numNodes: int,
|
|
423
434
|
preemptible: bool,
|
|
424
435
|
spotBid: Optional[float] = None,
|
|
@@ -433,7 +444,9 @@ class AbstractProvisioner(ABC):
|
|
|
433
444
|
"""
|
|
434
445
|
raise NotImplementedError
|
|
435
446
|
|
|
436
|
-
def addManagedNodes(
|
|
447
|
+
def addManagedNodes(
|
|
448
|
+
self, nodeTypes: set[str], minNodes, maxNodes, preemptible, spotBid=None
|
|
449
|
+
) -> None:
|
|
437
450
|
"""
|
|
438
451
|
Add a group of managed nodes of the given type, up to the given maximum.
|
|
439
452
|
The nodes will automatically be launched and terminated depending on cluster load.
|
|
@@ -448,10 +461,12 @@ class AbstractProvisioner(ABC):
|
|
|
448
461
|
"""
|
|
449
462
|
|
|
450
463
|
# Not available by default
|
|
451
|
-
raise ManagedNodesNotSupportedException(
|
|
464
|
+
raise ManagedNodesNotSupportedException(
|
|
465
|
+
"Managed nodes not supported by this provisioner"
|
|
466
|
+
)
|
|
452
467
|
|
|
453
468
|
@abstractmethod
|
|
454
|
-
def terminateNodes(self, nodes:
|
|
469
|
+
def terminateNodes(self, nodes: list[Node]) -> None:
|
|
455
470
|
"""
|
|
456
471
|
Terminate the nodes represented by given Node objects
|
|
457
472
|
|
|
@@ -467,7 +482,9 @@ class AbstractProvisioner(ABC):
|
|
|
467
482
|
raise NotImplementedError
|
|
468
483
|
|
|
469
484
|
@abstractmethod
|
|
470
|
-
def getProvisionedWorkers(
|
|
485
|
+
def getProvisionedWorkers(
|
|
486
|
+
self, instance_type: Optional[str] = None, preemptible: Optional[bool] = None
|
|
487
|
+
) -> list[Node]:
|
|
471
488
|
"""
|
|
472
489
|
Gets all nodes, optionally of the given instance type or
|
|
473
490
|
preemptability, from the provisioner. Includes both static and
|
|
@@ -514,7 +531,14 @@ class AbstractProvisioner(ABC):
|
|
|
514
531
|
# Holds strings like "ssh-rsa actualKeyData" for keys to authorize (independently of cloud provider's system)
|
|
515
532
|
self.sshPublicKeys = []
|
|
516
533
|
|
|
517
|
-
def addFile(
|
|
534
|
+
def addFile(
|
|
535
|
+
self,
|
|
536
|
+
path: str,
|
|
537
|
+
filesystem: str = "root",
|
|
538
|
+
mode: Union[str, int] = "0755",
|
|
539
|
+
contents: str = "",
|
|
540
|
+
append: bool = False,
|
|
541
|
+
):
|
|
518
542
|
"""
|
|
519
543
|
Make a file on the instance with the given filesystem, mode, and contents.
|
|
520
544
|
|
|
@@ -526,16 +550,21 @@ class AbstractProvisioner(ABC):
|
|
|
526
550
|
mode = int(mode, 8)
|
|
527
551
|
assert isinstance(mode, int)
|
|
528
552
|
|
|
529
|
-
contents =
|
|
553
|
+
contents = "data:," + quote(contents.encode("utf-8"))
|
|
530
554
|
|
|
531
|
-
ignition_file = {
|
|
555
|
+
ignition_file = {
|
|
556
|
+
"path": path,
|
|
557
|
+
"filesystem": filesystem,
|
|
558
|
+
"mode": mode,
|
|
559
|
+
"contents": {"source": contents},
|
|
560
|
+
}
|
|
532
561
|
|
|
533
562
|
if append:
|
|
534
563
|
ignition_file["append"] = append
|
|
535
564
|
|
|
536
565
|
self.files.append(ignition_file)
|
|
537
566
|
|
|
538
|
-
def addUnit(self, name: str, enabled: bool = True, contents: str =
|
|
567
|
+
def addUnit(self, name: str, enabled: bool = True, contents: str = ""):
|
|
539
568
|
"""
|
|
540
569
|
Make a systemd unit on the instance with the given name (including
|
|
541
570
|
.service), and content. Units will be enabled by default.
|
|
@@ -546,7 +575,7 @@ class AbstractProvisioner(ABC):
|
|
|
546
575
|
journalctl -xe
|
|
547
576
|
"""
|
|
548
577
|
|
|
549
|
-
self.units.append({
|
|
578
|
+
self.units.append({"name": name, "enabled": enabled, "contents": contents})
|
|
550
579
|
|
|
551
580
|
def addSSHRSAKey(self, keyData: str):
|
|
552
581
|
"""
|
|
@@ -563,30 +592,19 @@ class AbstractProvisioner(ABC):
|
|
|
563
592
|
# Define the base config. We're using Flatcar's v2.2.0 fork
|
|
564
593
|
# See: https://github.com/kinvolk/ignition/blob/flatcar-master/doc/configuration-v2_2.md
|
|
565
594
|
config = {
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
},
|
|
569
|
-
'storage': {
|
|
570
|
-
'files': self.files
|
|
571
|
-
},
|
|
572
|
-
'systemd': {
|
|
573
|
-
'units': self.units
|
|
574
|
-
}
|
|
595
|
+
"ignition": {"version": "2.2.0"},
|
|
596
|
+
"storage": {"files": self.files},
|
|
597
|
+
"systemd": {"units": self.units},
|
|
575
598
|
}
|
|
576
599
|
|
|
577
600
|
if len(self.sshPublicKeys) > 0:
|
|
578
601
|
# Add SSH keys if needed
|
|
579
|
-
config[
|
|
580
|
-
|
|
581
|
-
{
|
|
582
|
-
'name': 'core',
|
|
583
|
-
'sshAuthorizedKeys': self.sshPublicKeys
|
|
584
|
-
}
|
|
585
|
-
]
|
|
602
|
+
config["passwd"] = {
|
|
603
|
+
"users": [{"name": "core", "sshAuthorizedKeys": self.sshPublicKeys}]
|
|
586
604
|
}
|
|
587
605
|
|
|
588
606
|
# Serialize as JSON
|
|
589
|
-
return json.dumps(config, separators=(
|
|
607
|
+
return json.dumps(config, separators=(",", ":"))
|
|
590
608
|
|
|
591
609
|
def getBaseInstanceConfiguration(self) -> InstanceConfiguration:
|
|
592
610
|
"""
|
|
@@ -596,10 +614,16 @@ class AbstractProvisioner(ABC):
|
|
|
596
614
|
config = self.InstanceConfiguration()
|
|
597
615
|
|
|
598
616
|
# We set Flatcar's update reboot strategy to off
|
|
599
|
-
config.addFile(
|
|
617
|
+
config.addFile(
|
|
618
|
+
"/etc/coreos/update.conf",
|
|
619
|
+
mode="0644",
|
|
620
|
+
contents=textwrap.dedent(
|
|
621
|
+
"""\
|
|
600
622
|
GROUP=stable
|
|
601
623
|
REBOOT_STRATEGY=off
|
|
602
|
-
"""
|
|
624
|
+
"""
|
|
625
|
+
),
|
|
626
|
+
)
|
|
603
627
|
|
|
604
628
|
# Then we have volume mounting. That always happens.
|
|
605
629
|
self.addVolumesService(config)
|
|
@@ -621,7 +645,10 @@ class AbstractProvisioner(ABC):
|
|
|
621
645
|
#
|
|
622
646
|
# TODO: check what kind of instance this is, and what ephemeral volumes
|
|
623
647
|
# *should* be there, and declaratively RAID and mount them.
|
|
624
|
-
config.addFile(
|
|
648
|
+
config.addFile(
|
|
649
|
+
"/home/core/volumes.sh",
|
|
650
|
+
contents=textwrap.dedent(
|
|
651
|
+
"""\
|
|
625
652
|
#!/bin/bash
|
|
626
653
|
set -x
|
|
627
654
|
ephemeral_count=0
|
|
@@ -684,9 +711,14 @@ class AbstractProvisioner(ABC):
|
|
|
684
711
|
sudo mkdir -p /var/$directory
|
|
685
712
|
sudo mount --bind /mnt/ephemeral/var/$directory /var/$directory
|
|
686
713
|
done
|
|
687
|
-
"""
|
|
714
|
+
"""
|
|
715
|
+
),
|
|
716
|
+
)
|
|
688
717
|
# TODO: Make this retry?
|
|
689
|
-
config.addUnit(
|
|
718
|
+
config.addUnit(
|
|
719
|
+
"volume-mounting.service",
|
|
720
|
+
contents=textwrap.dedent(
|
|
721
|
+
"""\
|
|
690
722
|
[Unit]
|
|
691
723
|
Description=mounts ephemeral volumes & bind mounts toil directories
|
|
692
724
|
Before=docker.service
|
|
@@ -698,14 +730,19 @@ class AbstractProvisioner(ABC):
|
|
|
698
730
|
|
|
699
731
|
[Install]
|
|
700
732
|
WantedBy=multi-user.target
|
|
701
|
-
"""
|
|
733
|
+
"""
|
|
734
|
+
),
|
|
735
|
+
)
|
|
702
736
|
|
|
703
737
|
def addNodeExporterService(self, config: InstanceConfiguration):
|
|
704
738
|
"""
|
|
705
739
|
Add the node exporter service for Prometheus to an instance configuration.
|
|
706
740
|
"""
|
|
707
741
|
|
|
708
|
-
config.addUnit(
|
|
742
|
+
config.addUnit(
|
|
743
|
+
"node-exporter.service",
|
|
744
|
+
contents=textwrap.dedent(
|
|
745
|
+
"""\
|
|
709
746
|
[Unit]
|
|
710
747
|
Description=node-exporter container
|
|
711
748
|
After=docker.service
|
|
@@ -728,12 +765,20 @@ class AbstractProvisioner(ABC):
|
|
|
728
765
|
|
|
729
766
|
[Install]
|
|
730
767
|
WantedBy=multi-user.target
|
|
731
|
-
|
|
768
|
+
"""
|
|
769
|
+
),
|
|
770
|
+
)
|
|
732
771
|
|
|
733
772
|
def toil_service_env_options(self) -> str:
|
|
734
773
|
return "-e TMPDIR=/var/tmp"
|
|
735
774
|
|
|
736
|
-
def add_toil_service(
|
|
775
|
+
def add_toil_service(
|
|
776
|
+
self,
|
|
777
|
+
config: InstanceConfiguration,
|
|
778
|
+
role: str,
|
|
779
|
+
keyPath: str = None,
|
|
780
|
+
preemptible: bool = False,
|
|
781
|
+
):
|
|
737
782
|
"""
|
|
738
783
|
Add the Toil leader or worker service to an instance configuration.
|
|
739
784
|
|
|
@@ -750,46 +795,59 @@ class AbstractProvisioner(ABC):
|
|
|
750
795
|
# transferred. The waitForKey.sh script loops on the new VM until it finds the keyPath file, then it starts the
|
|
751
796
|
# mesos-agent. If there are multiple keys to be transferred, then the last one to be transferred must be
|
|
752
797
|
# set to keyPath.
|
|
753
|
-
MESOS_LOG_DIR =
|
|
754
|
-
LEADER_DOCKER_ARGS =
|
|
798
|
+
MESOS_LOG_DIR = "--log_dir=/var/lib/mesos "
|
|
799
|
+
LEADER_DOCKER_ARGS = "--registry=in_memory --cluster={name}"
|
|
755
800
|
# --no-systemd_enable_support is necessary in Ubuntu 16.04 (otherwise,
|
|
756
801
|
# Mesos attempts to contact systemd but can't find its run file)
|
|
757
|
-
WORKER_DOCKER_ARGS =
|
|
758
|
-
|
|
759
|
-
if self.clusterType ==
|
|
760
|
-
if role ==
|
|
761
|
-
entryPoint =
|
|
762
|
-
entryPointArgs = MESOS_LOG_DIR + LEADER_DOCKER_ARGS.format(
|
|
763
|
-
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
|
|
802
|
+
WORKER_DOCKER_ARGS = "--work_dir=/var/lib/mesos --master={ip}:5050 --attributes=preemptible:{preemptible} --no-hostname_lookup --no-systemd_enable_support"
|
|
803
|
+
|
|
804
|
+
if self.clusterType == "mesos":
|
|
805
|
+
if role == "leader":
|
|
806
|
+
entryPoint = "mesos-master"
|
|
807
|
+
entryPointArgs = MESOS_LOG_DIR + LEADER_DOCKER_ARGS.format(
|
|
808
|
+
name=self.clusterName
|
|
809
|
+
)
|
|
810
|
+
elif role == "worker":
|
|
811
|
+
entryPoint = "mesos-agent"
|
|
812
|
+
entryPointArgs = MESOS_LOG_DIR + WORKER_DOCKER_ARGS.format(
|
|
813
|
+
ip=self._leaderPrivateIP, preemptible=preemptible
|
|
814
|
+
)
|
|
767
815
|
else:
|
|
768
816
|
raise RuntimeError("Unknown role %s" % role)
|
|
769
|
-
elif self.clusterType ==
|
|
770
|
-
if role ==
|
|
817
|
+
elif self.clusterType == "kubernetes":
|
|
818
|
+
if role == "leader":
|
|
771
819
|
# We need *an* entry point or the leader container will finish
|
|
772
820
|
# and go away, and thus not be available to take user logins.
|
|
773
|
-
entryPoint =
|
|
774
|
-
entryPointArgs =
|
|
821
|
+
entryPoint = "sleep"
|
|
822
|
+
entryPointArgs = "infinity"
|
|
775
823
|
else:
|
|
776
|
-
raise RuntimeError(
|
|
777
|
-
|
|
824
|
+
raise RuntimeError(
|
|
825
|
+
"Toil service not needed for %s nodes in a %s cluster",
|
|
826
|
+
role,
|
|
827
|
+
self.clusterType,
|
|
828
|
+
)
|
|
778
829
|
else:
|
|
779
|
-
raise RuntimeError(
|
|
830
|
+
raise RuntimeError(
|
|
831
|
+
"Toil service not needed in a %s cluster", self.clusterType
|
|
832
|
+
)
|
|
780
833
|
|
|
781
834
|
if keyPath:
|
|
782
|
-
entryPointArgs = keyPath +
|
|
835
|
+
entryPointArgs = keyPath + " " + entryPointArgs
|
|
783
836
|
entryPoint = "waitForKey.sh"
|
|
784
837
|
customDockerInitCommand = customDockerInitCmd()
|
|
785
838
|
if customDockerInitCommand:
|
|
786
|
-
entryPointArgs = " ".join(
|
|
839
|
+
entryPointArgs = " ".join(
|
|
840
|
+
["'" + customDockerInitCommand + "'", entryPoint, entryPointArgs]
|
|
841
|
+
)
|
|
787
842
|
entryPoint = "customDockerInit.sh"
|
|
788
843
|
|
|
789
844
|
# Set up the service. Make sure to make it default to using the
|
|
790
845
|
# actually-big temp directory of /var/tmp (see
|
|
791
846
|
# https://systemd.io/TEMPORARY_DIRECTORIES/).
|
|
792
|
-
config.addUnit(
|
|
847
|
+
config.addUnit(
|
|
848
|
+
f"toil-{role}.service",
|
|
849
|
+
contents=textwrap.dedent(
|
|
850
|
+
f"""\
|
|
793
851
|
[Unit]
|
|
794
852
|
Description=toil-{role} container
|
|
795
853
|
After=docker.service
|
|
@@ -828,9 +886,11 @@ class AbstractProvisioner(ABC):
|
|
|
828
886
|
|
|
829
887
|
[Install]
|
|
830
888
|
WantedBy=multi-user.target
|
|
831
|
-
|
|
889
|
+
"""
|
|
890
|
+
),
|
|
891
|
+
)
|
|
832
892
|
|
|
833
|
-
def getKubernetesValues(self, architecture: str =
|
|
893
|
+
def getKubernetesValues(self, architecture: str = "amd64"):
|
|
834
894
|
"""
|
|
835
895
|
Returns a dict of Kubernetes component versions and paths for formatting into Kubernetes-related templates.
|
|
836
896
|
"""
|
|
@@ -857,10 +917,14 @@ class AbstractProvisioner(ABC):
|
|
|
857
917
|
METRICS_API_VERSION="v0.3.7",
|
|
858
918
|
CLUSTER_NAME=self.clusterName,
|
|
859
919
|
# YAML line that tells the Kubelet to use a cloud provider, if we need one.
|
|
860
|
-
CLOUD_PROVIDER_SPEC=(
|
|
920
|
+
CLOUD_PROVIDER_SPEC=(
|
|
921
|
+
("cloud-provider: " + cloud_provider) if cloud_provider else ""
|
|
922
|
+
),
|
|
861
923
|
)
|
|
862
924
|
|
|
863
|
-
def addKubernetesServices(
|
|
925
|
+
def addKubernetesServices(
|
|
926
|
+
self, config: InstanceConfiguration, architecture: str = "amd64"
|
|
927
|
+
):
|
|
864
928
|
"""
|
|
865
929
|
Add installing Kubernetes and Kubeadm and setting up the Kubelet to run when configured to an instance configuration.
|
|
866
930
|
The same process applies to leaders and workers.
|
|
@@ -869,7 +933,10 @@ class AbstractProvisioner(ABC):
|
|
|
869
933
|
values = self.getKubernetesValues(architecture)
|
|
870
934
|
|
|
871
935
|
# We're going to ship the Kubelet service from Kubernetes' release pipeline via cloud-config
|
|
872
|
-
config.addUnit(
|
|
936
|
+
config.addUnit(
|
|
937
|
+
"kubelet.service",
|
|
938
|
+
contents=textwrap.dedent(
|
|
939
|
+
"""\
|
|
873
940
|
# This came from https://raw.githubusercontent.com/kubernetes/release/v0.4.0/cmd/kubepkg/templates/latest/deb/kubelet/lib/systemd/system/kubelet.service
|
|
874
941
|
# It has been modified to replace /usr/bin with {DOWNLOAD_DIR}
|
|
875
942
|
# License: https://raw.githubusercontent.com/kubernetes/release/v0.4.0/LICENSE
|
|
@@ -888,11 +955,16 @@ class AbstractProvisioner(ABC):
|
|
|
888
955
|
|
|
889
956
|
[Install]
|
|
890
957
|
WantedBy=multi-user.target
|
|
891
|
-
|
|
958
|
+
"""
|
|
959
|
+
).format(**values),
|
|
960
|
+
)
|
|
892
961
|
|
|
893
962
|
# It needs this config file
|
|
894
|
-
config.addFile(
|
|
895
|
-
|
|
963
|
+
config.addFile(
|
|
964
|
+
"/etc/systemd/system/kubelet.service.d/10-kubeadm.conf",
|
|
965
|
+
mode="0644",
|
|
966
|
+
contents=textwrap.dedent(
|
|
967
|
+
"""\
|
|
896
968
|
# This came from https://raw.githubusercontent.com/kubernetes/release/v0.4.0/cmd/kubepkg/templates/latest/deb/kubeadm/10-kubeadm.conf
|
|
897
969
|
# It has been modified to replace /usr/bin with {DOWNLOAD_DIR}
|
|
898
970
|
# License: https://raw.githubusercontent.com/kubernetes/release/v0.4.0/LICENSE
|
|
@@ -908,7 +980,9 @@ class AbstractProvisioner(ABC):
|
|
|
908
980
|
EnvironmentFile=-/etc/default/kubelet
|
|
909
981
|
ExecStart=
|
|
910
982
|
ExecStart={DOWNLOAD_DIR}/kubelet $KUBELET_KUBECONFIG_ARGS $KUBELET_CONFIG_ARGS $KUBELET_KUBEADM_ARGS $KUBELET_EXTRA_ARGS
|
|
911
|
-
|
|
983
|
+
"""
|
|
984
|
+
).format(**values),
|
|
985
|
+
)
|
|
912
986
|
|
|
913
987
|
# Before we let the kubelet try to start, we have to actually download it (and kubeadm)
|
|
914
988
|
# We set up this service so it can restart on failure despite not
|
|
@@ -919,7 +993,10 @@ class AbstractProvisioner(ABC):
|
|
|
919
993
|
# restarts work if the script fails. We also use a condition which
|
|
920
994
|
# treats the service as successful and skips it if it made a file to
|
|
921
995
|
# say it already ran.
|
|
922
|
-
config.addFile(
|
|
996
|
+
config.addFile(
|
|
997
|
+
"/home/core/install-kubernetes.sh",
|
|
998
|
+
contents=textwrap.dedent(
|
|
999
|
+
"""\
|
|
923
1000
|
#!/usr/bin/env bash
|
|
924
1001
|
set -e
|
|
925
1002
|
FLAG_FILE="{SETUP_STATE_DIR}/install-kubernetes.done"
|
|
@@ -938,8 +1015,13 @@ class AbstractProvisioner(ABC):
|
|
|
938
1015
|
|
|
939
1016
|
mkdir -p "{SETUP_STATE_DIR}"
|
|
940
1017
|
touch "$FLAG_FILE"
|
|
941
|
-
|
|
942
|
-
|
|
1018
|
+
"""
|
|
1019
|
+
).format(**values),
|
|
1020
|
+
)
|
|
1021
|
+
config.addUnit(
|
|
1022
|
+
"install-kubernetes.service",
|
|
1023
|
+
contents=textwrap.dedent(
|
|
1024
|
+
"""\
|
|
943
1025
|
[Unit]
|
|
944
1026
|
Description=base Kubernetes installation
|
|
945
1027
|
Wants=network-online.target
|
|
@@ -957,12 +1039,14 @@ class AbstractProvisioner(ABC):
|
|
|
957
1039
|
[Install]
|
|
958
1040
|
WantedBy=multi-user.target
|
|
959
1041
|
RequiredBy=kubelet.service
|
|
960
|
-
|
|
1042
|
+
"""
|
|
1043
|
+
).format(**values),
|
|
1044
|
+
)
|
|
961
1045
|
|
|
962
1046
|
# Now we should have the kubeadm command, and the bootlooping kubelet
|
|
963
1047
|
# waiting for kubeadm to configure it.
|
|
964
1048
|
|
|
965
|
-
def getKubernetesAutoscalerSetupCommands(self, values:
|
|
1049
|
+
def getKubernetesAutoscalerSetupCommands(self, values: dict[str, str]) -> str:
|
|
966
1050
|
"""
|
|
967
1051
|
Return Bash commands that set up the Kubernetes cluster autoscaler for
|
|
968
1052
|
provisioning from the environment supported by this provisioner.
|
|
@@ -997,7 +1081,11 @@ class AbstractProvisioner(ABC):
|
|
|
997
1081
|
|
|
998
1082
|
# Customize scheduler to pack jobs into as few nodes as possible
|
|
999
1083
|
# See: https://kubernetes.io/docs/reference/scheduling/config/#profiles
|
|
1000
|
-
config.addFile(
|
|
1084
|
+
config.addFile(
|
|
1085
|
+
"/home/core/scheduler-config.yml",
|
|
1086
|
+
mode="0644",
|
|
1087
|
+
contents=textwrap.dedent(
|
|
1088
|
+
"""\
|
|
1001
1089
|
apiVersion: kubescheduler.config.k8s.io/v1beta1
|
|
1002
1090
|
kind: KubeSchedulerConfiguration
|
|
1003
1091
|
clientConnection:
|
|
@@ -1011,13 +1099,21 @@ class AbstractProvisioner(ABC):
|
|
|
1011
1099
|
enabled:
|
|
1012
1100
|
- name: NodeResourcesMostAllocated
|
|
1013
1101
|
weight: 1
|
|
1014
|
-
|
|
1102
|
+
""".format(
|
|
1103
|
+
**values
|
|
1104
|
+
)
|
|
1105
|
+
),
|
|
1106
|
+
)
|
|
1015
1107
|
|
|
1016
1108
|
# Main kubeadm cluster configuration.
|
|
1017
1109
|
# Make sure to mount the scheduler config where the scheduler can see
|
|
1018
1110
|
# it, which is undocumented but inferred from
|
|
1019
1111
|
# https://pkg.go.dev/k8s.io/kubernetes@v1.21.0/cmd/kubeadm/app/apis/kubeadm#ControlPlaneComponent
|
|
1020
|
-
config.addFile(
|
|
1112
|
+
config.addFile(
|
|
1113
|
+
"/home/core/kubernetes-leader.yml",
|
|
1114
|
+
mode="0644",
|
|
1115
|
+
contents=textwrap.dedent(
|
|
1116
|
+
"""\
|
|
1021
1117
|
apiVersion: kubeadm.k8s.io/v1beta2
|
|
1022
1118
|
kind: InitConfiguration
|
|
1023
1119
|
nodeRegistration:
|
|
@@ -1049,11 +1145,18 @@ class AbstractProvisioner(ABC):
|
|
|
1049
1145
|
serverTLSBootstrap: true
|
|
1050
1146
|
rotateCertificates: true
|
|
1051
1147
|
cgroupDriver: systemd
|
|
1052
|
-
|
|
1148
|
+
""".format(
|
|
1149
|
+
**values
|
|
1150
|
+
)
|
|
1151
|
+
),
|
|
1152
|
+
)
|
|
1053
1153
|
|
|
1054
1154
|
# Make a script to apply that and the other cluster components
|
|
1055
1155
|
# Note that we're escaping {{thing}} as {{{{thing}}}} because we need to match mustaches in a yaml we hack up.
|
|
1056
|
-
config.addFile(
|
|
1156
|
+
config.addFile(
|
|
1157
|
+
"/home/core/create-kubernetes-cluster.sh",
|
|
1158
|
+
contents=textwrap.dedent(
|
|
1159
|
+
"""\
|
|
1057
1160
|
#!/usr/bin/env bash
|
|
1058
1161
|
set -e
|
|
1059
1162
|
|
|
@@ -1086,7 +1189,11 @@ class AbstractProvisioner(ABC):
|
|
|
1086
1189
|
kubectl apply -f https://raw.githubusercontent.com/kontena/kubelet-rubber-stamp/release/{RUBBER_STAMP_VERSION}/deploy/role_binding.yaml
|
|
1087
1190
|
kubectl apply -f https://raw.githubusercontent.com/kontena/kubelet-rubber-stamp/release/{RUBBER_STAMP_VERSION}/deploy/operator.yaml
|
|
1088
1191
|
|
|
1089
|
-
|
|
1192
|
+
"""
|
|
1193
|
+
).format(**values)
|
|
1194
|
+
+ self.getKubernetesAutoscalerSetupCommands(values)
|
|
1195
|
+
+ textwrap.dedent(
|
|
1196
|
+
"""\
|
|
1090
1197
|
# Set up metrics server, which needs serverTLSBootstrap and rubber stamp, and insists on running on a worker
|
|
1091
1198
|
curl -sSL https://github.com/kubernetes-sigs/metrics-server/releases/download/{METRICS_API_VERSION}/components.yaml | \\
|
|
1092
1199
|
sed 's/ - --secure-port=4443/ - --secure-port=4443\\n - --kubelet-preferred-address-types=Hostname/' | \\
|
|
@@ -1100,8 +1207,13 @@ class AbstractProvisioner(ABC):
|
|
|
1100
1207
|
|
|
1101
1208
|
mkdir -p "{SETUP_STATE_DIR}"
|
|
1102
1209
|
touch "$FLAG_FILE"
|
|
1103
|
-
|
|
1104
|
-
|
|
1210
|
+
"""
|
|
1211
|
+
).format(**values),
|
|
1212
|
+
)
|
|
1213
|
+
config.addUnit(
|
|
1214
|
+
"create-kubernetes-cluster.service",
|
|
1215
|
+
contents=textwrap.dedent(
|
|
1216
|
+
"""\
|
|
1105
1217
|
[Unit]
|
|
1106
1218
|
Description=Kubernetes cluster bootstrap
|
|
1107
1219
|
After=install-kubernetes.service
|
|
@@ -1120,10 +1232,15 @@ class AbstractProvisioner(ABC):
|
|
|
1120
1232
|
[Install]
|
|
1121
1233
|
WantedBy=multi-user.target
|
|
1122
1234
|
RequiredBy=toil-leader.service
|
|
1123
|
-
|
|
1235
|
+
"""
|
|
1236
|
+
).format(**values),
|
|
1237
|
+
)
|
|
1124
1238
|
|
|
1125
1239
|
# We also need a node cleaner service
|
|
1126
|
-
config.addFile(
|
|
1240
|
+
config.addFile(
|
|
1241
|
+
"/home/core/cleanup-nodes.sh",
|
|
1242
|
+
contents=textwrap.dedent(
|
|
1243
|
+
"""\
|
|
1127
1244
|
#!/usr/bin/env bash
|
|
1128
1245
|
# cleanup-nodes.sh: constantly clean up NotReady nodes that are tainted as having been deleted
|
|
1129
1246
|
set -e
|
|
@@ -1142,8 +1259,13 @@ class AbstractProvisioner(ABC):
|
|
|
1142
1259
|
done
|
|
1143
1260
|
sleep 300
|
|
1144
1261
|
done
|
|
1145
|
-
|
|
1146
|
-
|
|
1262
|
+
"""
|
|
1263
|
+
).format(**values),
|
|
1264
|
+
)
|
|
1265
|
+
config.addUnit(
|
|
1266
|
+
"cleanup-nodes.service",
|
|
1267
|
+
contents=textwrap.dedent(
|
|
1268
|
+
"""\
|
|
1147
1269
|
[Unit]
|
|
1148
1270
|
Description=Remove scaled-in nodes
|
|
1149
1271
|
After=create-kubernetes-cluster.service
|
|
@@ -1155,9 +1277,16 @@ class AbstractProvisioner(ABC):
|
|
|
1155
1277
|
RestartSec=10
|
|
1156
1278
|
[Install]
|
|
1157
1279
|
WantedBy=multi-user.target
|
|
1158
|
-
|
|
1280
|
+
"""
|
|
1281
|
+
),
|
|
1282
|
+
)
|
|
1159
1283
|
|
|
1160
|
-
def addKubernetesWorker(
|
|
1284
|
+
def addKubernetesWorker(
|
|
1285
|
+
self,
|
|
1286
|
+
config: InstanceConfiguration,
|
|
1287
|
+
authVars: dict[str, str],
|
|
1288
|
+
preemptible: bool = False,
|
|
1289
|
+
):
|
|
1161
1290
|
"""
|
|
1162
1291
|
Add services to configure as a Kubernetes worker, if Kubernetes is
|
|
1163
1292
|
already set to be installed.
|
|
@@ -1177,10 +1306,16 @@ class AbstractProvisioner(ABC):
|
|
|
1177
1306
|
# TODO: We use the same label that EKS uses here, because nothing is standardized.
|
|
1178
1307
|
# This won't be quite appropriate as we aren't on EKS and we might not
|
|
1179
1308
|
# even be on AWS, but the batch system should understand it.
|
|
1180
|
-
values[
|
|
1309
|
+
values["WORKER_LABEL_SPEC"] = (
|
|
1310
|
+
'node-labels: "eks.amazonaws.com/capacityType=SPOT"' if preemptible else ""
|
|
1311
|
+
)
|
|
1181
1312
|
|
|
1182
1313
|
# Kubeadm worker configuration
|
|
1183
|
-
config.addFile(
|
|
1314
|
+
config.addFile(
|
|
1315
|
+
"/home/core/kubernetes-worker.yml",
|
|
1316
|
+
mode="0644",
|
|
1317
|
+
contents=textwrap.dedent(
|
|
1318
|
+
"""\
|
|
1184
1319
|
apiVersion: kubeadm.k8s.io/v1beta2
|
|
1185
1320
|
kind: JoinConfiguration
|
|
1186
1321
|
nodeRegistration:
|
|
@@ -1198,10 +1333,17 @@ class AbstractProvisioner(ABC):
|
|
|
1198
1333
|
apiVersion: kubelet.config.k8s.io/v1beta1
|
|
1199
1334
|
kind: KubeletConfiguration
|
|
1200
1335
|
cgroupDriver: systemd
|
|
1201
|
-
|
|
1336
|
+
""".format(
|
|
1337
|
+
**values
|
|
1338
|
+
)
|
|
1339
|
+
),
|
|
1340
|
+
)
|
|
1202
1341
|
|
|
1203
1342
|
# Make a script to join the cluster using that configuration
|
|
1204
|
-
config.addFile(
|
|
1343
|
+
config.addFile(
|
|
1344
|
+
"/home/core/join-kubernetes-cluster.sh",
|
|
1345
|
+
contents=textwrap.dedent(
|
|
1346
|
+
"""\
|
|
1205
1347
|
#!/usr/bin/env bash
|
|
1206
1348
|
set -e
|
|
1207
1349
|
FLAG_FILE="{SETUP_STATE_DIR}/join-kubernetes-cluster.done"
|
|
@@ -1216,9 +1358,14 @@ class AbstractProvisioner(ABC):
|
|
|
1216
1358
|
|
|
1217
1359
|
mkdir -p "{SETUP_STATE_DIR}"
|
|
1218
1360
|
touch "$FLAG_FILE"
|
|
1219
|
-
|
|
1361
|
+
"""
|
|
1362
|
+
).format(**values),
|
|
1363
|
+
)
|
|
1220
1364
|
|
|
1221
|
-
config.addUnit(
|
|
1365
|
+
config.addUnit(
|
|
1366
|
+
"join-kubernetes-cluster.service",
|
|
1367
|
+
contents=textwrap.dedent(
|
|
1368
|
+
"""\
|
|
1222
1369
|
[Unit]
|
|
1223
1370
|
Description=Kubernetes cluster membership
|
|
1224
1371
|
After=install-kubernetes.service
|
|
@@ -1236,9 +1383,17 @@ class AbstractProvisioner(ABC):
|
|
|
1236
1383
|
|
|
1237
1384
|
[Install]
|
|
1238
1385
|
WantedBy=multi-user.target
|
|
1239
|
-
|
|
1386
|
+
"""
|
|
1387
|
+
).format(**values),
|
|
1388
|
+
)
|
|
1240
1389
|
|
|
1241
|
-
def _getIgnitionUserData(
|
|
1390
|
+
def _getIgnitionUserData(
|
|
1391
|
+
self,
|
|
1392
|
+
role: str,
|
|
1393
|
+
keyPath: Optional[str] = None,
|
|
1394
|
+
preemptible: bool = False,
|
|
1395
|
+
architecture: str = "amd64",
|
|
1396
|
+
) -> str:
|
|
1242
1397
|
"""
|
|
1243
1398
|
Return the text (not bytes) user data to pass to a provisioned node.
|
|
1244
1399
|
|
|
@@ -1252,33 +1407,35 @@ class AbstractProvisioner(ABC):
|
|
|
1252
1407
|
# Start with a base config
|
|
1253
1408
|
config = self.getBaseInstanceConfiguration()
|
|
1254
1409
|
|
|
1255
|
-
if self.clusterType ==
|
|
1410
|
+
if self.clusterType == "kubernetes":
|
|
1256
1411
|
# Install Kubernetes
|
|
1257
1412
|
self.addKubernetesServices(config, architecture)
|
|
1258
1413
|
|
|
1259
|
-
if role ==
|
|
1414
|
+
if role == "leader":
|
|
1260
1415
|
# Set up the cluster
|
|
1261
1416
|
self.addKubernetesLeader(config)
|
|
1262
1417
|
|
|
1263
1418
|
# We can't actually set up a Kubernetes worker without credentials
|
|
1264
1419
|
# to connect back to the leader.
|
|
1265
1420
|
|
|
1266
|
-
if self.clusterType ==
|
|
1421
|
+
if self.clusterType == "mesos" or role == "leader":
|
|
1267
1422
|
# Leaders, and all nodes in a Mesos cluster, need a Toil service
|
|
1268
1423
|
self.add_toil_service(config, role, keyPath, preemptible)
|
|
1269
1424
|
|
|
1270
|
-
if role ==
|
|
1425
|
+
if role == "worker" and self._leaderWorkerAuthentication is not None:
|
|
1271
1426
|
# We need to connect the worker to the leader.
|
|
1272
|
-
if self.clusterType ==
|
|
1427
|
+
if self.clusterType == "mesos":
|
|
1273
1428
|
# This involves an SSH public key form the leader
|
|
1274
1429
|
config.addSSHRSAKey(self._leaderWorkerAuthentication)
|
|
1275
|
-
elif self.clusterType ==
|
|
1430
|
+
elif self.clusterType == "kubernetes":
|
|
1276
1431
|
# We can install the Kubernetes worker and make it phone home
|
|
1277
1432
|
# to the leader.
|
|
1278
1433
|
# TODO: this puts sufficient info to fake a malicious worker
|
|
1279
1434
|
# into the worker config, which probably is accessible by
|
|
1280
1435
|
# anyone in the cloud account.
|
|
1281
|
-
self.addKubernetesWorker(
|
|
1436
|
+
self.addKubernetesWorker(
|
|
1437
|
+
config, self._leaderWorkerAuthentication, preemptible=preemptible
|
|
1438
|
+
)
|
|
1282
1439
|
|
|
1283
1440
|
# Make it into a string for Ignition
|
|
1284
1441
|
user_data = config.toIgnitionConfig()
|
|
@@ -1289,21 +1446,29 @@ class AbstractProvisioner(ABC):
|
|
|
1289
1446
|
user_data_limit: int = self._get_user_data_limit()
|
|
1290
1447
|
|
|
1291
1448
|
if len(user_data) > user_data_limit:
|
|
1292
|
-
logger.warning(
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
|
|
1299
|
-
|
|
1300
|
-
|
|
1301
|
-
|
|
1302
|
-
|
|
1303
|
-
|
|
1304
|
-
|
|
1449
|
+
logger.warning(
|
|
1450
|
+
f"Ignition config size exceeds the user data limit ({len(user_data)} > {user_data_limit}). "
|
|
1451
|
+
"Writing to cloud storage..."
|
|
1452
|
+
)
|
|
1453
|
+
|
|
1454
|
+
src = self._write_file_to_cloud(
|
|
1455
|
+
f"configs/{role}/config-{uuid4()}.ign",
|
|
1456
|
+
contents=user_data.encode("utf-8"),
|
|
1457
|
+
)
|
|
1458
|
+
|
|
1459
|
+
return json.dumps(
|
|
1460
|
+
{
|
|
1461
|
+
"ignition": {
|
|
1462
|
+
"version": "2.2.0",
|
|
1463
|
+
# See: https://github.com/coreos/ignition/blob/spec2x/doc/configuration-v2_2.md
|
|
1464
|
+
"config": {
|
|
1465
|
+
"replace": {
|
|
1466
|
+
"source": src,
|
|
1467
|
+
}
|
|
1468
|
+
},
|
|
1305
1469
|
}
|
|
1306
|
-
}
|
|
1307
|
-
|
|
1470
|
+
},
|
|
1471
|
+
separators=(",", ":"),
|
|
1472
|
+
)
|
|
1308
1473
|
|
|
1309
1474
|
return user_data
|