toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +122 -315
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +173 -89
- toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
- toil/batchSystems/awsBatch.py +244 -135
- toil/batchSystems/cleanup_support.py +26 -16
- toil/batchSystems/contained_executor.py +31 -28
- toil/batchSystems/gridengine.py +86 -50
- toil/batchSystems/htcondor.py +166 -89
- toil/batchSystems/kubernetes.py +632 -382
- toil/batchSystems/local_support.py +20 -15
- toil/batchSystems/lsf.py +134 -81
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +290 -151
- toil/batchSystems/mesos/executor.py +79 -50
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +46 -28
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +296 -125
- toil/batchSystems/slurm.py +603 -138
- toil/batchSystems/torque.py +47 -33
- toil/bus.py +186 -76
- toil/common.py +664 -368
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1136 -483
- toil/cwl/utils.py +17 -22
- toil/deferred.py +63 -42
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +140 -60
- toil/fileStores/cachingFileStore.py +717 -269
- toil/fileStores/nonCachingFileStore.py +116 -87
- toil/job.py +1225 -368
- toil/jobStores/abstractJobStore.py +416 -266
- toil/jobStores/aws/jobStore.py +863 -477
- toil/jobStores/aws/utils.py +201 -120
- toil/jobStores/conftest.py +3 -2
- toil/jobStores/fileJobStore.py +292 -154
- toil/jobStores/googleJobStore.py +140 -74
- toil/jobStores/utils.py +36 -15
- toil/leader.py +668 -272
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +74 -31
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +214 -39
- toil/lib/aws/utils.py +287 -231
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +104 -47
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +361 -199
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +5 -3
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +141 -15
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +66 -21
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +68 -15
- toil/lib/retry.py +126 -81
- toil/lib/threading.py +299 -82
- toil/lib/throttle.py +16 -15
- toil/options/common.py +843 -409
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +73 -17
- toil/provisioners/__init__.py +117 -46
- toil/provisioners/abstractProvisioner.py +332 -157
- toil/provisioners/aws/__init__.py +70 -33
- toil/provisioners/aws/awsProvisioner.py +1145 -715
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +155 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +128 -62
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +224 -70
- toil/test/__init__.py +282 -183
- toil/test/batchSystems/batchSystemTest.py +460 -210
- toil/test/batchSystems/batch_system_plugin_test.py +90 -0
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +110 -49
- toil/test/cactus/__init__.py +0 -0
- toil/test/cactus/test_cactus_integration.py +56 -0
- toil/test/cwl/cwlTest.py +496 -287
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/cwl/seqtk_seq.cwl +1 -1
- toil/test/docs/scriptsTest.py +69 -46
- toil/test/jobStores/jobStoreTest.py +427 -264
- toil/test/lib/aws/test_iam.py +118 -50
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +58 -50
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/__init__.py +13 -0
- toil/test/options/options.py +42 -0
- toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +166 -44
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +141 -101
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +32 -24
- toil/test/src/environmentTest.py +135 -0
- toil/test/src/fileStoreTest.py +539 -272
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +46 -21
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +121 -71
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +10 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +73 -23
- toil/test/utils/toilDebugTest.py +103 -33
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +245 -106
- toil/test/wdl/wdltoil_test.py +818 -149
- toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
- toil/toilState.py +120 -35
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +214 -27
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +256 -140
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +32 -14
- toil/utils/toilSshCluster.py +49 -22
- toil/utils/toilStats.py +356 -273
- toil/utils/toilStatus.py +292 -139
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +12 -12
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3913 -1033
- toil/worker.py +367 -184
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-6.1.0a1.dist-info/METADATA +0 -125
- toil-6.1.0a1.dist-info/RECORD +0 -237
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
|
@@ -20,7 +20,7 @@ import tempfile
|
|
|
20
20
|
import textwrap
|
|
21
21
|
from abc import ABC, abstractmethod
|
|
22
22
|
from functools import total_ordering
|
|
23
|
-
from typing import Any,
|
|
23
|
+
from typing import Any, Optional, Union
|
|
24
24
|
from urllib.parse import quote
|
|
25
25
|
from uuid import uuid4
|
|
26
26
|
|
|
@@ -55,6 +55,7 @@ class Shape:
|
|
|
55
55
|
The memory and disk attributes store the number of bytes required by a job (or provided by a
|
|
56
56
|
node) in RAM or on disk (SSD or HDD), respectively.
|
|
57
57
|
"""
|
|
58
|
+
|
|
58
59
|
def __init__(
|
|
59
60
|
self,
|
|
60
61
|
wallTime: Union[int, float],
|
|
@@ -70,11 +71,13 @@ class Shape:
|
|
|
70
71
|
self.preemptible = preemptible
|
|
71
72
|
|
|
72
73
|
def __eq__(self, other: Any) -> bool:
|
|
73
|
-
return (
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
74
|
+
return (
|
|
75
|
+
self.wallTime == other.wallTime
|
|
76
|
+
and self.memory == other.memory
|
|
77
|
+
and self.cores == other.cores
|
|
78
|
+
and self.disk == other.disk
|
|
79
|
+
and self.preemptible == other.preemptible
|
|
80
|
+
)
|
|
78
81
|
|
|
79
82
|
def greater_than(self, other: Any) -> bool:
|
|
80
83
|
if self.preemptible < other.preemptible:
|
|
@@ -104,12 +107,13 @@ class Shape:
|
|
|
104
107
|
return self.greater_than(other)
|
|
105
108
|
|
|
106
109
|
def __repr__(self) -> str:
|
|
107
|
-
return "Shape(wallTime=%s, memory=%s, cores=%s, disk=%s, preemptible=%s)" %
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
110
|
+
return "Shape(wallTime=%s, memory=%s, cores=%s, disk=%s, preemptible=%s)" % (
|
|
111
|
+
self.wallTime,
|
|
112
|
+
self.memory,
|
|
113
|
+
self.cores,
|
|
114
|
+
self.disk,
|
|
115
|
+
self.preemptible,
|
|
116
|
+
)
|
|
113
117
|
|
|
114
118
|
def __str__(self) -> str:
|
|
115
119
|
return self.__repr__()
|
|
@@ -117,17 +121,14 @@ class Shape:
|
|
|
117
121
|
def __hash__(self) -> int:
|
|
118
122
|
# Since we replaced __eq__ we need to replace __hash__ as well.
|
|
119
123
|
return hash(
|
|
120
|
-
(self.wallTime,
|
|
121
|
-
|
|
122
|
-
self.cores,
|
|
123
|
-
self.disk,
|
|
124
|
-
self.preemptible))
|
|
124
|
+
(self.wallTime, self.memory, self.cores, self.disk, self.preemptible)
|
|
125
|
+
)
|
|
125
126
|
|
|
126
127
|
|
|
127
128
|
class AbstractProvisioner(ABC):
|
|
128
129
|
"""Interface for provisioning worker nodes to use in a Toil cluster."""
|
|
129
130
|
|
|
130
|
-
LEADER_HOME_DIR =
|
|
131
|
+
LEADER_HOME_DIR = "/root/" # home directory in the Toil appliance on an instance
|
|
131
132
|
cloud: str = None
|
|
132
133
|
|
|
133
134
|
def __init__(
|
|
@@ -136,7 +137,8 @@ class AbstractProvisioner(ABC):
|
|
|
136
137
|
clusterType: Optional[str] = "mesos",
|
|
137
138
|
zone: Optional[str] = None,
|
|
138
139
|
nodeStorage: int = 50,
|
|
139
|
-
nodeStorageOverrides: Optional[
|
|
140
|
+
nodeStorageOverrides: Optional[list[str]] = None,
|
|
141
|
+
enable_fuse: bool = False,
|
|
140
142
|
) -> None:
|
|
141
143
|
"""
|
|
142
144
|
Initialize provisioner.
|
|
@@ -160,13 +162,16 @@ class AbstractProvisioner(ABC):
|
|
|
160
162
|
self._nodeStorage = nodeStorage
|
|
161
163
|
self._nodeStorageOverrides = {}
|
|
162
164
|
for override in nodeStorageOverrides or []:
|
|
163
|
-
nodeShape, storageOverride = override.split(
|
|
165
|
+
nodeShape, storageOverride = override.split(":")
|
|
164
166
|
self._nodeStorageOverrides[nodeShape] = int(storageOverride)
|
|
165
|
-
self._leaderPrivateIP = None
|
|
167
|
+
self._leaderPrivateIP: Optional[str] = None
|
|
166
168
|
# This will hold an SSH public key for Mesos clusters, or the
|
|
167
169
|
# Kubernetes joining information as a dict for Kubernetes clusters.
|
|
168
170
|
self._leaderWorkerAuthentication = None
|
|
169
171
|
|
|
172
|
+
# Whether or not to use FUSE on the cluster. If true, the cluster's Toil containers will be launched in privileged mode
|
|
173
|
+
self.enable_fuse = enable_fuse
|
|
174
|
+
|
|
170
175
|
if clusterName:
|
|
171
176
|
# Making a new cluster
|
|
172
177
|
self.createClusterSettings()
|
|
@@ -175,7 +180,7 @@ class AbstractProvisioner(ABC):
|
|
|
175
180
|
self.readClusterSettings()
|
|
176
181
|
|
|
177
182
|
@abstractmethod
|
|
178
|
-
def supportedClusterTypes(self) ->
|
|
183
|
+
def supportedClusterTypes(self) -> set[str]:
|
|
179
184
|
"""
|
|
180
185
|
Get all the cluster types that this provisioner implementation
|
|
181
186
|
supports.
|
|
@@ -241,12 +246,14 @@ class AbstractProvisioner(ABC):
|
|
|
241
246
|
:param leader: Node to pull credentials from, if not the current machine.
|
|
242
247
|
"""
|
|
243
248
|
|
|
244
|
-
if self.clusterType ==
|
|
249
|
+
if self.clusterType == "mesos":
|
|
245
250
|
# We're using a Mesos cluster, so set up SSH from leader to workers.
|
|
246
251
|
self._leaderWorkerAuthentication = self._setSSH(leader=leader)
|
|
247
|
-
elif self.clusterType ==
|
|
252
|
+
elif self.clusterType == "kubernetes":
|
|
248
253
|
# We're using a Kubernetes cluster.
|
|
249
|
-
self._leaderWorkerAuthentication = self._getKubernetesJoiningInfo(
|
|
254
|
+
self._leaderWorkerAuthentication = self._getKubernetesJoiningInfo(
|
|
255
|
+
leader=leader
|
|
256
|
+
)
|
|
250
257
|
|
|
251
258
|
def _clearLeaderWorkerAuthentication(self):
|
|
252
259
|
"""
|
|
@@ -273,16 +280,22 @@ class AbstractProvisioner(ABC):
|
|
|
273
280
|
|
|
274
281
|
# To work locally or remotely we need to do all our setup work as one
|
|
275
282
|
# big bash -c
|
|
276
|
-
command = [
|
|
277
|
-
|
|
278
|
-
|
|
283
|
+
command = [
|
|
284
|
+
"bash",
|
|
285
|
+
"-c",
|
|
286
|
+
(
|
|
287
|
+
"set -e; if [ ! -e /root/.sshSuccess ] ; "
|
|
288
|
+
'then ssh-keygen -f /root/.ssh/id_rsa -t rsa -N ""; '
|
|
289
|
+
"touch /root/.sshSuccess; fi; chmod 700 /root/.ssh;"
|
|
290
|
+
),
|
|
291
|
+
]
|
|
279
292
|
|
|
280
293
|
if leader is None:
|
|
281
294
|
# Run locally
|
|
282
295
|
subprocess.check_call(command)
|
|
283
296
|
|
|
284
297
|
# Grab from local file
|
|
285
|
-
with open(
|
|
298
|
+
with open("/root/.ssh/id_rsa.pub") as f:
|
|
286
299
|
leaderPublicKey = f.read()
|
|
287
300
|
else:
|
|
288
301
|
# Run remotely
|
|
@@ -290,20 +303,20 @@ class AbstractProvisioner(ABC):
|
|
|
290
303
|
|
|
291
304
|
# Grab from remote file
|
|
292
305
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
293
|
-
localFile = os.path.join(tmpdir,
|
|
294
|
-
leader.extractFile(
|
|
306
|
+
localFile = os.path.join(tmpdir, "id_rsa.pub")
|
|
307
|
+
leader.extractFile("/root/.ssh/id_rsa.pub", localFile, "toil_leader")
|
|
295
308
|
|
|
296
309
|
with open(localFile) as f:
|
|
297
310
|
leaderPublicKey = f.read()
|
|
298
311
|
|
|
299
312
|
# Drop the key type and keep just the key data
|
|
300
|
-
leaderPublicKey = leaderPublicKey.split(
|
|
313
|
+
leaderPublicKey = leaderPublicKey.split(" ")[1]
|
|
301
314
|
|
|
302
315
|
# confirm it really is an RSA public key
|
|
303
|
-
assert leaderPublicKey.startswith(
|
|
316
|
+
assert leaderPublicKey.startswith("AAAAB3NzaC1yc2E"), leaderPublicKey
|
|
304
317
|
return leaderPublicKey
|
|
305
318
|
|
|
306
|
-
def _getKubernetesJoiningInfo(self, leader: Node = None) ->
|
|
319
|
+
def _getKubernetesJoiningInfo(self, leader: Node = None) -> dict[str, str]:
|
|
307
320
|
"""
|
|
308
321
|
Get the Kubernetes joining info created when Kubernetes was set up on
|
|
309
322
|
this node, which is the leader, or on a different specified Node.
|
|
@@ -323,22 +336,24 @@ class AbstractProvisioner(ABC):
|
|
|
323
336
|
# This info is always supposed to be set up before the Toil appliance
|
|
324
337
|
# starts, and mounted in at the same path as on the host. So we just go
|
|
325
338
|
# read it.
|
|
326
|
-
with open(
|
|
339
|
+
with open("/etc/kubernetes/worker.ini") as f:
|
|
327
340
|
config.read_file(f)
|
|
328
341
|
else:
|
|
329
342
|
# Grab from remote file
|
|
330
343
|
with tempfile.TemporaryDirectory() as tmpdir:
|
|
331
|
-
localFile = os.path.join(tmpdir,
|
|
332
|
-
leader.extractFile(
|
|
344
|
+
localFile = os.path.join(tmpdir, "worker.ini")
|
|
345
|
+
leader.extractFile(
|
|
346
|
+
"/etc/kubernetes/worker.ini", localFile, "toil_leader"
|
|
347
|
+
)
|
|
333
348
|
|
|
334
349
|
with open(localFile) as f:
|
|
335
350
|
config.read_file(f)
|
|
336
351
|
|
|
337
352
|
# Grab everything out of the default section where our setup script put
|
|
338
353
|
# it.
|
|
339
|
-
return dict(config[
|
|
354
|
+
return dict(config["DEFAULT"])
|
|
340
355
|
|
|
341
|
-
def setAutoscaledNodeTypes(self, nodeTypes:
|
|
356
|
+
def setAutoscaledNodeTypes(self, nodeTypes: list[tuple[set[str], Optional[float]]]):
|
|
342
357
|
"""
|
|
343
358
|
Set node types, shapes and spot bids for Toil-managed autoscaling.
|
|
344
359
|
:param nodeTypes: A list of node types, as parsed with parse_node_types.
|
|
@@ -371,13 +386,13 @@ class AbstractProvisioner(ABC):
|
|
|
371
386
|
"""
|
|
372
387
|
return len(self.getAutoscaledInstanceShapes()) > 0
|
|
373
388
|
|
|
374
|
-
def getAutoscaledInstanceShapes(self) ->
|
|
389
|
+
def getAutoscaledInstanceShapes(self) -> dict[Shape, str]:
|
|
375
390
|
"""
|
|
376
391
|
Get all the node shapes and their named instance types that the Toil
|
|
377
392
|
autoscaler should manage.
|
|
378
393
|
"""
|
|
379
394
|
|
|
380
|
-
if hasattr(self,
|
|
395
|
+
if hasattr(self, "_shape_to_instance_type"):
|
|
381
396
|
# We have had Toil-managed autoscaling set up
|
|
382
397
|
return dict(self._shape_to_instance_type)
|
|
383
398
|
else:
|
|
@@ -414,7 +429,7 @@ class AbstractProvisioner(ABC):
|
|
|
414
429
|
@abstractmethod
|
|
415
430
|
def addNodes(
|
|
416
431
|
self,
|
|
417
|
-
nodeTypes:
|
|
432
|
+
nodeTypes: set[str],
|
|
418
433
|
numNodes: int,
|
|
419
434
|
preemptible: bool,
|
|
420
435
|
spotBid: Optional[float] = None,
|
|
@@ -429,7 +444,9 @@ class AbstractProvisioner(ABC):
|
|
|
429
444
|
"""
|
|
430
445
|
raise NotImplementedError
|
|
431
446
|
|
|
432
|
-
def addManagedNodes(
|
|
447
|
+
def addManagedNodes(
|
|
448
|
+
self, nodeTypes: set[str], minNodes, maxNodes, preemptible, spotBid=None
|
|
449
|
+
) -> None:
|
|
433
450
|
"""
|
|
434
451
|
Add a group of managed nodes of the given type, up to the given maximum.
|
|
435
452
|
The nodes will automatically be launched and terminated depending on cluster load.
|
|
@@ -444,10 +461,12 @@ class AbstractProvisioner(ABC):
|
|
|
444
461
|
"""
|
|
445
462
|
|
|
446
463
|
# Not available by default
|
|
447
|
-
raise ManagedNodesNotSupportedException(
|
|
464
|
+
raise ManagedNodesNotSupportedException(
|
|
465
|
+
"Managed nodes not supported by this provisioner"
|
|
466
|
+
)
|
|
448
467
|
|
|
449
468
|
@abstractmethod
|
|
450
|
-
def terminateNodes(self, nodes:
|
|
469
|
+
def terminateNodes(self, nodes: list[Node]) -> None:
|
|
451
470
|
"""
|
|
452
471
|
Terminate the nodes represented by given Node objects
|
|
453
472
|
|
|
@@ -463,7 +482,9 @@ class AbstractProvisioner(ABC):
|
|
|
463
482
|
raise NotImplementedError
|
|
464
483
|
|
|
465
484
|
@abstractmethod
|
|
466
|
-
def getProvisionedWorkers(
|
|
485
|
+
def getProvisionedWorkers(
|
|
486
|
+
self, instance_type: Optional[str] = None, preemptible: Optional[bool] = None
|
|
487
|
+
) -> list[Node]:
|
|
467
488
|
"""
|
|
468
489
|
Gets all nodes, optionally of the given instance type or
|
|
469
490
|
preemptability, from the provisioner. Includes both static and
|
|
@@ -510,7 +531,14 @@ class AbstractProvisioner(ABC):
|
|
|
510
531
|
# Holds strings like "ssh-rsa actualKeyData" for keys to authorize (independently of cloud provider's system)
|
|
511
532
|
self.sshPublicKeys = []
|
|
512
533
|
|
|
513
|
-
def addFile(
|
|
534
|
+
def addFile(
|
|
535
|
+
self,
|
|
536
|
+
path: str,
|
|
537
|
+
filesystem: str = "root",
|
|
538
|
+
mode: Union[str, int] = "0755",
|
|
539
|
+
contents: str = "",
|
|
540
|
+
append: bool = False,
|
|
541
|
+
):
|
|
514
542
|
"""
|
|
515
543
|
Make a file on the instance with the given filesystem, mode, and contents.
|
|
516
544
|
|
|
@@ -522,16 +550,21 @@ class AbstractProvisioner(ABC):
|
|
|
522
550
|
mode = int(mode, 8)
|
|
523
551
|
assert isinstance(mode, int)
|
|
524
552
|
|
|
525
|
-
contents =
|
|
553
|
+
contents = "data:," + quote(contents.encode("utf-8"))
|
|
526
554
|
|
|
527
|
-
ignition_file = {
|
|
555
|
+
ignition_file = {
|
|
556
|
+
"path": path,
|
|
557
|
+
"filesystem": filesystem,
|
|
558
|
+
"mode": mode,
|
|
559
|
+
"contents": {"source": contents},
|
|
560
|
+
}
|
|
528
561
|
|
|
529
562
|
if append:
|
|
530
563
|
ignition_file["append"] = append
|
|
531
564
|
|
|
532
565
|
self.files.append(ignition_file)
|
|
533
566
|
|
|
534
|
-
def addUnit(self, name: str, enabled: bool = True, contents: str =
|
|
567
|
+
def addUnit(self, name: str, enabled: bool = True, contents: str = ""):
|
|
535
568
|
"""
|
|
536
569
|
Make a systemd unit on the instance with the given name (including
|
|
537
570
|
.service), and content. Units will be enabled by default.
|
|
@@ -542,7 +575,7 @@ class AbstractProvisioner(ABC):
|
|
|
542
575
|
journalctl -xe
|
|
543
576
|
"""
|
|
544
577
|
|
|
545
|
-
self.units.append({
|
|
578
|
+
self.units.append({"name": name, "enabled": enabled, "contents": contents})
|
|
546
579
|
|
|
547
580
|
def addSSHRSAKey(self, keyData: str):
|
|
548
581
|
"""
|
|
@@ -559,30 +592,19 @@ class AbstractProvisioner(ABC):
|
|
|
559
592
|
# Define the base config. We're using Flatcar's v2.2.0 fork
|
|
560
593
|
# See: https://github.com/kinvolk/ignition/blob/flatcar-master/doc/configuration-v2_2.md
|
|
561
594
|
config = {
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
},
|
|
565
|
-
'storage': {
|
|
566
|
-
'files': self.files
|
|
567
|
-
},
|
|
568
|
-
'systemd': {
|
|
569
|
-
'units': self.units
|
|
570
|
-
}
|
|
595
|
+
"ignition": {"version": "2.2.0"},
|
|
596
|
+
"storage": {"files": self.files},
|
|
597
|
+
"systemd": {"units": self.units},
|
|
571
598
|
}
|
|
572
599
|
|
|
573
600
|
if len(self.sshPublicKeys) > 0:
|
|
574
601
|
# Add SSH keys if needed
|
|
575
|
-
config[
|
|
576
|
-
|
|
577
|
-
{
|
|
578
|
-
'name': 'core',
|
|
579
|
-
'sshAuthorizedKeys': self.sshPublicKeys
|
|
580
|
-
}
|
|
581
|
-
]
|
|
602
|
+
config["passwd"] = {
|
|
603
|
+
"users": [{"name": "core", "sshAuthorizedKeys": self.sshPublicKeys}]
|
|
582
604
|
}
|
|
583
605
|
|
|
584
606
|
# Serialize as JSON
|
|
585
|
-
return json.dumps(config, separators=(
|
|
607
|
+
return json.dumps(config, separators=(",", ":"))
|
|
586
608
|
|
|
587
609
|
def getBaseInstanceConfiguration(self) -> InstanceConfiguration:
|
|
588
610
|
"""
|
|
@@ -592,10 +614,16 @@ class AbstractProvisioner(ABC):
|
|
|
592
614
|
config = self.InstanceConfiguration()
|
|
593
615
|
|
|
594
616
|
# We set Flatcar's update reboot strategy to off
|
|
595
|
-
config.addFile(
|
|
617
|
+
config.addFile(
|
|
618
|
+
"/etc/coreos/update.conf",
|
|
619
|
+
mode="0644",
|
|
620
|
+
contents=textwrap.dedent(
|
|
621
|
+
"""\
|
|
596
622
|
GROUP=stable
|
|
597
623
|
REBOOT_STRATEGY=off
|
|
598
|
-
"""
|
|
624
|
+
"""
|
|
625
|
+
),
|
|
626
|
+
)
|
|
599
627
|
|
|
600
628
|
# Then we have volume mounting. That always happens.
|
|
601
629
|
self.addVolumesService(config)
|
|
@@ -617,7 +645,10 @@ class AbstractProvisioner(ABC):
|
|
|
617
645
|
#
|
|
618
646
|
# TODO: check what kind of instance this is, and what ephemeral volumes
|
|
619
647
|
# *should* be there, and declaratively RAID and mount them.
|
|
620
|
-
config.addFile(
|
|
648
|
+
config.addFile(
|
|
649
|
+
"/home/core/volumes.sh",
|
|
650
|
+
contents=textwrap.dedent(
|
|
651
|
+
"""\
|
|
621
652
|
#!/bin/bash
|
|
622
653
|
set -x
|
|
623
654
|
ephemeral_count=0
|
|
@@ -680,9 +711,14 @@ class AbstractProvisioner(ABC):
|
|
|
680
711
|
sudo mkdir -p /var/$directory
|
|
681
712
|
sudo mount --bind /mnt/ephemeral/var/$directory /var/$directory
|
|
682
713
|
done
|
|
683
|
-
"""
|
|
714
|
+
"""
|
|
715
|
+
),
|
|
716
|
+
)
|
|
684
717
|
# TODO: Make this retry?
|
|
685
|
-
config.addUnit(
|
|
718
|
+
config.addUnit(
|
|
719
|
+
"volume-mounting.service",
|
|
720
|
+
contents=textwrap.dedent(
|
|
721
|
+
"""\
|
|
686
722
|
[Unit]
|
|
687
723
|
Description=mounts ephemeral volumes & bind mounts toil directories
|
|
688
724
|
Before=docker.service
|
|
@@ -694,14 +730,19 @@ class AbstractProvisioner(ABC):
|
|
|
694
730
|
|
|
695
731
|
[Install]
|
|
696
732
|
WantedBy=multi-user.target
|
|
697
|
-
"""
|
|
733
|
+
"""
|
|
734
|
+
),
|
|
735
|
+
)
|
|
698
736
|
|
|
699
737
|
def addNodeExporterService(self, config: InstanceConfiguration):
|
|
700
738
|
"""
|
|
701
739
|
Add the node exporter service for Prometheus to an instance configuration.
|
|
702
740
|
"""
|
|
703
741
|
|
|
704
|
-
config.addUnit(
|
|
742
|
+
config.addUnit(
|
|
743
|
+
"node-exporter.service",
|
|
744
|
+
contents=textwrap.dedent(
|
|
745
|
+
"""\
|
|
705
746
|
[Unit]
|
|
706
747
|
Description=node-exporter container
|
|
707
748
|
After=docker.service
|
|
@@ -724,12 +765,20 @@ class AbstractProvisioner(ABC):
|
|
|
724
765
|
|
|
725
766
|
[Install]
|
|
726
767
|
WantedBy=multi-user.target
|
|
727
|
-
|
|
768
|
+
"""
|
|
769
|
+
),
|
|
770
|
+
)
|
|
728
771
|
|
|
729
772
|
def toil_service_env_options(self) -> str:
|
|
730
773
|
return "-e TMPDIR=/var/tmp"
|
|
731
774
|
|
|
732
|
-
def add_toil_service(
|
|
775
|
+
def add_toil_service(
|
|
776
|
+
self,
|
|
777
|
+
config: InstanceConfiguration,
|
|
778
|
+
role: str,
|
|
779
|
+
keyPath: str = None,
|
|
780
|
+
preemptible: bool = False,
|
|
781
|
+
):
|
|
733
782
|
"""
|
|
734
783
|
Add the Toil leader or worker service to an instance configuration.
|
|
735
784
|
|
|
@@ -746,46 +795,59 @@ class AbstractProvisioner(ABC):
|
|
|
746
795
|
# transferred. The waitForKey.sh script loops on the new VM until it finds the keyPath file, then it starts the
|
|
747
796
|
# mesos-agent. If there are multiple keys to be transferred, then the last one to be transferred must be
|
|
748
797
|
# set to keyPath.
|
|
749
|
-
MESOS_LOG_DIR =
|
|
750
|
-
LEADER_DOCKER_ARGS =
|
|
798
|
+
MESOS_LOG_DIR = "--log_dir=/var/lib/mesos "
|
|
799
|
+
LEADER_DOCKER_ARGS = "--registry=in_memory --cluster={name}"
|
|
751
800
|
# --no-systemd_enable_support is necessary in Ubuntu 16.04 (otherwise,
|
|
752
801
|
# Mesos attempts to contact systemd but can't find its run file)
|
|
753
|
-
WORKER_DOCKER_ARGS =
|
|
754
|
-
|
|
755
|
-
if self.clusterType ==
|
|
756
|
-
if role ==
|
|
757
|
-
entryPoint =
|
|
758
|
-
entryPointArgs = MESOS_LOG_DIR + LEADER_DOCKER_ARGS.format(
|
|
759
|
-
|
|
760
|
-
|
|
761
|
-
|
|
762
|
-
|
|
802
|
+
WORKER_DOCKER_ARGS = "--work_dir=/var/lib/mesos --master={ip}:5050 --attributes=preemptible:{preemptible} --no-hostname_lookup --no-systemd_enable_support"
|
|
803
|
+
|
|
804
|
+
if self.clusterType == "mesos":
|
|
805
|
+
if role == "leader":
|
|
806
|
+
entryPoint = "mesos-master"
|
|
807
|
+
entryPointArgs = MESOS_LOG_DIR + LEADER_DOCKER_ARGS.format(
|
|
808
|
+
name=self.clusterName
|
|
809
|
+
)
|
|
810
|
+
elif role == "worker":
|
|
811
|
+
entryPoint = "mesos-agent"
|
|
812
|
+
entryPointArgs = MESOS_LOG_DIR + WORKER_DOCKER_ARGS.format(
|
|
813
|
+
ip=self._leaderPrivateIP, preemptible=preemptible
|
|
814
|
+
)
|
|
763
815
|
else:
|
|
764
816
|
raise RuntimeError("Unknown role %s" % role)
|
|
765
|
-
elif self.clusterType ==
|
|
766
|
-
if role ==
|
|
817
|
+
elif self.clusterType == "kubernetes":
|
|
818
|
+
if role == "leader":
|
|
767
819
|
# We need *an* entry point or the leader container will finish
|
|
768
820
|
# and go away, and thus not be available to take user logins.
|
|
769
|
-
entryPoint =
|
|
770
|
-
entryPointArgs =
|
|
821
|
+
entryPoint = "sleep"
|
|
822
|
+
entryPointArgs = "infinity"
|
|
771
823
|
else:
|
|
772
|
-
raise RuntimeError(
|
|
773
|
-
|
|
824
|
+
raise RuntimeError(
|
|
825
|
+
"Toil service not needed for %s nodes in a %s cluster",
|
|
826
|
+
role,
|
|
827
|
+
self.clusterType,
|
|
828
|
+
)
|
|
774
829
|
else:
|
|
775
|
-
raise RuntimeError(
|
|
830
|
+
raise RuntimeError(
|
|
831
|
+
"Toil service not needed in a %s cluster", self.clusterType
|
|
832
|
+
)
|
|
776
833
|
|
|
777
834
|
if keyPath:
|
|
778
|
-
entryPointArgs = keyPath +
|
|
835
|
+
entryPointArgs = keyPath + " " + entryPointArgs
|
|
779
836
|
entryPoint = "waitForKey.sh"
|
|
780
837
|
customDockerInitCommand = customDockerInitCmd()
|
|
781
838
|
if customDockerInitCommand:
|
|
782
|
-
entryPointArgs = " ".join(
|
|
839
|
+
entryPointArgs = " ".join(
|
|
840
|
+
["'" + customDockerInitCommand + "'", entryPoint, entryPointArgs]
|
|
841
|
+
)
|
|
783
842
|
entryPoint = "customDockerInit.sh"
|
|
784
843
|
|
|
785
844
|
# Set up the service. Make sure to make it default to using the
|
|
786
845
|
# actually-big temp directory of /var/tmp (see
|
|
787
846
|
# https://systemd.io/TEMPORARY_DIRECTORIES/).
|
|
788
|
-
config.addUnit(
|
|
847
|
+
config.addUnit(
|
|
848
|
+
f"toil-{role}.service",
|
|
849
|
+
contents=textwrap.dedent(
|
|
850
|
+
f"""\
|
|
789
851
|
[Unit]
|
|
790
852
|
Description=toil-{role} container
|
|
791
853
|
After=docker.service
|
|
@@ -812,15 +874,23 @@ class AbstractProvisioner(ABC):
|
|
|
812
874
|
-v /opt:/opt \\
|
|
813
875
|
-v /etc/kubernetes:/etc/kubernetes \\
|
|
814
876
|
-v /etc/kubernetes/admin.conf:/root/.kube/config \\
|
|
877
|
+
{"-e TOIL_KUBERNETES_PRIVILEGED=True --privileged" if self.enable_fuse else
|
|
878
|
+
"--security-opt seccomp=unconfined --security-opt systempaths=unconfined"} \\
|
|
879
|
+
-e TOIL_KUBERNETES_HOST_PATH=/var/lib/toil \\
|
|
880
|
+
# Pass in a path to use for singularity image caching into the container
|
|
881
|
+
-e SINGULARITY_CACHEDIR=/var/lib/toil/singularity \\
|
|
882
|
+
-e MINIWDL__SINGULARITY__IMAGE_CACHE=/var/lib/toil/miniwdl \\
|
|
815
883
|
--name=toil_{role} \\
|
|
816
884
|
{applianceSelf()} \\
|
|
817
885
|
{entryPointArgs}
|
|
818
886
|
|
|
819
887
|
[Install]
|
|
820
888
|
WantedBy=multi-user.target
|
|
821
|
-
|
|
889
|
+
"""
|
|
890
|
+
),
|
|
891
|
+
)
|
|
822
892
|
|
|
823
|
-
def getKubernetesValues(self, architecture: str =
|
|
893
|
+
def getKubernetesValues(self, architecture: str = "amd64"):
|
|
824
894
|
"""
|
|
825
895
|
Returns a dict of Kubernetes component versions and paths for formatting into Kubernetes-related templates.
|
|
826
896
|
"""
|
|
@@ -847,10 +917,14 @@ class AbstractProvisioner(ABC):
|
|
|
847
917
|
METRICS_API_VERSION="v0.3.7",
|
|
848
918
|
CLUSTER_NAME=self.clusterName,
|
|
849
919
|
# YAML line that tells the Kubelet to use a cloud provider, if we need one.
|
|
850
|
-
CLOUD_PROVIDER_SPEC=(
|
|
920
|
+
CLOUD_PROVIDER_SPEC=(
|
|
921
|
+
("cloud-provider: " + cloud_provider) if cloud_provider else ""
|
|
922
|
+
),
|
|
851
923
|
)
|
|
852
924
|
|
|
853
|
-
def addKubernetesServices(
|
|
925
|
+
def addKubernetesServices(
|
|
926
|
+
self, config: InstanceConfiguration, architecture: str = "amd64"
|
|
927
|
+
):
|
|
854
928
|
"""
|
|
855
929
|
Add installing Kubernetes and Kubeadm and setting up the Kubelet to run when configured to an instance configuration.
|
|
856
930
|
The same process applies to leaders and workers.
|
|
@@ -859,7 +933,10 @@ class AbstractProvisioner(ABC):
|
|
|
859
933
|
values = self.getKubernetesValues(architecture)
|
|
860
934
|
|
|
861
935
|
# We're going to ship the Kubelet service from Kubernetes' release pipeline via cloud-config
|
|
862
|
-
config.addUnit(
|
|
936
|
+
config.addUnit(
|
|
937
|
+
"kubelet.service",
|
|
938
|
+
contents=textwrap.dedent(
|
|
939
|
+
"""\
|
|
863
940
|
# This came from https://raw.githubusercontent.com/kubernetes/release/v0.4.0/cmd/kubepkg/templates/latest/deb/kubelet/lib/systemd/system/kubelet.service
|
|
864
941
|
# It has been modified to replace /usr/bin with {DOWNLOAD_DIR}
|
|
865
942
|
# License: https://raw.githubusercontent.com/kubernetes/release/v0.4.0/LICENSE
|
|
@@ -878,11 +955,16 @@ class AbstractProvisioner(ABC):
|
|
|
878
955
|
|
|
879
956
|
[Install]
|
|
880
957
|
WantedBy=multi-user.target
|
|
881
|
-
|
|
958
|
+
"""
|
|
959
|
+
).format(**values),
|
|
960
|
+
)
|
|
882
961
|
|
|
883
962
|
# It needs this config file
|
|
884
|
-
config.addFile(
|
|
885
|
-
|
|
963
|
+
config.addFile(
|
|
964
|
+
"/etc/systemd/system/kubelet.service.d/10-kubeadm.conf",
|
|
965
|
+
mode="0644",
|
|
966
|
+
contents=textwrap.dedent(
|
|
967
|
+
"""\
|
|
886
968
|
# This came from https://raw.githubusercontent.com/kubernetes/release/v0.4.0/cmd/kubepkg/templates/latest/deb/kubeadm/10-kubeadm.conf
|
|
887
969
|
# It has been modified to replace /usr/bin with {DOWNLOAD_DIR}
|
|
888
970
|
# License: https://raw.githubusercontent.com/kubernetes/release/v0.4.0/LICENSE
|
|
@@ -898,7 +980,9 @@ class AbstractProvisioner(ABC):
|
|
|
898
980
|
EnvironmentFile=-/etc/default/kubelet
|
|
899
981
|
ExecStart=
|
|
900
982
|
ExecStart={DOWNLOAD_DIR}/kubelet $KUBELET_KUBECONFIG_ARGS $KUBELET_CONFIG_ARGS $KUBELET_KUBEADM_ARGS $KUBELET_EXTRA_ARGS
|
|
901
|
-
|
|
983
|
+
"""
|
|
984
|
+
).format(**values),
|
|
985
|
+
)
|
|
902
986
|
|
|
903
987
|
# Before we let the kubelet try to start, we have to actually download it (and kubeadm)
|
|
904
988
|
# We set up this service so it can restart on failure despite not
|
|
@@ -909,7 +993,10 @@ class AbstractProvisioner(ABC):
|
|
|
909
993
|
# restarts work if the script fails. We also use a condition which
|
|
910
994
|
# treats the service as successful and skips it if it made a file to
|
|
911
995
|
# say it already ran.
|
|
912
|
-
config.addFile(
|
|
996
|
+
config.addFile(
|
|
997
|
+
"/home/core/install-kubernetes.sh",
|
|
998
|
+
contents=textwrap.dedent(
|
|
999
|
+
"""\
|
|
913
1000
|
#!/usr/bin/env bash
|
|
914
1001
|
set -e
|
|
915
1002
|
FLAG_FILE="{SETUP_STATE_DIR}/install-kubernetes.done"
|
|
@@ -928,8 +1015,13 @@ class AbstractProvisioner(ABC):
|
|
|
928
1015
|
|
|
929
1016
|
mkdir -p "{SETUP_STATE_DIR}"
|
|
930
1017
|
touch "$FLAG_FILE"
|
|
931
|
-
|
|
932
|
-
|
|
1018
|
+
"""
|
|
1019
|
+
).format(**values),
|
|
1020
|
+
)
|
|
1021
|
+
config.addUnit(
|
|
1022
|
+
"install-kubernetes.service",
|
|
1023
|
+
contents=textwrap.dedent(
|
|
1024
|
+
"""\
|
|
933
1025
|
[Unit]
|
|
934
1026
|
Description=base Kubernetes installation
|
|
935
1027
|
Wants=network-online.target
|
|
@@ -947,12 +1039,14 @@ class AbstractProvisioner(ABC):
|
|
|
947
1039
|
[Install]
|
|
948
1040
|
WantedBy=multi-user.target
|
|
949
1041
|
RequiredBy=kubelet.service
|
|
950
|
-
|
|
1042
|
+
"""
|
|
1043
|
+
).format(**values),
|
|
1044
|
+
)
|
|
951
1045
|
|
|
952
1046
|
# Now we should have the kubeadm command, and the bootlooping kubelet
|
|
953
1047
|
# waiting for kubeadm to configure it.
|
|
954
1048
|
|
|
955
|
-
def getKubernetesAutoscalerSetupCommands(self, values:
|
|
1049
|
+
def getKubernetesAutoscalerSetupCommands(self, values: dict[str, str]) -> str:
|
|
956
1050
|
"""
|
|
957
1051
|
Return Bash commands that set up the Kubernetes cluster autoscaler for
|
|
958
1052
|
provisioning from the environment supported by this provisioner.
|
|
@@ -987,7 +1081,11 @@ class AbstractProvisioner(ABC):
|
|
|
987
1081
|
|
|
988
1082
|
# Customize scheduler to pack jobs into as few nodes as possible
|
|
989
1083
|
# See: https://kubernetes.io/docs/reference/scheduling/config/#profiles
|
|
990
|
-
config.addFile(
|
|
1084
|
+
config.addFile(
|
|
1085
|
+
"/home/core/scheduler-config.yml",
|
|
1086
|
+
mode="0644",
|
|
1087
|
+
contents=textwrap.dedent(
|
|
1088
|
+
"""\
|
|
991
1089
|
apiVersion: kubescheduler.config.k8s.io/v1beta1
|
|
992
1090
|
kind: KubeSchedulerConfiguration
|
|
993
1091
|
clientConnection:
|
|
@@ -1001,13 +1099,21 @@ class AbstractProvisioner(ABC):
|
|
|
1001
1099
|
enabled:
|
|
1002
1100
|
- name: NodeResourcesMostAllocated
|
|
1003
1101
|
weight: 1
|
|
1004
|
-
|
|
1102
|
+
""".format(
|
|
1103
|
+
**values
|
|
1104
|
+
)
|
|
1105
|
+
),
|
|
1106
|
+
)
|
|
1005
1107
|
|
|
1006
1108
|
# Main kubeadm cluster configuration.
|
|
1007
1109
|
# Make sure to mount the scheduler config where the scheduler can see
|
|
1008
1110
|
# it, which is undocumented but inferred from
|
|
1009
1111
|
# https://pkg.go.dev/k8s.io/kubernetes@v1.21.0/cmd/kubeadm/app/apis/kubeadm#ControlPlaneComponent
|
|
1010
|
-
config.addFile(
|
|
1112
|
+
config.addFile(
|
|
1113
|
+
"/home/core/kubernetes-leader.yml",
|
|
1114
|
+
mode="0644",
|
|
1115
|
+
contents=textwrap.dedent(
|
|
1116
|
+
"""\
|
|
1011
1117
|
apiVersion: kubeadm.k8s.io/v1beta2
|
|
1012
1118
|
kind: InitConfiguration
|
|
1013
1119
|
nodeRegistration:
|
|
@@ -1039,11 +1145,18 @@ class AbstractProvisioner(ABC):
|
|
|
1039
1145
|
serverTLSBootstrap: true
|
|
1040
1146
|
rotateCertificates: true
|
|
1041
1147
|
cgroupDriver: systemd
|
|
1042
|
-
|
|
1148
|
+
""".format(
|
|
1149
|
+
**values
|
|
1150
|
+
)
|
|
1151
|
+
),
|
|
1152
|
+
)
|
|
1043
1153
|
|
|
1044
1154
|
# Make a script to apply that and the other cluster components
|
|
1045
1155
|
# Note that we're escaping {{thing}} as {{{{thing}}}} because we need to match mustaches in a yaml we hack up.
|
|
1046
|
-
config.addFile(
|
|
1156
|
+
config.addFile(
|
|
1157
|
+
"/home/core/create-kubernetes-cluster.sh",
|
|
1158
|
+
contents=textwrap.dedent(
|
|
1159
|
+
"""\
|
|
1047
1160
|
#!/usr/bin/env bash
|
|
1048
1161
|
set -e
|
|
1049
1162
|
|
|
@@ -1076,7 +1189,11 @@ class AbstractProvisioner(ABC):
|
|
|
1076
1189
|
kubectl apply -f https://raw.githubusercontent.com/kontena/kubelet-rubber-stamp/release/{RUBBER_STAMP_VERSION}/deploy/role_binding.yaml
|
|
1077
1190
|
kubectl apply -f https://raw.githubusercontent.com/kontena/kubelet-rubber-stamp/release/{RUBBER_STAMP_VERSION}/deploy/operator.yaml
|
|
1078
1191
|
|
|
1079
|
-
|
|
1192
|
+
"""
|
|
1193
|
+
).format(**values)
|
|
1194
|
+
+ self.getKubernetesAutoscalerSetupCommands(values)
|
|
1195
|
+
+ textwrap.dedent(
|
|
1196
|
+
"""\
|
|
1080
1197
|
# Set up metrics server, which needs serverTLSBootstrap and rubber stamp, and insists on running on a worker
|
|
1081
1198
|
curl -sSL https://github.com/kubernetes-sigs/metrics-server/releases/download/{METRICS_API_VERSION}/components.yaml | \\
|
|
1082
1199
|
sed 's/ - --secure-port=4443/ - --secure-port=4443\\n - --kubelet-preferred-address-types=Hostname/' | \\
|
|
@@ -1090,8 +1207,13 @@ class AbstractProvisioner(ABC):
|
|
|
1090
1207
|
|
|
1091
1208
|
mkdir -p "{SETUP_STATE_DIR}"
|
|
1092
1209
|
touch "$FLAG_FILE"
|
|
1093
|
-
|
|
1094
|
-
|
|
1210
|
+
"""
|
|
1211
|
+
).format(**values),
|
|
1212
|
+
)
|
|
1213
|
+
config.addUnit(
|
|
1214
|
+
"create-kubernetes-cluster.service",
|
|
1215
|
+
contents=textwrap.dedent(
|
|
1216
|
+
"""\
|
|
1095
1217
|
[Unit]
|
|
1096
1218
|
Description=Kubernetes cluster bootstrap
|
|
1097
1219
|
After=install-kubernetes.service
|
|
@@ -1110,10 +1232,15 @@ class AbstractProvisioner(ABC):
|
|
|
1110
1232
|
[Install]
|
|
1111
1233
|
WantedBy=multi-user.target
|
|
1112
1234
|
RequiredBy=toil-leader.service
|
|
1113
|
-
|
|
1235
|
+
"""
|
|
1236
|
+
).format(**values),
|
|
1237
|
+
)
|
|
1114
1238
|
|
|
1115
1239
|
# We also need a node cleaner service
|
|
1116
|
-
config.addFile(
|
|
1240
|
+
config.addFile(
|
|
1241
|
+
"/home/core/cleanup-nodes.sh",
|
|
1242
|
+
contents=textwrap.dedent(
|
|
1243
|
+
"""\
|
|
1117
1244
|
#!/usr/bin/env bash
|
|
1118
1245
|
# cleanup-nodes.sh: constantly clean up NotReady nodes that are tainted as having been deleted
|
|
1119
1246
|
set -e
|
|
@@ -1132,8 +1259,13 @@ class AbstractProvisioner(ABC):
|
|
|
1132
1259
|
done
|
|
1133
1260
|
sleep 300
|
|
1134
1261
|
done
|
|
1135
|
-
|
|
1136
|
-
|
|
1262
|
+
"""
|
|
1263
|
+
).format(**values),
|
|
1264
|
+
)
|
|
1265
|
+
config.addUnit(
|
|
1266
|
+
"cleanup-nodes.service",
|
|
1267
|
+
contents=textwrap.dedent(
|
|
1268
|
+
"""\
|
|
1137
1269
|
[Unit]
|
|
1138
1270
|
Description=Remove scaled-in nodes
|
|
1139
1271
|
After=create-kubernetes-cluster.service
|
|
@@ -1145,9 +1277,16 @@ class AbstractProvisioner(ABC):
|
|
|
1145
1277
|
RestartSec=10
|
|
1146
1278
|
[Install]
|
|
1147
1279
|
WantedBy=multi-user.target
|
|
1148
|
-
|
|
1280
|
+
"""
|
|
1281
|
+
),
|
|
1282
|
+
)
|
|
1149
1283
|
|
|
1150
|
-
def addKubernetesWorker(
|
|
1284
|
+
def addKubernetesWorker(
|
|
1285
|
+
self,
|
|
1286
|
+
config: InstanceConfiguration,
|
|
1287
|
+
authVars: dict[str, str],
|
|
1288
|
+
preemptible: bool = False,
|
|
1289
|
+
):
|
|
1151
1290
|
"""
|
|
1152
1291
|
Add services to configure as a Kubernetes worker, if Kubernetes is
|
|
1153
1292
|
already set to be installed.
|
|
@@ -1167,10 +1306,16 @@ class AbstractProvisioner(ABC):
|
|
|
1167
1306
|
# TODO: We use the same label that EKS uses here, because nothing is standardized.
|
|
1168
1307
|
# This won't be quite appropriate as we aren't on EKS and we might not
|
|
1169
1308
|
# even be on AWS, but the batch system should understand it.
|
|
1170
|
-
values[
|
|
1309
|
+
values["WORKER_LABEL_SPEC"] = (
|
|
1310
|
+
'node-labels: "eks.amazonaws.com/capacityType=SPOT"' if preemptible else ""
|
|
1311
|
+
)
|
|
1171
1312
|
|
|
1172
1313
|
# Kubeadm worker configuration
|
|
1173
|
-
config.addFile(
|
|
1314
|
+
config.addFile(
|
|
1315
|
+
"/home/core/kubernetes-worker.yml",
|
|
1316
|
+
mode="0644",
|
|
1317
|
+
contents=textwrap.dedent(
|
|
1318
|
+
"""\
|
|
1174
1319
|
apiVersion: kubeadm.k8s.io/v1beta2
|
|
1175
1320
|
kind: JoinConfiguration
|
|
1176
1321
|
nodeRegistration:
|
|
@@ -1188,10 +1333,17 @@ class AbstractProvisioner(ABC):
|
|
|
1188
1333
|
apiVersion: kubelet.config.k8s.io/v1beta1
|
|
1189
1334
|
kind: KubeletConfiguration
|
|
1190
1335
|
cgroupDriver: systemd
|
|
1191
|
-
|
|
1336
|
+
""".format(
|
|
1337
|
+
**values
|
|
1338
|
+
)
|
|
1339
|
+
),
|
|
1340
|
+
)
|
|
1192
1341
|
|
|
1193
1342
|
# Make a script to join the cluster using that configuration
|
|
1194
|
-
config.addFile(
|
|
1343
|
+
config.addFile(
|
|
1344
|
+
"/home/core/join-kubernetes-cluster.sh",
|
|
1345
|
+
contents=textwrap.dedent(
|
|
1346
|
+
"""\
|
|
1195
1347
|
#!/usr/bin/env bash
|
|
1196
1348
|
set -e
|
|
1197
1349
|
FLAG_FILE="{SETUP_STATE_DIR}/join-kubernetes-cluster.done"
|
|
@@ -1206,9 +1358,14 @@ class AbstractProvisioner(ABC):
|
|
|
1206
1358
|
|
|
1207
1359
|
mkdir -p "{SETUP_STATE_DIR}"
|
|
1208
1360
|
touch "$FLAG_FILE"
|
|
1209
|
-
|
|
1361
|
+
"""
|
|
1362
|
+
).format(**values),
|
|
1363
|
+
)
|
|
1210
1364
|
|
|
1211
|
-
config.addUnit(
|
|
1365
|
+
config.addUnit(
|
|
1366
|
+
"join-kubernetes-cluster.service",
|
|
1367
|
+
contents=textwrap.dedent(
|
|
1368
|
+
"""\
|
|
1212
1369
|
[Unit]
|
|
1213
1370
|
Description=Kubernetes cluster membership
|
|
1214
1371
|
After=install-kubernetes.service
|
|
@@ -1226,9 +1383,17 @@ class AbstractProvisioner(ABC):
|
|
|
1226
1383
|
|
|
1227
1384
|
[Install]
|
|
1228
1385
|
WantedBy=multi-user.target
|
|
1229
|
-
|
|
1386
|
+
"""
|
|
1387
|
+
).format(**values),
|
|
1388
|
+
)
|
|
1230
1389
|
|
|
1231
|
-
def _getIgnitionUserData(
|
|
1390
|
+
def _getIgnitionUserData(
|
|
1391
|
+
self,
|
|
1392
|
+
role: str,
|
|
1393
|
+
keyPath: Optional[str] = None,
|
|
1394
|
+
preemptible: bool = False,
|
|
1395
|
+
architecture: str = "amd64",
|
|
1396
|
+
) -> str:
|
|
1232
1397
|
"""
|
|
1233
1398
|
Return the text (not bytes) user data to pass to a provisioned node.
|
|
1234
1399
|
|
|
@@ -1242,33 +1407,35 @@ class AbstractProvisioner(ABC):
|
|
|
1242
1407
|
# Start with a base config
|
|
1243
1408
|
config = self.getBaseInstanceConfiguration()
|
|
1244
1409
|
|
|
1245
|
-
if self.clusterType ==
|
|
1410
|
+
if self.clusterType == "kubernetes":
|
|
1246
1411
|
# Install Kubernetes
|
|
1247
1412
|
self.addKubernetesServices(config, architecture)
|
|
1248
1413
|
|
|
1249
|
-
if role ==
|
|
1414
|
+
if role == "leader":
|
|
1250
1415
|
# Set up the cluster
|
|
1251
1416
|
self.addKubernetesLeader(config)
|
|
1252
1417
|
|
|
1253
1418
|
# We can't actually set up a Kubernetes worker without credentials
|
|
1254
1419
|
# to connect back to the leader.
|
|
1255
1420
|
|
|
1256
|
-
if self.clusterType ==
|
|
1421
|
+
if self.clusterType == "mesos" or role == "leader":
|
|
1257
1422
|
# Leaders, and all nodes in a Mesos cluster, need a Toil service
|
|
1258
1423
|
self.add_toil_service(config, role, keyPath, preemptible)
|
|
1259
1424
|
|
|
1260
|
-
if role ==
|
|
1425
|
+
if role == "worker" and self._leaderWorkerAuthentication is not None:
|
|
1261
1426
|
# We need to connect the worker to the leader.
|
|
1262
|
-
if self.clusterType ==
|
|
1427
|
+
if self.clusterType == "mesos":
|
|
1263
1428
|
# This involves an SSH public key form the leader
|
|
1264
1429
|
config.addSSHRSAKey(self._leaderWorkerAuthentication)
|
|
1265
|
-
elif self.clusterType ==
|
|
1430
|
+
elif self.clusterType == "kubernetes":
|
|
1266
1431
|
# We can install the Kubernetes worker and make it phone home
|
|
1267
1432
|
# to the leader.
|
|
1268
1433
|
# TODO: this puts sufficient info to fake a malicious worker
|
|
1269
1434
|
# into the worker config, which probably is accessible by
|
|
1270
1435
|
# anyone in the cloud account.
|
|
1271
|
-
self.addKubernetesWorker(
|
|
1436
|
+
self.addKubernetesWorker(
|
|
1437
|
+
config, self._leaderWorkerAuthentication, preemptible=preemptible
|
|
1438
|
+
)
|
|
1272
1439
|
|
|
1273
1440
|
# Make it into a string for Ignition
|
|
1274
1441
|
user_data = config.toIgnitionConfig()
|
|
@@ -1279,21 +1446,29 @@ class AbstractProvisioner(ABC):
|
|
|
1279
1446
|
user_data_limit: int = self._get_user_data_limit()
|
|
1280
1447
|
|
|
1281
1448
|
if len(user_data) > user_data_limit:
|
|
1282
|
-
logger.warning(
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
|
|
1287
|
-
|
|
1288
|
-
|
|
1289
|
-
|
|
1290
|
-
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1449
|
+
logger.warning(
|
|
1450
|
+
f"Ignition config size exceeds the user data limit ({len(user_data)} > {user_data_limit}). "
|
|
1451
|
+
"Writing to cloud storage..."
|
|
1452
|
+
)
|
|
1453
|
+
|
|
1454
|
+
src = self._write_file_to_cloud(
|
|
1455
|
+
f"configs/{role}/config-{uuid4()}.ign",
|
|
1456
|
+
contents=user_data.encode("utf-8"),
|
|
1457
|
+
)
|
|
1458
|
+
|
|
1459
|
+
return json.dumps(
|
|
1460
|
+
{
|
|
1461
|
+
"ignition": {
|
|
1462
|
+
"version": "2.2.0",
|
|
1463
|
+
# See: https://github.com/coreos/ignition/blob/spec2x/doc/configuration-v2_2.md
|
|
1464
|
+
"config": {
|
|
1465
|
+
"replace": {
|
|
1466
|
+
"source": src,
|
|
1467
|
+
}
|
|
1468
|
+
},
|
|
1295
1469
|
}
|
|
1296
|
-
}
|
|
1297
|
-
|
|
1470
|
+
},
|
|
1471
|
+
separators=(",", ":"),
|
|
1472
|
+
)
|
|
1298
1473
|
|
|
1299
1474
|
return user_data
|