skypilot-nightly 1.0.0.dev20250521__py3-none-any.whl → 1.0.0.dev20250523__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +46 -16
- sky/backends/cloud_vm_ray_backend.py +16 -4
- sky/check.py +109 -44
- sky/cli.py +261 -90
- sky/client/cli.py +261 -90
- sky/client/sdk.py +122 -3
- sky/clouds/__init__.py +5 -0
- sky/clouds/aws.py +4 -2
- sky/clouds/azure.py +4 -2
- sky/clouds/cloud.py +30 -6
- sky/clouds/cudo.py +2 -1
- sky/clouds/do.py +2 -1
- sky/clouds/fluidstack.py +2 -1
- sky/clouds/gcp.py +160 -23
- sky/clouds/ibm.py +4 -2
- sky/clouds/kubernetes.py +66 -22
- sky/clouds/lambda_cloud.py +2 -1
- sky/clouds/nebius.py +18 -2
- sky/clouds/oci.py +4 -2
- sky/clouds/paperspace.py +2 -1
- sky/clouds/runpod.py +2 -1
- sky/clouds/scp.py +2 -1
- sky/clouds/service_catalog/__init__.py +3 -0
- sky/clouds/service_catalog/common.py +9 -2
- sky/clouds/service_catalog/constants.py +2 -1
- sky/clouds/service_catalog/ssh_catalog.py +167 -0
- sky/clouds/ssh.py +203 -0
- sky/clouds/vast.py +2 -1
- sky/clouds/vsphere.py +2 -1
- sky/core.py +59 -17
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/data/storage.py +1 -0
- sky/execution.py +56 -7
- sky/jobs/server/core.py +4 -2
- sky/optimizer.py +29 -15
- sky/provision/__init__.py +1 -0
- sky/provision/aws/instance.py +17 -1
- sky/provision/gcp/constants.py +147 -4
- sky/provision/gcp/instance_utils.py +10 -0
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/kubernetes/instance.py +16 -5
- sky/provision/kubernetes/utils.py +37 -19
- sky/provision/nebius/instance.py +3 -1
- sky/provision/nebius/utils.py +14 -2
- sky/provision/ssh/__init__.py +18 -0
- sky/resources.py +177 -4
- sky/serve/server/core.py +2 -4
- sky/server/common.py +46 -9
- sky/server/constants.py +2 -0
- sky/server/html/token_page.html +154 -0
- sky/server/requests/executor.py +3 -6
- sky/server/requests/payloads.py +7 -0
- sky/server/server.py +80 -8
- sky/setup_files/dependencies.py +1 -0
- sky/skypilot_config.py +117 -31
- sky/task.py +24 -1
- sky/templates/gcp-ray.yml.j2 +44 -1
- sky/templates/nebius-ray.yml.j2 +12 -2
- sky/utils/admin_policy_utils.py +26 -22
- sky/utils/context.py +36 -6
- sky/utils/context_utils.py +15 -0
- sky/utils/infra_utils.py +21 -1
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/create_cluster.sh +1 -0
- sky/utils/kubernetes/deploy_remote_cluster.py +1437 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
- sky/utils/kubernetes/ssh-tunnel.sh +387 -0
- sky/utils/log_utils.py +214 -1
- sky/utils/resources_utils.py +14 -0
- sky/utils/schemas.py +67 -0
- sky/utils/ux_utils.py +2 -1
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/METADATA +6 -1
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/RECORD +88 -81
- sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- /sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1437 @@
|
|
1
|
+
"""SSH-based Kubernetes Cluster Deployment Script"""
|
2
|
+
# Refer to https://docs.skypilot.co/en/latest/reservations/existing-machines.html for details on how to use this script. # pylint: disable=line-too-long
|
3
|
+
import argparse
|
4
|
+
import base64
|
5
|
+
import concurrent.futures as cf
|
6
|
+
import os
|
7
|
+
import random
|
8
|
+
import re
|
9
|
+
import shlex
|
10
|
+
import subprocess
|
11
|
+
import sys
|
12
|
+
import tempfile
|
13
|
+
from typing import Any, Dict, List, Optional, Set
|
14
|
+
|
15
|
+
import yaml
|
16
|
+
|
17
|
+
# Colors for nicer UX
|
18
|
+
RED = '\033[0;31m'
|
19
|
+
GREEN = '\033[0;32m'
|
20
|
+
YELLOW = '\033[1;33m'
|
21
|
+
WARNING_YELLOW = '\x1b[33m'
|
22
|
+
NC = '\033[0m' # No color
|
23
|
+
|
24
|
+
DEFAULT_SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
|
25
|
+
DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
|
26
|
+
SSH_CONFIG_PATH = os.path.expanduser('~/.ssh/config')
|
27
|
+
NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
|
28
|
+
|
29
|
+
# Get the directory of this script
|
30
|
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
31
|
+
|
32
|
+
|
33
|
+
class UniqueKeySafeLoader(yaml.SafeLoader):
|
34
|
+
"""Custom YAML loader that raises an error if there are duplicate keys."""
|
35
|
+
|
36
|
+
def construct_mapping(self, node, deep=False):
|
37
|
+
mapping = {}
|
38
|
+
for key_node, value_node in node.value:
|
39
|
+
key = self.construct_object(key_node, deep=deep)
|
40
|
+
if key in mapping:
|
41
|
+
raise yaml.constructor.ConstructorError(
|
42
|
+
note=(f'Duplicate cluster config for cluster {key!r}.\n'
|
43
|
+
'Please remove one of them from: '
|
44
|
+
f'{DEFAULT_SSH_NODE_POOLS_PATH}'))
|
45
|
+
value = self.construct_object(value_node, deep=deep)
|
46
|
+
mapping[key] = value
|
47
|
+
return mapping
|
48
|
+
|
49
|
+
|
50
|
+
# Register the custom constructor inside the class
|
51
|
+
UniqueKeySafeLoader.add_constructor(
|
52
|
+
yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
|
53
|
+
UniqueKeySafeLoader.construct_mapping)
|
54
|
+
|
55
|
+
|
56
|
+
def parse_args():
|
57
|
+
parser = argparse.ArgumentParser(
|
58
|
+
description='Deploy a Kubernetes cluster on remote machines.')
|
59
|
+
parser.add_argument(
|
60
|
+
'--infra', help='Name of the cluster in ssh_node_pools.yaml to use')
|
61
|
+
parser.add_argument(
|
62
|
+
'--ssh-node-pools-file',
|
63
|
+
dest='ssh_node_pools_file',
|
64
|
+
default=DEFAULT_SSH_NODE_POOLS_PATH,
|
65
|
+
help=
|
66
|
+
f'Path to SSH node pools YAML file (default: {DEFAULT_SSH_NODE_POOLS_PATH})'
|
67
|
+
)
|
68
|
+
parser.add_argument(
|
69
|
+
'--kubeconfig-path',
|
70
|
+
dest='kubeconfig_path',
|
71
|
+
default=DEFAULT_KUBECONFIG_PATH,
|
72
|
+
help=
|
73
|
+
f'Path to save the kubeconfig file (default: {DEFAULT_KUBECONFIG_PATH})'
|
74
|
+
)
|
75
|
+
parser.add_argument(
|
76
|
+
'--use-ssh-config',
|
77
|
+
dest='use_ssh_config',
|
78
|
+
action='store_true',
|
79
|
+
help='Use SSH config for host settings instead of explicit parameters')
|
80
|
+
#TODO(romilb): The `sky local up --ips` command is deprecated and these args are now captured in the ssh_node_pools.yaml file.
|
81
|
+
# Remove these args after 0.11.0 release.
|
82
|
+
parser.add_argument(
|
83
|
+
'--ips-file',
|
84
|
+
dest='ips_file',
|
85
|
+
help=
|
86
|
+
'[Deprecated, use --ssh-node-pools-file instead] File containing IP addresses or SSH host entries (one per line)'
|
87
|
+
)
|
88
|
+
parser.add_argument(
|
89
|
+
'--user',
|
90
|
+
help=
|
91
|
+
'[Deprecated, use --ssh-node-pools-file instead] Username to use for SSH (overridden by SSH config if host exists there)'
|
92
|
+
)
|
93
|
+
parser.add_argument(
|
94
|
+
'--ssh-key',
|
95
|
+
dest='ssh_key',
|
96
|
+
help=
|
97
|
+
'[Deprecated, use --ssh-node-pools-file instead] Path to SSH private key (overridden by SSH config if host exists there)'
|
98
|
+
)
|
99
|
+
parser.add_argument(
|
100
|
+
'--context-name',
|
101
|
+
dest='context_name',
|
102
|
+
default='default',
|
103
|
+
help=
|
104
|
+
'[Deprecated, use --ssh-node-pools-file instead] Kubernetes context name'
|
105
|
+
)
|
106
|
+
parser.add_argument('--cleanup',
|
107
|
+
action='store_true',
|
108
|
+
help='Clean up the cluster')
|
109
|
+
parser.add_argument(
|
110
|
+
'--password',
|
111
|
+
help='[Deprecated, use --ssh-node-pools-file instead] Password for sudo'
|
112
|
+
)
|
113
|
+
|
114
|
+
return parser.parse_args()
|
115
|
+
|
116
|
+
|
117
|
+
def load_ssh_targets(file_path: str) -> Dict[str, Any]:
|
118
|
+
"""Load SSH targets from YAML file."""
|
119
|
+
if not os.path.exists(file_path):
|
120
|
+
print(f'{RED}Error: SSH Node Pools file not found: {file_path}{NC}',
|
121
|
+
file=sys.stderr)
|
122
|
+
sys.exit(1)
|
123
|
+
|
124
|
+
try:
|
125
|
+
with open(file_path, 'r', encoding='utf-8') as f:
|
126
|
+
targets = yaml.load(f, Loader=UniqueKeySafeLoader)
|
127
|
+
return targets
|
128
|
+
except yaml.constructor.ConstructorError as e:
|
129
|
+
print(f'{RED}{e.note}{NC}', file=sys.stderr)
|
130
|
+
sys.exit(1)
|
131
|
+
except (yaml.YAMLError, IOError, OSError) as e:
|
132
|
+
print(f'{RED}Error loading SSH Node Pools file: {e}{NC}',
|
133
|
+
file=sys.stderr)
|
134
|
+
sys.exit(1)
|
135
|
+
|
136
|
+
|
137
|
+
def check_host_in_ssh_config(hostname: str) -> bool:
|
138
|
+
"""Return True iff *hostname* matches at least one `Host`/`Match` stanza
|
139
|
+
in the user's OpenSSH client configuration (including anything pulled in
|
140
|
+
via Include).
|
141
|
+
|
142
|
+
It calls: ssh -vvG <hostname> -o ConnectTimeout=0
|
143
|
+
which:
|
144
|
+
• -G expands the effective config without connecting
|
145
|
+
• -vv prints debug lines that show which stanzas are applied
|
146
|
+
• ConnectTimeout=0 avoids a DNS lookup if <hostname> is a FQDN/IP
|
147
|
+
|
148
|
+
No config files are opened or parsed manually.
|
149
|
+
|
150
|
+
Parameters
|
151
|
+
----------
|
152
|
+
hostname : str
|
153
|
+
The alias/IP/FQDN you want to test.
|
154
|
+
|
155
|
+
Returns
|
156
|
+
-------
|
157
|
+
bool
|
158
|
+
True – a specific stanza matched the host
|
159
|
+
False – nothing but the global defaults (`Host *`) applied
|
160
|
+
"""
|
161
|
+
# We direct stderr→stdout because debug output goes to stderr.
|
162
|
+
proc = subprocess.run(
|
163
|
+
['ssh', '-vvG', hostname, '-o', 'ConnectTimeout=0'],
|
164
|
+
text=True,
|
165
|
+
stdout=subprocess.PIPE,
|
166
|
+
stderr=subprocess.STDOUT,
|
167
|
+
check=False, # we only want the text, not to raise
|
168
|
+
)
|
169
|
+
|
170
|
+
# Look for lines like:
|
171
|
+
# debug1: ~/.ssh/config line 42: Applying options for <hostname>
|
172
|
+
# Anything other than "*"
|
173
|
+
pattern = re.compile(r'^debug\d+: .*Applying options for ([^*].*)$',
|
174
|
+
re.MULTILINE)
|
175
|
+
|
176
|
+
return bool(pattern.search(proc.stdout))
|
177
|
+
|
178
|
+
|
179
|
+
def get_cluster_config(targets: Dict[str, Any],
|
180
|
+
cluster_name: Optional[str] = None,
|
181
|
+
file_path: Optional[str] = None) -> Dict[str, Any]:
|
182
|
+
"""Get configuration for specific clusters or all clusters."""
|
183
|
+
if not targets:
|
184
|
+
print(
|
185
|
+
f'{RED}Error: No clusters defined in SSH Node Pools '
|
186
|
+
f'file {file_path}{NC}',
|
187
|
+
file=sys.stderr)
|
188
|
+
sys.exit(1)
|
189
|
+
|
190
|
+
if cluster_name:
|
191
|
+
if cluster_name not in targets:
|
192
|
+
print(
|
193
|
+
f'{RED}Error: Cluster {cluster_name!r} not found in '
|
194
|
+
f'SSH Node Pools file {file_path}{NC}',
|
195
|
+
file=sys.stderr)
|
196
|
+
sys.exit(1)
|
197
|
+
return {cluster_name: targets[cluster_name]}
|
198
|
+
|
199
|
+
# Return all clusters if no specific cluster is specified
|
200
|
+
return targets
|
201
|
+
|
202
|
+
|
203
|
+
def prepare_hosts_info(cluster_config: Dict[str, Any]) -> List[Dict[str, str]]:
|
204
|
+
"""Prepare list of hosts with resolved user, identity_file, and password."""
|
205
|
+
if 'hosts' not in cluster_config or not cluster_config['hosts']:
|
206
|
+
print(f'{RED}Error: No hosts defined in cluster configuration{NC}',
|
207
|
+
file=sys.stderr)
|
208
|
+
sys.exit(1)
|
209
|
+
|
210
|
+
# Get cluster-level defaults
|
211
|
+
cluster_user = cluster_config.get('user', '')
|
212
|
+
cluster_identity_file = cluster_config.get('identity_file', '')
|
213
|
+
cluster_password = cluster_config.get('password', '')
|
214
|
+
|
215
|
+
hosts_info = []
|
216
|
+
for host in cluster_config['hosts']:
|
217
|
+
# Host can be a string (IP or SSH config hostname) or a dict
|
218
|
+
if isinstance(host, str):
|
219
|
+
# Check if this is an SSH config hostname
|
220
|
+
is_ssh_config_host = check_host_in_ssh_config(host)
|
221
|
+
|
222
|
+
hosts_info.append({
|
223
|
+
'ip': host,
|
224
|
+
'user': '' if is_ssh_config_host else cluster_user,
|
225
|
+
'identity_file': '' if is_ssh_config_host else
|
226
|
+
cluster_identity_file,
|
227
|
+
'password': cluster_password,
|
228
|
+
'use_ssh_config': is_ssh_config_host
|
229
|
+
})
|
230
|
+
else:
|
231
|
+
# It's a dict with potential overrides
|
232
|
+
if 'ip' not in host:
|
233
|
+
print(
|
234
|
+
f'{RED}Warning: Host missing \'ip\' field, skipping: {host}{NC}'
|
235
|
+
)
|
236
|
+
continue
|
237
|
+
|
238
|
+
# Check if this is an SSH config hostname
|
239
|
+
is_ssh_config_host = check_host_in_ssh_config(host['ip'])
|
240
|
+
|
241
|
+
# Use host-specific values or fall back to cluster defaults
|
242
|
+
host_user = '' if is_ssh_config_host else host.get(
|
243
|
+
'user', cluster_user)
|
244
|
+
host_identity_file = '' if is_ssh_config_host else host.get(
|
245
|
+
'identity_file', cluster_identity_file)
|
246
|
+
host_password = host.get('password', cluster_password)
|
247
|
+
|
248
|
+
hosts_info.append({
|
249
|
+
'ip': host['ip'],
|
250
|
+
'user': host_user,
|
251
|
+
'identity_file': host_identity_file,
|
252
|
+
'password': host_password,
|
253
|
+
'use_ssh_config': is_ssh_config_host
|
254
|
+
})
|
255
|
+
|
256
|
+
return hosts_info
|
257
|
+
|
258
|
+
|
259
|
+
def run_command(cmd, shell=False):
|
260
|
+
"""Run a local command and return the output."""
|
261
|
+
process = subprocess.run(cmd,
|
262
|
+
shell=shell,
|
263
|
+
capture_output=True,
|
264
|
+
text=True,
|
265
|
+
check=False)
|
266
|
+
if process.returncode != 0:
|
267
|
+
print(f'{RED}Error executing command: {cmd}{NC}')
|
268
|
+
print(f'STDOUT: {process.stdout}')
|
269
|
+
print(f'STDERR: {process.stderr}')
|
270
|
+
return None
|
271
|
+
return process.stdout.strip()
|
272
|
+
|
273
|
+
|
274
|
+
def get_effective_host_ip(hostname: str) -> str:
|
275
|
+
"""Get the effective IP for a hostname from SSH config."""
|
276
|
+
try:
|
277
|
+
result = subprocess.run(['ssh', '-G', hostname],
|
278
|
+
capture_output=True,
|
279
|
+
text=True,
|
280
|
+
check=False)
|
281
|
+
if result.returncode == 0:
|
282
|
+
for line in result.stdout.splitlines():
|
283
|
+
if line.startswith('hostname '):
|
284
|
+
return line.split(' ', 1)[1].strip()
|
285
|
+
except Exception: # pylint: disable=broad-except
|
286
|
+
pass
|
287
|
+
return hostname # Return the original hostname if lookup fails
|
288
|
+
|
289
|
+
|
290
|
+
def run_remote(node,
|
291
|
+
cmd,
|
292
|
+
user='',
|
293
|
+
ssh_key='',
|
294
|
+
connect_timeout=30,
|
295
|
+
use_ssh_config=False,
|
296
|
+
print_output=False,
|
297
|
+
use_shell=False):
|
298
|
+
"""Run a command on a remote machine via SSH."""
|
299
|
+
if use_ssh_config:
|
300
|
+
# Use SSH config for connection parameters
|
301
|
+
ssh_cmd = ['ssh', node, cmd]
|
302
|
+
else:
|
303
|
+
# Use explicit parameters
|
304
|
+
ssh_cmd = [
|
305
|
+
'ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'IdentitiesOnly=yes',
|
306
|
+
'-o', f'ConnectTimeout={connect_timeout}', '-o',
|
307
|
+
'ServerAliveInterval=10', '-o', 'ServerAliveCountMax=3'
|
308
|
+
]
|
309
|
+
|
310
|
+
if ssh_key:
|
311
|
+
ssh_cmd.extend(['-i', ssh_key])
|
312
|
+
|
313
|
+
ssh_cmd.append(f'{user}@{node}' if user else node)
|
314
|
+
ssh_cmd.append(cmd)
|
315
|
+
|
316
|
+
if use_shell:
|
317
|
+
ssh_cmd = ' '.join(ssh_cmd)
|
318
|
+
|
319
|
+
process = subprocess.run(ssh_cmd,
|
320
|
+
capture_output=True,
|
321
|
+
text=True,
|
322
|
+
check=False,
|
323
|
+
shell=use_shell)
|
324
|
+
if process.returncode != 0:
|
325
|
+
print(f'{RED}Error executing command {cmd} on {node}:{NC}')
|
326
|
+
print(f'STDERR: {process.stderr}')
|
327
|
+
return None
|
328
|
+
if print_output:
|
329
|
+
print(process.stdout)
|
330
|
+
return process.stdout.strip()
|
331
|
+
|
332
|
+
|
333
|
+
def create_askpass_script(password):
|
334
|
+
"""Create an askpass script block for sudo with password."""
|
335
|
+
if not password:
|
336
|
+
return ''
|
337
|
+
|
338
|
+
return f"""
|
339
|
+
# Create temporary askpass script
|
340
|
+
ASKPASS_SCRIPT=$(mktemp)
|
341
|
+
trap 'rm -f $ASKPASS_SCRIPT' EXIT INT TERM ERR QUIT
|
342
|
+
cat > $ASKPASS_SCRIPT << EOF
|
343
|
+
#!/bin/bash
|
344
|
+
echo {password}
|
345
|
+
EOF
|
346
|
+
chmod 700 $ASKPASS_SCRIPT
|
347
|
+
# Use askpass
|
348
|
+
export SUDO_ASKPASS=$ASKPASS_SCRIPT
|
349
|
+
"""
|
350
|
+
|
351
|
+
|
352
|
+
def progress_message(message):
|
353
|
+
"""Show a progress message."""
|
354
|
+
print(f'{YELLOW}➜ {message}{NC}')
|
355
|
+
|
356
|
+
|
357
|
+
def success_message(message):
|
358
|
+
"""Show a success message."""
|
359
|
+
print(f'{GREEN}✔ {message}{NC}')
|
360
|
+
|
361
|
+
|
362
|
+
def cleanup_server_node(node,
|
363
|
+
user,
|
364
|
+
ssh_key,
|
365
|
+
askpass_block,
|
366
|
+
use_ssh_config=False):
|
367
|
+
"""Uninstall k3s and clean up the state on a server node."""
|
368
|
+
print(f'{YELLOW}Cleaning up head node {node}...{NC}')
|
369
|
+
cmd = f"""
|
370
|
+
{askpass_block}
|
371
|
+
echo 'Uninstalling k3s...' &&
|
372
|
+
sudo -A /usr/local/bin/k3s-uninstall.sh || true &&
|
373
|
+
sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
|
374
|
+
"""
|
375
|
+
result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
|
376
|
+
if result is None:
|
377
|
+
print(f'{RED}Failed to clean up head node ({node}).{NC}')
|
378
|
+
else:
|
379
|
+
success_message(f'Node {node} cleaned up successfully.')
|
380
|
+
|
381
|
+
|
382
|
+
def cleanup_agent_node(node,
|
383
|
+
user,
|
384
|
+
ssh_key,
|
385
|
+
askpass_block,
|
386
|
+
use_ssh_config=False):
|
387
|
+
"""Uninstall k3s and clean up the state on an agent node."""
|
388
|
+
print(f'{YELLOW}Cleaning up worker node {node}...{NC}')
|
389
|
+
cmd = f"""
|
390
|
+
{askpass_block}
|
391
|
+
echo 'Uninstalling k3s...' &&
|
392
|
+
sudo -A /usr/local/bin/k3s-agent-uninstall.sh || true &&
|
393
|
+
sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
|
394
|
+
"""
|
395
|
+
result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
|
396
|
+
if result is None:
|
397
|
+
print(f'{RED}Failed to clean up worker node ({node}).{NC}')
|
398
|
+
else:
|
399
|
+
success_message(f'Node {node} cleaned up successfully.')
|
400
|
+
|
401
|
+
|
402
|
+
def start_agent_node(node,
|
403
|
+
master_addr,
|
404
|
+
k3s_token,
|
405
|
+
user,
|
406
|
+
ssh_key,
|
407
|
+
askpass_block,
|
408
|
+
use_ssh_config=False):
|
409
|
+
"""Start a k3s agent node.
|
410
|
+
Returns: if the start is successful, and if the node has a GPU."""
|
411
|
+
cmd = f"""
|
412
|
+
{askpass_block}
|
413
|
+
curl -sfL https://get.k3s.io | K3S_NODE_NAME={node} INSTALL_K3S_EXEC='agent --node-label skypilot-ip={node}' \
|
414
|
+
K3S_URL=https://{master_addr}:6443 K3S_TOKEN={k3s_token} sudo -E -A sh -
|
415
|
+
"""
|
416
|
+
result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
|
417
|
+
if result is None:
|
418
|
+
print(f'{RED}Failed to deploy K3s on worker node ({node}).{NC}')
|
419
|
+
return node, False, False
|
420
|
+
success_message(f'Kubernetes deployed on worker node ({node}).')
|
421
|
+
# Check if worker node has a GPU
|
422
|
+
if check_gpu(node, user, ssh_key, use_ssh_config=use_ssh_config):
|
423
|
+
print(f'{YELLOW}GPU detected on worker node ({node}).{NC}')
|
424
|
+
return node, True, True
|
425
|
+
return node, True, False
|
426
|
+
|
427
|
+
|
428
|
+
def check_gpu(node, user, ssh_key, use_ssh_config=False):
|
429
|
+
"""Check if a node has a GPU."""
|
430
|
+
cmd = 'command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null'
|
431
|
+
result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
|
432
|
+
return result is not None
|
433
|
+
|
434
|
+
|
435
|
+
def ensure_directory_exists(path):
|
436
|
+
"""Ensure the directory for the specified file path exists."""
|
437
|
+
directory = os.path.dirname(path)
|
438
|
+
if directory and not os.path.exists(directory):
|
439
|
+
os.makedirs(directory, exist_ok=True)
|
440
|
+
|
441
|
+
|
442
|
+
def get_used_localhost_ports() -> Set[int]:
|
443
|
+
"""Get SSH port forwardings already in use on localhost"""
|
444
|
+
used_ports = set()
|
445
|
+
|
446
|
+
# Get ports from netstat (works on macOS and Linux)
|
447
|
+
try:
|
448
|
+
if sys.platform == 'darwin':
|
449
|
+
# macOS
|
450
|
+
result = subprocess.run(['netstat', '-an', '-p', 'tcp'],
|
451
|
+
capture_output=True,
|
452
|
+
text=True,
|
453
|
+
check=False)
|
454
|
+
else:
|
455
|
+
# Linux and other Unix-like systems
|
456
|
+
result = subprocess.run(['netstat', '-tln'],
|
457
|
+
capture_output=True,
|
458
|
+
text=True,
|
459
|
+
check=False)
|
460
|
+
|
461
|
+
if result.returncode == 0:
|
462
|
+
# Look for lines with 'localhost:<port>' or '127.0.0.1:<port>'
|
463
|
+
for line in result.stdout.splitlines():
|
464
|
+
if '127.0.0.1:' in line or 'localhost:' in line:
|
465
|
+
match = re.search(r':(64\d\d)\s', line)
|
466
|
+
if match:
|
467
|
+
port = int(match.group(1))
|
468
|
+
if 6400 <= port <= 6500: # Only consider our range
|
469
|
+
used_ports.add(port)
|
470
|
+
except (subprocess.SubprocessError, FileNotFoundError):
|
471
|
+
# If netstat fails, try another approach
|
472
|
+
pass
|
473
|
+
|
474
|
+
# Also check ports from existing kubeconfig entries
|
475
|
+
try:
|
476
|
+
result = subprocess.run([
|
477
|
+
'kubectl', 'config', 'view', '-o',
|
478
|
+
'jsonpath=\'{.clusters[*].cluster.server}\''
|
479
|
+
],
|
480
|
+
capture_output=True,
|
481
|
+
text=True,
|
482
|
+
check=False)
|
483
|
+
|
484
|
+
if result.returncode == 0:
|
485
|
+
# Look for localhost URLs with ports
|
486
|
+
for url in result.stdout.split():
|
487
|
+
if 'localhost:' in url or '127.0.0.1:' in url:
|
488
|
+
match = re.search(r':(\d+)', url)
|
489
|
+
if match:
|
490
|
+
port = int(match.group(1))
|
491
|
+
if 6400 <= port <= 6500: # Only consider our range
|
492
|
+
used_ports.add(port)
|
493
|
+
except subprocess.SubprocessError:
|
494
|
+
pass
|
495
|
+
|
496
|
+
return used_ports
|
497
|
+
|
498
|
+
|
499
|
+
def get_available_port(start: int = 6443, end: int = 6499) -> int:
|
500
|
+
"""Get an available port in the given range that's not used by other tunnels"""
|
501
|
+
used_ports = get_used_localhost_ports()
|
502
|
+
|
503
|
+
# Try to use port 6443 first if available for the first cluster
|
504
|
+
if start == 6443 and start not in used_ports:
|
505
|
+
return start
|
506
|
+
|
507
|
+
# Otherwise find any available port in the range
|
508
|
+
available_ports = list(set(range(start, end + 1)) - used_ports)
|
509
|
+
|
510
|
+
if not available_ports:
|
511
|
+
# If all ports are used, pick a random one from our range
|
512
|
+
# (we'll terminate any existing connection in the setup)
|
513
|
+
return random.randint(start, end)
|
514
|
+
|
515
|
+
# Sort to get deterministic allocation
|
516
|
+
available_ports.sort()
|
517
|
+
return available_ports[0]
|
518
|
+
|
519
|
+
|
520
|
+
def setup_kubectl_ssh_tunnel(head_node,
|
521
|
+
ssh_user,
|
522
|
+
ssh_key,
|
523
|
+
context_name,
|
524
|
+
use_ssh_config=False):
|
525
|
+
"""Set up kubeconfig exec credential plugin for SSH tunnel"""
|
526
|
+
progress_message('Setting up SSH tunnel for Kubernetes API access...')
|
527
|
+
|
528
|
+
# Get an available port for this cluster
|
529
|
+
port = get_available_port()
|
530
|
+
|
531
|
+
# Paths to scripts
|
532
|
+
tunnel_script = os.path.join(SCRIPT_DIR, 'ssh-tunnel.sh')
|
533
|
+
|
534
|
+
# Make sure scripts are executable
|
535
|
+
os.chmod(tunnel_script, 0o755)
|
536
|
+
|
537
|
+
# Certificate files
|
538
|
+
client_cert_file = os.path.join(NODE_POOLS_INFO_DIR,
|
539
|
+
f'{context_name}-cert.pem')
|
540
|
+
client_key_file = os.path.join(NODE_POOLS_INFO_DIR,
|
541
|
+
f'{context_name}-key.pem')
|
542
|
+
|
543
|
+
# Update kubeconfig to use localhost with the selected port
|
544
|
+
run_command([
|
545
|
+
'kubectl', 'config', 'set-cluster', context_name,
|
546
|
+
f'--server=https://127.0.0.1:{port}', '--insecure-skip-tls-verify=true'
|
547
|
+
])
|
548
|
+
|
549
|
+
# Build the exec args list based on auth method
|
550
|
+
exec_args = [
|
551
|
+
'--exec-command', tunnel_script, '--exec-api-version',
|
552
|
+
'client.authentication.k8s.io/v1beta1'
|
553
|
+
]
|
554
|
+
|
555
|
+
# Set credential TTL to force frequent tunnel checks
|
556
|
+
ttl_seconds = 30
|
557
|
+
|
558
|
+
# Verify if we have extracted certificate data files
|
559
|
+
has_cert_files = os.path.isfile(client_cert_file) and os.path.isfile(
|
560
|
+
client_key_file)
|
561
|
+
if has_cert_files:
|
562
|
+
print(
|
563
|
+
f'{GREEN}Client certificate data extracted and will be used for authentication{NC}'
|
564
|
+
)
|
565
|
+
|
566
|
+
if use_ssh_config:
|
567
|
+
run_command(
|
568
|
+
['kubectl', 'config', 'set-credentials', context_name] + exec_args +
|
569
|
+
[
|
570
|
+
'--exec-arg=--context', f'--exec-arg={context_name}',
|
571
|
+
'--exec-arg=--port', f'--exec-arg={port}', '--exec-arg=--ttl',
|
572
|
+
f'--exec-arg={ttl_seconds}', '--exec-arg=--use-ssh-config',
|
573
|
+
'--exec-arg=--host', f'--exec-arg={head_node}'
|
574
|
+
])
|
575
|
+
else:
|
576
|
+
run_command(['kubectl', 'config', 'set-credentials', context_name] +
|
577
|
+
exec_args + [
|
578
|
+
'--exec-arg=--context', f'--exec-arg={context_name}',
|
579
|
+
'--exec-arg=--port', f'--exec-arg={port}',
|
580
|
+
'--exec-arg=--ttl', f'--exec-arg={ttl_seconds}',
|
581
|
+
'--exec-arg=--host', f'--exec-arg={head_node}',
|
582
|
+
'--exec-arg=--user', f'--exec-arg={ssh_user}',
|
583
|
+
'--exec-arg=--ssh-key', f'--exec-arg={ssh_key}'
|
584
|
+
])
|
585
|
+
|
586
|
+
success_message(
|
587
|
+
f'SSH tunnel configured through kubectl credential plugin on port {port}'
|
588
|
+
)
|
589
|
+
print(
|
590
|
+
f'{GREEN}Your kubectl connection is now tunneled through SSH (port {port}).{NC}'
|
591
|
+
)
|
592
|
+
print(
|
593
|
+
f'{GREEN}This tunnel will be automatically established when needed.{NC}'
|
594
|
+
)
|
595
|
+
print(
|
596
|
+
f'{GREEN}Credential TTL set to {ttl_seconds}s to ensure tunnel health is checked frequently.{NC}'
|
597
|
+
)
|
598
|
+
|
599
|
+
return port
|
600
|
+
|
601
|
+
|
602
|
+
def cleanup_kubectl_ssh_tunnel(context_name):
|
603
|
+
"""Clean up the SSH tunnel for a specific context"""
|
604
|
+
progress_message(f'Cleaning up SSH tunnel for context {context_name}...')
|
605
|
+
|
606
|
+
# Path to cleanup script
|
607
|
+
cleanup_script = os.path.join(SCRIPT_DIR, 'cleanup-tunnel.sh')
|
608
|
+
|
609
|
+
# Make sure script is executable
|
610
|
+
if os.path.exists(cleanup_script):
|
611
|
+
os.chmod(cleanup_script, 0o755)
|
612
|
+
|
613
|
+
# Run the cleanup script
|
614
|
+
subprocess.run([cleanup_script, context_name],
|
615
|
+
stdout=subprocess.DEVNULL,
|
616
|
+
stderr=subprocess.DEVNULL,
|
617
|
+
check=False)
|
618
|
+
|
619
|
+
success_message(f'SSH tunnel for context {context_name} cleaned up')
|
620
|
+
else:
|
621
|
+
print(f'{YELLOW}Cleanup script not found: {cleanup_script}{NC}')
|
622
|
+
|
623
|
+
|
624
|
+
def main():
|
625
|
+
args = parse_args()
|
626
|
+
|
627
|
+
kubeconfig_path = os.path.expanduser(args.kubeconfig_path)
|
628
|
+
global_use_ssh_config = args.use_ssh_config
|
629
|
+
|
630
|
+
# Print cleanup mode marker if applicable
|
631
|
+
if args.cleanup:
|
632
|
+
print('SKYPILOT_CLEANUP_MODE: Cleanup mode activated')
|
633
|
+
|
634
|
+
# Check if using YAML configuration or command line arguments
|
635
|
+
if args.ips_file:
|
636
|
+
# Using command line arguments - legacy mode
|
637
|
+
if args.ssh_key and not os.path.isfile(
|
638
|
+
args.ssh_key) and not global_use_ssh_config:
|
639
|
+
print(f'{RED}Error: SSH key not found: {args.ssh_key}{NC}',
|
640
|
+
file=sys.stderr)
|
641
|
+
sys.exit(1)
|
642
|
+
|
643
|
+
if not os.path.isfile(args.ips_file):
|
644
|
+
print(f'{RED}Error: IPs file not found: {args.ips_file}{NC}',
|
645
|
+
file=sys.stderr)
|
646
|
+
sys.exit(1)
|
647
|
+
|
648
|
+
with open(args.ips_file, 'r', encoding='utf-8') as f:
|
649
|
+
hosts = [line.strip() for line in f if line.strip()]
|
650
|
+
|
651
|
+
if not hosts:
|
652
|
+
print(
|
653
|
+
f'{RED}Error: Hosts file is empty or not formatted correctly.{NC}',
|
654
|
+
file=sys.stderr)
|
655
|
+
sys.exit(1)
|
656
|
+
|
657
|
+
head_node = hosts[0]
|
658
|
+
worker_nodes = hosts[1:]
|
659
|
+
ssh_user = args.user if not global_use_ssh_config else ''
|
660
|
+
ssh_key = args.ssh_key if not global_use_ssh_config else ''
|
661
|
+
context_name = args.context_name
|
662
|
+
password = args.password
|
663
|
+
|
664
|
+
# Check if hosts are in SSH config
|
665
|
+
head_use_ssh_config = global_use_ssh_config or check_host_in_ssh_config(
|
666
|
+
head_node)
|
667
|
+
worker_use_ssh_config = [
|
668
|
+
global_use_ssh_config or check_host_in_ssh_config(node)
|
669
|
+
for node in worker_nodes
|
670
|
+
]
|
671
|
+
|
672
|
+
# Single cluster deployment for legacy mode
|
673
|
+
deploy_cluster(head_node, worker_nodes, ssh_user, ssh_key, context_name,
|
674
|
+
password, head_use_ssh_config, worker_use_ssh_config,
|
675
|
+
kubeconfig_path, args.cleanup)
|
676
|
+
else:
|
677
|
+
# Using YAML configuration
|
678
|
+
targets = load_ssh_targets(args.ssh_node_pools_file)
|
679
|
+
clusters_config = get_cluster_config(targets,
|
680
|
+
args.infra,
|
681
|
+
file_path=args.ssh_node_pools_file)
|
682
|
+
|
683
|
+
# Print information about clusters being processed
|
684
|
+
num_clusters = len(clusters_config)
|
685
|
+
cluster_names = list(clusters_config.keys())
|
686
|
+
cluster_info = f'Found {num_clusters} Node Pool{"s" if num_clusters > 1 else ""}: {", ".join(cluster_names)}'
|
687
|
+
print(f'SKYPILOT_CLUSTER_INFO: {cluster_info}')
|
688
|
+
|
689
|
+
# Process each cluster
|
690
|
+
for cluster_name, cluster_config in clusters_config.items():
|
691
|
+
print(f'SKYPILOT_CURRENT_CLUSTER: {cluster_name}')
|
692
|
+
print(f'{YELLOW}==== Deploying cluster: {cluster_name} ====${NC}')
|
693
|
+
hosts_info = prepare_hosts_info(cluster_config)
|
694
|
+
|
695
|
+
if not hosts_info:
|
696
|
+
print(
|
697
|
+
f'{RED}Error: No valid hosts found for cluster {cluster_name!r}. Skipping.{NC}'
|
698
|
+
)
|
699
|
+
continue
|
700
|
+
|
701
|
+
# Generate a unique context name for each cluster
|
702
|
+
context_name = args.context_name
|
703
|
+
if context_name == 'default':
|
704
|
+
context_name = 'ssh-' + cluster_name
|
705
|
+
|
706
|
+
# Check cluster history
|
707
|
+
os.makedirs(NODE_POOLS_INFO_DIR, exist_ok=True)
|
708
|
+
history_yaml_file = os.path.join(NODE_POOLS_INFO_DIR,
|
709
|
+
f'{context_name}-history.yaml')
|
710
|
+
|
711
|
+
history = None
|
712
|
+
if os.path.exists(history_yaml_file):
|
713
|
+
print(f'{YELLOW}Loading history from {history_yaml_file}{NC}')
|
714
|
+
with open(history_yaml_file, 'r', encoding='utf-8') as f:
|
715
|
+
history = yaml.safe_load(f)
|
716
|
+
else:
|
717
|
+
print(f'{YELLOW}No history found for {context_name}.{NC}')
|
718
|
+
|
719
|
+
history_workers_info = None
|
720
|
+
history_worker_nodes = None
|
721
|
+
history_use_ssh_config = None
|
722
|
+
# Do not support changing anything besides hosts for now
|
723
|
+
if history is not None:
|
724
|
+
for key in ['user', 'identity_file', 'password']:
|
725
|
+
if history.get(key) != cluster_config.get(key):
|
726
|
+
raise ValueError(
|
727
|
+
f'Cluster configuration has changed for field {key!r}. '
|
728
|
+
f'Previous value: {history.get(key)}, '
|
729
|
+
f'Current value: {cluster_config.get(key)}')
|
730
|
+
history_hosts_info = prepare_hosts_info(history)
|
731
|
+
if history_hosts_info[0] != hosts_info[0]:
|
732
|
+
raise ValueError(
|
733
|
+
f'Cluster configuration has changed for master node. '
|
734
|
+
f'Previous value: {history_hosts_info[0]}, '
|
735
|
+
f'Current value: {hosts_info[0]}')
|
736
|
+
history_workers_info = history_hosts_info[1:] if len(
|
737
|
+
history_hosts_info) > 1 else []
|
738
|
+
history_worker_nodes = [h['ip'] for h in history_workers_info]
|
739
|
+
history_use_ssh_config = [
|
740
|
+
h.get('use_ssh_config', False) for h in history_workers_info
|
741
|
+
]
|
742
|
+
|
743
|
+
# Use the first host as the head node and the rest as worker nodes
|
744
|
+
head_host = hosts_info[0]
|
745
|
+
worker_hosts = hosts_info[1:] if len(hosts_info) > 1 else []
|
746
|
+
|
747
|
+
head_node = head_host['ip']
|
748
|
+
worker_nodes = [h['ip'] for h in worker_hosts]
|
749
|
+
ssh_user = head_host['user']
|
750
|
+
ssh_key = head_host['identity_file']
|
751
|
+
head_use_ssh_config = global_use_ssh_config or head_host.get(
|
752
|
+
'use_ssh_config', False)
|
753
|
+
worker_use_ssh_config = [
|
754
|
+
global_use_ssh_config or h.get('use_ssh_config', False)
|
755
|
+
for h in worker_hosts
|
756
|
+
]
|
757
|
+
password = head_host['password']
|
758
|
+
|
759
|
+
# Deploy this cluster
|
760
|
+
unsuccessful_workers = deploy_cluster(
|
761
|
+
head_node,
|
762
|
+
worker_nodes,
|
763
|
+
ssh_user,
|
764
|
+
ssh_key,
|
765
|
+
context_name,
|
766
|
+
password,
|
767
|
+
head_use_ssh_config,
|
768
|
+
worker_use_ssh_config,
|
769
|
+
kubeconfig_path,
|
770
|
+
args.cleanup,
|
771
|
+
worker_hosts=worker_hosts,
|
772
|
+
history_worker_nodes=history_worker_nodes,
|
773
|
+
history_workers_info=history_workers_info,
|
774
|
+
history_use_ssh_config=history_use_ssh_config)
|
775
|
+
|
776
|
+
if not args.cleanup:
|
777
|
+
successful_hosts = []
|
778
|
+
for host in cluster_config['hosts']:
|
779
|
+
if isinstance(host, str):
|
780
|
+
host_node = host
|
781
|
+
else:
|
782
|
+
host_node = host['ip']
|
783
|
+
if host_node not in unsuccessful_workers:
|
784
|
+
successful_hosts.append(host)
|
785
|
+
cluster_config['hosts'] = successful_hosts
|
786
|
+
with open(history_yaml_file, 'w', encoding='utf-8') as f:
|
787
|
+
print(f'{YELLOW}Writing history to {history_yaml_file}{NC}')
|
788
|
+
yaml.dump(cluster_config, f)
|
789
|
+
|
790
|
+
print(
|
791
|
+
f'{GREEN}==== Completed deployment for cluster: {cluster_name} ====${NC}'
|
792
|
+
)
|
793
|
+
|
794
|
+
|
795
|
+
def deploy_cluster(head_node,
|
796
|
+
worker_nodes,
|
797
|
+
ssh_user,
|
798
|
+
ssh_key,
|
799
|
+
context_name,
|
800
|
+
password,
|
801
|
+
head_use_ssh_config,
|
802
|
+
worker_use_ssh_config,
|
803
|
+
kubeconfig_path,
|
804
|
+
cleanup,
|
805
|
+
worker_hosts=None,
|
806
|
+
history_worker_nodes=None,
|
807
|
+
history_workers_info=None,
|
808
|
+
history_use_ssh_config=None) -> List[str]:
|
809
|
+
"""Deploy or clean up a single Kubernetes cluster.
|
810
|
+
|
811
|
+
Returns: List of unsuccessful worker nodes.
|
812
|
+
"""
|
813
|
+
# Ensure SSH key is expanded for paths with ~ (home directory)
|
814
|
+
if ssh_key:
|
815
|
+
ssh_key = os.path.expanduser(ssh_key)
|
816
|
+
|
817
|
+
history_yaml_file = os.path.join(NODE_POOLS_INFO_DIR,
|
818
|
+
f'{context_name}-history.yaml')
|
819
|
+
cert_file_path = os.path.join(NODE_POOLS_INFO_DIR,
|
820
|
+
f'{context_name}-cert.pem')
|
821
|
+
key_file_path = os.path.join(NODE_POOLS_INFO_DIR, f'{context_name}-key.pem')
|
822
|
+
tunnel_log_file_path = os.path.join(NODE_POOLS_INFO_DIR,
|
823
|
+
f'{context_name}-tunnel.log')
|
824
|
+
|
825
|
+
# Generate the askpass block if password is provided
|
826
|
+
askpass_block = create_askpass_script(password)
|
827
|
+
|
828
|
+
# Token for k3s
|
829
|
+
k3s_token = 'mytoken' # Any string can be used as the token
|
830
|
+
|
831
|
+
# Pre-flight checks
|
832
|
+
print(f'{YELLOW}Checking SSH connection to head node...{NC}')
|
833
|
+
result = run_remote(
|
834
|
+
head_node,
|
835
|
+
f'echo \'SSH connection successful ({head_node})\'',
|
836
|
+
ssh_user,
|
837
|
+
ssh_key,
|
838
|
+
use_ssh_config=head_use_ssh_config,
|
839
|
+
# For SkySSHUpLineProcessor
|
840
|
+
print_output=True)
|
841
|
+
if result is None:
|
842
|
+
print(
|
843
|
+
f'{RED}Failed to SSH to head node ({head_node}). '
|
844
|
+
f'Please check the SSH configuration.{NC}',
|
845
|
+
file=sys.stderr)
|
846
|
+
sys.exit(1)
|
847
|
+
|
848
|
+
# Checking history
|
849
|
+
history_exists = (history_worker_nodes is not None and
|
850
|
+
history_workers_info is not None and
|
851
|
+
history_use_ssh_config is not None)
|
852
|
+
|
853
|
+
# Cleanup history worker nodes
|
854
|
+
worker_nodes_to_cleanup = []
|
855
|
+
remove_worker_cmds = []
|
856
|
+
if history_exists:
|
857
|
+
for history_node, history_info, use_ssh_config in zip(
|
858
|
+
history_worker_nodes, history_workers_info,
|
859
|
+
history_use_ssh_config):
|
860
|
+
if worker_hosts is not None and history_info not in worker_hosts:
|
861
|
+
print(
|
862
|
+
f'{YELLOW}Worker node {history_node} not found in YAML config. '
|
863
|
+
f'Removing from history...{NC}')
|
864
|
+
worker_nodes_to_cleanup.append(
|
865
|
+
dict(
|
866
|
+
node=history_node,
|
867
|
+
user=ssh_user
|
868
|
+
if history_info is None else history_info['user'],
|
869
|
+
ssh_key=ssh_key if history_info is None else
|
870
|
+
history_info['identity_file'],
|
871
|
+
askpass_block=(askpass_block if history_info is None
|
872
|
+
else create_askpass_script(
|
873
|
+
history_info['password'])),
|
874
|
+
use_ssh_config=use_ssh_config,
|
875
|
+
))
|
876
|
+
remove_worker_cmds.append(
|
877
|
+
f'kubectl delete node -l skypilot-ip={history_node}')
|
878
|
+
# If this is a create operation and there exists some stale log,
|
879
|
+
# cleanup the log for a new file to store new logs.
|
880
|
+
if not cleanup and os.path.exists(tunnel_log_file_path):
|
881
|
+
os.remove(tunnel_log_file_path)
|
882
|
+
|
883
|
+
# If --cleanup flag is set, uninstall k3s and exit
|
884
|
+
if cleanup:
|
885
|
+
# Pickup all nodes
|
886
|
+
worker_nodes_to_cleanup.clear()
|
887
|
+
for node, info, use_ssh_config in zip(worker_nodes, worker_hosts,
|
888
|
+
worker_use_ssh_config):
|
889
|
+
worker_nodes_to_cleanup.append(
|
890
|
+
dict(
|
891
|
+
node=node,
|
892
|
+
user=ssh_user if info is None else info['user'],
|
893
|
+
ssh_key=ssh_key if info is None else info['identity_file'],
|
894
|
+
askpass_block=(askpass_block if info is None else
|
895
|
+
create_askpass_script(info['password'])),
|
896
|
+
use_ssh_config=use_ssh_config,
|
897
|
+
))
|
898
|
+
|
899
|
+
print(f'{YELLOW}Starting cleanup...{NC}')
|
900
|
+
|
901
|
+
# Clean up head node
|
902
|
+
cleanup_server_node(head_node,
|
903
|
+
ssh_user,
|
904
|
+
ssh_key,
|
905
|
+
askpass_block,
|
906
|
+
use_ssh_config=head_use_ssh_config)
|
907
|
+
# Clean up worker nodes
|
908
|
+
with cf.ThreadPoolExecutor() as executor:
|
909
|
+
executor.map(lambda kwargs: cleanup_agent_node(**kwargs),
|
910
|
+
worker_nodes_to_cleanup)
|
911
|
+
|
912
|
+
with cf.ThreadPoolExecutor() as executor:
|
913
|
+
|
914
|
+
def run_cleanup_cmd(cmd):
|
915
|
+
print('Cleaning up worker nodes:', cmd)
|
916
|
+
run_command(cmd, shell=True)
|
917
|
+
|
918
|
+
executor.map(run_cleanup_cmd, remove_worker_cmds)
|
919
|
+
|
920
|
+
if cleanup:
|
921
|
+
|
922
|
+
# Remove the context from local kubeconfig if it exists
|
923
|
+
if os.path.isfile(kubeconfig_path):
|
924
|
+
progress_message(
|
925
|
+
f'Removing context {context_name!r} from local kubeconfig...')
|
926
|
+
run_command(['kubectl', 'config', 'delete-context', context_name],
|
927
|
+
shell=False)
|
928
|
+
run_command(['kubectl', 'config', 'delete-cluster', context_name],
|
929
|
+
shell=False)
|
930
|
+
run_command(['kubectl', 'config', 'delete-user', context_name],
|
931
|
+
shell=False)
|
932
|
+
|
933
|
+
# Update the current context to the first available context
|
934
|
+
contexts = run_command([
|
935
|
+
'kubectl', 'config', 'view', '-o',
|
936
|
+
'jsonpath=\'{.contexts[0].name}\''
|
937
|
+
],
|
938
|
+
shell=False)
|
939
|
+
if contexts:
|
940
|
+
run_command(['kubectl', 'config', 'use-context', contexts],
|
941
|
+
shell=False)
|
942
|
+
else:
|
943
|
+
# If no context is available, simply unset the current context
|
944
|
+
run_command(['kubectl', 'config', 'unset', 'current-context'],
|
945
|
+
shell=False)
|
946
|
+
|
947
|
+
success_message(
|
948
|
+
f'Context {context_name!r} removed from local kubeconfig.')
|
949
|
+
|
950
|
+
for file in [history_yaml_file, cert_file_path, key_file_path]:
|
951
|
+
if os.path.exists(file):
|
952
|
+
os.remove(file)
|
953
|
+
|
954
|
+
# Clean up SSH tunnel after clean up kubeconfig, because the kubectl
|
955
|
+
# will restart the ssh tunnel if it's not running.
|
956
|
+
cleanup_kubectl_ssh_tunnel(context_name)
|
957
|
+
|
958
|
+
print(f'{GREEN}Cleanup completed successfully.{NC}')
|
959
|
+
|
960
|
+
# Print completion marker for current cluster
|
961
|
+
print(f'{GREEN}SKYPILOT_CLUSTER_COMPLETED: {NC}')
|
962
|
+
|
963
|
+
return []
|
964
|
+
|
965
|
+
print(f'{YELLOW}Checking TCP Forwarding Options...{NC}')
|
966
|
+
cmd = (
|
967
|
+
'if [ "$(sudo sshd -T | grep allowtcpforwarding)" = "allowtcpforwarding yes" ]; then '
|
968
|
+
f'echo "TCP Forwarding already enabled on head node ({head_node})."; '
|
969
|
+
'else '
|
970
|
+
'sudo sed -i \'s/^#\?\s*AllowTcpForwarding.*/AllowTcpForwarding yes/\' ' # pylint: disable=anomalous-backslash-in-string
|
971
|
+
'/etc/ssh/sshd_config && sudo systemctl restart sshd && '
|
972
|
+
f'echo "Successfully enabled TCP Forwarding on head node ({head_node})."; '
|
973
|
+
'fi')
|
974
|
+
result = run_remote(
|
975
|
+
head_node,
|
976
|
+
shlex.quote(cmd),
|
977
|
+
ssh_user,
|
978
|
+
ssh_key,
|
979
|
+
use_ssh_config=head_use_ssh_config,
|
980
|
+
# For SkySSHUpLineProcessor
|
981
|
+
print_output=True,
|
982
|
+
use_shell=True)
|
983
|
+
if result is None:
|
984
|
+
print(
|
985
|
+
f'{RED}Failed to setup TCP forwarding on head node ({head_node}). '
|
986
|
+
f'Please check the SSH configuration.{NC}',
|
987
|
+
file=sys.stderr)
|
988
|
+
|
989
|
+
# Get effective IP for master node if using SSH config - needed for workers to connect
|
990
|
+
if head_use_ssh_config:
|
991
|
+
effective_master_ip = get_effective_host_ip(head_node)
|
992
|
+
print(
|
993
|
+
f'{GREEN}Resolved head node {head_node} to {effective_master_ip} from SSH config{NC}'
|
994
|
+
)
|
995
|
+
else:
|
996
|
+
effective_master_ip = head_node
|
997
|
+
|
998
|
+
# Step 1: Install k3s on the head node
|
999
|
+
# Check if head node has a GPU
|
1000
|
+
install_gpu = False
|
1001
|
+
progress_message(f'Deploying Kubernetes on head node ({head_node})...')
|
1002
|
+
cmd = f"""
|
1003
|
+
{askpass_block}
|
1004
|
+
curl -sfL https://get.k3s.io | K3S_TOKEN={k3s_token} K3S_NODE_NAME={head_node} sudo -E -A sh - &&
|
1005
|
+
mkdir -p ~/.kube &&
|
1006
|
+
sudo -A cp /etc/rancher/k3s/k3s.yaml ~/.kube/config &&
|
1007
|
+
sudo -A chown $(id -u):$(id -g) ~/.kube/config &&
|
1008
|
+
for i in {{1..3}}; do
|
1009
|
+
if kubectl wait --for=condition=ready node --all --timeout=2m --kubeconfig ~/.kube/config; then
|
1010
|
+
break
|
1011
|
+
else
|
1012
|
+
echo 'Waiting for nodes to be ready...'
|
1013
|
+
sleep 5
|
1014
|
+
fi
|
1015
|
+
done
|
1016
|
+
if [ $i -eq 3 ]; then
|
1017
|
+
echo 'Failed to wait for nodes to be ready after 3 attempts'
|
1018
|
+
exit 1
|
1019
|
+
fi
|
1020
|
+
"""
|
1021
|
+
result = run_remote(head_node,
|
1022
|
+
cmd,
|
1023
|
+
ssh_user,
|
1024
|
+
ssh_key,
|
1025
|
+
use_ssh_config=head_use_ssh_config)
|
1026
|
+
if result is None:
|
1027
|
+
print(f'{RED}Failed to deploy K3s on head node ({head_node}). {NC}',
|
1028
|
+
file=sys.stderr)
|
1029
|
+
sys.exit(1)
|
1030
|
+
success_message(f'K3s deployed on head node ({head_node}).')
|
1031
|
+
|
1032
|
+
# Check if head node has a GPU
|
1033
|
+
install_gpu = False
|
1034
|
+
if check_gpu(head_node,
|
1035
|
+
ssh_user,
|
1036
|
+
ssh_key,
|
1037
|
+
use_ssh_config=head_use_ssh_config):
|
1038
|
+
print(f'{YELLOW}GPU detected on head node ({head_node}).{NC}')
|
1039
|
+
install_gpu = True
|
1040
|
+
|
1041
|
+
# Fetch the head node's internal IP (this will be passed to worker nodes)
|
1042
|
+
master_addr = run_remote(head_node,
|
1043
|
+
'hostname -I | awk \'{print $1}\'',
|
1044
|
+
ssh_user,
|
1045
|
+
ssh_key,
|
1046
|
+
use_ssh_config=head_use_ssh_config)
|
1047
|
+
if master_addr is None:
|
1048
|
+
print(
|
1049
|
+
f'{RED}Failed to SSH to head node ({head_node}). '
|
1050
|
+
f'Please check the SSH configuration.{NC}',
|
1051
|
+
file=sys.stderr)
|
1052
|
+
sys.exit(1)
|
1053
|
+
print(f'{GREEN}Master node internal IP: {master_addr}{NC}')
|
1054
|
+
|
1055
|
+
# Step 2: Install k3s on worker nodes and join them to the master node
|
1056
|
+
def deploy_worker(args):
|
1057
|
+
(i, node, worker_hosts, history_workers_info, ssh_user, ssh_key,
|
1058
|
+
askpass_block, worker_use_ssh_config, master_addr, k3s_token) = args
|
1059
|
+
progress_message(f'Deploying Kubernetes on worker node ({node})...')
|
1060
|
+
|
1061
|
+
# If using YAML config with specific worker info
|
1062
|
+
if worker_hosts and i < len(worker_hosts):
|
1063
|
+
if history_workers_info is not None and worker_hosts[
|
1064
|
+
i] in history_workers_info:
|
1065
|
+
print(
|
1066
|
+
f'{YELLOW}Worker node ({node}) already exists in history. '
|
1067
|
+
f'Skipping...{NC}')
|
1068
|
+
return node, True, False
|
1069
|
+
worker_user = worker_hosts[i]['user']
|
1070
|
+
worker_key = worker_hosts[i]['identity_file']
|
1071
|
+
worker_password = worker_hosts[i]['password']
|
1072
|
+
worker_askpass = create_askpass_script(worker_password)
|
1073
|
+
worker_config = worker_use_ssh_config[i]
|
1074
|
+
else:
|
1075
|
+
worker_user = ssh_user
|
1076
|
+
worker_key = ssh_key
|
1077
|
+
worker_askpass = askpass_block
|
1078
|
+
worker_config = worker_use_ssh_config[i]
|
1079
|
+
|
1080
|
+
return start_agent_node(node,
|
1081
|
+
master_addr,
|
1082
|
+
k3s_token,
|
1083
|
+
worker_user,
|
1084
|
+
worker_key,
|
1085
|
+
worker_askpass,
|
1086
|
+
use_ssh_config=worker_config)
|
1087
|
+
|
1088
|
+
unsuccessful_workers = []
|
1089
|
+
|
1090
|
+
# Deploy workers in parallel using thread pool
|
1091
|
+
with cf.ThreadPoolExecutor() as executor:
|
1092
|
+
futures = []
|
1093
|
+
for i, node in enumerate(worker_nodes):
|
1094
|
+
args = (i, node, worker_hosts, history_workers_info, ssh_user,
|
1095
|
+
ssh_key, askpass_block, worker_use_ssh_config, master_addr,
|
1096
|
+
k3s_token)
|
1097
|
+
futures.append(executor.submit(deploy_worker, args))
|
1098
|
+
|
1099
|
+
# Check if worker node has a GPU
|
1100
|
+
for future in cf.as_completed(futures):
|
1101
|
+
node, suc, has_gpu = future.result()
|
1102
|
+
install_gpu = install_gpu or has_gpu
|
1103
|
+
if not suc:
|
1104
|
+
unsuccessful_workers.append(node)
|
1105
|
+
|
1106
|
+
# Step 3: Configure local kubectl to connect to the cluster
|
1107
|
+
progress_message('Configuring local kubectl to connect to the cluster...')
|
1108
|
+
|
1109
|
+
# Create temporary directory for kubeconfig operations
|
1110
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
1111
|
+
temp_kubeconfig = os.path.join(temp_dir, 'kubeconfig')
|
1112
|
+
|
1113
|
+
# Get the kubeconfig from remote server
|
1114
|
+
if head_use_ssh_config:
|
1115
|
+
scp_cmd = ['scp', head_node + ':~/.kube/config', temp_kubeconfig]
|
1116
|
+
else:
|
1117
|
+
scp_cmd = [
|
1118
|
+
'scp', '-o', 'StrictHostKeyChecking=no', '-o',
|
1119
|
+
'IdentitiesOnly=yes', '-i', ssh_key,
|
1120
|
+
f'{ssh_user}@{head_node}:~/.kube/config', temp_kubeconfig
|
1121
|
+
]
|
1122
|
+
run_command(scp_cmd, shell=False)
|
1123
|
+
|
1124
|
+
# Create the directory for the kubeconfig file if it doesn't exist
|
1125
|
+
ensure_directory_exists(kubeconfig_path)
|
1126
|
+
|
1127
|
+
# Create empty kubeconfig if it doesn't exist
|
1128
|
+
if not os.path.isfile(kubeconfig_path):
|
1129
|
+
open(kubeconfig_path, 'a', encoding='utf-8').close()
|
1130
|
+
|
1131
|
+
# Modify the temporary kubeconfig to update server address and context name
|
1132
|
+
modified_config = os.path.join(temp_dir, 'modified_config')
|
1133
|
+
with open(temp_kubeconfig, 'r', encoding='utf-8') as f_in:
|
1134
|
+
with open(modified_config, 'w', encoding='utf-8') as f_out:
|
1135
|
+
in_cluster = False
|
1136
|
+
in_user = False
|
1137
|
+
client_cert_data = None
|
1138
|
+
client_key_data = None
|
1139
|
+
|
1140
|
+
for line in f_in:
|
1141
|
+
if 'clusters:' in line:
|
1142
|
+
in_cluster = True
|
1143
|
+
in_user = False
|
1144
|
+
elif 'users:' in line:
|
1145
|
+
in_cluster = False
|
1146
|
+
in_user = True
|
1147
|
+
elif 'contexts:' in line:
|
1148
|
+
in_cluster = False
|
1149
|
+
in_user = False
|
1150
|
+
|
1151
|
+
# Skip certificate authority data in cluster section
|
1152
|
+
if in_cluster and 'certificate-authority-data:' in line:
|
1153
|
+
continue
|
1154
|
+
# Skip client certificate data in user section but extract it
|
1155
|
+
elif in_user and 'client-certificate-data:' in line:
|
1156
|
+
client_cert_data = line.split(':', 1)[1].strip()
|
1157
|
+
continue
|
1158
|
+
# Skip client key data in user section but extract it
|
1159
|
+
elif in_user and 'client-key-data:' in line:
|
1160
|
+
client_key_data = line.split(':', 1)[1].strip()
|
1161
|
+
continue
|
1162
|
+
elif in_cluster and 'server:' in line:
|
1163
|
+
# Initially just set to the effective master IP
|
1164
|
+
# (will be changed to localhost by setup_kubectl_ssh_tunnel later)
|
1165
|
+
f_out.write(
|
1166
|
+
f' server: https://{effective_master_ip}:6443\n')
|
1167
|
+
f_out.write(' insecure-skip-tls-verify: true\n')
|
1168
|
+
continue
|
1169
|
+
|
1170
|
+
# Replace default context names with user-provided context name
|
1171
|
+
line = line.replace('name: default',
|
1172
|
+
f'name: {context_name}')
|
1173
|
+
line = line.replace('cluster: default',
|
1174
|
+
f'cluster: {context_name}')
|
1175
|
+
line = line.replace('user: default',
|
1176
|
+
f'user: {context_name}')
|
1177
|
+
line = line.replace('current-context: default',
|
1178
|
+
f'current-context: {context_name}')
|
1179
|
+
|
1180
|
+
f_out.write(line)
|
1181
|
+
|
1182
|
+
# Save certificate data if available
|
1183
|
+
|
1184
|
+
if client_cert_data:
|
1185
|
+
# Decode base64 data and save as PEM
|
1186
|
+
try:
|
1187
|
+
# Clean up the certificate data by removing whitespace
|
1188
|
+
clean_cert_data = ''.join(client_cert_data.split())
|
1189
|
+
cert_pem = base64.b64decode(clean_cert_data).decode(
|
1190
|
+
'utf-8')
|
1191
|
+
|
1192
|
+
# Check if the data already looks like a PEM file
|
1193
|
+
has_begin = '-----BEGIN CERTIFICATE-----' in cert_pem
|
1194
|
+
has_end = '-----END CERTIFICATE-----' in cert_pem
|
1195
|
+
|
1196
|
+
if not has_begin or not has_end:
|
1197
|
+
print(
|
1198
|
+
f'{YELLOW}Warning: Certificate data missing PEM markers, attempting to fix...{NC}'
|
1199
|
+
)
|
1200
|
+
# Add PEM markers if missing
|
1201
|
+
if not has_begin:
|
1202
|
+
cert_pem = f'-----BEGIN CERTIFICATE-----\n{cert_pem}'
|
1203
|
+
if not has_end:
|
1204
|
+
cert_pem = f'{cert_pem}\n-----END CERTIFICATE-----'
|
1205
|
+
|
1206
|
+
# Write the certificate
|
1207
|
+
with open(cert_file_path, 'w',
|
1208
|
+
encoding='utf-8') as cert_file:
|
1209
|
+
cert_file.write(cert_pem)
|
1210
|
+
|
1211
|
+
# Verify the file was written correctly
|
1212
|
+
if os.path.getsize(cert_file_path) > 0:
|
1213
|
+
print(
|
1214
|
+
f'{GREEN}Successfully saved certificate data ({len(cert_pem)} bytes){NC}'
|
1215
|
+
)
|
1216
|
+
|
1217
|
+
# Quick validation of PEM format
|
1218
|
+
with open(cert_file_path, 'r',
|
1219
|
+
encoding='utf-8') as f:
|
1220
|
+
content = f.readlines()
|
1221
|
+
first_line = content[0].strip(
|
1222
|
+
) if content else ''
|
1223
|
+
last_line = content[-1].strip(
|
1224
|
+
) if content else ''
|
1225
|
+
|
1226
|
+
if not first_line.startswith(
|
1227
|
+
'-----BEGIN') or not last_line.startswith(
|
1228
|
+
'-----END'):
|
1229
|
+
print(
|
1230
|
+
f'{YELLOW}Warning: Certificate may not be in proper PEM format{NC}'
|
1231
|
+
)
|
1232
|
+
else:
|
1233
|
+
print(f'{RED}Error: Certificate file is empty{NC}')
|
1234
|
+
except Exception as e: # pylint: disable=broad-except
|
1235
|
+
print(
|
1236
|
+
f'{RED}Error processing certificate data: {e}{NC}')
|
1237
|
+
|
1238
|
+
if client_key_data:
|
1239
|
+
# Decode base64 data and save as PEM
|
1240
|
+
try:
|
1241
|
+
# Clean up the key data by removing whitespace
|
1242
|
+
clean_key_data = ''.join(client_key_data.split())
|
1243
|
+
key_pem = base64.b64decode(clean_key_data).decode(
|
1244
|
+
'utf-8')
|
1245
|
+
|
1246
|
+
# Check if the data already looks like a PEM file
|
1247
|
+
|
1248
|
+
# Check for EC key format
|
1249
|
+
if 'EC PRIVATE KEY' in key_pem:
|
1250
|
+
# Handle EC KEY format directly
|
1251
|
+
match_ec = re.search(
|
1252
|
+
r'-----BEGIN EC PRIVATE KEY-----(.*?)-----END EC PRIVATE KEY-----',
|
1253
|
+
key_pem, re.DOTALL)
|
1254
|
+
if match_ec:
|
1255
|
+
# Extract and properly format EC key
|
1256
|
+
key_content = match_ec.group(1).strip()
|
1257
|
+
key_pem = f'-----BEGIN EC PRIVATE KEY-----\n{key_content}\n-----END EC PRIVATE KEY-----'
|
1258
|
+
else:
|
1259
|
+
# Extract content and assume EC format
|
1260
|
+
key_content = re.sub(r'-----BEGIN.*?-----', '',
|
1261
|
+
key_pem)
|
1262
|
+
key_content = re.sub(r'-----END.*?-----.*', '',
|
1263
|
+
key_content).strip()
|
1264
|
+
key_pem = f'-----BEGIN EC PRIVATE KEY-----\n{key_content}\n-----END EC PRIVATE KEY-----'
|
1265
|
+
else:
|
1266
|
+
# Handle regular private key format
|
1267
|
+
has_begin = any(marker in key_pem for marker in [
|
1268
|
+
'-----BEGIN PRIVATE KEY-----',
|
1269
|
+
'-----BEGIN RSA PRIVATE KEY-----'
|
1270
|
+
])
|
1271
|
+
has_end = any(marker in key_pem for marker in [
|
1272
|
+
'-----END PRIVATE KEY-----',
|
1273
|
+
'-----END RSA PRIVATE KEY-----'
|
1274
|
+
])
|
1275
|
+
|
1276
|
+
if not has_begin or not has_end:
|
1277
|
+
print(
|
1278
|
+
f'{YELLOW}Warning: Key data missing PEM markers, attempting to fix...{NC}'
|
1279
|
+
)
|
1280
|
+
# Add PEM markers if missing
|
1281
|
+
if not has_begin:
|
1282
|
+
key_pem = f'-----BEGIN PRIVATE KEY-----\n{key_pem}'
|
1283
|
+
if not has_end:
|
1284
|
+
key_pem = f'{key_pem}\n-----END PRIVATE KEY-----'
|
1285
|
+
# Remove any trailing characters after END marker
|
1286
|
+
key_pem = re.sub(
|
1287
|
+
r'(-----END PRIVATE KEY-----).*', r'\1',
|
1288
|
+
key_pem)
|
1289
|
+
|
1290
|
+
# Write the key
|
1291
|
+
with open(key_file_path, 'w',
|
1292
|
+
encoding='utf-8') as key_file:
|
1293
|
+
key_file.write(key_pem)
|
1294
|
+
|
1295
|
+
# Verify the file was written correctly
|
1296
|
+
if os.path.getsize(key_file_path) > 0:
|
1297
|
+
print(
|
1298
|
+
f'{GREEN}Successfully saved key data ({len(key_pem)} bytes){NC}'
|
1299
|
+
)
|
1300
|
+
|
1301
|
+
# Quick validation of PEM format
|
1302
|
+
with open(key_file_path, 'r',
|
1303
|
+
encoding='utf-8') as f:
|
1304
|
+
content = f.readlines()
|
1305
|
+
first_line = content[0].strip(
|
1306
|
+
) if content else ''
|
1307
|
+
last_line = content[-1].strip(
|
1308
|
+
) if content else ''
|
1309
|
+
|
1310
|
+
if not first_line.startswith(
|
1311
|
+
'-----BEGIN') or not last_line.startswith(
|
1312
|
+
'-----END'):
|
1313
|
+
print(
|
1314
|
+
f'{YELLOW}Warning: Key may not be in proper PEM format{NC}'
|
1315
|
+
)
|
1316
|
+
else:
|
1317
|
+
print(f'{RED}Error: Key file is empty{NC}')
|
1318
|
+
except Exception as e: # pylint: disable=broad-except
|
1319
|
+
print(f'{RED}Error processing key data: {e}{NC}')
|
1320
|
+
|
1321
|
+
# First check if context name exists and delete it if it does
|
1322
|
+
# TODO(romilb): Should we throw an error here instead?
|
1323
|
+
run_command(['kubectl', 'config', 'delete-context', context_name],
|
1324
|
+
shell=False)
|
1325
|
+
run_command(['kubectl', 'config', 'delete-cluster', context_name],
|
1326
|
+
shell=False)
|
1327
|
+
run_command(['kubectl', 'config', 'delete-user', context_name],
|
1328
|
+
shell=False)
|
1329
|
+
|
1330
|
+
# Merge the configurations using kubectl
|
1331
|
+
merged_config = os.path.join(temp_dir, 'merged_config')
|
1332
|
+
os.environ['KUBECONFIG'] = f'{kubeconfig_path}:{modified_config}'
|
1333
|
+
with open(merged_config, 'w', encoding='utf-8') as merged_file:
|
1334
|
+
kubectl_cmd = ['kubectl', 'config', 'view', '--flatten']
|
1335
|
+
result = run_command(kubectl_cmd, shell=False)
|
1336
|
+
if result:
|
1337
|
+
merged_file.write(result)
|
1338
|
+
|
1339
|
+
# Replace the kubeconfig with the merged config
|
1340
|
+
os.replace(merged_config, kubeconfig_path)
|
1341
|
+
|
1342
|
+
# Set the new context as the current context
|
1343
|
+
run_command(['kubectl', 'config', 'use-context', context_name],
|
1344
|
+
shell=False)
|
1345
|
+
|
1346
|
+
# Always set up SSH tunnel since we assume only port 22 is accessible
|
1347
|
+
setup_kubectl_ssh_tunnel(head_node,
|
1348
|
+
ssh_user,
|
1349
|
+
ssh_key,
|
1350
|
+
context_name,
|
1351
|
+
use_ssh_config=head_use_ssh_config)
|
1352
|
+
|
1353
|
+
success_message(f'kubectl configured with new context \'{context_name}\'.')
|
1354
|
+
|
1355
|
+
print(
|
1356
|
+
f'Cluster deployment completed. Kubeconfig saved to {kubeconfig_path}')
|
1357
|
+
print('You can now run \'kubectl get nodes\' to verify the setup.')
|
1358
|
+
|
1359
|
+
# Install GPU operator if a GPU was detected on any node
|
1360
|
+
if install_gpu:
|
1361
|
+
print(
|
1362
|
+
f'{YELLOW}GPU detected in the cluster. Installing Nvidia GPU Operator...{NC}'
|
1363
|
+
)
|
1364
|
+
cmd = f"""
|
1365
|
+
{askpass_block}
|
1366
|
+
curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
|
1367
|
+
chmod 700 get_helm.sh &&
|
1368
|
+
./get_helm.sh &&
|
1369
|
+
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update &&
|
1370
|
+
kubectl create namespace gpu-operator --kubeconfig ~/.kube/config || true &&
|
1371
|
+
sudo -A ln -s /sbin/ldconfig /sbin/ldconfig.real || true &&
|
1372
|
+
helm install gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator \\
|
1373
|
+
--set 'toolkit.env[0].name=CONTAINERD_CONFIG' \\
|
1374
|
+
--set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \\
|
1375
|
+
--set 'toolkit.env[1].name=CONTAINERD_SOCKET' \\
|
1376
|
+
--set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \\
|
1377
|
+
--set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \\
|
1378
|
+
--set 'toolkit.env[2].value=nvidia' &&
|
1379
|
+
echo 'Waiting for GPU operator installation...' &&
|
1380
|
+
while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:' || ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu.product'; do
|
1381
|
+
echo 'Waiting for GPU operator...'
|
1382
|
+
sleep 5
|
1383
|
+
done
|
1384
|
+
echo 'GPU operator installed successfully.'
|
1385
|
+
"""
|
1386
|
+
result = run_remote(head_node,
|
1387
|
+
cmd,
|
1388
|
+
ssh_user,
|
1389
|
+
ssh_key,
|
1390
|
+
use_ssh_config=head_use_ssh_config)
|
1391
|
+
if result is None:
|
1392
|
+
print(f'{RED}Failed to install GPU Operator.{NC}')
|
1393
|
+
else:
|
1394
|
+
success_message('GPU Operator installed.')
|
1395
|
+
else:
|
1396
|
+
print(
|
1397
|
+
f'{YELLOW}No GPUs detected. Skipping GPU Operator installation.{NC}'
|
1398
|
+
)
|
1399
|
+
|
1400
|
+
# Configure SkyPilot
|
1401
|
+
progress_message('Configuring SkyPilot...')
|
1402
|
+
|
1403
|
+
# The env var KUBECONFIG ensures sky check uses the right kubeconfig
|
1404
|
+
os.environ['KUBECONFIG'] = kubeconfig_path
|
1405
|
+
run_command(['sky', 'check', 'kubernetes'], shell=False)
|
1406
|
+
|
1407
|
+
success_message('SkyPilot configured successfully.')
|
1408
|
+
|
1409
|
+
# Display final success message
|
1410
|
+
print(
|
1411
|
+
f'{GREEN}==== 🎉 Kubernetes cluster deployment completed successfully 🎉 ====${NC}'
|
1412
|
+
)
|
1413
|
+
print(
|
1414
|
+
'You can now interact with your Kubernetes cluster through SkyPilot: ')
|
1415
|
+
print(' • List available GPUs: sky show-gpus --cloud kubernetes')
|
1416
|
+
print(
|
1417
|
+
' • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes'
|
1418
|
+
)
|
1419
|
+
print(' • Connect to pod with VSCode: code --remote ssh-remote+devbox ')
|
1420
|
+
# Print completion marker for current cluster
|
1421
|
+
print(f'{GREEN}SKYPILOT_CLUSTER_COMPLETED: {NC}')
|
1422
|
+
|
1423
|
+
if unsuccessful_workers:
|
1424
|
+
quoted_unsuccessful_workers = [
|
1425
|
+
f'"{worker}"' for worker in unsuccessful_workers
|
1426
|
+
]
|
1427
|
+
|
1428
|
+
print(
|
1429
|
+
f'{WARNING_YELLOW}Failed to deploy Kubernetes on the following nodes: '
|
1430
|
+
f'{", ".join(quoted_unsuccessful_workers)}. Please check '
|
1431
|
+
f'the logs for more details.{NC}')
|
1432
|
+
|
1433
|
+
return unsuccessful_workers
|
1434
|
+
|
1435
|
+
|
1436
|
+
if __name__ == '__main__':
|
1437
|
+
main()
|