skypilot-nightly 1.0.0.dev20250522__py3-none-any.whl → 1.0.0.dev20250524__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +46 -16
  3. sky/backends/backend_utils.py +62 -45
  4. sky/backends/cloud_vm_ray_backend.py +19 -5
  5. sky/check.py +398 -171
  6. sky/cli.py +302 -98
  7. sky/client/cli.py +302 -98
  8. sky/client/sdk.py +104 -12
  9. sky/clouds/__init__.py +3 -0
  10. sky/clouds/aws.py +4 -2
  11. sky/clouds/azure.py +4 -2
  12. sky/clouds/cloud.py +24 -6
  13. sky/clouds/cudo.py +2 -1
  14. sky/clouds/do.py +2 -1
  15. sky/clouds/fluidstack.py +2 -1
  16. sky/clouds/gcp.py +23 -5
  17. sky/clouds/ibm.py +4 -2
  18. sky/clouds/kubernetes.py +66 -22
  19. sky/clouds/lambda_cloud.py +2 -1
  20. sky/clouds/nebius.py +18 -2
  21. sky/clouds/oci.py +4 -2
  22. sky/clouds/paperspace.py +2 -1
  23. sky/clouds/runpod.py +2 -1
  24. sky/clouds/scp.py +2 -1
  25. sky/clouds/service_catalog/constants.py +1 -1
  26. sky/clouds/service_catalog/ssh_catalog.py +167 -0
  27. sky/clouds/ssh.py +203 -0
  28. sky/clouds/vast.py +2 -1
  29. sky/clouds/vsphere.py +2 -1
  30. sky/core.py +58 -11
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/aHej19bZyl4hoHgrzPCn7/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/480-ee58038f1a4afd5c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/488-50d843fdb5396d32.js +15 -0
  35. sky/dashboard/out/_next/static/chunks/498-d7722313e5e5b4e6.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/573-f17bd89d9f9118b3.js +66 -0
  37. sky/dashboard/out/_next/static/chunks/578-7a4795009a56430c.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/734-5f5ce8f347b7f417.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/937.f97f83652028e944.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/938-f347f6144075b0c8.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/pages/_app-dec800f9ef1b10f4.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-37c042a356f8e608.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/pages/clusters-9e6d1ec6e1ac5b29.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/pages/infra-e690d864aa00e2ea.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-db6558a5ec687011.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/jobs-73d5e0c369d00346.js +16 -0
  49. sky/dashboard/out/_next/static/chunks/pages/users-2d319455c3f1c3e2.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/workspaces-02a7b60f2ead275f.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +1 -0
  52. sky/dashboard/out/_next/static/css/d2cdba64c9202dd7.css +3 -0
  53. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  54. sky/dashboard/out/clusters/[cluster].html +1 -1
  55. sky/dashboard/out/clusters.html +1 -1
  56. sky/dashboard/out/index.html +1 -1
  57. sky/dashboard/out/infra.html +1 -1
  58. sky/dashboard/out/jobs/[job].html +1 -1
  59. sky/dashboard/out/jobs.html +1 -1
  60. sky/dashboard/out/users.html +1 -0
  61. sky/dashboard/out/workspaces.html +1 -0
  62. sky/data/storage.py +1 -1
  63. sky/global_user_state.py +42 -19
  64. sky/jobs/constants.py +1 -1
  65. sky/jobs/server/core.py +72 -56
  66. sky/jobs/state.py +26 -5
  67. sky/jobs/utils.py +65 -13
  68. sky/optimizer.py +29 -7
  69. sky/provision/__init__.py +1 -0
  70. sky/provision/aws/instance.py +17 -1
  71. sky/provision/fluidstack/instance.py +1 -0
  72. sky/provision/kubernetes/instance.py +16 -5
  73. sky/provision/kubernetes/utils.py +37 -19
  74. sky/provision/nebius/instance.py +3 -1
  75. sky/provision/nebius/utils.py +14 -2
  76. sky/provision/ssh/__init__.py +18 -0
  77. sky/resources.py +4 -1
  78. sky/serve/server/core.py +9 -6
  79. sky/server/html/token_page.html +6 -1
  80. sky/server/requests/executor.py +1 -0
  81. sky/server/requests/payloads.py +18 -0
  82. sky/server/server.py +108 -5
  83. sky/setup_files/dependencies.py +1 -0
  84. sky/skylet/constants.py +4 -1
  85. sky/skypilot_config.py +83 -9
  86. sky/templates/nebius-ray.yml.j2 +12 -0
  87. sky/utils/cli_utils/status_utils.py +18 -8
  88. sky/utils/infra_utils.py +21 -1
  89. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  90. sky/utils/kubernetes/create_cluster.sh +1 -0
  91. sky/utils/kubernetes/deploy_remote_cluster.py +1440 -0
  92. sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
  93. sky/utils/kubernetes/ssh-tunnel.sh +387 -0
  94. sky/utils/log_utils.py +218 -1
  95. sky/utils/schemas.py +75 -0
  96. sky/utils/ux_utils.py +2 -1
  97. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/METADATA +6 -1
  98. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/RECORD +103 -91
  99. sky/dashboard/out/_next/static/CzOVV6JpRQBRt5GhZuhyK/_buildManifest.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +0 -6
  101. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  102. sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +0 -6
  103. sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +0 -1
  105. sky/dashboard/out/_next/static/chunks/582-683f4f27b81996dc.js +0 -59
  106. sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +0 -1
  109. sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +0 -1
  110. sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
  111. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +0 -1
  112. sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +0 -1
  113. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  114. sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +0 -3
  115. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  116. /sky/dashboard/out/_next/static/{CzOVV6JpRQBRt5GhZuhyK → aHej19bZyl4hoHgrzPCn7}/_ssgManifest.js +0 -0
  117. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/WHEEL +0 -0
  118. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/entry_points.txt +0 -0
  119. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/licenses/LICENSE +0 -0
  120. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1440 @@
1
+ """SSH-based Kubernetes Cluster Deployment Script"""
2
+ # Refer to https://docs.skypilot.co/en/latest/reservations/existing-machines.html for details on how to use this script. # pylint: disable=line-too-long
3
+ import argparse
4
+ import base64
5
+ import concurrent.futures as cf
6
+ import os
7
+ import random
8
+ import re
9
+ import shlex
10
+ import subprocess
11
+ import sys
12
+ import tempfile
13
+ from typing import Any, Dict, List, Optional, Set
14
+
15
+ import yaml
16
+
17
+ from sky.utils import ux_utils
18
+
19
+ # Colors for nicer UX
20
+ RED = '\033[0;31m'
21
+ GREEN = '\033[0;32m'
22
+ YELLOW = '\033[1;33m'
23
+ WARNING_YELLOW = '\x1b[33m'
24
+ NC = '\033[0m' # No color
25
+
26
+ DEFAULT_SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
27
+ DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
28
+ SSH_CONFIG_PATH = os.path.expanduser('~/.ssh/config')
29
+ NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
30
+
31
+ # Get the directory of this script
32
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
33
+
34
+
35
+ class UniqueKeySafeLoader(yaml.SafeLoader):
36
+ """Custom YAML loader that raises an error if there are duplicate keys."""
37
+
38
+ def construct_mapping(self, node, deep=False):
39
+ mapping = {}
40
+ for key_node, value_node in node.value:
41
+ key = self.construct_object(key_node, deep=deep)
42
+ if key in mapping:
43
+ raise yaml.constructor.ConstructorError(
44
+ note=(f'Duplicate cluster config for cluster {key!r}.\n'
45
+ 'Please remove one of them from: '
46
+ f'{DEFAULT_SSH_NODE_POOLS_PATH}'))
47
+ value = self.construct_object(value_node, deep=deep)
48
+ mapping[key] = value
49
+ return mapping
50
+
51
+
52
+ # Register the custom constructor inside the class
53
+ UniqueKeySafeLoader.add_constructor(
54
+ yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
55
+ UniqueKeySafeLoader.construct_mapping)
56
+
57
+
58
+ def parse_args():
59
+ parser = argparse.ArgumentParser(
60
+ description='Deploy a Kubernetes cluster on remote machines.')
61
+ parser.add_argument(
62
+ '--infra', help='Name of the cluster in ssh_node_pools.yaml to use')
63
+ parser.add_argument(
64
+ '--ssh-node-pools-file',
65
+ dest='ssh_node_pools_file',
66
+ default=DEFAULT_SSH_NODE_POOLS_PATH,
67
+ help=
68
+ f'Path to SSH node pools YAML file (default: {DEFAULT_SSH_NODE_POOLS_PATH})'
69
+ )
70
+ parser.add_argument(
71
+ '--kubeconfig-path',
72
+ dest='kubeconfig_path',
73
+ default=DEFAULT_KUBECONFIG_PATH,
74
+ help=
75
+ f'Path to save the kubeconfig file (default: {DEFAULT_KUBECONFIG_PATH})'
76
+ )
77
+ parser.add_argument(
78
+ '--use-ssh-config',
79
+ dest='use_ssh_config',
80
+ action='store_true',
81
+ help='Use SSH config for host settings instead of explicit parameters')
82
+ #TODO(romilb): The `sky local up --ips` command is deprecated and these args are now captured in the ssh_node_pools.yaml file.
83
+ # Remove these args after 0.11.0 release.
84
+ parser.add_argument(
85
+ '--ips-file',
86
+ dest='ips_file',
87
+ help=
88
+ '[Deprecated, use --ssh-node-pools-file instead] File containing IP addresses or SSH host entries (one per line)'
89
+ )
90
+ parser.add_argument(
91
+ '--user',
92
+ help=
93
+ '[Deprecated, use --ssh-node-pools-file instead] Username to use for SSH (overridden by SSH config if host exists there)'
94
+ )
95
+ parser.add_argument(
96
+ '--ssh-key',
97
+ dest='ssh_key',
98
+ help=
99
+ '[Deprecated, use --ssh-node-pools-file instead] Path to SSH private key (overridden by SSH config if host exists there)'
100
+ )
101
+ parser.add_argument(
102
+ '--context-name',
103
+ dest='context_name',
104
+ default='default',
105
+ help=
106
+ '[Deprecated, use --ssh-node-pools-file instead] Kubernetes context name'
107
+ )
108
+ parser.add_argument('--cleanup',
109
+ action='store_true',
110
+ help='Clean up the cluster')
111
+ parser.add_argument(
112
+ '--password',
113
+ help='[Deprecated, use --ssh-node-pools-file instead] Password for sudo'
114
+ )
115
+
116
+ return parser.parse_args()
117
+
118
+
119
+ def load_ssh_targets(file_path: str) -> Dict[str, Any]:
120
+ """Load SSH targets from YAML file."""
121
+ if not os.path.exists(file_path):
122
+ with ux_utils.print_exception_no_traceback():
123
+ raise ValueError(f'SSH Node Pools file not found: {file_path}')
124
+
125
+ try:
126
+ with open(file_path, 'r', encoding='utf-8') as f:
127
+ targets = yaml.load(f, Loader=UniqueKeySafeLoader)
128
+ return targets
129
+ except yaml.constructor.ConstructorError as e:
130
+ with ux_utils.print_exception_no_traceback():
131
+ raise ValueError(e.note) from e
132
+ except (yaml.YAMLError, IOError, OSError) as e:
133
+ with ux_utils.print_exception_no_traceback():
134
+ raise ValueError(f'Error loading SSH Node Pools file: {e}') from e
135
+
136
+
137
+ def check_host_in_ssh_config(hostname: str) -> bool:
138
+ """Return True iff *hostname* matches at least one `Host`/`Match` stanza
139
+ in the user's OpenSSH client configuration (including anything pulled in
140
+ via Include).
141
+
142
+ It calls: ssh -vvG <hostname> -o ConnectTimeout=0
143
+ which:
144
+ • -G expands the effective config without connecting
145
+ • -vv prints debug lines that show which stanzas are applied
146
+ • ConnectTimeout=0 avoids a DNS lookup if <hostname> is a FQDN/IP
147
+
148
+ No config files are opened or parsed manually.
149
+
150
+ Parameters
151
+ ----------
152
+ hostname : str
153
+ The alias/IP/FQDN you want to test.
154
+
155
+ Returns
156
+ -------
157
+ bool
158
+ True – a specific stanza matched the host
159
+ False – nothing but the global defaults (`Host *`) applied
160
+ """
161
+ # We direct stderr→stdout because debug output goes to stderr.
162
+ proc = subprocess.run(
163
+ ['ssh', '-vvG', hostname, '-o', 'ConnectTimeout=0'],
164
+ text=True,
165
+ stdout=subprocess.PIPE,
166
+ stderr=subprocess.STDOUT,
167
+ check=False, # we only want the text, not to raise
168
+ )
169
+
170
+ # Look for lines like:
171
+ # debug1: ~/.ssh/config line 42: Applying options for <hostname>
172
+ # Anything other than "*"
173
+ pattern = re.compile(r'^debug\d+: .*Applying options for ([^*].*)$',
174
+ re.MULTILINE)
175
+
176
+ return bool(pattern.search(proc.stdout))
177
+
178
+
179
+ def get_cluster_config(targets: Dict[str, Any],
180
+ cluster_name: Optional[str] = None,
181
+ file_path: Optional[str] = None) -> Dict[str, Any]:
182
+ """Get configuration for specific clusters or all clusters."""
183
+ if not targets:
184
+ with ux_utils.print_exception_no_traceback():
185
+ raise ValueError(
186
+ f'No clusters defined in SSH Node Pools file {file_path}')
187
+
188
+ if cluster_name:
189
+ if cluster_name not in targets:
190
+ with ux_utils.print_exception_no_traceback():
191
+ raise ValueError(f'Cluster {cluster_name!r} not found in '
192
+ f'SSH Node Pools file {file_path}')
193
+ return {cluster_name: targets[cluster_name]}
194
+
195
+ # Return all clusters if no specific cluster is specified
196
+ return targets
197
+
198
+
199
+ def prepare_hosts_info(cluster_name: str,
200
+ cluster_config: Dict[str, Any]) -> List[Dict[str, str]]:
201
+ """Prepare list of hosts with resolved user, identity_file, and password."""
202
+ if 'hosts' not in cluster_config or not cluster_config['hosts']:
203
+ with ux_utils.print_exception_no_traceback():
204
+ raise ValueError(
205
+ f'No hosts defined in cluster {cluster_name} configuration')
206
+
207
+ # Get cluster-level defaults
208
+ cluster_user = cluster_config.get('user', '')
209
+ cluster_identity_file = cluster_config.get('identity_file', '')
210
+ cluster_password = cluster_config.get('password', '')
211
+
212
+ hosts_info = []
213
+ for host in cluster_config['hosts']:
214
+ # Host can be a string (IP or SSH config hostname) or a dict
215
+ if isinstance(host, str):
216
+ # Check if this is an SSH config hostname
217
+ is_ssh_config_host = check_host_in_ssh_config(host)
218
+
219
+ hosts_info.append({
220
+ 'ip': host,
221
+ 'user': '' if is_ssh_config_host else cluster_user,
222
+ 'identity_file': '' if is_ssh_config_host else
223
+ cluster_identity_file,
224
+ 'password': cluster_password,
225
+ 'use_ssh_config': is_ssh_config_host
226
+ })
227
+ else:
228
+ # It's a dict with potential overrides
229
+ if 'ip' not in host:
230
+ print(
231
+ f'{RED}Warning: Host missing \'ip\' field, skipping: {host}{NC}'
232
+ )
233
+ continue
234
+
235
+ # Check if this is an SSH config hostname
236
+ is_ssh_config_host = check_host_in_ssh_config(host['ip'])
237
+
238
+ # Use host-specific values or fall back to cluster defaults
239
+ host_user = '' if is_ssh_config_host else host.get(
240
+ 'user', cluster_user)
241
+ host_identity_file = '' if is_ssh_config_host else host.get(
242
+ 'identity_file', cluster_identity_file)
243
+ host_password = host.get('password', cluster_password)
244
+
245
+ hosts_info.append({
246
+ 'ip': host['ip'],
247
+ 'user': host_user,
248
+ 'identity_file': host_identity_file,
249
+ 'password': host_password,
250
+ 'use_ssh_config': is_ssh_config_host
251
+ })
252
+
253
+ return hosts_info
254
+
255
+
256
+ def run_command(cmd, shell=False):
257
+ """Run a local command and return the output."""
258
+ process = subprocess.run(cmd,
259
+ shell=shell,
260
+ capture_output=True,
261
+ text=True,
262
+ check=False)
263
+ if process.returncode != 0:
264
+ print(f'{RED}Error executing command: {cmd}{NC}')
265
+ print(f'STDOUT: {process.stdout}')
266
+ print(f'STDERR: {process.stderr}')
267
+ return None
268
+ return process.stdout.strip()
269
+
270
+
271
+ def get_effective_host_ip(hostname: str) -> str:
272
+ """Get the effective IP for a hostname from SSH config."""
273
+ try:
274
+ result = subprocess.run(['ssh', '-G', hostname],
275
+ capture_output=True,
276
+ text=True,
277
+ check=False)
278
+ if result.returncode == 0:
279
+ for line in result.stdout.splitlines():
280
+ if line.startswith('hostname '):
281
+ return line.split(' ', 1)[1].strip()
282
+ except Exception: # pylint: disable=broad-except
283
+ pass
284
+ return hostname # Return the original hostname if lookup fails
285
+
286
+
287
+ def run_remote(node,
288
+ cmd,
289
+ user='',
290
+ ssh_key='',
291
+ connect_timeout=30,
292
+ use_ssh_config=False,
293
+ print_output=False,
294
+ use_shell=False):
295
+ """Run a command on a remote machine via SSH."""
296
+ if use_ssh_config:
297
+ # Use SSH config for connection parameters
298
+ ssh_cmd = ['ssh', node, cmd]
299
+ else:
300
+ # Use explicit parameters
301
+ ssh_cmd = [
302
+ 'ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'IdentitiesOnly=yes',
303
+ '-o', f'ConnectTimeout={connect_timeout}', '-o',
304
+ 'ServerAliveInterval=10', '-o', 'ServerAliveCountMax=3'
305
+ ]
306
+
307
+ if ssh_key:
308
+ ssh_cmd.extend(['-i', ssh_key])
309
+
310
+ ssh_cmd.append(f'{user}@{node}' if user else node)
311
+ ssh_cmd.append(cmd)
312
+
313
+ if use_shell:
314
+ ssh_cmd = ' '.join(ssh_cmd)
315
+
316
+ process = subprocess.run(ssh_cmd,
317
+ capture_output=True,
318
+ text=True,
319
+ check=False,
320
+ shell=use_shell)
321
+ if process.returncode != 0:
322
+ print(f'{RED}Error executing command {cmd} on {node}:{NC}')
323
+ print(f'STDERR: {process.stderr}')
324
+ return None
325
+ if print_output:
326
+ print(process.stdout)
327
+ return process.stdout.strip()
328
+
329
+
330
+ def create_askpass_script(password):
331
+ """Create an askpass script block for sudo with password."""
332
+ if not password:
333
+ return ''
334
+
335
+ return f"""
336
+ # Create temporary askpass script
337
+ ASKPASS_SCRIPT=$(mktemp)
338
+ trap 'rm -f $ASKPASS_SCRIPT' EXIT INT TERM ERR QUIT
339
+ cat > $ASKPASS_SCRIPT << EOF
340
+ #!/bin/bash
341
+ echo {password}
342
+ EOF
343
+ chmod 700 $ASKPASS_SCRIPT
344
+ # Use askpass
345
+ export SUDO_ASKPASS=$ASKPASS_SCRIPT
346
+ """
347
+
348
+
349
+ def progress_message(message):
350
+ """Show a progress message."""
351
+ print(f'{YELLOW}➜ {message}{NC}')
352
+
353
+
354
+ def success_message(message):
355
+ """Show a success message."""
356
+ print(f'{GREEN}✔ {message}{NC}')
357
+
358
+
359
+ def cleanup_server_node(node,
360
+ user,
361
+ ssh_key,
362
+ askpass_block,
363
+ use_ssh_config=False):
364
+ """Uninstall k3s and clean up the state on a server node."""
365
+ print(f'{YELLOW}Cleaning up head node {node}...{NC}')
366
+ cmd = f"""
367
+ {askpass_block}
368
+ echo 'Uninstalling k3s...' &&
369
+ sudo -A /usr/local/bin/k3s-uninstall.sh || true &&
370
+ sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
371
+ """
372
+ result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
373
+ if result is None:
374
+ print(f'{RED}Failed to clean up head node ({node}).{NC}')
375
+ else:
376
+ success_message(f'Node {node} cleaned up successfully.')
377
+
378
+
379
+ def cleanup_agent_node(node,
380
+ user,
381
+ ssh_key,
382
+ askpass_block,
383
+ use_ssh_config=False):
384
+ """Uninstall k3s and clean up the state on an agent node."""
385
+ print(f'{YELLOW}Cleaning up worker node {node}...{NC}')
386
+ cmd = f"""
387
+ {askpass_block}
388
+ echo 'Uninstalling k3s...' &&
389
+ sudo -A /usr/local/bin/k3s-agent-uninstall.sh || true &&
390
+ sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
391
+ """
392
+ result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
393
+ if result is None:
394
+ print(f'{RED}Failed to clean up worker node ({node}).{NC}')
395
+ else:
396
+ success_message(f'Node {node} cleaned up successfully.')
397
+
398
+
399
+ def start_agent_node(node,
400
+ master_addr,
401
+ k3s_token,
402
+ user,
403
+ ssh_key,
404
+ askpass_block,
405
+ use_ssh_config=False):
406
+ """Start a k3s agent node.
407
+ Returns: if the start is successful, and if the node has a GPU."""
408
+ cmd = f"""
409
+ {askpass_block}
410
+ curl -sfL https://get.k3s.io | K3S_NODE_NAME={node} INSTALL_K3S_EXEC='agent --node-label skypilot-ip={node}' \
411
+ K3S_URL=https://{master_addr}:6443 K3S_TOKEN={k3s_token} sudo -E -A sh -
412
+ """
413
+ result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
414
+ if result is None:
415
+ print(f'{RED}Failed to deploy K3s on worker node ({node}).{NC}')
416
+ return node, False, False
417
+ success_message(f'Kubernetes deployed on worker node ({node}).')
418
+ # Check if worker node has a GPU
419
+ if check_gpu(node, user, ssh_key, use_ssh_config=use_ssh_config):
420
+ print(f'{YELLOW}GPU detected on worker node ({node}).{NC}')
421
+ return node, True, True
422
+ return node, True, False
423
+
424
+
425
+ def check_gpu(node, user, ssh_key, use_ssh_config=False):
426
+ """Check if a node has a GPU."""
427
+ cmd = 'command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null'
428
+ result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
429
+ return result is not None
430
+
431
+
432
+ def ensure_directory_exists(path):
433
+ """Ensure the directory for the specified file path exists."""
434
+ directory = os.path.dirname(path)
435
+ if directory and not os.path.exists(directory):
436
+ os.makedirs(directory, exist_ok=True)
437
+
438
+
439
+ def get_used_localhost_ports() -> Set[int]:
440
+ """Get SSH port forwardings already in use on localhost"""
441
+ used_ports = set()
442
+
443
+ # Get ports from netstat (works on macOS and Linux)
444
+ try:
445
+ if sys.platform == 'darwin':
446
+ # macOS
447
+ result = subprocess.run(['netstat', '-an', '-p', 'tcp'],
448
+ capture_output=True,
449
+ text=True,
450
+ check=False)
451
+ else:
452
+ # Linux and other Unix-like systems
453
+ result = subprocess.run(['netstat', '-tln'],
454
+ capture_output=True,
455
+ text=True,
456
+ check=False)
457
+
458
+ if result.returncode == 0:
459
+ # Look for lines with 'localhost:<port>' or '127.0.0.1:<port>'
460
+ for line in result.stdout.splitlines():
461
+ if '127.0.0.1:' in line or 'localhost:' in line:
462
+ match = re.search(r':(64\d\d)\s', line)
463
+ if match:
464
+ port = int(match.group(1))
465
+ if 6400 <= port <= 6500: # Only consider our range
466
+ used_ports.add(port)
467
+ except (subprocess.SubprocessError, FileNotFoundError):
468
+ # If netstat fails, try another approach
469
+ pass
470
+
471
+ # Also check ports from existing kubeconfig entries
472
+ try:
473
+ result = subprocess.run([
474
+ 'kubectl', 'config', 'view', '-o',
475
+ 'jsonpath=\'{.clusters[*].cluster.server}\''
476
+ ],
477
+ capture_output=True,
478
+ text=True,
479
+ check=False)
480
+
481
+ if result.returncode == 0:
482
+ # Look for localhost URLs with ports
483
+ for url in result.stdout.split():
484
+ if 'localhost:' in url or '127.0.0.1:' in url:
485
+ match = re.search(r':(\d+)', url)
486
+ if match:
487
+ port = int(match.group(1))
488
+ if 6400 <= port <= 6500: # Only consider our range
489
+ used_ports.add(port)
490
+ except subprocess.SubprocessError:
491
+ pass
492
+
493
+ return used_ports
494
+
495
+
496
+ def get_available_port(start: int = 6443, end: int = 6499) -> int:
497
+ """Get an available port in the given range that's not used by other tunnels"""
498
+ used_ports = get_used_localhost_ports()
499
+
500
+ # Try to use port 6443 first if available for the first cluster
501
+ if start == 6443 and start not in used_ports:
502
+ return start
503
+
504
+ # Otherwise find any available port in the range
505
+ available_ports = list(set(range(start, end + 1)) - used_ports)
506
+
507
+ if not available_ports:
508
+ # If all ports are used, pick a random one from our range
509
+ # (we'll terminate any existing connection in the setup)
510
+ return random.randint(start, end)
511
+
512
+ # Sort to get deterministic allocation
513
+ available_ports.sort()
514
+ return available_ports[0]
515
+
516
+
517
+ def setup_kubectl_ssh_tunnel(head_node,
518
+ ssh_user,
519
+ ssh_key,
520
+ context_name,
521
+ use_ssh_config=False):
522
+ """Set up kubeconfig exec credential plugin for SSH tunnel"""
523
+ progress_message('Setting up SSH tunnel for Kubernetes API access...')
524
+
525
+ # Get an available port for this cluster
526
+ port = get_available_port()
527
+
528
+ # Paths to scripts
529
+ tunnel_script = os.path.join(SCRIPT_DIR, 'ssh-tunnel.sh')
530
+
531
+ # Make sure scripts are executable
532
+ os.chmod(tunnel_script, 0o755)
533
+
534
+ # Certificate files
535
+ client_cert_file = os.path.join(NODE_POOLS_INFO_DIR,
536
+ f'{context_name}-cert.pem')
537
+ client_key_file = os.path.join(NODE_POOLS_INFO_DIR,
538
+ f'{context_name}-key.pem')
539
+
540
+ # Update kubeconfig to use localhost with the selected port
541
+ run_command([
542
+ 'kubectl', 'config', 'set-cluster', context_name,
543
+ f'--server=https://127.0.0.1:{port}', '--insecure-skip-tls-verify=true'
544
+ ])
545
+
546
+ # Build the exec args list based on auth method
547
+ exec_args = [
548
+ '--exec-command', tunnel_script, '--exec-api-version',
549
+ 'client.authentication.k8s.io/v1beta1'
550
+ ]
551
+
552
+ # Set credential TTL to force frequent tunnel checks
553
+ ttl_seconds = 30
554
+
555
+ # Verify if we have extracted certificate data files
556
+ has_cert_files = os.path.isfile(client_cert_file) and os.path.isfile(
557
+ client_key_file)
558
+ if has_cert_files:
559
+ print(
560
+ f'{GREEN}Client certificate data extracted and will be used for authentication{NC}'
561
+ )
562
+
563
+ if use_ssh_config:
564
+ run_command(
565
+ ['kubectl', 'config', 'set-credentials', context_name] + exec_args +
566
+ [
567
+ '--exec-arg=--context', f'--exec-arg={context_name}',
568
+ '--exec-arg=--port', f'--exec-arg={port}', '--exec-arg=--ttl',
569
+ f'--exec-arg={ttl_seconds}', '--exec-arg=--use-ssh-config',
570
+ '--exec-arg=--host', f'--exec-arg={head_node}'
571
+ ])
572
+ else:
573
+ run_command(['kubectl', 'config', 'set-credentials', context_name] +
574
+ exec_args + [
575
+ '--exec-arg=--context', f'--exec-arg={context_name}',
576
+ '--exec-arg=--port', f'--exec-arg={port}',
577
+ '--exec-arg=--ttl', f'--exec-arg={ttl_seconds}',
578
+ '--exec-arg=--host', f'--exec-arg={head_node}',
579
+ '--exec-arg=--user', f'--exec-arg={ssh_user}',
580
+ '--exec-arg=--ssh-key', f'--exec-arg={ssh_key}'
581
+ ])
582
+
583
+ success_message(
584
+ f'SSH tunnel configured through kubectl credential plugin on port {port}'
585
+ )
586
+ print(
587
+ f'{GREEN}Your kubectl connection is now tunneled through SSH (port {port}).{NC}'
588
+ )
589
+ print(
590
+ f'{GREEN}This tunnel will be automatically established when needed.{NC}'
591
+ )
592
+ print(
593
+ f'{GREEN}Credential TTL set to {ttl_seconds}s to ensure tunnel health is checked frequently.{NC}'
594
+ )
595
+
596
+ return port
597
+
598
+
599
+ def cleanup_kubectl_ssh_tunnel(context_name):
600
+ """Clean up the SSH tunnel for a specific context"""
601
+ progress_message(f'Cleaning up SSH tunnel for context {context_name}...')
602
+
603
+ # Path to cleanup script
604
+ cleanup_script = os.path.join(SCRIPT_DIR, 'cleanup-tunnel.sh')
605
+
606
+ # Make sure script is executable
607
+ if os.path.exists(cleanup_script):
608
+ os.chmod(cleanup_script, 0o755)
609
+
610
+ # Run the cleanup script
611
+ subprocess.run([cleanup_script, context_name],
612
+ stdout=subprocess.DEVNULL,
613
+ stderr=subprocess.DEVNULL,
614
+ check=False)
615
+
616
+ success_message(f'SSH tunnel for context {context_name} cleaned up')
617
+ else:
618
+ print(f'{YELLOW}Cleanup script not found: {cleanup_script}{NC}')
619
+
620
+
621
+ def main():
622
+ args = parse_args()
623
+
624
+ kubeconfig_path = os.path.expanduser(args.kubeconfig_path)
625
+ global_use_ssh_config = args.use_ssh_config
626
+
627
+ # Print cleanup mode marker if applicable
628
+ if args.cleanup:
629
+ print('SKYPILOT_CLEANUP_MODE: Cleanup mode activated')
630
+
631
+ # Check if using YAML configuration or command line arguments
632
+ if args.ips_file:
633
+ # Using command line arguments - legacy mode
634
+ if args.ssh_key and not os.path.isfile(
635
+ args.ssh_key) and not global_use_ssh_config:
636
+ with ux_utils.print_exception_no_traceback():
637
+ raise ValueError(f'SSH key not found: {args.ssh_key}')
638
+
639
+ if not os.path.isfile(args.ips_file):
640
+ with ux_utils.print_exception_no_traceback():
641
+ raise ValueError(f'IPs file not found: {args.ips_file}')
642
+
643
+ with open(args.ips_file, 'r', encoding='utf-8') as f:
644
+ hosts = [line.strip() for line in f if line.strip()]
645
+
646
+ if not hosts:
647
+ with ux_utils.print_exception_no_traceback():
648
+ raise ValueError(
649
+ 'Hosts file is empty or not formatted correctly.')
650
+
651
+ head_node = hosts[0]
652
+ worker_nodes = hosts[1:]
653
+ ssh_user = args.user if not global_use_ssh_config else ''
654
+ ssh_key = args.ssh_key if not global_use_ssh_config else ''
655
+ context_name = args.context_name
656
+ password = args.password
657
+
658
+ # Check if hosts are in SSH config
659
+ head_use_ssh_config = global_use_ssh_config or check_host_in_ssh_config(
660
+ head_node)
661
+ worker_use_ssh_config = [
662
+ global_use_ssh_config or check_host_in_ssh_config(node)
663
+ for node in worker_nodes
664
+ ]
665
+
666
+ # Single cluster deployment for legacy mode
667
+ deploy_cluster(head_node, worker_nodes, ssh_user, ssh_key, context_name,
668
+ password, head_use_ssh_config, worker_use_ssh_config,
669
+ kubeconfig_path, args.cleanup)
670
+ else:
671
+ # Using YAML configuration
672
+ targets = load_ssh_targets(args.ssh_node_pools_file)
673
+ clusters_config = get_cluster_config(targets,
674
+ args.infra,
675
+ file_path=args.ssh_node_pools_file)
676
+
677
+ # Print information about clusters being processed
678
+ num_clusters = len(clusters_config)
679
+ cluster_names = list(clusters_config.keys())
680
+ cluster_info = f'Found {num_clusters} Node Pool{"s" if num_clusters > 1 else ""}: {", ".join(cluster_names)}'
681
+ print(f'SKYPILOT_CLUSTER_INFO: {cluster_info}')
682
+
683
+ # Process each cluster
684
+ for cluster_name, cluster_config in clusters_config.items():
685
+ try:
686
+ print(f'SKYPILOT_CURRENT_CLUSTER: {cluster_name}')
687
+ print(
688
+ f'{YELLOW}==== Deploying cluster: {cluster_name} ====${NC}')
689
+ hosts_info = prepare_hosts_info(cluster_name, cluster_config)
690
+
691
+ if not hosts_info:
692
+ print(
693
+ f'{RED}Error: No valid hosts found for cluster {cluster_name!r}. Skipping.{NC}'
694
+ )
695
+ continue
696
+
697
+ # Generate a unique context name for each cluster
698
+ context_name = args.context_name
699
+ if context_name == 'default':
700
+ context_name = 'ssh-' + cluster_name
701
+
702
+ # Check cluster history
703
+ os.makedirs(NODE_POOLS_INFO_DIR, exist_ok=True)
704
+ history_yaml_file = os.path.join(
705
+ NODE_POOLS_INFO_DIR, f'{context_name}-history.yaml')
706
+
707
+ history = None
708
+ if os.path.exists(history_yaml_file):
709
+ print(
710
+ f'{YELLOW}Loading history from {history_yaml_file}{NC}')
711
+ with open(history_yaml_file, 'r', encoding='utf-8') as f:
712
+ history = yaml.safe_load(f)
713
+ else:
714
+ print(f'{YELLOW}No history found for {context_name}.{NC}')
715
+
716
+ history_workers_info = None
717
+ history_worker_nodes = None
718
+ history_use_ssh_config = None
719
+ # Do not support changing anything besides hosts for now
720
+ if history is not None:
721
+ for key in ['user', 'identity_file', 'password']:
722
+ if history.get(key) != cluster_config.get(key):
723
+ raise ValueError(
724
+ f'Cluster configuration has changed for field {key!r}. '
725
+ f'Previous value: {history.get(key)}, '
726
+ f'Current value: {cluster_config.get(key)}')
727
+ history_hosts_info = prepare_hosts_info(
728
+ cluster_name, history)
729
+ if history_hosts_info[0] != hosts_info[0]:
730
+ raise ValueError(
731
+ f'Cluster configuration has changed for master node. '
732
+ f'Previous value: {history_hosts_info[0]}, '
733
+ f'Current value: {hosts_info[0]}')
734
+ history_workers_info = history_hosts_info[1:] if len(
735
+ history_hosts_info) > 1 else []
736
+ history_worker_nodes = [
737
+ h['ip'] for h in history_workers_info
738
+ ]
739
+ history_use_ssh_config = [
740
+ h.get('use_ssh_config', False)
741
+ for h in history_workers_info
742
+ ]
743
+
744
+ # Use the first host as the head node and the rest as worker nodes
745
+ head_host = hosts_info[0]
746
+ worker_hosts = hosts_info[1:] if len(hosts_info) > 1 else []
747
+
748
+ head_node = head_host['ip']
749
+ worker_nodes = [h['ip'] for h in worker_hosts]
750
+ ssh_user = head_host['user']
751
+ ssh_key = head_host['identity_file']
752
+ head_use_ssh_config = global_use_ssh_config or head_host.get(
753
+ 'use_ssh_config', False)
754
+ worker_use_ssh_config = [
755
+ global_use_ssh_config or h.get('use_ssh_config', False)
756
+ for h in worker_hosts
757
+ ]
758
+ password = head_host['password']
759
+
760
+ # Deploy this cluster
761
+ unsuccessful_workers = deploy_cluster(
762
+ head_node,
763
+ worker_nodes,
764
+ ssh_user,
765
+ ssh_key,
766
+ context_name,
767
+ password,
768
+ head_use_ssh_config,
769
+ worker_use_ssh_config,
770
+ kubeconfig_path,
771
+ args.cleanup,
772
+ worker_hosts=worker_hosts,
773
+ history_worker_nodes=history_worker_nodes,
774
+ history_workers_info=history_workers_info,
775
+ history_use_ssh_config=history_use_ssh_config)
776
+
777
+ if not args.cleanup:
778
+ successful_hosts = []
779
+ for host in cluster_config['hosts']:
780
+ if isinstance(host, str):
781
+ host_node = host
782
+ else:
783
+ host_node = host['ip']
784
+ if host_node not in unsuccessful_workers:
785
+ successful_hosts.append(host)
786
+ cluster_config['hosts'] = successful_hosts
787
+ with open(history_yaml_file, 'w', encoding='utf-8') as f:
788
+ print(
789
+ f'{YELLOW}Writing history to {history_yaml_file}{NC}'
790
+ )
791
+ yaml.dump(cluster_config, f)
792
+
793
+ print(
794
+ f'{GREEN}==== Completed deployment for cluster: {cluster_name} ====${NC}'
795
+ )
796
+ except Exception as e: # pylint: disable=broad-except
797
+ print(
798
+ f'{RED}Error deploying SSH Node Pool {cluster_name}: {e}{NC}'
799
+ )
800
+
801
+
802
+ def deploy_cluster(head_node,
803
+ worker_nodes,
804
+ ssh_user,
805
+ ssh_key,
806
+ context_name,
807
+ password,
808
+ head_use_ssh_config,
809
+ worker_use_ssh_config,
810
+ kubeconfig_path,
811
+ cleanup,
812
+ worker_hosts=None,
813
+ history_worker_nodes=None,
814
+ history_workers_info=None,
815
+ history_use_ssh_config=None) -> List[str]:
816
+ """Deploy or clean up a single Kubernetes cluster.
817
+
818
+ Returns: List of unsuccessful worker nodes.
819
+ """
820
+ # Ensure SSH key is expanded for paths with ~ (home directory)
821
+ if ssh_key:
822
+ ssh_key = os.path.expanduser(ssh_key)
823
+
824
+ history_yaml_file = os.path.join(NODE_POOLS_INFO_DIR,
825
+ f'{context_name}-history.yaml')
826
+ cert_file_path = os.path.join(NODE_POOLS_INFO_DIR,
827
+ f'{context_name}-cert.pem')
828
+ key_file_path = os.path.join(NODE_POOLS_INFO_DIR, f'{context_name}-key.pem')
829
+ tunnel_log_file_path = os.path.join(NODE_POOLS_INFO_DIR,
830
+ f'{context_name}-tunnel.log')
831
+
832
+ # Generate the askpass block if password is provided
833
+ askpass_block = create_askpass_script(password)
834
+
835
+ # Token for k3s
836
+ k3s_token = 'mytoken' # Any string can be used as the token
837
+
838
+ # Pre-flight checks
839
+ print(f'{YELLOW}Checking SSH connection to head node...{NC}')
840
+ result = run_remote(
841
+ head_node,
842
+ f'echo \'SSH connection successful ({head_node})\'',
843
+ ssh_user,
844
+ ssh_key,
845
+ use_ssh_config=head_use_ssh_config,
846
+ # For SkySSHUpLineProcessor
847
+ print_output=True)
848
+ if result is None:
849
+ with ux_utils.print_exception_no_traceback():
850
+ raise RuntimeError(f'Failed to SSH to head node ({head_node}). '
851
+ f'Please check the SSH configuration.')
852
+
853
+ # Checking history
854
+ history_exists = (history_worker_nodes is not None and
855
+ history_workers_info is not None and
856
+ history_use_ssh_config is not None)
857
+
858
+ # Cleanup history worker nodes
859
+ worker_nodes_to_cleanup = []
860
+ remove_worker_cmds = []
861
+ if history_exists:
862
+ for history_node, history_info, use_ssh_config in zip(
863
+ history_worker_nodes, history_workers_info,
864
+ history_use_ssh_config):
865
+ if worker_hosts is not None and history_info not in worker_hosts:
866
+ print(
867
+ f'{YELLOW}Worker node {history_node} not found in YAML config. '
868
+ f'Removing from history...{NC}')
869
+ worker_nodes_to_cleanup.append(
870
+ dict(
871
+ node=history_node,
872
+ user=ssh_user
873
+ if history_info is None else history_info['user'],
874
+ ssh_key=ssh_key if history_info is None else
875
+ history_info['identity_file'],
876
+ askpass_block=(askpass_block if history_info is None
877
+ else create_askpass_script(
878
+ history_info['password'])),
879
+ use_ssh_config=use_ssh_config,
880
+ ))
881
+ remove_worker_cmds.append(
882
+ f'kubectl delete node -l skypilot-ip={history_node}')
883
+ # If this is a create operation and there exists some stale log,
884
+ # cleanup the log for a new file to store new logs.
885
+ if not cleanup and os.path.exists(tunnel_log_file_path):
886
+ os.remove(tunnel_log_file_path)
887
+
888
+ # If --cleanup flag is set, uninstall k3s and exit
889
+ if cleanup:
890
+ # Pickup all nodes
891
+ worker_nodes_to_cleanup.clear()
892
+ for node, info, use_ssh_config in zip(worker_nodes, worker_hosts,
893
+ worker_use_ssh_config):
894
+ worker_nodes_to_cleanup.append(
895
+ dict(
896
+ node=node,
897
+ user=ssh_user if info is None else info['user'],
898
+ ssh_key=ssh_key if info is None else info['identity_file'],
899
+ askpass_block=(askpass_block if info is None else
900
+ create_askpass_script(info['password'])),
901
+ use_ssh_config=use_ssh_config,
902
+ ))
903
+
904
+ print(f'{YELLOW}Starting cleanup...{NC}')
905
+
906
+ # Clean up head node
907
+ cleanup_server_node(head_node,
908
+ ssh_user,
909
+ ssh_key,
910
+ askpass_block,
911
+ use_ssh_config=head_use_ssh_config)
912
+ # Clean up worker nodes
913
+ with cf.ThreadPoolExecutor() as executor:
914
+ executor.map(lambda kwargs: cleanup_agent_node(**kwargs),
915
+ worker_nodes_to_cleanup)
916
+
917
+ with cf.ThreadPoolExecutor() as executor:
918
+
919
+ def run_cleanup_cmd(cmd):
920
+ print('Cleaning up worker nodes:', cmd)
921
+ run_command(cmd, shell=True)
922
+
923
+ executor.map(run_cleanup_cmd, remove_worker_cmds)
924
+
925
+ if cleanup:
926
+
927
+ # Remove the context from local kubeconfig if it exists
928
+ if os.path.isfile(kubeconfig_path):
929
+ progress_message(
930
+ f'Removing context {context_name!r} from local kubeconfig...')
931
+ run_command(['kubectl', 'config', 'delete-context', context_name],
932
+ shell=False)
933
+ run_command(['kubectl', 'config', 'delete-cluster', context_name],
934
+ shell=False)
935
+ run_command(['kubectl', 'config', 'delete-user', context_name],
936
+ shell=False)
937
+
938
+ # Update the current context to the first available context
939
+ contexts = run_command([
940
+ 'kubectl', 'config', 'view', '-o',
941
+ 'jsonpath=\'{.contexts[0].name}\''
942
+ ],
943
+ shell=False)
944
+ if contexts:
945
+ run_command(['kubectl', 'config', 'use-context', contexts],
946
+ shell=False)
947
+ else:
948
+ # If no context is available, simply unset the current context
949
+ run_command(['kubectl', 'config', 'unset', 'current-context'],
950
+ shell=False)
951
+
952
+ success_message(
953
+ f'Context {context_name!r} removed from local kubeconfig.')
954
+
955
+ for file in [history_yaml_file, cert_file_path, key_file_path]:
956
+ if os.path.exists(file):
957
+ os.remove(file)
958
+
959
+ # Clean up SSH tunnel after clean up kubeconfig, because the kubectl
960
+ # will restart the ssh tunnel if it's not running.
961
+ cleanup_kubectl_ssh_tunnel(context_name)
962
+
963
+ print(f'{GREEN}Cleanup completed successfully.{NC}')
964
+
965
+ # Print completion marker for current cluster
966
+ print(f'{GREEN}SKYPILOT_CLUSTER_COMPLETED: {NC}')
967
+
968
+ return []
969
+
970
+ print(f'{YELLOW}Checking TCP Forwarding Options...{NC}')
971
+ cmd = (
972
+ 'if [ "$(sudo sshd -T | grep allowtcpforwarding)" = "allowtcpforwarding yes" ]; then '
973
+ f'echo "TCP Forwarding already enabled on head node ({head_node})."; '
974
+ 'else '
975
+ 'sudo sed -i \'s/^#\?\s*AllowTcpForwarding.*/AllowTcpForwarding yes/\' ' # pylint: disable=anomalous-backslash-in-string
976
+ '/etc/ssh/sshd_config && sudo systemctl restart sshd && '
977
+ f'echo "Successfully enabled TCP Forwarding on head node ({head_node})."; '
978
+ 'fi')
979
+ result = run_remote(
980
+ head_node,
981
+ shlex.quote(cmd),
982
+ ssh_user,
983
+ ssh_key,
984
+ use_ssh_config=head_use_ssh_config,
985
+ # For SkySSHUpLineProcessor
986
+ print_output=True,
987
+ use_shell=True)
988
+ if result is None:
989
+ with ux_utils.print_exception_no_traceback():
990
+ raise RuntimeError(
991
+ f'Failed to setup TCP forwarding on head node ({head_node}). '
992
+ f'Please check the SSH configuration.')
993
+
994
+ # Get effective IP for master node if using SSH config - needed for workers to connect
995
+ if head_use_ssh_config:
996
+ effective_master_ip = get_effective_host_ip(head_node)
997
+ print(
998
+ f'{GREEN}Resolved head node {head_node} to {effective_master_ip} from SSH config{NC}'
999
+ )
1000
+ else:
1001
+ effective_master_ip = head_node
1002
+
1003
+ # Step 1: Install k3s on the head node
1004
+ # Check if head node has a GPU
1005
+ install_gpu = False
1006
+ progress_message(f'Deploying Kubernetes on head node ({head_node})...')
1007
+ cmd = f"""
1008
+ {askpass_block}
1009
+ curl -sfL https://get.k3s.io | K3S_TOKEN={k3s_token} K3S_NODE_NAME={head_node} sudo -E -A sh - &&
1010
+ mkdir -p ~/.kube &&
1011
+ sudo -A cp /etc/rancher/k3s/k3s.yaml ~/.kube/config &&
1012
+ sudo -A chown $(id -u):$(id -g) ~/.kube/config &&
1013
+ for i in {{1..3}}; do
1014
+ if kubectl wait --for=condition=ready node --all --timeout=2m --kubeconfig ~/.kube/config; then
1015
+ break
1016
+ else
1017
+ echo 'Waiting for nodes to be ready...'
1018
+ sleep 5
1019
+ fi
1020
+ done
1021
+ if [ $i -eq 3 ]; then
1022
+ echo 'Failed to wait for nodes to be ready after 3 attempts'
1023
+ exit 1
1024
+ fi
1025
+ """
1026
+ result = run_remote(head_node,
1027
+ cmd,
1028
+ ssh_user,
1029
+ ssh_key,
1030
+ use_ssh_config=head_use_ssh_config)
1031
+ if result is None:
1032
+ with ux_utils.print_exception_no_traceback():
1033
+ raise RuntimeError(
1034
+ f'Failed to deploy K3s on head node ({head_node}).')
1035
+ success_message(f'K3s deployed on head node ({head_node}).')
1036
+
1037
+ # Check if head node has a GPU
1038
+ install_gpu = False
1039
+ if check_gpu(head_node,
1040
+ ssh_user,
1041
+ ssh_key,
1042
+ use_ssh_config=head_use_ssh_config):
1043
+ print(f'{YELLOW}GPU detected on head node ({head_node}).{NC}')
1044
+ install_gpu = True
1045
+
1046
+ # Fetch the head node's internal IP (this will be passed to worker nodes)
1047
+ master_addr = run_remote(head_node,
1048
+ 'hostname -I | awk \'{print $1}\'',
1049
+ ssh_user,
1050
+ ssh_key,
1051
+ use_ssh_config=head_use_ssh_config)
1052
+ if master_addr is None:
1053
+ with ux_utils.print_exception_no_traceback():
1054
+ raise RuntimeError(f'Failed to SSH to head node ({head_node}). '
1055
+ f'Please check the SSH configuration.')
1056
+ print(f'{GREEN}Master node internal IP: {master_addr}{NC}')
1057
+
1058
+ # Step 2: Install k3s on worker nodes and join them to the master node
1059
+ def deploy_worker(args):
1060
+ (i, node, worker_hosts, history_workers_info, ssh_user, ssh_key,
1061
+ askpass_block, worker_use_ssh_config, master_addr, k3s_token) = args
1062
+ progress_message(f'Deploying Kubernetes on worker node ({node})...')
1063
+
1064
+ # If using YAML config with specific worker info
1065
+ if worker_hosts and i < len(worker_hosts):
1066
+ if history_workers_info is not None and worker_hosts[
1067
+ i] in history_workers_info:
1068
+ print(
1069
+ f'{YELLOW}Worker node ({node}) already exists in history. '
1070
+ f'Skipping...{NC}')
1071
+ return node, True, False
1072
+ worker_user = worker_hosts[i]['user']
1073
+ worker_key = worker_hosts[i]['identity_file']
1074
+ worker_password = worker_hosts[i]['password']
1075
+ worker_askpass = create_askpass_script(worker_password)
1076
+ worker_config = worker_use_ssh_config[i]
1077
+ else:
1078
+ worker_user = ssh_user
1079
+ worker_key = ssh_key
1080
+ worker_askpass = askpass_block
1081
+ worker_config = worker_use_ssh_config[i]
1082
+
1083
+ return start_agent_node(node,
1084
+ master_addr,
1085
+ k3s_token,
1086
+ worker_user,
1087
+ worker_key,
1088
+ worker_askpass,
1089
+ use_ssh_config=worker_config)
1090
+
1091
+ unsuccessful_workers = []
1092
+
1093
+ # Deploy workers in parallel using thread pool
1094
+ with cf.ThreadPoolExecutor() as executor:
1095
+ futures = []
1096
+ for i, node in enumerate(worker_nodes):
1097
+ args = (i, node, worker_hosts, history_workers_info, ssh_user,
1098
+ ssh_key, askpass_block, worker_use_ssh_config, master_addr,
1099
+ k3s_token)
1100
+ futures.append(executor.submit(deploy_worker, args))
1101
+
1102
+ # Check if worker node has a GPU
1103
+ for future in cf.as_completed(futures):
1104
+ node, suc, has_gpu = future.result()
1105
+ install_gpu = install_gpu or has_gpu
1106
+ if not suc:
1107
+ unsuccessful_workers.append(node)
1108
+
1109
+ # Step 3: Configure local kubectl to connect to the cluster
1110
+ progress_message('Configuring local kubectl to connect to the cluster...')
1111
+
1112
+ # Create temporary directory for kubeconfig operations
1113
+ with tempfile.TemporaryDirectory() as temp_dir:
1114
+ temp_kubeconfig = os.path.join(temp_dir, 'kubeconfig')
1115
+
1116
+ # Get the kubeconfig from remote server
1117
+ if head_use_ssh_config:
1118
+ scp_cmd = ['scp', head_node + ':~/.kube/config', temp_kubeconfig]
1119
+ else:
1120
+ scp_cmd = [
1121
+ 'scp', '-o', 'StrictHostKeyChecking=no', '-o',
1122
+ 'IdentitiesOnly=yes', '-i', ssh_key,
1123
+ f'{ssh_user}@{head_node}:~/.kube/config', temp_kubeconfig
1124
+ ]
1125
+ run_command(scp_cmd, shell=False)
1126
+
1127
+ # Create the directory for the kubeconfig file if it doesn't exist
1128
+ ensure_directory_exists(kubeconfig_path)
1129
+
1130
+ # Create empty kubeconfig if it doesn't exist
1131
+ if not os.path.isfile(kubeconfig_path):
1132
+ open(kubeconfig_path, 'a', encoding='utf-8').close()
1133
+
1134
+ # Modify the temporary kubeconfig to update server address and context name
1135
+ modified_config = os.path.join(temp_dir, 'modified_config')
1136
+ with open(temp_kubeconfig, 'r', encoding='utf-8') as f_in:
1137
+ with open(modified_config, 'w', encoding='utf-8') as f_out:
1138
+ in_cluster = False
1139
+ in_user = False
1140
+ client_cert_data = None
1141
+ client_key_data = None
1142
+
1143
+ for line in f_in:
1144
+ if 'clusters:' in line:
1145
+ in_cluster = True
1146
+ in_user = False
1147
+ elif 'users:' in line:
1148
+ in_cluster = False
1149
+ in_user = True
1150
+ elif 'contexts:' in line:
1151
+ in_cluster = False
1152
+ in_user = False
1153
+
1154
+ # Skip certificate authority data in cluster section
1155
+ if in_cluster and 'certificate-authority-data:' in line:
1156
+ continue
1157
+ # Skip client certificate data in user section but extract it
1158
+ elif in_user and 'client-certificate-data:' in line:
1159
+ client_cert_data = line.split(':', 1)[1].strip()
1160
+ continue
1161
+ # Skip client key data in user section but extract it
1162
+ elif in_user and 'client-key-data:' in line:
1163
+ client_key_data = line.split(':', 1)[1].strip()
1164
+ continue
1165
+ elif in_cluster and 'server:' in line:
1166
+ # Initially just set to the effective master IP
1167
+ # (will be changed to localhost by setup_kubectl_ssh_tunnel later)
1168
+ f_out.write(
1169
+ f' server: https://{effective_master_ip}:6443\n')
1170
+ f_out.write(' insecure-skip-tls-verify: true\n')
1171
+ continue
1172
+
1173
+ # Replace default context names with user-provided context name
1174
+ line = line.replace('name: default',
1175
+ f'name: {context_name}')
1176
+ line = line.replace('cluster: default',
1177
+ f'cluster: {context_name}')
1178
+ line = line.replace('user: default',
1179
+ f'user: {context_name}')
1180
+ line = line.replace('current-context: default',
1181
+ f'current-context: {context_name}')
1182
+
1183
+ f_out.write(line)
1184
+
1185
+ # Save certificate data if available
1186
+
1187
+ if client_cert_data:
1188
+ # Decode base64 data and save as PEM
1189
+ try:
1190
+ # Clean up the certificate data by removing whitespace
1191
+ clean_cert_data = ''.join(client_cert_data.split())
1192
+ cert_pem = base64.b64decode(clean_cert_data).decode(
1193
+ 'utf-8')
1194
+
1195
+ # Check if the data already looks like a PEM file
1196
+ has_begin = '-----BEGIN CERTIFICATE-----' in cert_pem
1197
+ has_end = '-----END CERTIFICATE-----' in cert_pem
1198
+
1199
+ if not has_begin or not has_end:
1200
+ print(
1201
+ f'{YELLOW}Warning: Certificate data missing PEM markers, attempting to fix...{NC}'
1202
+ )
1203
+ # Add PEM markers if missing
1204
+ if not has_begin:
1205
+ cert_pem = f'-----BEGIN CERTIFICATE-----\n{cert_pem}'
1206
+ if not has_end:
1207
+ cert_pem = f'{cert_pem}\n-----END CERTIFICATE-----'
1208
+
1209
+ # Write the certificate
1210
+ with open(cert_file_path, 'w',
1211
+ encoding='utf-8') as cert_file:
1212
+ cert_file.write(cert_pem)
1213
+
1214
+ # Verify the file was written correctly
1215
+ if os.path.getsize(cert_file_path) > 0:
1216
+ print(
1217
+ f'{GREEN}Successfully saved certificate data ({len(cert_pem)} bytes){NC}'
1218
+ )
1219
+
1220
+ # Quick validation of PEM format
1221
+ with open(cert_file_path, 'r',
1222
+ encoding='utf-8') as f:
1223
+ content = f.readlines()
1224
+ first_line = content[0].strip(
1225
+ ) if content else ''
1226
+ last_line = content[-1].strip(
1227
+ ) if content else ''
1228
+
1229
+ if not first_line.startswith(
1230
+ '-----BEGIN') or not last_line.startswith(
1231
+ '-----END'):
1232
+ print(
1233
+ f'{YELLOW}Warning: Certificate may not be in proper PEM format{NC}'
1234
+ )
1235
+ else:
1236
+ print(f'{RED}Error: Certificate file is empty{NC}')
1237
+ except Exception as e: # pylint: disable=broad-except
1238
+ print(
1239
+ f'{RED}Error processing certificate data: {e}{NC}')
1240
+
1241
+ if client_key_data:
1242
+ # Decode base64 data and save as PEM
1243
+ try:
1244
+ # Clean up the key data by removing whitespace
1245
+ clean_key_data = ''.join(client_key_data.split())
1246
+ key_pem = base64.b64decode(clean_key_data).decode(
1247
+ 'utf-8')
1248
+
1249
+ # Check if the data already looks like a PEM file
1250
+
1251
+ # Check for EC key format
1252
+ if 'EC PRIVATE KEY' in key_pem:
1253
+ # Handle EC KEY format directly
1254
+ match_ec = re.search(
1255
+ r'-----BEGIN EC PRIVATE KEY-----(.*?)-----END EC PRIVATE KEY-----',
1256
+ key_pem, re.DOTALL)
1257
+ if match_ec:
1258
+ # Extract and properly format EC key
1259
+ key_content = match_ec.group(1).strip()
1260
+ key_pem = f'-----BEGIN EC PRIVATE KEY-----\n{key_content}\n-----END EC PRIVATE KEY-----'
1261
+ else:
1262
+ # Extract content and assume EC format
1263
+ key_content = re.sub(r'-----BEGIN.*?-----', '',
1264
+ key_pem)
1265
+ key_content = re.sub(r'-----END.*?-----.*', '',
1266
+ key_content).strip()
1267
+ key_pem = f'-----BEGIN EC PRIVATE KEY-----\n{key_content}\n-----END EC PRIVATE KEY-----'
1268
+ else:
1269
+ # Handle regular private key format
1270
+ has_begin = any(marker in key_pem for marker in [
1271
+ '-----BEGIN PRIVATE KEY-----',
1272
+ '-----BEGIN RSA PRIVATE KEY-----'
1273
+ ])
1274
+ has_end = any(marker in key_pem for marker in [
1275
+ '-----END PRIVATE KEY-----',
1276
+ '-----END RSA PRIVATE KEY-----'
1277
+ ])
1278
+
1279
+ if not has_begin or not has_end:
1280
+ print(
1281
+ f'{YELLOW}Warning: Key data missing PEM markers, attempting to fix...{NC}'
1282
+ )
1283
+ # Add PEM markers if missing
1284
+ if not has_begin:
1285
+ key_pem = f'-----BEGIN PRIVATE KEY-----\n{key_pem}'
1286
+ if not has_end:
1287
+ key_pem = f'{key_pem}\n-----END PRIVATE KEY-----'
1288
+ # Remove any trailing characters after END marker
1289
+ key_pem = re.sub(
1290
+ r'(-----END PRIVATE KEY-----).*', r'\1',
1291
+ key_pem)
1292
+
1293
+ # Write the key
1294
+ with open(key_file_path, 'w',
1295
+ encoding='utf-8') as key_file:
1296
+ key_file.write(key_pem)
1297
+
1298
+ # Verify the file was written correctly
1299
+ if os.path.getsize(key_file_path) > 0:
1300
+ print(
1301
+ f'{GREEN}Successfully saved key data ({len(key_pem)} bytes){NC}'
1302
+ )
1303
+
1304
+ # Quick validation of PEM format
1305
+ with open(key_file_path, 'r',
1306
+ encoding='utf-8') as f:
1307
+ content = f.readlines()
1308
+ first_line = content[0].strip(
1309
+ ) if content else ''
1310
+ last_line = content[-1].strip(
1311
+ ) if content else ''
1312
+
1313
+ if not first_line.startswith(
1314
+ '-----BEGIN') or not last_line.startswith(
1315
+ '-----END'):
1316
+ print(
1317
+ f'{YELLOW}Warning: Key may not be in proper PEM format{NC}'
1318
+ )
1319
+ else:
1320
+ print(f'{RED}Error: Key file is empty{NC}')
1321
+ except Exception as e: # pylint: disable=broad-except
1322
+ print(f'{RED}Error processing key data: {e}{NC}')
1323
+
1324
+ # First check if context name exists and delete it if it does
1325
+ # TODO(romilb): Should we throw an error here instead?
1326
+ run_command(['kubectl', 'config', 'delete-context', context_name],
1327
+ shell=False)
1328
+ run_command(['kubectl', 'config', 'delete-cluster', context_name],
1329
+ shell=False)
1330
+ run_command(['kubectl', 'config', 'delete-user', context_name],
1331
+ shell=False)
1332
+
1333
+ # Merge the configurations using kubectl
1334
+ merged_config = os.path.join(temp_dir, 'merged_config')
1335
+ os.environ['KUBECONFIG'] = f'{kubeconfig_path}:{modified_config}'
1336
+ with open(merged_config, 'w', encoding='utf-8') as merged_file:
1337
+ kubectl_cmd = ['kubectl', 'config', 'view', '--flatten']
1338
+ result = run_command(kubectl_cmd, shell=False)
1339
+ if result:
1340
+ merged_file.write(result)
1341
+
1342
+ # Replace the kubeconfig with the merged config
1343
+ os.replace(merged_config, kubeconfig_path)
1344
+
1345
+ # Set the new context as the current context
1346
+ run_command(['kubectl', 'config', 'use-context', context_name],
1347
+ shell=False)
1348
+
1349
+ # Always set up SSH tunnel since we assume only port 22 is accessible
1350
+ setup_kubectl_ssh_tunnel(head_node,
1351
+ ssh_user,
1352
+ ssh_key,
1353
+ context_name,
1354
+ use_ssh_config=head_use_ssh_config)
1355
+
1356
+ success_message(f'kubectl configured with new context \'{context_name}\'.')
1357
+
1358
+ print(
1359
+ f'Cluster deployment completed. Kubeconfig saved to {kubeconfig_path}')
1360
+ print('You can now run \'kubectl get nodes\' to verify the setup.')
1361
+
1362
+ # Install GPU operator if a GPU was detected on any node
1363
+ if install_gpu:
1364
+ print(
1365
+ f'{YELLOW}GPU detected in the cluster. Installing Nvidia GPU Operator...{NC}'
1366
+ )
1367
+ cmd = f"""
1368
+ {askpass_block}
1369
+ curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
1370
+ chmod 700 get_helm.sh &&
1371
+ ./get_helm.sh &&
1372
+ helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update &&
1373
+ kubectl create namespace gpu-operator --kubeconfig ~/.kube/config || true &&
1374
+ sudo -A ln -s /sbin/ldconfig /sbin/ldconfig.real || true &&
1375
+ helm install gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator \\
1376
+ --set 'toolkit.env[0].name=CONTAINERD_CONFIG' \\
1377
+ --set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \\
1378
+ --set 'toolkit.env[1].name=CONTAINERD_SOCKET' \\
1379
+ --set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \\
1380
+ --set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \\
1381
+ --set 'toolkit.env[2].value=nvidia' &&
1382
+ echo 'Waiting for GPU operator installation...' &&
1383
+ while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:' || ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu.product'; do
1384
+ echo 'Waiting for GPU operator...'
1385
+ sleep 5
1386
+ done
1387
+ echo 'GPU operator installed successfully.'
1388
+ """
1389
+ result = run_remote(head_node,
1390
+ cmd,
1391
+ ssh_user,
1392
+ ssh_key,
1393
+ use_ssh_config=head_use_ssh_config)
1394
+ if result is None:
1395
+ print(f'{RED}Failed to install GPU Operator.{NC}')
1396
+ else:
1397
+ success_message('GPU Operator installed.')
1398
+ else:
1399
+ print(
1400
+ f'{YELLOW}No GPUs detected. Skipping GPU Operator installation.{NC}'
1401
+ )
1402
+
1403
+ # Configure SkyPilot
1404
+ progress_message('Configuring SkyPilot...')
1405
+
1406
+ # The env var KUBECONFIG ensures sky check uses the right kubeconfig
1407
+ os.environ['KUBECONFIG'] = kubeconfig_path
1408
+ run_command(['sky', 'check', 'kubernetes'], shell=False)
1409
+
1410
+ success_message('SkyPilot configured successfully.')
1411
+
1412
+ # Display final success message
1413
+ print(
1414
+ f'{GREEN}==== 🎉 Kubernetes cluster deployment completed successfully 🎉 ====${NC}'
1415
+ )
1416
+ print(
1417
+ 'You can now interact with your Kubernetes cluster through SkyPilot: ')
1418
+ print(' • List available GPUs: sky show-gpus --cloud kubernetes')
1419
+ print(
1420
+ ' • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes'
1421
+ )
1422
+ print(' • Connect to pod with VSCode: code --remote ssh-remote+devbox ')
1423
+ # Print completion marker for current cluster
1424
+ print(f'{GREEN}SKYPILOT_CLUSTER_COMPLETED: {NC}')
1425
+
1426
+ if unsuccessful_workers:
1427
+ quoted_unsuccessful_workers = [
1428
+ f'"{worker}"' for worker in unsuccessful_workers
1429
+ ]
1430
+
1431
+ print(
1432
+ f'{WARNING_YELLOW}Failed to deploy Kubernetes on the following nodes: '
1433
+ f'{", ".join(quoted_unsuccessful_workers)}. Please check '
1434
+ f'the logs for more details.{NC}')
1435
+
1436
+ return unsuccessful_workers
1437
+
1438
+
1439
+ if __name__ == '__main__':
1440
+ main()