skypilot-nightly 1.0.0.dev20250521__py3-none-any.whl → 1.0.0.dev20250523__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +46 -16
  3. sky/backends/cloud_vm_ray_backend.py +16 -4
  4. sky/check.py +109 -44
  5. sky/cli.py +261 -90
  6. sky/client/cli.py +261 -90
  7. sky/client/sdk.py +122 -3
  8. sky/clouds/__init__.py +5 -0
  9. sky/clouds/aws.py +4 -2
  10. sky/clouds/azure.py +4 -2
  11. sky/clouds/cloud.py +30 -6
  12. sky/clouds/cudo.py +2 -1
  13. sky/clouds/do.py +2 -1
  14. sky/clouds/fluidstack.py +2 -1
  15. sky/clouds/gcp.py +160 -23
  16. sky/clouds/ibm.py +4 -2
  17. sky/clouds/kubernetes.py +66 -22
  18. sky/clouds/lambda_cloud.py +2 -1
  19. sky/clouds/nebius.py +18 -2
  20. sky/clouds/oci.py +4 -2
  21. sky/clouds/paperspace.py +2 -1
  22. sky/clouds/runpod.py +2 -1
  23. sky/clouds/scp.py +2 -1
  24. sky/clouds/service_catalog/__init__.py +3 -0
  25. sky/clouds/service_catalog/common.py +9 -2
  26. sky/clouds/service_catalog/constants.py +2 -1
  27. sky/clouds/service_catalog/ssh_catalog.py +167 -0
  28. sky/clouds/ssh.py +203 -0
  29. sky/clouds/vast.py +2 -1
  30. sky/clouds/vsphere.py +2 -1
  31. sky/core.py +59 -17
  32. sky/dashboard/out/404.html +1 -1
  33. sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_buildManifest.js +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +1 -0
  35. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  36. sky/dashboard/out/clusters/[cluster].html +1 -1
  37. sky/dashboard/out/clusters.html +1 -1
  38. sky/dashboard/out/index.html +1 -1
  39. sky/dashboard/out/infra.html +1 -1
  40. sky/dashboard/out/jobs/[job].html +1 -1
  41. sky/dashboard/out/jobs.html +1 -1
  42. sky/data/storage.py +1 -0
  43. sky/execution.py +56 -7
  44. sky/jobs/server/core.py +4 -2
  45. sky/optimizer.py +29 -15
  46. sky/provision/__init__.py +1 -0
  47. sky/provision/aws/instance.py +17 -1
  48. sky/provision/gcp/constants.py +147 -4
  49. sky/provision/gcp/instance_utils.py +10 -0
  50. sky/provision/gcp/volume_utils.py +247 -0
  51. sky/provision/kubernetes/instance.py +16 -5
  52. sky/provision/kubernetes/utils.py +37 -19
  53. sky/provision/nebius/instance.py +3 -1
  54. sky/provision/nebius/utils.py +14 -2
  55. sky/provision/ssh/__init__.py +18 -0
  56. sky/resources.py +177 -4
  57. sky/serve/server/core.py +2 -4
  58. sky/server/common.py +46 -9
  59. sky/server/constants.py +2 -0
  60. sky/server/html/token_page.html +154 -0
  61. sky/server/requests/executor.py +3 -6
  62. sky/server/requests/payloads.py +7 -0
  63. sky/server/server.py +80 -8
  64. sky/setup_files/dependencies.py +1 -0
  65. sky/skypilot_config.py +117 -31
  66. sky/task.py +24 -1
  67. sky/templates/gcp-ray.yml.j2 +44 -1
  68. sky/templates/nebius-ray.yml.j2 +12 -2
  69. sky/utils/admin_policy_utils.py +26 -22
  70. sky/utils/context.py +36 -6
  71. sky/utils/context_utils.py +15 -0
  72. sky/utils/infra_utils.py +21 -1
  73. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  74. sky/utils/kubernetes/create_cluster.sh +1 -0
  75. sky/utils/kubernetes/deploy_remote_cluster.py +1437 -0
  76. sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
  77. sky/utils/kubernetes/ssh-tunnel.sh +387 -0
  78. sky/utils/log_utils.py +214 -1
  79. sky/utils/resources_utils.py +14 -0
  80. sky/utils/schemas.py +67 -0
  81. sky/utils/ux_utils.py +2 -1
  82. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/METADATA +6 -1
  83. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/RECORD +88 -81
  84. sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
  85. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  86. /sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_ssgManifest.js +0 -0
  87. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/WHEEL +0 -0
  88. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/entry_points.txt +0 -0
  89. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/licenses/LICENSE +0 -0
  90. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,1437 @@
1
+ """SSH-based Kubernetes Cluster Deployment Script"""
2
+ # Refer to https://docs.skypilot.co/en/latest/reservations/existing-machines.html for details on how to use this script. # pylint: disable=line-too-long
3
+ import argparse
4
+ import base64
5
+ import concurrent.futures as cf
6
+ import os
7
+ import random
8
+ import re
9
+ import shlex
10
+ import subprocess
11
+ import sys
12
+ import tempfile
13
+ from typing import Any, Dict, List, Optional, Set
14
+
15
+ import yaml
16
+
17
+ # Colors for nicer UX
18
+ RED = '\033[0;31m'
19
+ GREEN = '\033[0;32m'
20
+ YELLOW = '\033[1;33m'
21
+ WARNING_YELLOW = '\x1b[33m'
22
+ NC = '\033[0m' # No color
23
+
24
+ DEFAULT_SSH_NODE_POOLS_PATH = os.path.expanduser('~/.sky/ssh_node_pools.yaml')
25
+ DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
26
+ SSH_CONFIG_PATH = os.path.expanduser('~/.ssh/config')
27
+ NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
28
+
29
+ # Get the directory of this script
30
+ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
31
+
32
+
33
+ class UniqueKeySafeLoader(yaml.SafeLoader):
34
+ """Custom YAML loader that raises an error if there are duplicate keys."""
35
+
36
+ def construct_mapping(self, node, deep=False):
37
+ mapping = {}
38
+ for key_node, value_node in node.value:
39
+ key = self.construct_object(key_node, deep=deep)
40
+ if key in mapping:
41
+ raise yaml.constructor.ConstructorError(
42
+ note=(f'Duplicate cluster config for cluster {key!r}.\n'
43
+ 'Please remove one of them from: '
44
+ f'{DEFAULT_SSH_NODE_POOLS_PATH}'))
45
+ value = self.construct_object(value_node, deep=deep)
46
+ mapping[key] = value
47
+ return mapping
48
+
49
+
50
+ # Register the custom constructor inside the class
51
+ UniqueKeySafeLoader.add_constructor(
52
+ yaml.resolver.BaseResolver.DEFAULT_MAPPING_TAG,
53
+ UniqueKeySafeLoader.construct_mapping)
54
+
55
+
56
+ def parse_args():
57
+ parser = argparse.ArgumentParser(
58
+ description='Deploy a Kubernetes cluster on remote machines.')
59
+ parser.add_argument(
60
+ '--infra', help='Name of the cluster in ssh_node_pools.yaml to use')
61
+ parser.add_argument(
62
+ '--ssh-node-pools-file',
63
+ dest='ssh_node_pools_file',
64
+ default=DEFAULT_SSH_NODE_POOLS_PATH,
65
+ help=
66
+ f'Path to SSH node pools YAML file (default: {DEFAULT_SSH_NODE_POOLS_PATH})'
67
+ )
68
+ parser.add_argument(
69
+ '--kubeconfig-path',
70
+ dest='kubeconfig_path',
71
+ default=DEFAULT_KUBECONFIG_PATH,
72
+ help=
73
+ f'Path to save the kubeconfig file (default: {DEFAULT_KUBECONFIG_PATH})'
74
+ )
75
+ parser.add_argument(
76
+ '--use-ssh-config',
77
+ dest='use_ssh_config',
78
+ action='store_true',
79
+ help='Use SSH config for host settings instead of explicit parameters')
80
+ #TODO(romilb): The `sky local up --ips` command is deprecated and these args are now captured in the ssh_node_pools.yaml file.
81
+ # Remove these args after 0.11.0 release.
82
+ parser.add_argument(
83
+ '--ips-file',
84
+ dest='ips_file',
85
+ help=
86
+ '[Deprecated, use --ssh-node-pools-file instead] File containing IP addresses or SSH host entries (one per line)'
87
+ )
88
+ parser.add_argument(
89
+ '--user',
90
+ help=
91
+ '[Deprecated, use --ssh-node-pools-file instead] Username to use for SSH (overridden by SSH config if host exists there)'
92
+ )
93
+ parser.add_argument(
94
+ '--ssh-key',
95
+ dest='ssh_key',
96
+ help=
97
+ '[Deprecated, use --ssh-node-pools-file instead] Path to SSH private key (overridden by SSH config if host exists there)'
98
+ )
99
+ parser.add_argument(
100
+ '--context-name',
101
+ dest='context_name',
102
+ default='default',
103
+ help=
104
+ '[Deprecated, use --ssh-node-pools-file instead] Kubernetes context name'
105
+ )
106
+ parser.add_argument('--cleanup',
107
+ action='store_true',
108
+ help='Clean up the cluster')
109
+ parser.add_argument(
110
+ '--password',
111
+ help='[Deprecated, use --ssh-node-pools-file instead] Password for sudo'
112
+ )
113
+
114
+ return parser.parse_args()
115
+
116
+
117
+ def load_ssh_targets(file_path: str) -> Dict[str, Any]:
118
+ """Load SSH targets from YAML file."""
119
+ if not os.path.exists(file_path):
120
+ print(f'{RED}Error: SSH Node Pools file not found: {file_path}{NC}',
121
+ file=sys.stderr)
122
+ sys.exit(1)
123
+
124
+ try:
125
+ with open(file_path, 'r', encoding='utf-8') as f:
126
+ targets = yaml.load(f, Loader=UniqueKeySafeLoader)
127
+ return targets
128
+ except yaml.constructor.ConstructorError as e:
129
+ print(f'{RED}{e.note}{NC}', file=sys.stderr)
130
+ sys.exit(1)
131
+ except (yaml.YAMLError, IOError, OSError) as e:
132
+ print(f'{RED}Error loading SSH Node Pools file: {e}{NC}',
133
+ file=sys.stderr)
134
+ sys.exit(1)
135
+
136
+
137
+ def check_host_in_ssh_config(hostname: str) -> bool:
138
+ """Return True iff *hostname* matches at least one `Host`/`Match` stanza
139
+ in the user's OpenSSH client configuration (including anything pulled in
140
+ via Include).
141
+
142
+ It calls: ssh -vvG <hostname> -o ConnectTimeout=0
143
+ which:
144
+ • -G expands the effective config without connecting
145
+ • -vv prints debug lines that show which stanzas are applied
146
+ • ConnectTimeout=0 avoids a DNS lookup if <hostname> is a FQDN/IP
147
+
148
+ No config files are opened or parsed manually.
149
+
150
+ Parameters
151
+ ----------
152
+ hostname : str
153
+ The alias/IP/FQDN you want to test.
154
+
155
+ Returns
156
+ -------
157
+ bool
158
+ True – a specific stanza matched the host
159
+ False – nothing but the global defaults (`Host *`) applied
160
+ """
161
+ # We direct stderr→stdout because debug output goes to stderr.
162
+ proc = subprocess.run(
163
+ ['ssh', '-vvG', hostname, '-o', 'ConnectTimeout=0'],
164
+ text=True,
165
+ stdout=subprocess.PIPE,
166
+ stderr=subprocess.STDOUT,
167
+ check=False, # we only want the text, not to raise
168
+ )
169
+
170
+ # Look for lines like:
171
+ # debug1: ~/.ssh/config line 42: Applying options for <hostname>
172
+ # Anything other than "*"
173
+ pattern = re.compile(r'^debug\d+: .*Applying options for ([^*].*)$',
174
+ re.MULTILINE)
175
+
176
+ return bool(pattern.search(proc.stdout))
177
+
178
+
179
+ def get_cluster_config(targets: Dict[str, Any],
180
+ cluster_name: Optional[str] = None,
181
+ file_path: Optional[str] = None) -> Dict[str, Any]:
182
+ """Get configuration for specific clusters or all clusters."""
183
+ if not targets:
184
+ print(
185
+ f'{RED}Error: No clusters defined in SSH Node Pools '
186
+ f'file {file_path}{NC}',
187
+ file=sys.stderr)
188
+ sys.exit(1)
189
+
190
+ if cluster_name:
191
+ if cluster_name not in targets:
192
+ print(
193
+ f'{RED}Error: Cluster {cluster_name!r} not found in '
194
+ f'SSH Node Pools file {file_path}{NC}',
195
+ file=sys.stderr)
196
+ sys.exit(1)
197
+ return {cluster_name: targets[cluster_name]}
198
+
199
+ # Return all clusters if no specific cluster is specified
200
+ return targets
201
+
202
+
203
+ def prepare_hosts_info(cluster_config: Dict[str, Any]) -> List[Dict[str, str]]:
204
+ """Prepare list of hosts with resolved user, identity_file, and password."""
205
+ if 'hosts' not in cluster_config or not cluster_config['hosts']:
206
+ print(f'{RED}Error: No hosts defined in cluster configuration{NC}',
207
+ file=sys.stderr)
208
+ sys.exit(1)
209
+
210
+ # Get cluster-level defaults
211
+ cluster_user = cluster_config.get('user', '')
212
+ cluster_identity_file = cluster_config.get('identity_file', '')
213
+ cluster_password = cluster_config.get('password', '')
214
+
215
+ hosts_info = []
216
+ for host in cluster_config['hosts']:
217
+ # Host can be a string (IP or SSH config hostname) or a dict
218
+ if isinstance(host, str):
219
+ # Check if this is an SSH config hostname
220
+ is_ssh_config_host = check_host_in_ssh_config(host)
221
+
222
+ hosts_info.append({
223
+ 'ip': host,
224
+ 'user': '' if is_ssh_config_host else cluster_user,
225
+ 'identity_file': '' if is_ssh_config_host else
226
+ cluster_identity_file,
227
+ 'password': cluster_password,
228
+ 'use_ssh_config': is_ssh_config_host
229
+ })
230
+ else:
231
+ # It's a dict with potential overrides
232
+ if 'ip' not in host:
233
+ print(
234
+ f'{RED}Warning: Host missing \'ip\' field, skipping: {host}{NC}'
235
+ )
236
+ continue
237
+
238
+ # Check if this is an SSH config hostname
239
+ is_ssh_config_host = check_host_in_ssh_config(host['ip'])
240
+
241
+ # Use host-specific values or fall back to cluster defaults
242
+ host_user = '' if is_ssh_config_host else host.get(
243
+ 'user', cluster_user)
244
+ host_identity_file = '' if is_ssh_config_host else host.get(
245
+ 'identity_file', cluster_identity_file)
246
+ host_password = host.get('password', cluster_password)
247
+
248
+ hosts_info.append({
249
+ 'ip': host['ip'],
250
+ 'user': host_user,
251
+ 'identity_file': host_identity_file,
252
+ 'password': host_password,
253
+ 'use_ssh_config': is_ssh_config_host
254
+ })
255
+
256
+ return hosts_info
257
+
258
+
259
+ def run_command(cmd, shell=False):
260
+ """Run a local command and return the output."""
261
+ process = subprocess.run(cmd,
262
+ shell=shell,
263
+ capture_output=True,
264
+ text=True,
265
+ check=False)
266
+ if process.returncode != 0:
267
+ print(f'{RED}Error executing command: {cmd}{NC}')
268
+ print(f'STDOUT: {process.stdout}')
269
+ print(f'STDERR: {process.stderr}')
270
+ return None
271
+ return process.stdout.strip()
272
+
273
+
274
+ def get_effective_host_ip(hostname: str) -> str:
275
+ """Get the effective IP for a hostname from SSH config."""
276
+ try:
277
+ result = subprocess.run(['ssh', '-G', hostname],
278
+ capture_output=True,
279
+ text=True,
280
+ check=False)
281
+ if result.returncode == 0:
282
+ for line in result.stdout.splitlines():
283
+ if line.startswith('hostname '):
284
+ return line.split(' ', 1)[1].strip()
285
+ except Exception: # pylint: disable=broad-except
286
+ pass
287
+ return hostname # Return the original hostname if lookup fails
288
+
289
+
290
+ def run_remote(node,
291
+ cmd,
292
+ user='',
293
+ ssh_key='',
294
+ connect_timeout=30,
295
+ use_ssh_config=False,
296
+ print_output=False,
297
+ use_shell=False):
298
+ """Run a command on a remote machine via SSH."""
299
+ if use_ssh_config:
300
+ # Use SSH config for connection parameters
301
+ ssh_cmd = ['ssh', node, cmd]
302
+ else:
303
+ # Use explicit parameters
304
+ ssh_cmd = [
305
+ 'ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'IdentitiesOnly=yes',
306
+ '-o', f'ConnectTimeout={connect_timeout}', '-o',
307
+ 'ServerAliveInterval=10', '-o', 'ServerAliveCountMax=3'
308
+ ]
309
+
310
+ if ssh_key:
311
+ ssh_cmd.extend(['-i', ssh_key])
312
+
313
+ ssh_cmd.append(f'{user}@{node}' if user else node)
314
+ ssh_cmd.append(cmd)
315
+
316
+ if use_shell:
317
+ ssh_cmd = ' '.join(ssh_cmd)
318
+
319
+ process = subprocess.run(ssh_cmd,
320
+ capture_output=True,
321
+ text=True,
322
+ check=False,
323
+ shell=use_shell)
324
+ if process.returncode != 0:
325
+ print(f'{RED}Error executing command {cmd} on {node}:{NC}')
326
+ print(f'STDERR: {process.stderr}')
327
+ return None
328
+ if print_output:
329
+ print(process.stdout)
330
+ return process.stdout.strip()
331
+
332
+
333
+ def create_askpass_script(password):
334
+ """Create an askpass script block for sudo with password."""
335
+ if not password:
336
+ return ''
337
+
338
+ return f"""
339
+ # Create temporary askpass script
340
+ ASKPASS_SCRIPT=$(mktemp)
341
+ trap 'rm -f $ASKPASS_SCRIPT' EXIT INT TERM ERR QUIT
342
+ cat > $ASKPASS_SCRIPT << EOF
343
+ #!/bin/bash
344
+ echo {password}
345
+ EOF
346
+ chmod 700 $ASKPASS_SCRIPT
347
+ # Use askpass
348
+ export SUDO_ASKPASS=$ASKPASS_SCRIPT
349
+ """
350
+
351
+
352
+ def progress_message(message):
353
+ """Show a progress message."""
354
+ print(f'{YELLOW}➜ {message}{NC}')
355
+
356
+
357
+ def success_message(message):
358
+ """Show a success message."""
359
+ print(f'{GREEN}✔ {message}{NC}')
360
+
361
+
362
+ def cleanup_server_node(node,
363
+ user,
364
+ ssh_key,
365
+ askpass_block,
366
+ use_ssh_config=False):
367
+ """Uninstall k3s and clean up the state on a server node."""
368
+ print(f'{YELLOW}Cleaning up head node {node}...{NC}')
369
+ cmd = f"""
370
+ {askpass_block}
371
+ echo 'Uninstalling k3s...' &&
372
+ sudo -A /usr/local/bin/k3s-uninstall.sh || true &&
373
+ sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
374
+ """
375
+ result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
376
+ if result is None:
377
+ print(f'{RED}Failed to clean up head node ({node}).{NC}')
378
+ else:
379
+ success_message(f'Node {node} cleaned up successfully.')
380
+
381
+
382
+ def cleanup_agent_node(node,
383
+ user,
384
+ ssh_key,
385
+ askpass_block,
386
+ use_ssh_config=False):
387
+ """Uninstall k3s and clean up the state on an agent node."""
388
+ print(f'{YELLOW}Cleaning up worker node {node}...{NC}')
389
+ cmd = f"""
390
+ {askpass_block}
391
+ echo 'Uninstalling k3s...' &&
392
+ sudo -A /usr/local/bin/k3s-agent-uninstall.sh || true &&
393
+ sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
394
+ """
395
+ result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
396
+ if result is None:
397
+ print(f'{RED}Failed to clean up worker node ({node}).{NC}')
398
+ else:
399
+ success_message(f'Node {node} cleaned up successfully.')
400
+
401
+
402
+ def start_agent_node(node,
403
+ master_addr,
404
+ k3s_token,
405
+ user,
406
+ ssh_key,
407
+ askpass_block,
408
+ use_ssh_config=False):
409
+ """Start a k3s agent node.
410
+ Returns: if the start is successful, and if the node has a GPU."""
411
+ cmd = f"""
412
+ {askpass_block}
413
+ curl -sfL https://get.k3s.io | K3S_NODE_NAME={node} INSTALL_K3S_EXEC='agent --node-label skypilot-ip={node}' \
414
+ K3S_URL=https://{master_addr}:6443 K3S_TOKEN={k3s_token} sudo -E -A sh -
415
+ """
416
+ result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
417
+ if result is None:
418
+ print(f'{RED}Failed to deploy K3s on worker node ({node}).{NC}')
419
+ return node, False, False
420
+ success_message(f'Kubernetes deployed on worker node ({node}).')
421
+ # Check if worker node has a GPU
422
+ if check_gpu(node, user, ssh_key, use_ssh_config=use_ssh_config):
423
+ print(f'{YELLOW}GPU detected on worker node ({node}).{NC}')
424
+ return node, True, True
425
+ return node, True, False
426
+
427
+
428
+ def check_gpu(node, user, ssh_key, use_ssh_config=False):
429
+ """Check if a node has a GPU."""
430
+ cmd = 'command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null'
431
+ result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
432
+ return result is not None
433
+
434
+
435
+ def ensure_directory_exists(path):
436
+ """Ensure the directory for the specified file path exists."""
437
+ directory = os.path.dirname(path)
438
+ if directory and not os.path.exists(directory):
439
+ os.makedirs(directory, exist_ok=True)
440
+
441
+
442
+ def get_used_localhost_ports() -> Set[int]:
443
+ """Get SSH port forwardings already in use on localhost"""
444
+ used_ports = set()
445
+
446
+ # Get ports from netstat (works on macOS and Linux)
447
+ try:
448
+ if sys.platform == 'darwin':
449
+ # macOS
450
+ result = subprocess.run(['netstat', '-an', '-p', 'tcp'],
451
+ capture_output=True,
452
+ text=True,
453
+ check=False)
454
+ else:
455
+ # Linux and other Unix-like systems
456
+ result = subprocess.run(['netstat', '-tln'],
457
+ capture_output=True,
458
+ text=True,
459
+ check=False)
460
+
461
+ if result.returncode == 0:
462
+ # Look for lines with 'localhost:<port>' or '127.0.0.1:<port>'
463
+ for line in result.stdout.splitlines():
464
+ if '127.0.0.1:' in line or 'localhost:' in line:
465
+ match = re.search(r':(64\d\d)\s', line)
466
+ if match:
467
+ port = int(match.group(1))
468
+ if 6400 <= port <= 6500: # Only consider our range
469
+ used_ports.add(port)
470
+ except (subprocess.SubprocessError, FileNotFoundError):
471
+ # If netstat fails, try another approach
472
+ pass
473
+
474
+ # Also check ports from existing kubeconfig entries
475
+ try:
476
+ result = subprocess.run([
477
+ 'kubectl', 'config', 'view', '-o',
478
+ 'jsonpath=\'{.clusters[*].cluster.server}\''
479
+ ],
480
+ capture_output=True,
481
+ text=True,
482
+ check=False)
483
+
484
+ if result.returncode == 0:
485
+ # Look for localhost URLs with ports
486
+ for url in result.stdout.split():
487
+ if 'localhost:' in url or '127.0.0.1:' in url:
488
+ match = re.search(r':(\d+)', url)
489
+ if match:
490
+ port = int(match.group(1))
491
+ if 6400 <= port <= 6500: # Only consider our range
492
+ used_ports.add(port)
493
+ except subprocess.SubprocessError:
494
+ pass
495
+
496
+ return used_ports
497
+
498
+
499
+ def get_available_port(start: int = 6443, end: int = 6499) -> int:
500
+ """Get an available port in the given range that's not used by other tunnels"""
501
+ used_ports = get_used_localhost_ports()
502
+
503
+ # Try to use port 6443 first if available for the first cluster
504
+ if start == 6443 and start not in used_ports:
505
+ return start
506
+
507
+ # Otherwise find any available port in the range
508
+ available_ports = list(set(range(start, end + 1)) - used_ports)
509
+
510
+ if not available_ports:
511
+ # If all ports are used, pick a random one from our range
512
+ # (we'll terminate any existing connection in the setup)
513
+ return random.randint(start, end)
514
+
515
+ # Sort to get deterministic allocation
516
+ available_ports.sort()
517
+ return available_ports[0]
518
+
519
+
520
+ def setup_kubectl_ssh_tunnel(head_node,
521
+ ssh_user,
522
+ ssh_key,
523
+ context_name,
524
+ use_ssh_config=False):
525
+ """Set up kubeconfig exec credential plugin for SSH tunnel"""
526
+ progress_message('Setting up SSH tunnel for Kubernetes API access...')
527
+
528
+ # Get an available port for this cluster
529
+ port = get_available_port()
530
+
531
+ # Paths to scripts
532
+ tunnel_script = os.path.join(SCRIPT_DIR, 'ssh-tunnel.sh')
533
+
534
+ # Make sure scripts are executable
535
+ os.chmod(tunnel_script, 0o755)
536
+
537
+ # Certificate files
538
+ client_cert_file = os.path.join(NODE_POOLS_INFO_DIR,
539
+ f'{context_name}-cert.pem')
540
+ client_key_file = os.path.join(NODE_POOLS_INFO_DIR,
541
+ f'{context_name}-key.pem')
542
+
543
+ # Update kubeconfig to use localhost with the selected port
544
+ run_command([
545
+ 'kubectl', 'config', 'set-cluster', context_name,
546
+ f'--server=https://127.0.0.1:{port}', '--insecure-skip-tls-verify=true'
547
+ ])
548
+
549
+ # Build the exec args list based on auth method
550
+ exec_args = [
551
+ '--exec-command', tunnel_script, '--exec-api-version',
552
+ 'client.authentication.k8s.io/v1beta1'
553
+ ]
554
+
555
+ # Set credential TTL to force frequent tunnel checks
556
+ ttl_seconds = 30
557
+
558
+ # Verify if we have extracted certificate data files
559
+ has_cert_files = os.path.isfile(client_cert_file) and os.path.isfile(
560
+ client_key_file)
561
+ if has_cert_files:
562
+ print(
563
+ f'{GREEN}Client certificate data extracted and will be used for authentication{NC}'
564
+ )
565
+
566
+ if use_ssh_config:
567
+ run_command(
568
+ ['kubectl', 'config', 'set-credentials', context_name] + exec_args +
569
+ [
570
+ '--exec-arg=--context', f'--exec-arg={context_name}',
571
+ '--exec-arg=--port', f'--exec-arg={port}', '--exec-arg=--ttl',
572
+ f'--exec-arg={ttl_seconds}', '--exec-arg=--use-ssh-config',
573
+ '--exec-arg=--host', f'--exec-arg={head_node}'
574
+ ])
575
+ else:
576
+ run_command(['kubectl', 'config', 'set-credentials', context_name] +
577
+ exec_args + [
578
+ '--exec-arg=--context', f'--exec-arg={context_name}',
579
+ '--exec-arg=--port', f'--exec-arg={port}',
580
+ '--exec-arg=--ttl', f'--exec-arg={ttl_seconds}',
581
+ '--exec-arg=--host', f'--exec-arg={head_node}',
582
+ '--exec-arg=--user', f'--exec-arg={ssh_user}',
583
+ '--exec-arg=--ssh-key', f'--exec-arg={ssh_key}'
584
+ ])
585
+
586
+ success_message(
587
+ f'SSH tunnel configured through kubectl credential plugin on port {port}'
588
+ )
589
+ print(
590
+ f'{GREEN}Your kubectl connection is now tunneled through SSH (port {port}).{NC}'
591
+ )
592
+ print(
593
+ f'{GREEN}This tunnel will be automatically established when needed.{NC}'
594
+ )
595
+ print(
596
+ f'{GREEN}Credential TTL set to {ttl_seconds}s to ensure tunnel health is checked frequently.{NC}'
597
+ )
598
+
599
+ return port
600
+
601
+
602
+ def cleanup_kubectl_ssh_tunnel(context_name):
603
+ """Clean up the SSH tunnel for a specific context"""
604
+ progress_message(f'Cleaning up SSH tunnel for context {context_name}...')
605
+
606
+ # Path to cleanup script
607
+ cleanup_script = os.path.join(SCRIPT_DIR, 'cleanup-tunnel.sh')
608
+
609
+ # Make sure script is executable
610
+ if os.path.exists(cleanup_script):
611
+ os.chmod(cleanup_script, 0o755)
612
+
613
+ # Run the cleanup script
614
+ subprocess.run([cleanup_script, context_name],
615
+ stdout=subprocess.DEVNULL,
616
+ stderr=subprocess.DEVNULL,
617
+ check=False)
618
+
619
+ success_message(f'SSH tunnel for context {context_name} cleaned up')
620
+ else:
621
+ print(f'{YELLOW}Cleanup script not found: {cleanup_script}{NC}')
622
+
623
+
624
+ def main():
625
+ args = parse_args()
626
+
627
+ kubeconfig_path = os.path.expanduser(args.kubeconfig_path)
628
+ global_use_ssh_config = args.use_ssh_config
629
+
630
+ # Print cleanup mode marker if applicable
631
+ if args.cleanup:
632
+ print('SKYPILOT_CLEANUP_MODE: Cleanup mode activated')
633
+
634
+ # Check if using YAML configuration or command line arguments
635
+ if args.ips_file:
636
+ # Using command line arguments - legacy mode
637
+ if args.ssh_key and not os.path.isfile(
638
+ args.ssh_key) and not global_use_ssh_config:
639
+ print(f'{RED}Error: SSH key not found: {args.ssh_key}{NC}',
640
+ file=sys.stderr)
641
+ sys.exit(1)
642
+
643
+ if not os.path.isfile(args.ips_file):
644
+ print(f'{RED}Error: IPs file not found: {args.ips_file}{NC}',
645
+ file=sys.stderr)
646
+ sys.exit(1)
647
+
648
+ with open(args.ips_file, 'r', encoding='utf-8') as f:
649
+ hosts = [line.strip() for line in f if line.strip()]
650
+
651
+ if not hosts:
652
+ print(
653
+ f'{RED}Error: Hosts file is empty or not formatted correctly.{NC}',
654
+ file=sys.stderr)
655
+ sys.exit(1)
656
+
657
+ head_node = hosts[0]
658
+ worker_nodes = hosts[1:]
659
+ ssh_user = args.user if not global_use_ssh_config else ''
660
+ ssh_key = args.ssh_key if not global_use_ssh_config else ''
661
+ context_name = args.context_name
662
+ password = args.password
663
+
664
+ # Check if hosts are in SSH config
665
+ head_use_ssh_config = global_use_ssh_config or check_host_in_ssh_config(
666
+ head_node)
667
+ worker_use_ssh_config = [
668
+ global_use_ssh_config or check_host_in_ssh_config(node)
669
+ for node in worker_nodes
670
+ ]
671
+
672
+ # Single cluster deployment for legacy mode
673
+ deploy_cluster(head_node, worker_nodes, ssh_user, ssh_key, context_name,
674
+ password, head_use_ssh_config, worker_use_ssh_config,
675
+ kubeconfig_path, args.cleanup)
676
+ else:
677
+ # Using YAML configuration
678
+ targets = load_ssh_targets(args.ssh_node_pools_file)
679
+ clusters_config = get_cluster_config(targets,
680
+ args.infra,
681
+ file_path=args.ssh_node_pools_file)
682
+
683
+ # Print information about clusters being processed
684
+ num_clusters = len(clusters_config)
685
+ cluster_names = list(clusters_config.keys())
686
+ cluster_info = f'Found {num_clusters} Node Pool{"s" if num_clusters > 1 else ""}: {", ".join(cluster_names)}'
687
+ print(f'SKYPILOT_CLUSTER_INFO: {cluster_info}')
688
+
689
+ # Process each cluster
690
+ for cluster_name, cluster_config in clusters_config.items():
691
+ print(f'SKYPILOT_CURRENT_CLUSTER: {cluster_name}')
692
+ print(f'{YELLOW}==== Deploying cluster: {cluster_name} ====${NC}')
693
+ hosts_info = prepare_hosts_info(cluster_config)
694
+
695
+ if not hosts_info:
696
+ print(
697
+ f'{RED}Error: No valid hosts found for cluster {cluster_name!r}. Skipping.{NC}'
698
+ )
699
+ continue
700
+
701
+ # Generate a unique context name for each cluster
702
+ context_name = args.context_name
703
+ if context_name == 'default':
704
+ context_name = 'ssh-' + cluster_name
705
+
706
+ # Check cluster history
707
+ os.makedirs(NODE_POOLS_INFO_DIR, exist_ok=True)
708
+ history_yaml_file = os.path.join(NODE_POOLS_INFO_DIR,
709
+ f'{context_name}-history.yaml')
710
+
711
+ history = None
712
+ if os.path.exists(history_yaml_file):
713
+ print(f'{YELLOW}Loading history from {history_yaml_file}{NC}')
714
+ with open(history_yaml_file, 'r', encoding='utf-8') as f:
715
+ history = yaml.safe_load(f)
716
+ else:
717
+ print(f'{YELLOW}No history found for {context_name}.{NC}')
718
+
719
+ history_workers_info = None
720
+ history_worker_nodes = None
721
+ history_use_ssh_config = None
722
+ # Do not support changing anything besides hosts for now
723
+ if history is not None:
724
+ for key in ['user', 'identity_file', 'password']:
725
+ if history.get(key) != cluster_config.get(key):
726
+ raise ValueError(
727
+ f'Cluster configuration has changed for field {key!r}. '
728
+ f'Previous value: {history.get(key)}, '
729
+ f'Current value: {cluster_config.get(key)}')
730
+ history_hosts_info = prepare_hosts_info(history)
731
+ if history_hosts_info[0] != hosts_info[0]:
732
+ raise ValueError(
733
+ f'Cluster configuration has changed for master node. '
734
+ f'Previous value: {history_hosts_info[0]}, '
735
+ f'Current value: {hosts_info[0]}')
736
+ history_workers_info = history_hosts_info[1:] if len(
737
+ history_hosts_info) > 1 else []
738
+ history_worker_nodes = [h['ip'] for h in history_workers_info]
739
+ history_use_ssh_config = [
740
+ h.get('use_ssh_config', False) for h in history_workers_info
741
+ ]
742
+
743
+ # Use the first host as the head node and the rest as worker nodes
744
+ head_host = hosts_info[0]
745
+ worker_hosts = hosts_info[1:] if len(hosts_info) > 1 else []
746
+
747
+ head_node = head_host['ip']
748
+ worker_nodes = [h['ip'] for h in worker_hosts]
749
+ ssh_user = head_host['user']
750
+ ssh_key = head_host['identity_file']
751
+ head_use_ssh_config = global_use_ssh_config or head_host.get(
752
+ 'use_ssh_config', False)
753
+ worker_use_ssh_config = [
754
+ global_use_ssh_config or h.get('use_ssh_config', False)
755
+ for h in worker_hosts
756
+ ]
757
+ password = head_host['password']
758
+
759
+ # Deploy this cluster
760
+ unsuccessful_workers = deploy_cluster(
761
+ head_node,
762
+ worker_nodes,
763
+ ssh_user,
764
+ ssh_key,
765
+ context_name,
766
+ password,
767
+ head_use_ssh_config,
768
+ worker_use_ssh_config,
769
+ kubeconfig_path,
770
+ args.cleanup,
771
+ worker_hosts=worker_hosts,
772
+ history_worker_nodes=history_worker_nodes,
773
+ history_workers_info=history_workers_info,
774
+ history_use_ssh_config=history_use_ssh_config)
775
+
776
+ if not args.cleanup:
777
+ successful_hosts = []
778
+ for host in cluster_config['hosts']:
779
+ if isinstance(host, str):
780
+ host_node = host
781
+ else:
782
+ host_node = host['ip']
783
+ if host_node not in unsuccessful_workers:
784
+ successful_hosts.append(host)
785
+ cluster_config['hosts'] = successful_hosts
786
+ with open(history_yaml_file, 'w', encoding='utf-8') as f:
787
+ print(f'{YELLOW}Writing history to {history_yaml_file}{NC}')
788
+ yaml.dump(cluster_config, f)
789
+
790
+ print(
791
+ f'{GREEN}==== Completed deployment for cluster: {cluster_name} ====${NC}'
792
+ )
793
+
794
+
795
+ def deploy_cluster(head_node,
796
+ worker_nodes,
797
+ ssh_user,
798
+ ssh_key,
799
+ context_name,
800
+ password,
801
+ head_use_ssh_config,
802
+ worker_use_ssh_config,
803
+ kubeconfig_path,
804
+ cleanup,
805
+ worker_hosts=None,
806
+ history_worker_nodes=None,
807
+ history_workers_info=None,
808
+ history_use_ssh_config=None) -> List[str]:
809
+ """Deploy or clean up a single Kubernetes cluster.
810
+
811
+ Returns: List of unsuccessful worker nodes.
812
+ """
813
+ # Ensure SSH key is expanded for paths with ~ (home directory)
814
+ if ssh_key:
815
+ ssh_key = os.path.expanduser(ssh_key)
816
+
817
+ history_yaml_file = os.path.join(NODE_POOLS_INFO_DIR,
818
+ f'{context_name}-history.yaml')
819
+ cert_file_path = os.path.join(NODE_POOLS_INFO_DIR,
820
+ f'{context_name}-cert.pem')
821
+ key_file_path = os.path.join(NODE_POOLS_INFO_DIR, f'{context_name}-key.pem')
822
+ tunnel_log_file_path = os.path.join(NODE_POOLS_INFO_DIR,
823
+ f'{context_name}-tunnel.log')
824
+
825
+ # Generate the askpass block if password is provided
826
+ askpass_block = create_askpass_script(password)
827
+
828
+ # Token for k3s
829
+ k3s_token = 'mytoken' # Any string can be used as the token
830
+
831
+ # Pre-flight checks
832
+ print(f'{YELLOW}Checking SSH connection to head node...{NC}')
833
+ result = run_remote(
834
+ head_node,
835
+ f'echo \'SSH connection successful ({head_node})\'',
836
+ ssh_user,
837
+ ssh_key,
838
+ use_ssh_config=head_use_ssh_config,
839
+ # For SkySSHUpLineProcessor
840
+ print_output=True)
841
+ if result is None:
842
+ print(
843
+ f'{RED}Failed to SSH to head node ({head_node}). '
844
+ f'Please check the SSH configuration.{NC}',
845
+ file=sys.stderr)
846
+ sys.exit(1)
847
+
848
+ # Checking history
849
+ history_exists = (history_worker_nodes is not None and
850
+ history_workers_info is not None and
851
+ history_use_ssh_config is not None)
852
+
853
+ # Cleanup history worker nodes
854
+ worker_nodes_to_cleanup = []
855
+ remove_worker_cmds = []
856
+ if history_exists:
857
+ for history_node, history_info, use_ssh_config in zip(
858
+ history_worker_nodes, history_workers_info,
859
+ history_use_ssh_config):
860
+ if worker_hosts is not None and history_info not in worker_hosts:
861
+ print(
862
+ f'{YELLOW}Worker node {history_node} not found in YAML config. '
863
+ f'Removing from history...{NC}')
864
+ worker_nodes_to_cleanup.append(
865
+ dict(
866
+ node=history_node,
867
+ user=ssh_user
868
+ if history_info is None else history_info['user'],
869
+ ssh_key=ssh_key if history_info is None else
870
+ history_info['identity_file'],
871
+ askpass_block=(askpass_block if history_info is None
872
+ else create_askpass_script(
873
+ history_info['password'])),
874
+ use_ssh_config=use_ssh_config,
875
+ ))
876
+ remove_worker_cmds.append(
877
+ f'kubectl delete node -l skypilot-ip={history_node}')
878
+ # If this is a create operation and there exists some stale log,
879
+ # cleanup the log for a new file to store new logs.
880
+ if not cleanup and os.path.exists(tunnel_log_file_path):
881
+ os.remove(tunnel_log_file_path)
882
+
883
+ # If --cleanup flag is set, uninstall k3s and exit
884
+ if cleanup:
885
+ # Pickup all nodes
886
+ worker_nodes_to_cleanup.clear()
887
+ for node, info, use_ssh_config in zip(worker_nodes, worker_hosts,
888
+ worker_use_ssh_config):
889
+ worker_nodes_to_cleanup.append(
890
+ dict(
891
+ node=node,
892
+ user=ssh_user if info is None else info['user'],
893
+ ssh_key=ssh_key if info is None else info['identity_file'],
894
+ askpass_block=(askpass_block if info is None else
895
+ create_askpass_script(info['password'])),
896
+ use_ssh_config=use_ssh_config,
897
+ ))
898
+
899
+ print(f'{YELLOW}Starting cleanup...{NC}')
900
+
901
+ # Clean up head node
902
+ cleanup_server_node(head_node,
903
+ ssh_user,
904
+ ssh_key,
905
+ askpass_block,
906
+ use_ssh_config=head_use_ssh_config)
907
+ # Clean up worker nodes
908
+ with cf.ThreadPoolExecutor() as executor:
909
+ executor.map(lambda kwargs: cleanup_agent_node(**kwargs),
910
+ worker_nodes_to_cleanup)
911
+
912
+ with cf.ThreadPoolExecutor() as executor:
913
+
914
+ def run_cleanup_cmd(cmd):
915
+ print('Cleaning up worker nodes:', cmd)
916
+ run_command(cmd, shell=True)
917
+
918
+ executor.map(run_cleanup_cmd, remove_worker_cmds)
919
+
920
+ if cleanup:
921
+
922
+ # Remove the context from local kubeconfig if it exists
923
+ if os.path.isfile(kubeconfig_path):
924
+ progress_message(
925
+ f'Removing context {context_name!r} from local kubeconfig...')
926
+ run_command(['kubectl', 'config', 'delete-context', context_name],
927
+ shell=False)
928
+ run_command(['kubectl', 'config', 'delete-cluster', context_name],
929
+ shell=False)
930
+ run_command(['kubectl', 'config', 'delete-user', context_name],
931
+ shell=False)
932
+
933
+ # Update the current context to the first available context
934
+ contexts = run_command([
935
+ 'kubectl', 'config', 'view', '-o',
936
+ 'jsonpath=\'{.contexts[0].name}\''
937
+ ],
938
+ shell=False)
939
+ if contexts:
940
+ run_command(['kubectl', 'config', 'use-context', contexts],
941
+ shell=False)
942
+ else:
943
+ # If no context is available, simply unset the current context
944
+ run_command(['kubectl', 'config', 'unset', 'current-context'],
945
+ shell=False)
946
+
947
+ success_message(
948
+ f'Context {context_name!r} removed from local kubeconfig.')
949
+
950
+ for file in [history_yaml_file, cert_file_path, key_file_path]:
951
+ if os.path.exists(file):
952
+ os.remove(file)
953
+
954
+ # Clean up SSH tunnel after clean up kubeconfig, because the kubectl
955
+ # will restart the ssh tunnel if it's not running.
956
+ cleanup_kubectl_ssh_tunnel(context_name)
957
+
958
+ print(f'{GREEN}Cleanup completed successfully.{NC}')
959
+
960
+ # Print completion marker for current cluster
961
+ print(f'{GREEN}SKYPILOT_CLUSTER_COMPLETED: {NC}')
962
+
963
+ return []
964
+
965
+ print(f'{YELLOW}Checking TCP Forwarding Options...{NC}')
966
+ cmd = (
967
+ 'if [ "$(sudo sshd -T | grep allowtcpforwarding)" = "allowtcpforwarding yes" ]; then '
968
+ f'echo "TCP Forwarding already enabled on head node ({head_node})."; '
969
+ 'else '
970
+ 'sudo sed -i \'s/^#\?\s*AllowTcpForwarding.*/AllowTcpForwarding yes/\' ' # pylint: disable=anomalous-backslash-in-string
971
+ '/etc/ssh/sshd_config && sudo systemctl restart sshd && '
972
+ f'echo "Successfully enabled TCP Forwarding on head node ({head_node})."; '
973
+ 'fi')
974
+ result = run_remote(
975
+ head_node,
976
+ shlex.quote(cmd),
977
+ ssh_user,
978
+ ssh_key,
979
+ use_ssh_config=head_use_ssh_config,
980
+ # For SkySSHUpLineProcessor
981
+ print_output=True,
982
+ use_shell=True)
983
+ if result is None:
984
+ print(
985
+ f'{RED}Failed to setup TCP forwarding on head node ({head_node}). '
986
+ f'Please check the SSH configuration.{NC}',
987
+ file=sys.stderr)
988
+
989
+ # Get effective IP for master node if using SSH config - needed for workers to connect
990
+ if head_use_ssh_config:
991
+ effective_master_ip = get_effective_host_ip(head_node)
992
+ print(
993
+ f'{GREEN}Resolved head node {head_node} to {effective_master_ip} from SSH config{NC}'
994
+ )
995
+ else:
996
+ effective_master_ip = head_node
997
+
998
+ # Step 1: Install k3s on the head node
999
+ # Check if head node has a GPU
1000
+ install_gpu = False
1001
+ progress_message(f'Deploying Kubernetes on head node ({head_node})...')
1002
+ cmd = f"""
1003
+ {askpass_block}
1004
+ curl -sfL https://get.k3s.io | K3S_TOKEN={k3s_token} K3S_NODE_NAME={head_node} sudo -E -A sh - &&
1005
+ mkdir -p ~/.kube &&
1006
+ sudo -A cp /etc/rancher/k3s/k3s.yaml ~/.kube/config &&
1007
+ sudo -A chown $(id -u):$(id -g) ~/.kube/config &&
1008
+ for i in {{1..3}}; do
1009
+ if kubectl wait --for=condition=ready node --all --timeout=2m --kubeconfig ~/.kube/config; then
1010
+ break
1011
+ else
1012
+ echo 'Waiting for nodes to be ready...'
1013
+ sleep 5
1014
+ fi
1015
+ done
1016
+ if [ $i -eq 3 ]; then
1017
+ echo 'Failed to wait for nodes to be ready after 3 attempts'
1018
+ exit 1
1019
+ fi
1020
+ """
1021
+ result = run_remote(head_node,
1022
+ cmd,
1023
+ ssh_user,
1024
+ ssh_key,
1025
+ use_ssh_config=head_use_ssh_config)
1026
+ if result is None:
1027
+ print(f'{RED}Failed to deploy K3s on head node ({head_node}). {NC}',
1028
+ file=sys.stderr)
1029
+ sys.exit(1)
1030
+ success_message(f'K3s deployed on head node ({head_node}).')
1031
+
1032
+ # Check if head node has a GPU
1033
+ install_gpu = False
1034
+ if check_gpu(head_node,
1035
+ ssh_user,
1036
+ ssh_key,
1037
+ use_ssh_config=head_use_ssh_config):
1038
+ print(f'{YELLOW}GPU detected on head node ({head_node}).{NC}')
1039
+ install_gpu = True
1040
+
1041
+ # Fetch the head node's internal IP (this will be passed to worker nodes)
1042
+ master_addr = run_remote(head_node,
1043
+ 'hostname -I | awk \'{print $1}\'',
1044
+ ssh_user,
1045
+ ssh_key,
1046
+ use_ssh_config=head_use_ssh_config)
1047
+ if master_addr is None:
1048
+ print(
1049
+ f'{RED}Failed to SSH to head node ({head_node}). '
1050
+ f'Please check the SSH configuration.{NC}',
1051
+ file=sys.stderr)
1052
+ sys.exit(1)
1053
+ print(f'{GREEN}Master node internal IP: {master_addr}{NC}')
1054
+
1055
+ # Step 2: Install k3s on worker nodes and join them to the master node
1056
+ def deploy_worker(args):
1057
+ (i, node, worker_hosts, history_workers_info, ssh_user, ssh_key,
1058
+ askpass_block, worker_use_ssh_config, master_addr, k3s_token) = args
1059
+ progress_message(f'Deploying Kubernetes on worker node ({node})...')
1060
+
1061
+ # If using YAML config with specific worker info
1062
+ if worker_hosts and i < len(worker_hosts):
1063
+ if history_workers_info is not None and worker_hosts[
1064
+ i] in history_workers_info:
1065
+ print(
1066
+ f'{YELLOW}Worker node ({node}) already exists in history. '
1067
+ f'Skipping...{NC}')
1068
+ return node, True, False
1069
+ worker_user = worker_hosts[i]['user']
1070
+ worker_key = worker_hosts[i]['identity_file']
1071
+ worker_password = worker_hosts[i]['password']
1072
+ worker_askpass = create_askpass_script(worker_password)
1073
+ worker_config = worker_use_ssh_config[i]
1074
+ else:
1075
+ worker_user = ssh_user
1076
+ worker_key = ssh_key
1077
+ worker_askpass = askpass_block
1078
+ worker_config = worker_use_ssh_config[i]
1079
+
1080
+ return start_agent_node(node,
1081
+ master_addr,
1082
+ k3s_token,
1083
+ worker_user,
1084
+ worker_key,
1085
+ worker_askpass,
1086
+ use_ssh_config=worker_config)
1087
+
1088
+ unsuccessful_workers = []
1089
+
1090
+ # Deploy workers in parallel using thread pool
1091
+ with cf.ThreadPoolExecutor() as executor:
1092
+ futures = []
1093
+ for i, node in enumerate(worker_nodes):
1094
+ args = (i, node, worker_hosts, history_workers_info, ssh_user,
1095
+ ssh_key, askpass_block, worker_use_ssh_config, master_addr,
1096
+ k3s_token)
1097
+ futures.append(executor.submit(deploy_worker, args))
1098
+
1099
+ # Check if worker node has a GPU
1100
+ for future in cf.as_completed(futures):
1101
+ node, suc, has_gpu = future.result()
1102
+ install_gpu = install_gpu or has_gpu
1103
+ if not suc:
1104
+ unsuccessful_workers.append(node)
1105
+
1106
+ # Step 3: Configure local kubectl to connect to the cluster
1107
+ progress_message('Configuring local kubectl to connect to the cluster...')
1108
+
1109
+ # Create temporary directory for kubeconfig operations
1110
+ with tempfile.TemporaryDirectory() as temp_dir:
1111
+ temp_kubeconfig = os.path.join(temp_dir, 'kubeconfig')
1112
+
1113
+ # Get the kubeconfig from remote server
1114
+ if head_use_ssh_config:
1115
+ scp_cmd = ['scp', head_node + ':~/.kube/config', temp_kubeconfig]
1116
+ else:
1117
+ scp_cmd = [
1118
+ 'scp', '-o', 'StrictHostKeyChecking=no', '-o',
1119
+ 'IdentitiesOnly=yes', '-i', ssh_key,
1120
+ f'{ssh_user}@{head_node}:~/.kube/config', temp_kubeconfig
1121
+ ]
1122
+ run_command(scp_cmd, shell=False)
1123
+
1124
+ # Create the directory for the kubeconfig file if it doesn't exist
1125
+ ensure_directory_exists(kubeconfig_path)
1126
+
1127
+ # Create empty kubeconfig if it doesn't exist
1128
+ if not os.path.isfile(kubeconfig_path):
1129
+ open(kubeconfig_path, 'a', encoding='utf-8').close()
1130
+
1131
+ # Modify the temporary kubeconfig to update server address and context name
1132
+ modified_config = os.path.join(temp_dir, 'modified_config')
1133
+ with open(temp_kubeconfig, 'r', encoding='utf-8') as f_in:
1134
+ with open(modified_config, 'w', encoding='utf-8') as f_out:
1135
+ in_cluster = False
1136
+ in_user = False
1137
+ client_cert_data = None
1138
+ client_key_data = None
1139
+
1140
+ for line in f_in:
1141
+ if 'clusters:' in line:
1142
+ in_cluster = True
1143
+ in_user = False
1144
+ elif 'users:' in line:
1145
+ in_cluster = False
1146
+ in_user = True
1147
+ elif 'contexts:' in line:
1148
+ in_cluster = False
1149
+ in_user = False
1150
+
1151
+ # Skip certificate authority data in cluster section
1152
+ if in_cluster and 'certificate-authority-data:' in line:
1153
+ continue
1154
+ # Skip client certificate data in user section but extract it
1155
+ elif in_user and 'client-certificate-data:' in line:
1156
+ client_cert_data = line.split(':', 1)[1].strip()
1157
+ continue
1158
+ # Skip client key data in user section but extract it
1159
+ elif in_user and 'client-key-data:' in line:
1160
+ client_key_data = line.split(':', 1)[1].strip()
1161
+ continue
1162
+ elif in_cluster and 'server:' in line:
1163
+ # Initially just set to the effective master IP
1164
+ # (will be changed to localhost by setup_kubectl_ssh_tunnel later)
1165
+ f_out.write(
1166
+ f' server: https://{effective_master_ip}:6443\n')
1167
+ f_out.write(' insecure-skip-tls-verify: true\n')
1168
+ continue
1169
+
1170
+ # Replace default context names with user-provided context name
1171
+ line = line.replace('name: default',
1172
+ f'name: {context_name}')
1173
+ line = line.replace('cluster: default',
1174
+ f'cluster: {context_name}')
1175
+ line = line.replace('user: default',
1176
+ f'user: {context_name}')
1177
+ line = line.replace('current-context: default',
1178
+ f'current-context: {context_name}')
1179
+
1180
+ f_out.write(line)
1181
+
1182
+ # Save certificate data if available
1183
+
1184
+ if client_cert_data:
1185
+ # Decode base64 data and save as PEM
1186
+ try:
1187
+ # Clean up the certificate data by removing whitespace
1188
+ clean_cert_data = ''.join(client_cert_data.split())
1189
+ cert_pem = base64.b64decode(clean_cert_data).decode(
1190
+ 'utf-8')
1191
+
1192
+ # Check if the data already looks like a PEM file
1193
+ has_begin = '-----BEGIN CERTIFICATE-----' in cert_pem
1194
+ has_end = '-----END CERTIFICATE-----' in cert_pem
1195
+
1196
+ if not has_begin or not has_end:
1197
+ print(
1198
+ f'{YELLOW}Warning: Certificate data missing PEM markers, attempting to fix...{NC}'
1199
+ )
1200
+ # Add PEM markers if missing
1201
+ if not has_begin:
1202
+ cert_pem = f'-----BEGIN CERTIFICATE-----\n{cert_pem}'
1203
+ if not has_end:
1204
+ cert_pem = f'{cert_pem}\n-----END CERTIFICATE-----'
1205
+
1206
+ # Write the certificate
1207
+ with open(cert_file_path, 'w',
1208
+ encoding='utf-8') as cert_file:
1209
+ cert_file.write(cert_pem)
1210
+
1211
+ # Verify the file was written correctly
1212
+ if os.path.getsize(cert_file_path) > 0:
1213
+ print(
1214
+ f'{GREEN}Successfully saved certificate data ({len(cert_pem)} bytes){NC}'
1215
+ )
1216
+
1217
+ # Quick validation of PEM format
1218
+ with open(cert_file_path, 'r',
1219
+ encoding='utf-8') as f:
1220
+ content = f.readlines()
1221
+ first_line = content[0].strip(
1222
+ ) if content else ''
1223
+ last_line = content[-1].strip(
1224
+ ) if content else ''
1225
+
1226
+ if not first_line.startswith(
1227
+ '-----BEGIN') or not last_line.startswith(
1228
+ '-----END'):
1229
+ print(
1230
+ f'{YELLOW}Warning: Certificate may not be in proper PEM format{NC}'
1231
+ )
1232
+ else:
1233
+ print(f'{RED}Error: Certificate file is empty{NC}')
1234
+ except Exception as e: # pylint: disable=broad-except
1235
+ print(
1236
+ f'{RED}Error processing certificate data: {e}{NC}')
1237
+
1238
+ if client_key_data:
1239
+ # Decode base64 data and save as PEM
1240
+ try:
1241
+ # Clean up the key data by removing whitespace
1242
+ clean_key_data = ''.join(client_key_data.split())
1243
+ key_pem = base64.b64decode(clean_key_data).decode(
1244
+ 'utf-8')
1245
+
1246
+ # Check if the data already looks like a PEM file
1247
+
1248
+ # Check for EC key format
1249
+ if 'EC PRIVATE KEY' in key_pem:
1250
+ # Handle EC KEY format directly
1251
+ match_ec = re.search(
1252
+ r'-----BEGIN EC PRIVATE KEY-----(.*?)-----END EC PRIVATE KEY-----',
1253
+ key_pem, re.DOTALL)
1254
+ if match_ec:
1255
+ # Extract and properly format EC key
1256
+ key_content = match_ec.group(1).strip()
1257
+ key_pem = f'-----BEGIN EC PRIVATE KEY-----\n{key_content}\n-----END EC PRIVATE KEY-----'
1258
+ else:
1259
+ # Extract content and assume EC format
1260
+ key_content = re.sub(r'-----BEGIN.*?-----', '',
1261
+ key_pem)
1262
+ key_content = re.sub(r'-----END.*?-----.*', '',
1263
+ key_content).strip()
1264
+ key_pem = f'-----BEGIN EC PRIVATE KEY-----\n{key_content}\n-----END EC PRIVATE KEY-----'
1265
+ else:
1266
+ # Handle regular private key format
1267
+ has_begin = any(marker in key_pem for marker in [
1268
+ '-----BEGIN PRIVATE KEY-----',
1269
+ '-----BEGIN RSA PRIVATE KEY-----'
1270
+ ])
1271
+ has_end = any(marker in key_pem for marker in [
1272
+ '-----END PRIVATE KEY-----',
1273
+ '-----END RSA PRIVATE KEY-----'
1274
+ ])
1275
+
1276
+ if not has_begin or not has_end:
1277
+ print(
1278
+ f'{YELLOW}Warning: Key data missing PEM markers, attempting to fix...{NC}'
1279
+ )
1280
+ # Add PEM markers if missing
1281
+ if not has_begin:
1282
+ key_pem = f'-----BEGIN PRIVATE KEY-----\n{key_pem}'
1283
+ if not has_end:
1284
+ key_pem = f'{key_pem}\n-----END PRIVATE KEY-----'
1285
+ # Remove any trailing characters after END marker
1286
+ key_pem = re.sub(
1287
+ r'(-----END PRIVATE KEY-----).*', r'\1',
1288
+ key_pem)
1289
+
1290
+ # Write the key
1291
+ with open(key_file_path, 'w',
1292
+ encoding='utf-8') as key_file:
1293
+ key_file.write(key_pem)
1294
+
1295
+ # Verify the file was written correctly
1296
+ if os.path.getsize(key_file_path) > 0:
1297
+ print(
1298
+ f'{GREEN}Successfully saved key data ({len(key_pem)} bytes){NC}'
1299
+ )
1300
+
1301
+ # Quick validation of PEM format
1302
+ with open(key_file_path, 'r',
1303
+ encoding='utf-8') as f:
1304
+ content = f.readlines()
1305
+ first_line = content[0].strip(
1306
+ ) if content else ''
1307
+ last_line = content[-1].strip(
1308
+ ) if content else ''
1309
+
1310
+ if not first_line.startswith(
1311
+ '-----BEGIN') or not last_line.startswith(
1312
+ '-----END'):
1313
+ print(
1314
+ f'{YELLOW}Warning: Key may not be in proper PEM format{NC}'
1315
+ )
1316
+ else:
1317
+ print(f'{RED}Error: Key file is empty{NC}')
1318
+ except Exception as e: # pylint: disable=broad-except
1319
+ print(f'{RED}Error processing key data: {e}{NC}')
1320
+
1321
+ # First check if context name exists and delete it if it does
1322
+ # TODO(romilb): Should we throw an error here instead?
1323
+ run_command(['kubectl', 'config', 'delete-context', context_name],
1324
+ shell=False)
1325
+ run_command(['kubectl', 'config', 'delete-cluster', context_name],
1326
+ shell=False)
1327
+ run_command(['kubectl', 'config', 'delete-user', context_name],
1328
+ shell=False)
1329
+
1330
+ # Merge the configurations using kubectl
1331
+ merged_config = os.path.join(temp_dir, 'merged_config')
1332
+ os.environ['KUBECONFIG'] = f'{kubeconfig_path}:{modified_config}'
1333
+ with open(merged_config, 'w', encoding='utf-8') as merged_file:
1334
+ kubectl_cmd = ['kubectl', 'config', 'view', '--flatten']
1335
+ result = run_command(kubectl_cmd, shell=False)
1336
+ if result:
1337
+ merged_file.write(result)
1338
+
1339
+ # Replace the kubeconfig with the merged config
1340
+ os.replace(merged_config, kubeconfig_path)
1341
+
1342
+ # Set the new context as the current context
1343
+ run_command(['kubectl', 'config', 'use-context', context_name],
1344
+ shell=False)
1345
+
1346
+ # Always set up SSH tunnel since we assume only port 22 is accessible
1347
+ setup_kubectl_ssh_tunnel(head_node,
1348
+ ssh_user,
1349
+ ssh_key,
1350
+ context_name,
1351
+ use_ssh_config=head_use_ssh_config)
1352
+
1353
+ success_message(f'kubectl configured with new context \'{context_name}\'.')
1354
+
1355
+ print(
1356
+ f'Cluster deployment completed. Kubeconfig saved to {kubeconfig_path}')
1357
+ print('You can now run \'kubectl get nodes\' to verify the setup.')
1358
+
1359
+ # Install GPU operator if a GPU was detected on any node
1360
+ if install_gpu:
1361
+ print(
1362
+ f'{YELLOW}GPU detected in the cluster. Installing Nvidia GPU Operator...{NC}'
1363
+ )
1364
+ cmd = f"""
1365
+ {askpass_block}
1366
+ curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
1367
+ chmod 700 get_helm.sh &&
1368
+ ./get_helm.sh &&
1369
+ helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update &&
1370
+ kubectl create namespace gpu-operator --kubeconfig ~/.kube/config || true &&
1371
+ sudo -A ln -s /sbin/ldconfig /sbin/ldconfig.real || true &&
1372
+ helm install gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator \\
1373
+ --set 'toolkit.env[0].name=CONTAINERD_CONFIG' \\
1374
+ --set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \\
1375
+ --set 'toolkit.env[1].name=CONTAINERD_SOCKET' \\
1376
+ --set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \\
1377
+ --set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \\
1378
+ --set 'toolkit.env[2].value=nvidia' &&
1379
+ echo 'Waiting for GPU operator installation...' &&
1380
+ while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:' || ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu.product'; do
1381
+ echo 'Waiting for GPU operator...'
1382
+ sleep 5
1383
+ done
1384
+ echo 'GPU operator installed successfully.'
1385
+ """
1386
+ result = run_remote(head_node,
1387
+ cmd,
1388
+ ssh_user,
1389
+ ssh_key,
1390
+ use_ssh_config=head_use_ssh_config)
1391
+ if result is None:
1392
+ print(f'{RED}Failed to install GPU Operator.{NC}')
1393
+ else:
1394
+ success_message('GPU Operator installed.')
1395
+ else:
1396
+ print(
1397
+ f'{YELLOW}No GPUs detected. Skipping GPU Operator installation.{NC}'
1398
+ )
1399
+
1400
+ # Configure SkyPilot
1401
+ progress_message('Configuring SkyPilot...')
1402
+
1403
+ # The env var KUBECONFIG ensures sky check uses the right kubeconfig
1404
+ os.environ['KUBECONFIG'] = kubeconfig_path
1405
+ run_command(['sky', 'check', 'kubernetes'], shell=False)
1406
+
1407
+ success_message('SkyPilot configured successfully.')
1408
+
1409
+ # Display final success message
1410
+ print(
1411
+ f'{GREEN}==== 🎉 Kubernetes cluster deployment completed successfully 🎉 ====${NC}'
1412
+ )
1413
+ print(
1414
+ 'You can now interact with your Kubernetes cluster through SkyPilot: ')
1415
+ print(' • List available GPUs: sky show-gpus --cloud kubernetes')
1416
+ print(
1417
+ ' • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes'
1418
+ )
1419
+ print(' • Connect to pod with VSCode: code --remote ssh-remote+devbox ')
1420
+ # Print completion marker for current cluster
1421
+ print(f'{GREEN}SKYPILOT_CLUSTER_COMPLETED: {NC}')
1422
+
1423
+ if unsuccessful_workers:
1424
+ quoted_unsuccessful_workers = [
1425
+ f'"{worker}"' for worker in unsuccessful_workers
1426
+ ]
1427
+
1428
+ print(
1429
+ f'{WARNING_YELLOW}Failed to deploy Kubernetes on the following nodes: '
1430
+ f'{", ".join(quoted_unsuccessful_workers)}. Please check '
1431
+ f'the logs for more details.{NC}')
1432
+
1433
+ return unsuccessful_workers
1434
+
1435
+
1436
+ if __name__ == '__main__':
1437
+ main()