skypilot-nightly 1.0.0.dev20251203__py3-none-any.whl → 1.0.0.dev20260112__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (245) hide show
  1. sky/__init__.py +6 -2
  2. sky/adaptors/aws.py +1 -61
  3. sky/adaptors/slurm.py +565 -0
  4. sky/backends/backend_utils.py +95 -12
  5. sky/backends/cloud_vm_ray_backend.py +224 -65
  6. sky/backends/task_codegen.py +380 -4
  7. sky/catalog/__init__.py +0 -3
  8. sky/catalog/data_fetchers/fetch_gcp.py +9 -1
  9. sky/catalog/data_fetchers/fetch_nebius.py +1 -1
  10. sky/catalog/data_fetchers/fetch_vast.py +4 -2
  11. sky/catalog/kubernetes_catalog.py +12 -4
  12. sky/catalog/seeweb_catalog.py +30 -15
  13. sky/catalog/shadeform_catalog.py +5 -2
  14. sky/catalog/slurm_catalog.py +236 -0
  15. sky/catalog/vast_catalog.py +30 -6
  16. sky/check.py +25 -11
  17. sky/client/cli/command.py +391 -32
  18. sky/client/interactive_utils.py +190 -0
  19. sky/client/sdk.py +64 -2
  20. sky/client/sdk_async.py +9 -0
  21. sky/clouds/__init__.py +2 -0
  22. sky/clouds/aws.py +60 -2
  23. sky/clouds/azure.py +2 -0
  24. sky/clouds/cloud.py +7 -0
  25. sky/clouds/kubernetes.py +2 -0
  26. sky/clouds/runpod.py +38 -7
  27. sky/clouds/slurm.py +610 -0
  28. sky/clouds/ssh.py +3 -2
  29. sky/clouds/vast.py +39 -16
  30. sky/core.py +197 -37
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/3nu-b8raeKRNABZ2d4GAG/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/1871-0565f8975a7dcd10.js +6 -0
  34. sky/dashboard/out/_next/static/chunks/2109-55a1546d793574a7.js +11 -0
  35. sky/dashboard/out/_next/static/chunks/2521-099b07cd9e4745bf.js +26 -0
  36. sky/dashboard/out/_next/static/chunks/2755.a636e04a928a700e.js +31 -0
  37. sky/dashboard/out/_next/static/chunks/3495.05eab4862217c1a5.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/3785.cfc5dcc9434fd98c.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/3981.645d01bf9c8cad0c.js +21 -0
  41. sky/dashboard/out/_next/static/chunks/4083-0115d67c1fb57d6c.js +21 -0
  42. sky/dashboard/out/_next/static/chunks/{8640.5b9475a2d18c5416.js → 429.a58e9ba9742309ed.js} +2 -2
  43. sky/dashboard/out/_next/static/chunks/4555.8e221537181b5dc1.js +6 -0
  44. sky/dashboard/out/_next/static/chunks/4725.937865b81fdaaebb.js +6 -0
  45. sky/dashboard/out/_next/static/chunks/6082-edabd8f6092300ce.js +25 -0
  46. sky/dashboard/out/_next/static/chunks/6989-49cb7dca83a7a62d.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/6990-630bd2a2257275f8.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/7248-a99800d4db8edabd.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/754-cfc5d4ad1b843d29.js +18 -0
  50. sky/dashboard/out/_next/static/chunks/8050-dd8aa107b17dce00.js +16 -0
  51. sky/dashboard/out/_next/static/chunks/8056-d4ae1e0cb81e7368.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/8555.011023e296c127b3.js +6 -0
  53. sky/dashboard/out/_next/static/chunks/8821-93c25df904a8362b.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/8969-0662594b69432ade.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/9025.f15c91c97d124a5f.js +6 -0
  56. sky/dashboard/out/_next/static/chunks/9353-7ad6bd01858556f1.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/_app-5a86569acad99764.js +34 -0
  58. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8297476714acb4ac.js +6 -0
  59. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-337c3ba1085f1210.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/{clusters-ee39056f9851a3ff.js → clusters-57632ff3684a8b5c.js} +1 -1
  61. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  62. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-5fd3a453c079c2ea.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/infra-9f85c02c9c6cae9e.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90f16972cbecf354.js +1 -0
  65. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-2dd42fc37aad427a.js +16 -0
  66. sky/dashboard/out/_next/static/chunks/pages/jobs-ed806aeace26b972.js +1 -0
  67. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-449a9f5a3bb20fb3.js +1 -0
  68. sky/dashboard/out/_next/static/chunks/pages/users-bec34706b36f3524.js +1 -0
  69. sky/dashboard/out/_next/static/chunks/pages/{volumes-b84b948ff357c43e.js → volumes-a83ba9b38dff7ea9.js} +1 -1
  70. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-84a40f8c7c627fe4.js → [name]-c781e9c3e52ef9fc.js} +1 -1
  71. sky/dashboard/out/_next/static/chunks/pages/workspaces-91e0942f47310aae.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/webpack-cfe59cf684ee13b9.js +1 -0
  73. sky/dashboard/out/_next/static/css/b0dbca28f027cc19.css +3 -0
  74. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  75. sky/dashboard/out/clusters/[cluster].html +1 -1
  76. sky/dashboard/out/clusters.html +1 -1
  77. sky/dashboard/out/config.html +1 -1
  78. sky/dashboard/out/index.html +1 -1
  79. sky/dashboard/out/infra/[context].html +1 -1
  80. sky/dashboard/out/infra.html +1 -1
  81. sky/dashboard/out/jobs/[job].html +1 -1
  82. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  83. sky/dashboard/out/jobs.html +1 -1
  84. sky/dashboard/out/plugins/[...slug].html +1 -0
  85. sky/dashboard/out/users.html +1 -1
  86. sky/dashboard/out/volumes.html +1 -1
  87. sky/dashboard/out/workspace/new.html +1 -1
  88. sky/dashboard/out/workspaces/[name].html +1 -1
  89. sky/dashboard/out/workspaces.html +1 -1
  90. sky/data/data_utils.py +26 -12
  91. sky/data/mounting_utils.py +44 -5
  92. sky/global_user_state.py +111 -19
  93. sky/jobs/client/sdk.py +8 -3
  94. sky/jobs/controller.py +191 -31
  95. sky/jobs/recovery_strategy.py +109 -11
  96. sky/jobs/server/core.py +81 -4
  97. sky/jobs/server/server.py +14 -0
  98. sky/jobs/state.py +417 -19
  99. sky/jobs/utils.py +73 -80
  100. sky/models.py +11 -0
  101. sky/optimizer.py +8 -6
  102. sky/provision/__init__.py +12 -9
  103. sky/provision/common.py +20 -0
  104. sky/provision/docker_utils.py +15 -2
  105. sky/provision/kubernetes/utils.py +163 -20
  106. sky/provision/kubernetes/volume.py +52 -17
  107. sky/provision/provisioner.py +17 -7
  108. sky/provision/runpod/instance.py +3 -1
  109. sky/provision/runpod/utils.py +13 -1
  110. sky/provision/runpod/volume.py +25 -9
  111. sky/provision/slurm/__init__.py +12 -0
  112. sky/provision/slurm/config.py +13 -0
  113. sky/provision/slurm/instance.py +618 -0
  114. sky/provision/slurm/utils.py +689 -0
  115. sky/provision/vast/instance.py +4 -1
  116. sky/provision/vast/utils.py +11 -6
  117. sky/resources.py +135 -13
  118. sky/schemas/api/responses.py +4 -0
  119. sky/schemas/db/global_user_state/010_save_ssh_key.py +1 -1
  120. sky/schemas/db/spot_jobs/008_add_full_resources.py +34 -0
  121. sky/schemas/db/spot_jobs/009_job_events.py +32 -0
  122. sky/schemas/db/spot_jobs/010_job_events_timestamp_with_timezone.py +43 -0
  123. sky/schemas/db/spot_jobs/011_add_links.py +34 -0
  124. sky/schemas/generated/jobsv1_pb2.py +9 -5
  125. sky/schemas/generated/jobsv1_pb2.pyi +12 -0
  126. sky/schemas/generated/jobsv1_pb2_grpc.py +44 -0
  127. sky/schemas/generated/managed_jobsv1_pb2.py +32 -28
  128. sky/schemas/generated/managed_jobsv1_pb2.pyi +11 -2
  129. sky/serve/serve_utils.py +232 -40
  130. sky/serve/server/impl.py +1 -1
  131. sky/server/common.py +17 -0
  132. sky/server/constants.py +1 -1
  133. sky/server/metrics.py +6 -3
  134. sky/server/plugins.py +238 -0
  135. sky/server/requests/executor.py +5 -2
  136. sky/server/requests/payloads.py +30 -1
  137. sky/server/requests/request_names.py +4 -0
  138. sky/server/requests/requests.py +33 -11
  139. sky/server/requests/serializers/encoders.py +22 -0
  140. sky/server/requests/serializers/return_value_serializers.py +70 -0
  141. sky/server/server.py +506 -109
  142. sky/server/server_utils.py +30 -0
  143. sky/server/uvicorn.py +5 -0
  144. sky/setup_files/MANIFEST.in +1 -0
  145. sky/setup_files/dependencies.py +22 -9
  146. sky/sky_logging.py +2 -1
  147. sky/skylet/attempt_skylet.py +13 -3
  148. sky/skylet/constants.py +55 -13
  149. sky/skylet/events.py +10 -4
  150. sky/skylet/executor/__init__.py +1 -0
  151. sky/skylet/executor/slurm.py +187 -0
  152. sky/skylet/job_lib.py +91 -5
  153. sky/skylet/log_lib.py +22 -6
  154. sky/skylet/log_lib.pyi +8 -6
  155. sky/skylet/services.py +18 -3
  156. sky/skylet/skylet.py +5 -1
  157. sky/skylet/subprocess_daemon.py +2 -1
  158. sky/ssh_node_pools/constants.py +12 -0
  159. sky/ssh_node_pools/core.py +40 -3
  160. sky/ssh_node_pools/deploy/__init__.py +4 -0
  161. sky/{utils/kubernetes/deploy_ssh_node_pools.py → ssh_node_pools/deploy/deploy.py} +279 -504
  162. sky/ssh_node_pools/deploy/tunnel/ssh-tunnel.sh +379 -0
  163. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  164. sky/ssh_node_pools/deploy/utils.py +173 -0
  165. sky/ssh_node_pools/server.py +11 -13
  166. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  167. sky/templates/kubernetes-ray.yml.j2 +12 -6
  168. sky/templates/slurm-ray.yml.j2 +115 -0
  169. sky/templates/vast-ray.yml.j2 +1 -0
  170. sky/templates/websocket_proxy.py +18 -41
  171. sky/users/model.conf +1 -1
  172. sky/users/permission.py +85 -52
  173. sky/users/rbac.py +31 -3
  174. sky/utils/annotations.py +108 -8
  175. sky/utils/auth_utils.py +42 -0
  176. sky/utils/cli_utils/status_utils.py +19 -5
  177. sky/utils/cluster_utils.py +10 -3
  178. sky/utils/command_runner.py +389 -35
  179. sky/utils/command_runner.pyi +43 -4
  180. sky/utils/common_utils.py +47 -31
  181. sky/utils/context.py +32 -0
  182. sky/utils/db/db_utils.py +36 -6
  183. sky/utils/db/migration_utils.py +41 -21
  184. sky/utils/infra_utils.py +5 -1
  185. sky/utils/instance_links.py +139 -0
  186. sky/utils/interactive_utils.py +49 -0
  187. sky/utils/kubernetes/generate_kubeconfig.sh +42 -33
  188. sky/utils/kubernetes/kubernetes_deploy_utils.py +2 -94
  189. sky/utils/kubernetes/rsync_helper.sh +5 -1
  190. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  191. sky/utils/plugin_extensions/__init__.py +14 -0
  192. sky/utils/plugin_extensions/external_failure_source.py +176 -0
  193. sky/utils/resources_utils.py +10 -8
  194. sky/utils/rich_utils.py +9 -11
  195. sky/utils/schemas.py +93 -19
  196. sky/utils/status_lib.py +7 -0
  197. sky/utils/subprocess_utils.py +17 -0
  198. sky/volumes/client/sdk.py +6 -3
  199. sky/volumes/server/core.py +65 -27
  200. sky_templates/ray/start_cluster +8 -4
  201. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/METADATA +67 -59
  202. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/RECORD +208 -180
  203. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +0 -1
  204. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +0 -11
  205. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +0 -6
  206. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +0 -1
  207. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  208. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  209. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +0 -26
  210. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +0 -1
  211. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +0 -1
  212. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +0 -1
  213. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  214. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +0 -1
  215. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  216. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  217. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +0 -1
  218. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +0 -1
  219. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +0 -1
  220. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +0 -30
  221. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +0 -41
  222. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +0 -1
  223. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +0 -1
  224. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +0 -6
  225. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +0 -1
  226. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +0 -31
  227. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +0 -30
  228. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +0 -34
  229. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +0 -16
  230. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +0 -1
  231. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +0 -1
  232. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +0 -1
  233. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +0 -16
  234. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +0 -21
  235. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +0 -1
  236. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +0 -1
  237. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +0 -1
  238. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +0 -1
  239. sky/dashboard/out/_next/static/css/0748ce22df867032.css +0 -3
  240. /sky/dashboard/out/_next/static/{96_E2yl3QAiIJGOYCkSpB → 3nu-b8raeKRNABZ2d4GAG}/_ssgManifest.js +0 -0
  241. /sky/{utils/kubernetes → ssh_node_pools/deploy/tunnel}/cleanup-tunnel.sh +0 -0
  242. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/WHEEL +0 -0
  243. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/entry_points.txt +0 -0
  244. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/licenses/LICENSE +0 -0
  245. {skypilot_nightly-1.0.0.dev20251203.dist-info → skypilot_nightly-1.0.0.dev20260112.dist-info}/top_level.txt +0 -0
@@ -3,36 +3,24 @@
3
3
  import base64
4
4
  import concurrent.futures as cf
5
5
  import os
6
- import random
7
6
  import re
8
7
  import shlex
9
8
  import shutil
10
- import subprocess
11
- import sys
12
9
  import tempfile
13
- from typing import List, Optional, Set
10
+ from typing import List, Optional
14
11
 
15
12
  import colorama
16
13
  import yaml
17
14
 
18
15
  from sky import sky_logging
16
+ from sky.ssh_node_pools import constants
17
+ from sky.ssh_node_pools import utils as ssh_utils
18
+ from sky.ssh_node_pools.deploy import tunnel_utils
19
+ from sky.ssh_node_pools.deploy import utils as deploy_utils
19
20
  from sky.utils import rich_utils
20
21
  from sky.utils import ux_utils
21
- from sky.utils.kubernetes import ssh_utils
22
-
23
- # Colors for nicer UX
24
- RED = '\033[0;31m'
25
- GREEN = '\033[0;32m'
26
- YELLOW = '\033[1;33m'
27
- WARNING_YELLOW = '\x1b[33m'
28
- NC = '\033[0m' # No color
29
- DIM = colorama.Style.DIM
30
- CYAN = colorama.Fore.CYAN
31
- RESET_ALL = colorama.Style.RESET_ALL
32
22
 
33
- DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
34
- SSH_CONFIG_PATH = os.path.expanduser('~/.ssh/config')
35
- NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
23
+ RESET_ALL = colorama.Style.RESET_ALL
36
24
 
37
25
  # Get the directory of this script
38
26
  SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
@@ -40,113 +28,14 @@ SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
40
28
  logger = sky_logging.init_logger(__name__)
41
29
 
42
30
 
43
- def run_command(cmd, shell=False, silent=False):
44
- """Run a local command and return the output."""
45
- process = subprocess.run(cmd,
46
- shell=shell,
47
- capture_output=True,
48
- text=True,
49
- check=False)
50
- if process.returncode != 0:
51
- if not silent:
52
- logger.error(f'{RED}Error executing command: {cmd}{NC}\n'
53
- f'STDOUT: {process.stdout}\n'
54
- f'STDERR: {process.stderr}')
55
- return None
56
- return process.stdout.strip()
57
-
58
-
59
- def get_effective_host_ip(hostname: str) -> str:
60
- """Get the effective IP for a hostname from SSH config."""
61
- try:
62
- result = subprocess.run(['ssh', '-G', hostname],
63
- capture_output=True,
64
- text=True,
65
- check=False)
66
- if result.returncode == 0:
67
- for line in result.stdout.splitlines():
68
- if line.startswith('hostname '):
69
- return line.split(' ', 1)[1].strip()
70
- except Exception: # pylint: disable=broad-except
71
- pass
72
- return hostname # Return the original hostname if lookup fails
73
-
74
-
75
- def run_remote(node,
76
- cmd,
77
- user='',
78
- ssh_key='',
79
- connect_timeout=30,
80
- use_ssh_config=False,
81
- print_output=False,
82
- use_shell=False,
83
- silent=False):
84
- """Run a command on a remote machine via SSH.
85
-
86
- silent is used for gpu checking (will show error logs when no gpus are found)"""
87
- ssh_cmd: List[str]
88
- if use_ssh_config:
89
- # Use SSH config for connection parameters
90
- ssh_cmd = ['ssh', node, cmd]
91
- else:
92
- # Use explicit parameters
93
- ssh_cmd = [
94
- 'ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'IdentitiesOnly=yes',
95
- '-o', f'ConnectTimeout={connect_timeout}', '-o',
96
- 'ServerAliveInterval=10', '-o', 'ServerAliveCountMax=3'
97
- ]
98
-
99
- if ssh_key:
100
- if not os.path.isfile(ssh_key):
101
- raise ValueError(f'SSH key not found: {ssh_key}')
102
- ssh_cmd.extend(['-i', ssh_key])
103
-
104
- ssh_cmd.append(f'{user}@{node}' if user else node)
105
- ssh_cmd.append(cmd)
106
-
107
- subprocess_cmd = ' '.join(ssh_cmd) if use_shell else ssh_cmd
108
- process = subprocess.run(subprocess_cmd,
109
- capture_output=True,
110
- text=True,
111
- check=False,
112
- shell=use_shell)
113
- if process.returncode != 0:
114
- if not silent:
115
- logger.error(f'{RED}Error executing command {cmd} on {node}:{NC} '
116
- f'{process.stderr}')
117
- return None
118
- if print_output:
119
- logger.info(process.stdout)
120
- return process.stdout.strip()
121
-
122
-
123
- def create_askpass_script(password):
124
- """Create an askpass script block for sudo with password."""
125
- if not password:
126
- return ''
127
-
128
- return f"""
129
- # Create temporary askpass script
130
- ASKPASS_SCRIPT=$(mktemp)
131
- trap 'rm -f $ASKPASS_SCRIPT' EXIT INT TERM ERR QUIT
132
- cat > $ASKPASS_SCRIPT << EOF
133
- #!/bin/bash
134
- echo {password}
135
- EOF
136
- chmod 700 $ASKPASS_SCRIPT
137
- # Use askpass
138
- export SUDO_ASKPASS=$ASKPASS_SCRIPT
139
- """
140
-
141
-
142
31
  def progress_message(message):
143
32
  """Show a progress message."""
144
- logger.info(f'{YELLOW}➜ {message}{NC}')
33
+ logger.info(f'{colorama.Fore.YELLOW}➜ {message}{RESET_ALL}')
145
34
 
146
35
 
147
36
  def success_message(message):
148
37
  """Show a success message."""
149
- logger.info(f'{GREEN}✔ {message}{NC}')
38
+ logger.info(f'{colorama.Fore.GREEN}✔ {message}{RESET_ALL}')
150
39
 
151
40
 
152
41
  def force_update_status(message):
@@ -154,283 +43,61 @@ def force_update_status(message):
154
43
  rich_utils.force_update_status(ux_utils.spinner_message(message))
155
44
 
156
45
 
157
- def cleanup_server_node(node,
158
- user,
159
- ssh_key,
160
- askpass_block,
161
- use_ssh_config=False):
162
- """Uninstall k3s and clean up the state on a server node."""
163
- force_update_status(f'Cleaning up head node ({node})...')
164
- cmd = f"""
165
- {askpass_block}
166
- echo 'Uninstalling k3s...' &&
167
- sudo -A /usr/local/bin/k3s-uninstall.sh || true &&
168
- sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
169
- """
170
- result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
171
- if result is None:
172
- logger.error(f'{RED}Failed to clean up head node ({node}).{NC}')
173
- else:
174
- success_message(f'Node {node} cleaned up successfully.')
46
+ def run(cleanup: bool = False,
47
+ infra: Optional[str] = None,
48
+ kubeconfig_path: str = constants.DEFAULT_KUBECONFIG_PATH):
49
+ """Deploy a Kubernetes cluster on SSH targets.
175
50
 
51
+ This function reads ~/.sky/ssh_node_pools.yaml and uses it to deploy a
52
+ Kubernetes cluster on the specified machines.
176
53
 
177
- def cleanup_agent_node(node,
178
- user,
179
- ssh_key,
180
- askpass_block,
181
- use_ssh_config=False):
182
- """Uninstall k3s and clean up the state on an agent node."""
183
- force_update_status(f'Cleaning up worker node ({node})...')
184
- cmd = f"""
185
- {askpass_block}
186
- echo 'Uninstalling k3s...' &&
187
- sudo -A /usr/local/bin/k3s-agent-uninstall.sh || true &&
188
- sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
54
+ Args:
55
+ cleanup: Whether to clean up the cluster instead of deploying.
56
+ infra: Name of the cluster in ssh_node_pools.yaml to use.
57
+ If None, the first cluster in the file will be used.
58
+ kubeconfig_path: Path to save the Kubernetes configuration file.
59
+ If None, the default ~/.kube/config will be used.
189
60
  """
190
- result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
191
- if result is None:
192
- logger.error(f'{RED}Failed to clean up worker node ({node}).{NC}')
193
- else:
194
- success_message(f'Node {node} cleaned up successfully.')
195
-
196
-
197
- def start_agent_node(node,
198
- master_addr,
199
- k3s_token,
200
- user,
201
- ssh_key,
202
- askpass_block,
203
- use_ssh_config=False):
204
- """Start a k3s agent node.
205
- Returns: if the start is successful, and if the node has a GPU."""
206
- logger.info(f'Deploying worker node ({node}).')
207
- cmd = f"""
208
- {askpass_block}
209
- curl -sfL https://get.k3s.io | K3S_NODE_NAME={node} INSTALL_K3S_EXEC='agent --node-label skypilot-ip={node}' \
210
- K3S_URL=https://{master_addr}:6443 K3S_TOKEN={k3s_token} sudo -E -A sh -
211
- """
212
- result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
213
- if result is None:
214
- logger.error(
215
- f'{RED}✗ Failed to deploy K3s on worker node ({node}).{NC}')
216
- return node, False, False
217
- success_message(
218
- f'SkyPilot runtime successfully deployed on worker node ({node}).')
219
- # Check if worker node has a GPU
220
- if check_gpu(node, user, ssh_key, use_ssh_config=use_ssh_config):
221
- logger.info(f'{YELLOW}GPU detected on worker node ({node}).{NC}')
222
- return node, True, True
223
- return node, True, False
224
-
225
-
226
- def check_gpu(node, user, ssh_key, use_ssh_config=False):
227
- """Check if a node has a GPU."""
228
- cmd = 'command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null'
229
- result = run_remote(node,
230
- cmd,
231
- user,
232
- ssh_key,
233
- use_ssh_config=use_ssh_config,
234
- silent=True)
235
- return result is not None
61
+ deploy_utils.check_ssh_cluster_dependencies()
62
+ action = 'Cleanup' if cleanup else 'Deployment'
63
+ msg_str = f'Initializing SSH Node Pools {action}...'
236
64
 
65
+ with rich_utils.safe_status(ux_utils.spinner_message(msg_str)):
66
+ try:
67
+ deploy_multiple_clusters(infra=infra,
68
+ cleanup=cleanup,
69
+ kubeconfig_path=kubeconfig_path)
70
+ except Exception as e: # pylint: disable=broad-except
71
+ logger.error(str(e))
72
+ with ux_utils.print_exception_no_traceback():
73
+ raise RuntimeError(
74
+ 'Failed to deploy SkyPilot on some Node Pools.') from e
237
75
 
238
- def ensure_directory_exists(path):
239
- """Ensure the directory for the specified file path exists."""
240
- directory = os.path.dirname(path)
241
- if directory and not os.path.exists(directory):
242
- os.makedirs(directory, exist_ok=True)
243
-
244
-
245
- def get_used_localhost_ports() -> Set[int]:
246
- """Get SSH port forwardings already in use on localhost"""
247
- used_ports = set()
248
-
249
- # Get ports from netstat (works on macOS and Linux)
250
- try:
251
- if sys.platform == 'darwin':
252
- # macOS
253
- result = subprocess.run(['netstat', '-an', '-p', 'tcp'],
254
- capture_output=True,
255
- text=True,
256
- check=False)
257
- else:
258
- # Linux and other Unix-like systems
259
- result = subprocess.run(['netstat', '-tln'],
260
- capture_output=True,
261
- text=True,
262
- check=False)
263
-
264
- if result.returncode == 0:
265
- # Look for lines with 'localhost:<port>' or '127.0.0.1:<port>'
266
- for line in result.stdout.splitlines():
267
- if '127.0.0.1:' in line or 'localhost:' in line:
268
- match = re.search(r':(64\d\d)\s', line)
269
- if match:
270
- port = int(match.group(1))
271
- if 6400 <= port <= 6500: # Only consider our range
272
- used_ports.add(port)
273
- except (subprocess.SubprocessError, FileNotFoundError):
274
- # If netstat fails, try another approach
275
- pass
276
-
277
- # Also check ports from existing kubeconfig entries
278
- try:
279
- result = subprocess.run([
280
- 'kubectl', 'config', 'view', '-o',
281
- 'jsonpath=\'{.clusters[*].cluster.server}\''
282
- ],
283
- capture_output=True,
284
- text=True,
285
- check=False)
286
-
287
- if result.returncode == 0:
288
- # Look for localhost URLs with ports
289
- for url in result.stdout.split():
290
- if 'localhost:' in url or '127.0.0.1:' in url:
291
- match = re.search(r':(\d+)', url)
292
- if match:
293
- port = int(match.group(1))
294
- if 6400 <= port <= 6500: # Only consider our range
295
- used_ports.add(port)
296
- except subprocess.SubprocessError:
297
- pass
298
-
299
- return used_ports
300
-
301
-
302
- def get_available_port(start: int = 6443, end: int = 6499) -> int:
303
- """Get an available port in the given range that's not used by other tunnels"""
304
- used_ports = get_used_localhost_ports()
305
-
306
- # Try to use port 6443 first if available for the first cluster
307
- if start == 6443 and start not in used_ports:
308
- return start
309
-
310
- # Otherwise find any available port in the range
311
- available_ports = list(set(range(start, end + 1)) - used_ports)
312
-
313
- if not available_ports:
314
- # If all ports are used, pick a random one from our range
315
- # (we'll terminate any existing connection in the setup)
316
- return random.randint(start, end)
317
-
318
- # Sort to get deterministic allocation
319
- available_ports.sort()
320
- return available_ports[0]
321
-
322
-
323
- def setup_kubectl_ssh_tunnel(head_node,
324
- ssh_user,
325
- ssh_key,
326
- context_name,
327
- use_ssh_config=False):
328
- """Set up kubeconfig exec credential plugin for SSH tunnel"""
329
- progress_message('Setting up SSH tunnel for Kubernetes API access...')
330
-
331
- # Get an available port for this cluster
332
- port = get_available_port()
333
-
334
- # Paths to scripts
335
- tunnel_script = os.path.join(SCRIPT_DIR, 'ssh-tunnel.sh')
336
-
337
- # Make sure scripts are executable
338
- os.chmod(tunnel_script, 0o755)
339
-
340
- # Certificate files
341
- client_cert_file = os.path.join(NODE_POOLS_INFO_DIR,
342
- f'{context_name}-cert.pem')
343
- client_key_file = os.path.join(NODE_POOLS_INFO_DIR,
344
- f'{context_name}-key.pem')
345
-
346
- # Update kubeconfig to use localhost with the selected port
347
- run_command([
348
- 'kubectl', 'config', 'set-cluster', context_name,
349
- f'--server=https://127.0.0.1:{port}', '--insecure-skip-tls-verify=true'
350
- ])
351
-
352
- # Build the exec args list based on auth method
353
- exec_args = [
354
- '--exec-command', tunnel_script, '--exec-api-version',
355
- 'client.authentication.k8s.io/v1beta1'
356
- ]
357
-
358
- # Set credential TTL to force frequent tunnel checks
359
- ttl_seconds = 30
360
-
361
- # Verify if we have extracted certificate data files
362
- has_cert_files = os.path.isfile(client_cert_file) and os.path.isfile(
363
- client_key_file)
364
- if has_cert_files:
76
+ # Add empty line for ux-purposes.
77
+ logger.info('')
78
+ if cleanup:
365
79
  logger.info(
366
- f'{GREEN}Client certificate data extracted and will be used for authentication{NC}'
367
- )
368
-
369
- if use_ssh_config:
370
- run_command(
371
- ['kubectl', 'config', 'set-credentials', context_name] + exec_args +
372
- [
373
- '--exec-arg=--context', f'--exec-arg={context_name}',
374
- '--exec-arg=--port', f'--exec-arg={port}', '--exec-arg=--ttl',
375
- f'--exec-arg={ttl_seconds}', '--exec-arg=--use-ssh-config',
376
- '--exec-arg=--host', f'--exec-arg={head_node}'
377
- ])
80
+ ux_utils.finishing_message(
81
+ '🎉 SSH Node Pools cleaned up successfully.'))
378
82
  else:
379
- run_command(['kubectl', 'config', 'set-credentials', context_name] +
380
- exec_args + [
381
- '--exec-arg=--context', f'--exec-arg={context_name}',
382
- '--exec-arg=--port', f'--exec-arg={port}',
383
- '--exec-arg=--ttl', f'--exec-arg={ttl_seconds}',
384
- '--exec-arg=--host', f'--exec-arg={head_node}',
385
- '--exec-arg=--user', f'--exec-arg={ssh_user}',
386
- '--exec-arg=--ssh-key', f'--exec-arg={ssh_key}'
387
- ])
388
-
389
- success_message(
390
- f'SSH tunnel configured through kubectl credential plugin on port {port}'
391
- )
392
- logger.info(
393
- f'{GREEN}Your kubectl connection is now tunneled through SSH (port {port}).{NC}'
394
- )
395
- logger.info(
396
- f'{GREEN}This tunnel will be automatically established when needed.{NC}'
397
- )
398
- logger.info(
399
- f'{GREEN}Credential TTL set to {ttl_seconds}s to ensure tunnel health is checked frequently.{NC}'
400
- )
401
-
402
- return port
403
-
404
-
405
- def cleanup_kubectl_ssh_tunnel(cluster_name, context_name):
406
- """Clean up the SSH tunnel for a specific context"""
407
- progress_message(f'Cleaning up SSH tunnel for `{cluster_name}`...')
408
-
409
- # Path to cleanup script
410
- cleanup_script = os.path.join(SCRIPT_DIR, 'cleanup-tunnel.sh')
411
-
412
- # Make sure script is executable
413
- if os.path.exists(cleanup_script):
414
- os.chmod(cleanup_script, 0o755)
415
-
416
- # Run the cleanup script
417
- subprocess.run([cleanup_script, context_name],
418
- stdout=subprocess.DEVNULL,
419
- stderr=subprocess.DEVNULL,
420
- check=False)
421
-
422
- success_message(f'SSH tunnel for `{cluster_name}` cleaned up.')
423
- else:
424
- logger.error(f'{YELLOW}Cleanup script not found: {cleanup_script}{NC}')
83
+ logger.info(
84
+ ux_utils.finishing_message(
85
+ '🎉 SSH Node Pools set up successfully. ',
86
+ follow_up_message=(
87
+ f'Run `{colorama.Style.BRIGHT}'
88
+ f'sky check ssh'
89
+ f'{colorama.Style.RESET_ALL}` to verify access, '
90
+ f'`{colorama.Style.BRIGHT}sky launch --infra ssh'
91
+ f'{colorama.Style.RESET_ALL}` to launch a cluster.')))
425
92
 
426
93
 
427
- def deploy_clusters(
94
+ def deploy_multiple_clusters(
428
95
  infra: Optional[str],
429
- ssh_node_pools_file: str = ssh_utils.DEFAULT_SSH_NODE_POOLS_PATH,
430
- kubeconfig_path: Optional[str] = None,
96
+ ssh_node_pools_file: str = constants.DEFAULT_SSH_NODE_POOLS_PATH,
97
+ kubeconfig_path: str = constants.DEFAULT_KUBECONFIG_PATH,
431
98
  cleanup: bool = True):
432
99
 
433
- kubeconfig_path = kubeconfig_path or DEFAULT_KUBECONFIG_PATH
100
+ kubeconfig_path = kubeconfig_path or constants.DEFAULT_KUBECONFIG_PATH
434
101
  kubeconfig_path = os.path.expanduser(kubeconfig_path)
435
102
 
436
103
  failed_clusters = []
@@ -445,7 +112,7 @@ def deploy_clusters(
445
112
  num_clusters = len(clusters_config)
446
113
  cluster_names = list(clusters_config.keys())
447
114
  cluster_info = f'Found {num_clusters} Node Pool{"s" if num_clusters > 1 else ""}: {", ".join(cluster_names)}'
448
- logger.info(f'{colorama.Fore.CYAN}{cluster_info}{colorama.Style.RESET_ALL}')
115
+ logger.info(f'{colorama.Fore.CYAN}{cluster_info}{RESET_ALL}')
449
116
 
450
117
  # Process each cluster
451
118
  for cluster_name, cluster_config in clusters_config.items():
@@ -457,15 +124,15 @@ def deploy_clusters(
457
124
 
458
125
  if not hosts_info:
459
126
  logger.warning(
460
- f'{RED}Error: No valid hosts found for cluster {cluster_name!r}. Skipping.{NC}'
461
- )
127
+ f'{colorama.Fore.RED}Error: No valid hosts found '
128
+ f'for cluster {cluster_name!r}. Skipping.{RESET_ALL}')
462
129
  continue
463
130
 
464
131
  context_name = f'ssh-{cluster_name}'
465
132
 
466
133
  # Check cluster history
467
- os.makedirs(NODE_POOLS_INFO_DIR, exist_ok=True)
468
- history_yaml_file = os.path.join(NODE_POOLS_INFO_DIR,
134
+ os.makedirs(constants.NODE_POOLS_INFO_DIR, exist_ok=True)
135
+ history_yaml_file = os.path.join(constants.NODE_POOLS_INFO_DIR,
469
136
  f'{context_name}-history.yaml')
470
137
 
471
138
  history = None
@@ -517,7 +184,7 @@ def deploy_clusters(
517
184
  password = head_host['password']
518
185
 
519
186
  # Deploy this cluster
520
- unsuccessful_workers = deploy_cluster(
187
+ unsuccessful_workers = deploy_single_cluster(
521
188
  cluster_name,
522
189
  head_node,
523
190
  worker_nodes,
@@ -556,67 +223,70 @@ def deploy_clusters(
556
223
  except Exception as e: # pylint: disable=broad-except
557
224
  reason = str(e)
558
225
  failed_clusters.append((cluster_name, reason))
226
+ action = 'cleaning' if cleanup else 'deploying'
559
227
  logger.debug(
560
- f'Error deploying SSH Node Pool `{cluster_name}`: {reason}')
228
+ f'Error {action} SSH Node Pool `{cluster_name}`: {reason}')
561
229
 
562
230
  if failed_clusters:
563
231
  action = 'clean' if cleanup else 'deploy'
564
- msg = f'{GREEN}Successfully {action}ed {len(successful_clusters)} cluster(s) ({", ".join(successful_clusters)}). {NC}'
565
- msg += f'{RED}Failed to {action} {len(failed_clusters)} cluster(s): {NC}'
232
+ msg = f'{colorama.Fore.GREEN}Successfully {action}ed {len(successful_clusters)} cluster(s) ({", ".join(successful_clusters)}). {RESET_ALL}'
233
+ msg += f'{colorama.Fore.RED}Failed to {action} {len(failed_clusters)} cluster(s): {RESET_ALL}'
566
234
  for cluster_name, reason in failed_clusters:
567
235
  msg += f'\n {cluster_name}: {reason}'
568
236
  raise RuntimeError(msg)
569
237
 
570
238
 
571
- def deploy_cluster(cluster_name,
572
- head_node,
573
- worker_nodes,
574
- ssh_user,
575
- ssh_key,
576
- context_name,
577
- password,
578
- head_use_ssh_config,
579
- worker_use_ssh_config,
580
- kubeconfig_path,
581
- cleanup,
582
- worker_hosts=None,
583
- history_worker_nodes=None,
584
- history_workers_info=None,
585
- history_use_ssh_config=None) -> List[str]:
239
+ def deploy_single_cluster(cluster_name,
240
+ head_node,
241
+ worker_nodes,
242
+ ssh_user,
243
+ ssh_key,
244
+ context_name,
245
+ password,
246
+ head_use_ssh_config,
247
+ worker_use_ssh_config,
248
+ kubeconfig_path,
249
+ cleanup,
250
+ worker_hosts=None,
251
+ history_worker_nodes=None,
252
+ history_workers_info=None,
253
+ history_use_ssh_config=None) -> List[str]:
586
254
  """Deploy or clean up a single Kubernetes cluster.
587
255
 
588
256
  Returns: List of unsuccessful worker nodes.
589
257
  """
590
- history_yaml_file = os.path.join(NODE_POOLS_INFO_DIR,
258
+ history_yaml_file = os.path.join(constants.NODE_POOLS_INFO_DIR,
591
259
  f'{context_name}-history.yaml')
592
- cert_file_path = os.path.join(NODE_POOLS_INFO_DIR,
260
+ cert_file_path = os.path.join(constants.NODE_POOLS_INFO_DIR,
593
261
  f'{context_name}-cert.pem')
594
- key_file_path = os.path.join(NODE_POOLS_INFO_DIR, f'{context_name}-key.pem')
595
- tunnel_log_file_path = os.path.join(NODE_POOLS_INFO_DIR,
262
+ key_file_path = os.path.join(constants.NODE_POOLS_INFO_DIR,
263
+ f'{context_name}-key.pem')
264
+ tunnel_log_file_path = os.path.join(constants.NODE_POOLS_INFO_DIR,
596
265
  f'{context_name}-tunnel.log')
597
266
 
598
267
  # Generate the askpass block if password is provided
599
268
  askpass_block = create_askpass_script(password)
600
269
 
601
270
  # Token for k3s
602
- k3s_token = 'mytoken' # Any string can be used as the token
271
+ # TODO (kyuds): make this configurable?
272
+ k3s_token = constants.K3S_TOKEN
603
273
 
604
274
  # Pre-flight checks
605
275
  logger.info(f'Checking SSH connection to head node ({head_node})...')
606
- result = run_remote(head_node,
607
- f'echo \'SSH connection successful ({head_node})\'',
608
- ssh_user,
609
- ssh_key,
610
- use_ssh_config=head_use_ssh_config)
611
- if result.startswith('SSH connection successful'):
612
- success_message(f'SSH connection established to head node {head_node}.')
613
-
614
- if not cleanup and result is None:
276
+ result = deploy_utils.run_remote(
277
+ head_node,
278
+ f'echo \'SSH connection successful ({head_node})\'',
279
+ ssh_user,
280
+ ssh_key,
281
+ use_ssh_config=head_use_ssh_config)
282
+ if result is None:
615
283
  with ux_utils.print_exception_no_traceback():
616
284
  raise RuntimeError(
617
285
  f'Failed to SSH to head node ({head_node}). '
618
286
  f'Please check the SSH configuration and logs for more details.'
619
287
  )
288
+ elif result.startswith('SSH connection successful'):
289
+ success_message(f'SSH connection established to head node {head_node}.')
620
290
 
621
291
  # Checking history
622
292
  history_exists = (history_worker_nodes is not None and
@@ -670,47 +340,58 @@ def deploy_cluster(cluster_name,
670
340
  ))
671
341
 
672
342
  # Clean up head node
673
- cleanup_server_node(head_node,
674
- ssh_user,
675
- ssh_key,
676
- askpass_block,
677
- use_ssh_config=head_use_ssh_config)
343
+ cleanup_node(head_node,
344
+ ssh_user,
345
+ ssh_key,
346
+ askpass_block,
347
+ use_ssh_config=head_use_ssh_config,
348
+ is_worker=False)
678
349
  # Clean up worker nodes
679
350
  force_update_status(f'Cleaning up worker nodes [{cluster_name}]')
680
351
  with cf.ThreadPoolExecutor() as executor:
681
- executor.map(lambda kwargs: cleanup_agent_node(**kwargs),
352
+ executor.map(lambda kwargs: cleanup_node(**kwargs),
682
353
  worker_nodes_to_cleanup)
683
354
 
684
355
  with cf.ThreadPoolExecutor() as executor:
685
- executor.map(lambda cmd: run_command(cmd, shell=True),
356
+ executor.map(lambda cmd: deploy_utils.run_command(cmd, shell=True),
686
357
  remove_worker_cmds)
687
358
 
688
359
  if cleanup:
689
-
690
360
  # Remove the context from local kubeconfig if it exists
691
361
  if os.path.isfile(kubeconfig_path):
692
362
  logger.debug(
693
363
  f'Removing context {context_name!r} from local kubeconfig...')
694
- run_command(['kubectl', 'config', 'delete-context', context_name],
695
- shell=False)
696
- run_command(['kubectl', 'config', 'delete-cluster', context_name],
697
- shell=False)
698
- run_command(['kubectl', 'config', 'delete-user', context_name],
699
- shell=False)
364
+ deploy_utils.run_command(
365
+ ['kubectl', 'config', 'delete-context', context_name],
366
+ shell=False,
367
+ silent=True)
368
+ deploy_utils.run_command(
369
+ ['kubectl', 'config', 'delete-cluster', context_name],
370
+ shell=False,
371
+ silent=True)
372
+ deploy_utils.run_command(
373
+ ['kubectl', 'config', 'delete-user', context_name],
374
+ shell=False,
375
+ silent=True)
700
376
 
701
377
  # Update the current context to the first available context
702
- contexts = run_command([
378
+ contexts = deploy_utils.run_command([
703
379
  'kubectl', 'config', 'view', '-o',
704
380
  'jsonpath=\'{.contexts[0].name}\''
705
381
  ],
706
- shell=False)
382
+ shell=False,
383
+ silent=True)
707
384
  if contexts:
708
- run_command(['kubectl', 'config', 'use-context', contexts],
709
- shell=False)
385
+ deploy_utils.run_command(
386
+ ['kubectl', 'config', 'use-context', contexts],
387
+ shell=False,
388
+ silent=True)
710
389
  else:
711
390
  # If no context is available, simply unset the current context
712
- run_command(['kubectl', 'config', 'unset', 'current-context'],
713
- shell=False)
391
+ deploy_utils.run_command(
392
+ ['kubectl', 'config', 'unset', 'current-context'],
393
+ shell=False,
394
+ silent=True)
714
395
 
715
396
  logger.debug(
716
397
  f'Context {context_name!r} removed from local kubeconfig.')
@@ -721,7 +402,7 @@ def deploy_cluster(cluster_name,
721
402
 
722
403
  # Clean up SSH tunnel after clean up kubeconfig, because the kubectl
723
404
  # will restart the ssh tunnel if it's not running.
724
- cleanup_kubectl_ssh_tunnel(cluster_name, context_name)
405
+ tunnel_utils.cleanup_kubectl_ssh_tunnel(cluster_name, context_name)
725
406
 
726
407
  success_message(f'Node Pool `{cluster_name}` cleaned up successfully.')
727
408
  return []
@@ -735,12 +416,12 @@ def deploy_cluster(cluster_name,
735
416
  '/etc/ssh/sshd_config && sudo systemctl restart sshd && '
736
417
  f'echo "Successfully enabled TCP Forwarding on head node ({head_node})."; '
737
418
  'fi')
738
- result = run_remote(head_node,
739
- shlex.quote(cmd),
740
- ssh_user,
741
- ssh_key,
742
- use_ssh_config=head_use_ssh_config,
743
- use_shell=True)
419
+ result = deploy_utils.run_remote(head_node,
420
+ shlex.quote(cmd),
421
+ ssh_user,
422
+ ssh_key,
423
+ use_ssh_config=head_use_ssh_config,
424
+ use_shell=True)
744
425
  if result is None:
745
426
  with ux_utils.print_exception_no_traceback():
746
427
  raise RuntimeError(
@@ -749,10 +430,9 @@ def deploy_cluster(cluster_name,
749
430
 
750
431
  # Get effective IP for master node if using SSH config - needed for workers to connect
751
432
  if head_use_ssh_config:
752
- effective_master_ip = get_effective_host_ip(head_node)
753
- logger.info(
754
- f'{GREEN}Resolved head node {head_node} to {effective_master_ip} from SSH config{NC}'
755
- )
433
+ effective_master_ip = deploy_utils.get_effective_host_ip(head_node)
434
+ logger.info(f'{colorama.Fore.GREEN}Resolved head node {head_node} '
435
+ f'to {effective_master_ip} from SSH config{RESET_ALL}')
756
436
  else:
757
437
  effective_master_ip = head_node
758
438
 
@@ -780,11 +460,11 @@ def deploy_cluster(cluster_name,
780
460
  exit 1
781
461
  fi
782
462
  """
783
- result = run_remote(head_node,
784
- cmd,
785
- ssh_user,
786
- ssh_key,
787
- use_ssh_config=head_use_ssh_config)
463
+ result = deploy_utils.run_remote(head_node,
464
+ cmd,
465
+ ssh_user,
466
+ ssh_key,
467
+ use_ssh_config=head_use_ssh_config)
788
468
  if result is None:
789
469
  with ux_utils.print_exception_no_traceback():
790
470
  raise RuntimeError(
@@ -794,19 +474,19 @@ def deploy_cluster(cluster_name,
794
474
 
795
475
  # Check if head node has a GPU
796
476
  install_gpu = False
797
- if check_gpu(head_node,
798
- ssh_user,
799
- ssh_key,
800
- use_ssh_config=head_use_ssh_config):
801
- logger.info(f'{YELLOW}GPU detected on head node ({head_node}).{NC}')
477
+ if deploy_utils.check_gpu(head_node,
478
+ ssh_user,
479
+ ssh_key,
480
+ use_ssh_config=head_use_ssh_config,
481
+ is_head=True):
802
482
  install_gpu = True
803
483
 
804
484
  # Fetch the head node's internal IP (this will be passed to worker nodes)
805
- master_addr = run_remote(head_node,
806
- 'hostname -I | awk \'{print $1}\'',
807
- ssh_user,
808
- ssh_key,
809
- use_ssh_config=head_use_ssh_config)
485
+ master_addr = deploy_utils.run_remote(head_node,
486
+ 'hostname -I | awk \'{print $1}\'',
487
+ ssh_user,
488
+ ssh_key,
489
+ use_ssh_config=head_use_ssh_config)
810
490
  if master_addr is None:
811
491
  with ux_utils.print_exception_no_traceback():
812
492
  raise RuntimeError(f'Failed to SSH to head node ({head_node}). '
@@ -824,7 +504,7 @@ def deploy_cluster(cluster_name,
824
504
  i] in history_workers_info:
825
505
  logger.info(
826
506
  f'{colorama.Style.DIM}✔ SkyPilot runtime already deployed on worker node {node}. '
827
- f'Skipping...{colorama.Style.RESET_ALL}')
507
+ f'Skipping...{RESET_ALL}')
828
508
  return node, True, False
829
509
  worker_user = worker_hosts[i]['user']
830
510
  worker_key = worker_hosts[i]['identity_file']
@@ -881,10 +561,10 @@ def deploy_cluster(cluster_name,
881
561
  'IdentitiesOnly=yes', '-i', ssh_key,
882
562
  f'{ssh_user}@{head_node}:~/.kube/config', temp_kubeconfig
883
563
  ]
884
- run_command(scp_cmd, shell=False)
564
+ deploy_utils.run_command(scp_cmd, shell=False)
885
565
 
886
566
  # Create the directory for the kubeconfig file if it doesn't exist
887
- ensure_directory_exists(kubeconfig_path)
567
+ deploy_utils.ensure_directory_exists(kubeconfig_path)
888
568
 
889
569
  # Create empty kubeconfig if it doesn't exist
890
570
  if not os.path.isfile(kubeconfig_path):
@@ -993,10 +673,12 @@ def deploy_cluster(cluster_name,
993
673
  )
994
674
  else:
995
675
  logger.error(
996
- f'{RED}Error: Certificate file is empty{NC}')
676
+ f'{colorama.Fore.RED}Error: '
677
+ f'Certificate file is empty{RESET_ALL}')
997
678
  except Exception as e: # pylint: disable=broad-except
998
- logger.error(
999
- f'{RED}Error processing certificate data: {e}{NC}')
679
+ logger.error(f'{colorama.Fore.RED}'
680
+ f'Error processing certificate data: {e}'
681
+ f'{RESET_ALL}')
1000
682
 
1001
683
  if client_key_data:
1002
684
  # Decode base64 data and save as PEM
@@ -1077,28 +759,34 @@ def deploy_cluster(cluster_name,
1077
759
  'Warning: Key may not be in proper PEM format'
1078
760
  )
1079
761
  else:
1080
- logger.error(f'{RED}Error: Key file is empty{NC}')
762
+ logger.error(f'{colorama.Fore.RED}Error: '
763
+ f'Key file is empty{RESET_ALL}')
1081
764
  except Exception as e: # pylint: disable=broad-except
1082
- logger.error(f'{RED}Error processing key data: {e}{NC}')
765
+ logger.error(f'{colorama.Fore.RED}'
766
+ f'Error processing key data: {e}'
767
+ f'{RESET_ALL}')
1083
768
 
1084
769
  # First check if context name exists and delete it if it does
1085
770
  # TODO(romilb): Should we throw an error here instead?
1086
- run_command(['kubectl', 'config', 'delete-context', context_name],
1087
- shell=False,
1088
- silent=True)
1089
- run_command(['kubectl', 'config', 'delete-cluster', context_name],
1090
- shell=False,
1091
- silent=True)
1092
- run_command(['kubectl', 'config', 'delete-user', context_name],
1093
- shell=False,
1094
- silent=True)
771
+ deploy_utils.run_command(
772
+ ['kubectl', 'config', 'delete-context', context_name],
773
+ shell=False,
774
+ silent=True)
775
+ deploy_utils.run_command(
776
+ ['kubectl', 'config', 'delete-cluster', context_name],
777
+ shell=False,
778
+ silent=True)
779
+ deploy_utils.run_command(
780
+ ['kubectl', 'config', 'delete-user', context_name],
781
+ shell=False,
782
+ silent=True)
1095
783
 
1096
784
  # Merge the configurations using kubectl
1097
785
  merged_config = os.path.join(temp_dir, 'merged_config')
1098
786
  os.environ['KUBECONFIG'] = f'{kubeconfig_path}:{modified_config}'
1099
787
  with open(merged_config, 'w', encoding='utf-8') as merged_file:
1100
788
  kubectl_cmd = ['kubectl', 'config', 'view', '--flatten']
1101
- result = run_command(kubectl_cmd, shell=False)
789
+ result = deploy_utils.run_command(kubectl_cmd, shell=False)
1102
790
  if result:
1103
791
  merged_file.write(result)
1104
792
 
@@ -1106,15 +794,17 @@ def deploy_cluster(cluster_name,
1106
794
  shutil.move(merged_config, kubeconfig_path)
1107
795
 
1108
796
  # Set the new context as the current context
1109
- run_command(['kubectl', 'config', 'use-context', context_name],
1110
- shell=False)
797
+ deploy_utils.run_command(
798
+ ['kubectl', 'config', 'use-context', context_name],
799
+ shell=False,
800
+ silent=True)
1111
801
 
1112
802
  # Always set up SSH tunnel since we assume only port 22 is accessible
1113
- setup_kubectl_ssh_tunnel(head_node,
1114
- ssh_user,
1115
- ssh_key,
1116
- context_name,
1117
- use_ssh_config=head_use_ssh_config)
803
+ tunnel_utils.setup_kubectl_ssh_tunnel(head_node,
804
+ ssh_user,
805
+ ssh_key,
806
+ context_name,
807
+ use_ssh_config=head_use_ssh_config)
1118
808
 
1119
809
  logger.debug(f'kubectl configured with new context \'{context_name}\'.')
1120
810
  success_message(f'SkyPilot runtime is up [{cluster_name}].')
@@ -1144,13 +834,14 @@ def deploy_cluster(cluster_name,
1144
834
  done
1145
835
  echo 'GPU operator installed successfully.'
1146
836
  """
1147
- result = run_remote(head_node,
1148
- cmd,
1149
- ssh_user,
1150
- ssh_key,
1151
- use_ssh_config=head_use_ssh_config)
837
+ result = deploy_utils.run_remote(head_node,
838
+ cmd,
839
+ ssh_user,
840
+ ssh_key,
841
+ use_ssh_config=head_use_ssh_config)
1152
842
  if result is None:
1153
- logger.error(f'{RED}Failed to install GPU Operator.{NC}')
843
+ logger.error(f'{colorama.Fore.RED}Failed to install GPU Operator.'
844
+ f'{RESET_ALL}')
1154
845
  else:
1155
846
  success_message('GPU Operator installed.')
1156
847
  else:
@@ -1158,7 +849,7 @@ def deploy_cluster(cluster_name,
1158
849
 
1159
850
  # The env var KUBECONFIG ensures sky check uses the right kubeconfig
1160
851
  os.environ['KUBECONFIG'] = kubeconfig_path
1161
- run_command(['sky', 'check', 'ssh'], shell=False)
852
+ deploy_utils.run_command(['sky', 'check', 'ssh'], shell=False)
1162
853
 
1163
854
  success_message('SkyPilot configured successfully.')
1164
855
 
@@ -1167,11 +858,95 @@ def deploy_cluster(cluster_name,
1167
858
  f'"{worker}"' for worker in unsuccessful_workers
1168
859
  ]
1169
860
 
1170
- logger.info(
1171
- f'{WARNING_YELLOW}Failed to deploy Kubernetes on the following nodes: '
1172
- f'{", ".join(quoted_unsuccessful_workers)}. Please check '
1173
- f'the logs for more details.{NC}')
861
+ logger.info(f'{colorama.Fore.YELLOW}'
862
+ 'Failed to deploy Kubernetes on the following nodes: '
863
+ f'{", ".join(quoted_unsuccessful_workers)}. Please check '
864
+ f'the logs for more details.{RESET_ALL}')
1174
865
  else:
1175
866
  success_message(f'Node Pool `{cluster_name}` deployed successfully.')
1176
867
 
1177
868
  return unsuccessful_workers
869
+
870
+
871
+ def create_askpass_script(password):
872
+ """Create an askpass script block for sudo with password."""
873
+ if not password:
874
+ return ''
875
+
876
+ return f"""
877
+ # Create temporary askpass script
878
+ ASKPASS_SCRIPT=$(mktemp)
879
+ trap 'rm -f $ASKPASS_SCRIPT' EXIT INT TERM ERR QUIT
880
+ cat > $ASKPASS_SCRIPT << EOF
881
+ #!/bin/bash
882
+ echo {password}
883
+ EOF
884
+ chmod 700 $ASKPASS_SCRIPT
885
+ # Use askpass
886
+ export SUDO_ASKPASS=$ASKPASS_SCRIPT
887
+ """
888
+
889
+
890
+ def cleanup_node(node,
891
+ user,
892
+ ssh_key,
893
+ askpass_block,
894
+ use_ssh_config=False,
895
+ is_worker=True):
896
+ """Uninstall k3s and clean up the state on a node."""
897
+ ntype = 'worker' if is_worker else 'head'
898
+ force_update_status(f'Cleaning up {ntype} node ({node})...')
899
+ script = f'k3s{"-agent" if is_worker else ""}-uninstall.sh'
900
+ cmd = f"""
901
+ {askpass_block}
902
+ echo 'Uninstalling k3s...' &&
903
+ sudo -A /usr/local/bin/{script} || true &&
904
+ sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
905
+ """
906
+ result = deploy_utils.run_remote(node,
907
+ cmd,
908
+ user,
909
+ ssh_key,
910
+ use_ssh_config=use_ssh_config)
911
+ if result is None:
912
+ logger.error(f'{colorama.Fore.RED}Failed to clean up {ntype} '
913
+ f'node ({node}).{RESET_ALL}')
914
+ else:
915
+ success_message(f'Node {node} cleaned up successfully.')
916
+
917
+
918
+ def start_agent_node(node,
919
+ master_addr,
920
+ k3s_token,
921
+ user,
922
+ ssh_key,
923
+ askpass_block,
924
+ use_ssh_config=False):
925
+ """Start a k3s agent node.
926
+ Returns: if the start is successful, and whether the node has a GPU."""
927
+ logger.info(f'Deploying worker node ({node}).')
928
+ cmd = f"""
929
+ {askpass_block}
930
+ curl -sfL https://get.k3s.io | K3S_NODE_NAME={node} INSTALL_K3S_EXEC='agent --node-label skypilot-ip={node}' \
931
+ K3S_URL=https://{master_addr}:6443 K3S_TOKEN={k3s_token} sudo -E -A sh -
932
+ """
933
+ result = deploy_utils.run_remote(node,
934
+ cmd,
935
+ user,
936
+ ssh_key,
937
+ use_ssh_config=use_ssh_config)
938
+ if result is None:
939
+ logger.error(f'{colorama.Fore.RED}✗ Failed to deploy K3s on worker '
940
+ f'node ({node}).{RESET_ALL}')
941
+ return node, False, False
942
+ success_message(
943
+ f'SkyPilot runtime successfully deployed on worker node ({node}).')
944
+ # Check if worker node has a GPU
945
+ if deploy_utils.check_gpu(node,
946
+ user,
947
+ ssh_key,
948
+ use_ssh_config=use_ssh_config):
949
+ logger.info(f'{colorama.Fore.YELLOW}GPU detected on worker node '
950
+ f'({node}).{RESET_ALL}')
951
+ return node, True, True
952
+ return node, True, False