skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/authentication.py CHANGED
@@ -12,14 +12,12 @@ in ray yaml config as input,
12
12
  2. Setup the `authorized_keys` on the remote VM with the public key content,
13
13
  by cloud-init or directly using cloud provider's API.
14
14
 
15
- The local machine's public key should not be uploaded to the
16
- `~/.ssh/sky-key.pub` on the remote VM, because it will cause private/public
17
- key pair mismatch when the user tries to launch new VM from that remote VM
18
- using SkyPilot, e.g., the node is used as a jobs controller. (Lambda cloud
19
- is an exception, due to the limitation of the cloud provider. See the
20
- comments in setup_lambda_authentication)
15
+ The local machine's public key should not be uploaded to the remote VM, because
16
+ it will cause private/public key pair mismatch when the user tries to launch new
17
+ VM from that remote VM using SkyPilot, e.g., the node is used as a jobs
18
+ controller. (Lambda cloud is an exception, due to the limitation of the cloud
19
+ provider. See the comments in setup_lambda_authentication)
21
20
  """
22
- import base64
23
21
  import copy
24
22
  import functools
25
23
  import os
@@ -44,10 +42,12 @@ from sky.adaptors import gcp
44
42
  from sky.adaptors import ibm
45
43
  from sky.adaptors import kubernetes
46
44
  from sky.adaptors import runpod
47
- from sky.clouds.utils import lambda_utils
45
+ from sky.adaptors import vast
48
46
  from sky.provision.fluidstack import fluidstack_utils
49
47
  from sky.provision.kubernetes import utils as kubernetes_utils
48
+ from sky.provision.lambda_cloud import lambda_utils
50
49
  from sky.utils import common_utils
50
+ from sky.utils import config_utils
51
51
  from sky.utils import kubernetes_enums
52
52
  from sky.utils import subprocess_utils
53
53
  from sky.utils import ux_utils
@@ -61,9 +61,24 @@ logger = sky_logging.init_logger(__name__)
61
61
 
62
62
  MAX_TRIALS = 64
63
63
  # TODO(zhwu): Support user specified key pair.
64
- PRIVATE_SSH_KEY_PATH = '~/.ssh/sky-key'
65
- PUBLIC_SSH_KEY_PATH = '~/.ssh/sky-key.pub'
66
- _SSH_KEY_GENERATION_LOCK = '~/.sky/generated/ssh/.__internal-sky-key.lock'
64
+ # We intentionally not have the ssh key pair to be stored in
65
+ # ~/.sky/api_server/clients, i.e. sky.server.common.API_SERVER_CLIENT_DIR,
66
+ # because ssh key pair need to persist across API server restarts, while
67
+ # the former dir is empheral.
68
+ _SSH_KEY_PATH_PREFIX = '~/.sky/clients/{user_hash}/ssh'
69
+
70
+
71
+ def get_ssh_key_and_lock_path() -> Tuple[str, str, str]:
72
+ user_hash = common_utils.get_user_hash()
73
+ user_ssh_key_prefix = _SSH_KEY_PATH_PREFIX.format(user_hash=user_hash)
74
+
75
+ os.makedirs(os.path.expanduser(user_ssh_key_prefix),
76
+ exist_ok=True,
77
+ mode=0o700)
78
+ private_key_path = os.path.join(user_ssh_key_prefix, 'sky-key')
79
+ public_key_path = os.path.join(user_ssh_key_prefix, 'sky-key.pub')
80
+ lock_path = os.path.join(user_ssh_key_prefix, '.__internal-sky-key.lock')
81
+ return private_key_path, public_key_path, lock_path
67
82
 
68
83
 
69
84
  def _generate_rsa_key_pair() -> Tuple[str, str]:
@@ -106,16 +121,17 @@ def _save_key_pair(private_key_path: str, public_key_path: str,
106
121
 
107
122
  def get_or_generate_keys() -> Tuple[str, str]:
108
123
  """Returns the aboslute private and public key paths."""
109
- private_key_path = os.path.expanduser(PRIVATE_SSH_KEY_PATH)
110
- public_key_path = os.path.expanduser(PUBLIC_SSH_KEY_PATH)
124
+ private_key_path, public_key_path, lock_path = get_ssh_key_and_lock_path()
125
+ private_key_path = os.path.expanduser(private_key_path)
126
+ public_key_path = os.path.expanduser(public_key_path)
127
+ lock_path = os.path.expanduser(lock_path)
111
128
 
112
- key_file_lock = os.path.expanduser(_SSH_KEY_GENERATION_LOCK)
113
- lock_dir = os.path.dirname(key_file_lock)
129
+ lock_dir = os.path.dirname(lock_path)
114
130
  # We should have the folder ~/.sky/generated/ssh to have 0o700 permission,
115
131
  # as the ssh configs will be written to this folder as well in
116
132
  # backend_utils.SSHConfigHelper
117
133
  os.makedirs(lock_dir, exist_ok=True, mode=0o700)
118
- with filelock.FileLock(key_file_lock, timeout=10):
134
+ with filelock.FileLock(lock_path, timeout=10):
119
135
  if not os.path.exists(private_key_path):
120
136
  public_key, private_key = _generate_rsa_key_pair()
121
137
  _save_key_pair(private_key_path, public_key_path, private_key,
@@ -270,43 +286,13 @@ def setup_gcp_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
270
286
  return configure_ssh_info(config)
271
287
 
272
288
 
273
- # In Azure, cloud-init script must be encoded in base64. See
274
- # https://learn.microsoft.com/en-us/azure/virtual-machines/custom-data
275
- # for more information. Here we decode it and replace the ssh user
276
- # and public key content, then encode it back.
277
- def setup_azure_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
278
- _, public_key_path = get_or_generate_keys()
279
- with open(public_key_path, 'r', encoding='utf-8') as f:
280
- public_key = f.read().strip()
281
- for node_type in config['available_node_types']:
282
- node_config = config['available_node_types'][node_type]['node_config']
283
- cloud_init = (
284
- node_config['azure_arm_parameters']['cloudInitSetupCommands'])
285
- cloud_init = base64.b64decode(cloud_init).decode('utf-8')
286
- cloud_init = cloud_init.replace('skypilot:ssh_user',
287
- config['auth']['ssh_user'])
288
- cloud_init = cloud_init.replace('skypilot:ssh_public_key_content',
289
- public_key)
290
- cloud_init = base64.b64encode(
291
- cloud_init.encode('utf-8')).decode('utf-8')
292
- node_config['azure_arm_parameters']['cloudInitSetupCommands'] = (
293
- cloud_init)
294
- config_str = common_utils.dump_yaml_str(config)
295
- config_str = config_str.replace('skypilot:ssh_user',
296
- config['auth']['ssh_user'])
297
- config_str = config_str.replace('skypilot:ssh_public_key_content',
298
- public_key)
299
- config = yaml.safe_load(config_str)
300
- return config
301
-
302
-
303
289
  def setup_lambda_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
304
290
 
305
291
  get_or_generate_keys()
306
292
 
307
293
  # Ensure ssh key is registered with Lambda Cloud
308
294
  lambda_client = lambda_utils.LambdaCloudClient()
309
- public_key_path = os.path.expanduser(PUBLIC_SSH_KEY_PATH)
295
+ _, public_key_path = get_or_generate_keys()
310
296
  with open(public_key_path, 'r', encoding='utf-8') as f:
311
297
  public_key = f.read().strip()
312
298
  prefix = f'sky-key-{common_utils.get_user_hash()}'
@@ -314,26 +300,16 @@ def setup_lambda_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
314
300
  if not exists:
315
301
  lambda_client.register_ssh_key(name, public_key)
316
302
 
317
- # Need to use ~ relative path because Ray uses the same
318
- # path for finding the public key path on both local and head node.
319
- config['auth']['ssh_public_key'] = PUBLIC_SSH_KEY_PATH
320
-
321
- # TODO(zhwu): we need to avoid uploading the public ssh key to the
322
- # nodes, as that will cause problem when the node is used as spot
323
- # controller, i.e., the public and private key on the node may
324
- # not match.
325
- file_mounts = config['file_mounts']
326
- file_mounts[PUBLIC_SSH_KEY_PATH] = PUBLIC_SSH_KEY_PATH
327
- config['file_mounts'] = file_mounts
328
-
303
+ config['auth']['remote_key_name'] = name
329
304
  return config
330
305
 
331
306
 
332
- def setup_ibm_authentication(config):
307
+ def setup_ibm_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
333
308
  """ registers keys if they do not exist in sky folder
334
309
  and updates config file.
335
310
  keys default location: '~/.ssh/sky-key' and '~/.ssh/sky-key.pub'
336
311
  """
312
+ private_key_path, _ = get_or_generate_keys()
337
313
 
338
314
  def _get_unique_key_name():
339
315
  suffix_len = 10
@@ -373,17 +349,11 @@ def setup_ibm_authentication(config):
373
349
  else:
374
350
  raise Exception('Failed to register a key') from e
375
351
 
376
- config['auth']['ssh_private_key'] = PRIVATE_SSH_KEY_PATH
352
+ config['auth']['ssh_private_key'] = private_key_path
377
353
 
378
354
  for node_type in config['available_node_types']:
379
355
  config['available_node_types'][node_type]['node_config'][
380
356
  'key_id'] = vpc_key_id
381
-
382
- # Add public key path to file mounts
383
- file_mounts = config['file_mounts']
384
- file_mounts[PUBLIC_SSH_KEY_PATH] = PUBLIC_SSH_KEY_PATH
385
- config['file_mounts'] = file_mounts
386
-
387
357
  return config
388
358
 
389
359
 
@@ -403,13 +373,19 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
403
373
  with ux_utils.print_exception_no_traceback():
404
374
  raise ValueError(str(e) + ' Please check: ~/.sky/config.yaml.') \
405
375
  from None
406
- get_or_generate_keys()
376
+ _, public_key_path = get_or_generate_keys()
407
377
 
408
378
  # Add the user's public key to the SkyPilot cluster.
409
- public_key_path = os.path.expanduser(PUBLIC_SSH_KEY_PATH)
410
379
  secret_name = clouds.Kubernetes.SKY_SSH_KEY_SECRET_NAME
411
380
  secret_field_name = clouds.Kubernetes().ssh_key_secret_field_name
412
- namespace = kubernetes_utils.get_current_kube_config_context_namespace()
381
+ context = config['provider'].get(
382
+ 'context', kubernetes_utils.get_current_kube_config_context_name())
383
+ if context == kubernetes.in_cluster_context_name():
384
+ # If the context is an in-cluster context name, we are running in a pod
385
+ # with in-cluster configuration. We need to set the context to None
386
+ # to use the mounted service account.
387
+ context = None
388
+ namespace = kubernetes_utils.get_namespace_from_config(config['provider'])
413
389
  k8s = kubernetes.kubernetes
414
390
  with open(public_key_path, 'r', encoding='utf-8') as f:
415
391
  public_key = f.read()
@@ -425,44 +401,71 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
425
401
  }
426
402
  custom_metadata = skypilot_config.get_nested(
427
403
  ('kubernetes', 'custom_metadata'), {})
428
- kubernetes_utils.merge_dicts(custom_metadata, secret_metadata)
404
+ config_utils.merge_k8s_configs(secret_metadata, custom_metadata)
429
405
 
430
406
  secret = k8s.client.V1Secret(
431
407
  metadata=k8s.client.V1ObjectMeta(**secret_metadata),
432
408
  string_data={secret_field_name: public_key})
433
- if kubernetes_utils.check_secret_exists(secret_name, namespace):
434
- logger.debug(f'Key {secret_name} exists in the cluster, patching it...')
435
- kubernetes.core_api().patch_namespaced_secret(secret_name, namespace,
436
- secret)
437
- else:
438
- logger.debug(
439
- f'Key {secret_name} does not exist in the cluster, creating it...')
440
- kubernetes.core_api().create_namespaced_secret(namespace, secret)
409
+ try:
410
+ if kubernetes_utils.check_secret_exists(secret_name, namespace,
411
+ context):
412
+ logger.debug(f'Key {secret_name} exists in the cluster, '
413
+ 'patching it...')
414
+ kubernetes.core_api(context).patch_namespaced_secret(
415
+ secret_name, namespace, secret)
416
+ else:
417
+ logger.debug(f'Key {secret_name} does not exist in the cluster, '
418
+ 'creating it...')
419
+ kubernetes.core_api(context).create_namespaced_secret(
420
+ namespace, secret)
421
+ except kubernetes.api_exception() as e:
422
+ if e.status == 409 and e.reason == 'AlreadyExists':
423
+ logger.debug(f'Key {secret_name} was created concurrently, '
424
+ 'patching it...')
425
+ kubernetes.core_api(context).patch_namespaced_secret(
426
+ secret_name, namespace, secret)
427
+ else:
428
+ raise e
441
429
 
442
- ssh_jump_name = clouds.Kubernetes.SKY_SSH_JUMP_NAME
430
+ private_key_path, _ = get_or_generate_keys()
443
431
  if network_mode == nodeport_mode:
432
+ ssh_jump_name = clouds.Kubernetes.SKY_SSH_JUMP_NAME
444
433
  service_type = kubernetes_enums.KubernetesServiceType.NODEPORT
434
+ # Setup service for SSH jump pod. We create the SSH jump service here
435
+ # because we need to know the service IP address and port to set the
436
+ # ssh_proxy_command in the autoscaler config.
437
+ kubernetes_utils.setup_ssh_jump_svc(ssh_jump_name, namespace, context,
438
+ service_type)
439
+ ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
440
+ ssh_jump_name,
441
+ nodeport_mode,
442
+ private_key_path=private_key_path,
443
+ context=context,
444
+ namespace=namespace)
445
445
  elif network_mode == port_forward_mode:
446
+ # Using `kubectl port-forward` creates a direct tunnel to the pod and
447
+ # does not require a ssh jump pod.
446
448
  kubernetes_utils.check_port_forward_mode_dependencies()
447
- # Using `kubectl port-forward` creates a direct tunnel to jump pod and
448
- # does not require opening any ports on Kubernetes nodes. As a result,
449
- # the service can be a simple ClusterIP service which we access with
450
- # `kubectl port-forward`.
451
- service_type = kubernetes_enums.KubernetesServiceType.CLUSTERIP
449
+ # TODO(romilb): This can be further optimized. Instead of using the
450
+ # head node as a jump pod for worker nodes, we can also directly
451
+ # set the ssh_target to the worker node. However, that requires
452
+ # changes in the downstream code to return a mapping of node IPs to
453
+ # pod names (to be used as ssh_target) and updating the upstream
454
+ # SSHConfigHelper to use a different ProxyCommand for each pod.
455
+ # This optimization can reduce SSH time from ~0.35s to ~0.25s, tested
456
+ # on GKE.
457
+ ssh_target = config['cluster_name'] + '-head'
458
+ ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
459
+ ssh_target,
460
+ port_forward_mode,
461
+ private_key_path=private_key_path,
462
+ context=context,
463
+ namespace=namespace)
452
464
  else:
453
465
  # This should never happen because we check for this in from_str above.
454
466
  raise ValueError(f'Unsupported networking mode: {network_mode_str}')
455
- # Setup service for SSH jump pod. We create the SSH jump service here
456
- # because we need to know the service IP address and port to set the
457
- # ssh_proxy_command in the autoscaler config.
458
- kubernetes_utils.setup_ssh_jump_svc(ssh_jump_name, namespace, service_type)
459
-
460
- ssh_proxy_cmd = kubernetes_utils.get_ssh_proxy_command(
461
- PRIVATE_SSH_KEY_PATH, ssh_jump_name, network_mode, namespace,
462
- clouds.Kubernetes.PORT_FORWARD_PROXY_CMD_PATH,
463
- clouds.Kubernetes.PORT_FORWARD_PROXY_CMD_TEMPLATE)
464
-
465
467
  config['auth']['ssh_proxy_command'] = ssh_proxy_cmd
468
+ config['auth']['ssh_private_key'] = private_key_path
466
469
 
467
470
  return config
468
471
 
@@ -481,15 +484,31 @@ def setup_runpod_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
481
484
  return configure_ssh_info(config)
482
485
 
483
486
 
487
+ def setup_vast_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
488
+ """Sets up SSH authentication for Vast.
489
+ - Generates a new SSH key pair if one does not exist.
490
+ - Adds the public SSH key to the user's Vast account.
491
+ """
492
+ _, public_key_path = get_or_generate_keys()
493
+ with open(public_key_path, 'r', encoding='UTF-8') as pub_key_file:
494
+ public_key = pub_key_file.read().strip()
495
+ current_key_list = vast.vast().show_ssh_keys() # pylint: disable=assignment-from-no-return
496
+ # Only add an ssh key if it hasn't already been added
497
+ if not any(x['public_key'] == public_key for x in current_key_list):
498
+ vast.vast().create_ssh_key(ssh_key=public_key)
499
+
500
+ config['auth']['ssh_public_key'] = public_key_path
501
+ return configure_ssh_info(config)
502
+
503
+
484
504
  def setup_fluidstack_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
485
505
 
486
- get_or_generate_keys()
506
+ _, public_key_path = get_or_generate_keys()
487
507
 
488
508
  client = fluidstack_utils.FluidstackClient()
489
- public_key_path = os.path.expanduser(PUBLIC_SSH_KEY_PATH)
490
509
  public_key = None
491
510
  with open(public_key_path, 'r', encoding='utf-8') as f:
492
511
  public_key = f.read()
493
512
  client.get_or_add_ssh_key(public_key)
494
- config['auth']['ssh_public_key'] = PUBLIC_SSH_KEY_PATH
513
+ config['auth']['ssh_public_key'] = public_key_path
495
514
  return configure_ssh_info(config)
sky/backends/backend.py CHANGED
@@ -2,9 +2,11 @@
2
2
  import typing
3
3
  from typing import Dict, Generic, Optional
4
4
 
5
- import sky
6
5
  from sky.usage import usage_lib
6
+ from sky.utils import cluster_utils
7
+ from sky.utils import rich_utils
7
8
  from sky.utils import timeline
9
+ from sky.utils import ux_utils
8
10
 
9
11
  if typing.TYPE_CHECKING:
10
12
  from sky import resources
@@ -43,19 +45,45 @@ class Backend(Generic[_ResourceHandleType]):
43
45
  @timeline.event
44
46
  @usage_lib.messages.usage.update_runtime('provision')
45
47
  def provision(
46
- self,
47
- task: 'task_lib.Task',
48
- to_provision: Optional['resources.Resources'],
49
- dryrun: bool,
50
- stream_logs: bool,
51
- cluster_name: Optional[str] = None,
52
- retry_until_up: bool = False) -> Optional[_ResourceHandleType]:
48
+ self,
49
+ task: 'task_lib.Task',
50
+ to_provision: Optional['resources.Resources'],
51
+ dryrun: bool,
52
+ stream_logs: bool,
53
+ cluster_name: Optional[str] = None,
54
+ retry_until_up: bool = False,
55
+ skip_unnecessary_provisioning: bool = False,
56
+ ) -> Optional[_ResourceHandleType]:
57
+ """Provisions resources for the given task.
58
+
59
+ Args:
60
+ task: The task to provision resources for.
61
+ to_provision: Resource config to provision. Should only be None if
62
+ cluster_name refers to an existing cluster, whose resources will
63
+ be used.
64
+ dryrun: If True, don't actually provision anything.
65
+ stream_logs: If True, stream additional logs to console.
66
+ cluster_name: Name of the cluster to provision. If None, a name will
67
+ be auto-generated. If the name refers to an existing cluster,
68
+ the existing cluster will be reused and re-provisioned.
69
+ retry_until_up: If True, retry provisioning until resources are
70
+ successfully launched.
71
+ skip_if_no_cluster_updates: If True, compare the cluster config to
72
+ the existing cluster_name's config. Skip provisioning if no
73
+ updates are needed for the existing cluster.
74
+
75
+ Returns:
76
+ A ResourceHandle object for the provisioned resources, or None if
77
+ dryrun is True.
78
+ """
53
79
  if cluster_name is None:
54
- cluster_name = sky.backends.backend_utils.generate_cluster_name()
80
+ cluster_name = cluster_utils.generate_cluster_name()
55
81
  usage_lib.record_cluster_name_for_current_operation(cluster_name)
56
82
  usage_lib.messages.usage.update_actual_task(task)
57
- return self._provision(task, to_provision, dryrun, stream_logs,
58
- cluster_name, retry_until_up)
83
+ with rich_utils.safe_status(ux_utils.spinner_message('Launching')):
84
+ return self._provision(task, to_provision, dryrun, stream_logs,
85
+ cluster_name, retry_until_up,
86
+ skip_unnecessary_provisioning)
59
87
 
60
88
  @timeline.event
61
89
  @usage_lib.messages.usage.update_runtime('sync_workdir')
@@ -76,7 +104,8 @@ class Backend(Generic[_ResourceHandleType]):
76
104
  @usage_lib.messages.usage.update_runtime('setup')
77
105
  def setup(self, handle: _ResourceHandleType, task: 'task_lib.Task',
78
106
  detach_setup: bool) -> None:
79
- return self._setup(handle, task, detach_setup)
107
+ with rich_utils.safe_status(ux_utils.spinner_message('Running setup')):
108
+ return self._setup(handle, task, detach_setup)
80
109
 
81
110
  def add_storage_objects(self, task: 'task_lib.Task') -> None:
82
111
  raise NotImplementedError
@@ -96,7 +125,8 @@ class Backend(Generic[_ResourceHandleType]):
96
125
  usage_lib.record_cluster_name_for_current_operation(
97
126
  handle.get_cluster_name())
98
127
  usage_lib.messages.usage.update_actual_task(task)
99
- return self._execute(handle, task, detach_run, dryrun)
128
+ with rich_utils.safe_status(ux_utils.spinner_message('Submitting job')):
129
+ return self._execute(handle, task, detach_run, dryrun)
100
130
 
101
131
  @timeline.event
102
132
  def post_execute(self, handle: _ResourceHandleType, down: bool) -> None:
@@ -121,13 +151,15 @@ class Backend(Generic[_ResourceHandleType]):
121
151
 
122
152
  # --- Implementations of the APIs ---
123
153
  def _provision(
124
- self,
125
- task: 'task_lib.Task',
126
- to_provision: Optional['resources.Resources'],
127
- dryrun: bool,
128
- stream_logs: bool,
129
- cluster_name: str,
130
- retry_until_up: bool = False) -> Optional[_ResourceHandleType]:
154
+ self,
155
+ task: 'task_lib.Task',
156
+ to_provision: Optional['resources.Resources'],
157
+ dryrun: bool,
158
+ stream_logs: bool,
159
+ cluster_name: str,
160
+ retry_until_up: bool = False,
161
+ skip_unnecessary_provisioning: bool = False,
162
+ ) -> Optional[_ResourceHandleType]:
131
163
  raise NotImplementedError
132
164
 
133
165
  def _sync_workdir(self, handle: _ResourceHandleType, workdir: Path) -> None: