skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,7 @@
2
2
  from datetime import datetime
3
3
  import enum
4
4
  import fnmatch
5
- import functools
5
+ import hashlib
6
6
  import os
7
7
  import pathlib
8
8
  import pprint
@@ -11,7 +11,6 @@ import shlex
11
11
  import subprocess
12
12
  import sys
13
13
  import tempfile
14
- import textwrap
15
14
  import time
16
15
  import typing
17
16
  from typing import Any, Dict, List, Optional, Sequence, Set, Tuple, Union
@@ -37,26 +36,27 @@ from sky import global_user_state
37
36
  from sky import provision as provision_lib
38
37
  from sky import sky_logging
39
38
  from sky import skypilot_config
40
- from sky import status_lib
41
- from sky.clouds import cloud_registry
42
39
  from sky.provision import instance_setup
43
40
  from sky.provision.kubernetes import utils as kubernetes_utils
44
41
  from sky.skylet import constants
45
42
  from sky.usage import usage_lib
46
- from sky.utils import cluster_yaml_utils
43
+ from sky.utils import cluster_utils
47
44
  from sky.utils import command_runner
45
+ from sky.utils import common
48
46
  from sky.utils import common_utils
49
47
  from sky.utils import controller_utils
50
48
  from sky.utils import env_options
49
+ from sky.utils import registry
51
50
  from sky.utils import resources_utils
52
51
  from sky.utils import rich_utils
53
52
  from sky.utils import schemas
53
+ from sky.utils import status_lib
54
54
  from sky.utils import subprocess_utils
55
55
  from sky.utils import timeline
56
56
  from sky.utils import ux_utils
57
57
 
58
58
  if typing.TYPE_CHECKING:
59
- from sky import resources
59
+ from sky import resources as resources_lib
60
60
  from sky import task as task_lib
61
61
  from sky.backends import cloud_vm_ray_backend
62
62
  from sky.backends import local_docker_backend
@@ -68,10 +68,6 @@ SKY_REMOTE_APP_DIR = '~/.sky/sky_app'
68
68
  # Exclude subnet mask from IP address regex.
69
69
  IP_ADDR_REGEX = r'\b\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?!/\d{1,2})\b'
70
70
  SKY_REMOTE_PATH = '~/.sky/wheels'
71
- SKY_USER_FILE_PATH = '~/.sky/generated'
72
-
73
- BOLD = '\033[1m'
74
- RESET_BOLD = '\033[0m'
75
71
 
76
72
  # Do not use /tmp because it gets cleared on VM restart.
77
73
  _SKY_REMOTE_FILE_MOUNTS_DIR = '~/.sky/file_mounts/'
@@ -103,6 +99,10 @@ DEFAULT_TASK_CPU_DEMAND = 0.5
103
99
  CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock')
104
100
  CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
105
101
 
102
+ # Time that must elapse since the last status check before we should re-check if
103
+ # the cluster has been terminated or autostopped.
104
+ _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
105
+
106
106
  # Filelocks for updating cluster's file_mounts.
107
107
  CLUSTER_FILE_MOUNTS_LOCK_PATH = os.path.expanduser(
108
108
  '~/.sky/.{}_file_mounts.lock')
@@ -114,6 +114,16 @@ _REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
114
114
  _ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
115
115
  'please retry after a while.')
116
116
 
117
+ # If a cluster is less than LAUNCH_DOUBLE_CHECK_WINDOW seconds old, and we don't
118
+ # see any instances in the cloud, the instances might be in the proccess of
119
+ # being created. We will wait LAUNCH_DOUBLE_CHECK_DELAY seconds and then double
120
+ # check to make sure there are still no instances. LAUNCH_DOUBLE_CHECK_DELAY
121
+ # should be set longer than the delay between (sending the create instance
122
+ # request) and (the instances appearing on the cloud).
123
+ # See https://github.com/skypilot-org/skypilot/issues/4431.
124
+ _LAUNCH_DOUBLE_CHECK_WINDOW = 60
125
+ _LAUNCH_DOUBLE_CHECK_DELAY = 1
126
+
117
127
  # Include the fields that will be used for generating tags that distinguishes
118
128
  # the cluster in ray, to avoid the stopped cluster being discarded due to
119
129
  # updates in the yaml template.
@@ -146,6 +156,7 @@ _RAY_YAML_KEYS_TO_RESTORE_EXCEPTIONS = [
146
156
  # Clouds with new provisioner has docker_login_config in the
147
157
  # docker field, instead of the provider field.
148
158
  ('docker', 'docker_login_config'),
159
+ ('docker', 'run_options'),
149
160
  # Other clouds
150
161
  ('provider', 'docker_login_config'),
151
162
  ('provider', 'firewall_rule'),
@@ -154,8 +165,21 @@ _RAY_YAML_KEYS_TO_RESTORE_EXCEPTIONS = [
154
165
  # we need to take this field from the new yaml.
155
166
  ('provider', 'tpu_node'),
156
167
  ('provider', 'security_group', 'GroupName'),
168
+ ('available_node_types', 'ray.head.default', 'node_config',
169
+ 'IamInstanceProfile'),
157
170
  ('available_node_types', 'ray.head.default', 'node_config', 'UserData'),
158
- ('available_node_types', 'ray.worker.default', 'node_config', 'UserData'),
171
+ ('available_node_types', 'ray.head.default', 'node_config',
172
+ 'azure_arm_parameters', 'cloudInitSetupCommands'),
173
+ ]
174
+ # These keys are expected to change when provisioning on an existing cluster,
175
+ # but they don't actually represent a change that requires re-provisioning the
176
+ # cluster. If the cluster yaml is the same except for these keys, we can safely
177
+ # skip reprovisioning. See _deterministic_cluster_yaml_hash.
178
+ _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
179
+ # On first launch, availability_zones will include all possible zones. Once
180
+ # the cluster exists, it will only include the zone that the cluster is
181
+ # actually in.
182
+ ('provider', 'availability_zone'),
159
183
  ]
160
184
 
161
185
 
@@ -165,13 +189,17 @@ def is_ip(s: str) -> bool:
165
189
 
166
190
 
167
191
  def _get_yaml_path_from_cluster_name(cluster_name: str,
168
- prefix: str = SKY_USER_FILE_PATH) -> str:
192
+ prefix: str = constants.SKY_USER_FILE_PATH
193
+ ) -> str:
169
194
  output_path = pathlib.Path(
170
195
  prefix).expanduser().resolve() / f'{cluster_name}.yml'
171
196
  os.makedirs(output_path.parents[0], exist_ok=True)
172
197
  return str(output_path)
173
198
 
174
199
 
200
+ # Add retry for the file mounts optimization, as the underlying cp command may
201
+ # experience transient errors, #4758.
202
+ @common_utils.retry
175
203
  def _optimize_file_mounts(yaml_path: str) -> None:
176
204
  """Optimize file mounts in the given ray yaml file.
177
205
 
@@ -181,6 +209,10 @@ def _optimize_file_mounts(yaml_path: str) -> None:
181
209
  - wheel
182
210
  - credentials
183
211
  Format is {dst: src}.
212
+
213
+ Raises:
214
+ subprocess.CalledProcessError: If the file mounts are failed to be
215
+ copied.
184
216
  """
185
217
  yaml_config = common_utils.read_yaml(yaml_path)
186
218
 
@@ -276,18 +308,22 @@ def path_size_megabytes(path: str) -> int:
276
308
  If successful: the size of 'path' in megabytes, rounded down. Otherwise,
277
309
  -1.
278
310
  """
279
- resolved_path = pathlib.Path(path).expanduser().resolve()
280
311
  git_exclude_filter = ''
281
- if (resolved_path / command_runner.GIT_EXCLUDE).exists():
282
- # Ensure file exists; otherwise, rsync will error out.
283
- #
284
- # We shlex.quote() because the path may contain spaces:
285
- # 'my dir/.git/info/exclude'
286
- # Without quoting rsync fails.
287
- git_exclude_filter = command_runner.RSYNC_EXCLUDE_OPTION.format(
288
- shlex.quote(str(resolved_path / command_runner.GIT_EXCLUDE)))
312
+ resolved_path = pathlib.Path(path).expanduser().resolve()
313
+ if (resolved_path / constants.SKY_IGNORE_FILE).exists():
314
+ rsync_filter = command_runner.RSYNC_FILTER_SKYIGNORE
315
+ else:
316
+ rsync_filter = command_runner.RSYNC_FILTER_GITIGNORE
317
+ if (resolved_path / command_runner.GIT_EXCLUDE).exists():
318
+ # Ensure file exists; otherwise, rsync will error out.
319
+ #
320
+ # We shlex.quote() because the path may contain spaces:
321
+ # 'my dir/.git/info/exclude'
322
+ # Without quoting rsync fails.
323
+ git_exclude_filter = command_runner.RSYNC_EXCLUDE_OPTION.format(
324
+ shlex.quote(str(resolved_path / command_runner.GIT_EXCLUDE)))
289
325
  rsync_command = (f'rsync {command_runner.RSYNC_DISPLAY_OPTION} '
290
- f'{command_runner.RSYNC_FILTER_OPTION} '
326
+ f'{rsync_filter} '
291
327
  f'{git_exclude_filter} --dry-run {path!r}')
292
328
  rsync_output = ''
293
329
  try:
@@ -391,304 +427,6 @@ class FileMountHelper(object):
391
427
  return ' && '.join(commands)
392
428
 
393
429
 
394
- class SSHConfigHelper(object):
395
- """Helper for handling local SSH configuration."""
396
-
397
- ssh_conf_path = '~/.ssh/config'
398
- ssh_conf_lock_path = os.path.expanduser('~/.sky/ssh_config.lock')
399
- ssh_cluster_path = SKY_USER_FILE_PATH + '/ssh/{}'
400
-
401
- @classmethod
402
- def _get_generated_config(cls, autogen_comment: str, host_name: str,
403
- ip: str, username: str, ssh_key_path: str,
404
- proxy_command: Optional[str], port: int,
405
- docker_proxy_command: Optional[str]):
406
- if proxy_command is not None:
407
- # Already checked in resources
408
- assert docker_proxy_command is None, (
409
- 'Cannot specify both proxy_command and docker_proxy_command.')
410
- proxy = f'ProxyCommand {proxy_command}'
411
- elif docker_proxy_command is not None:
412
- proxy = f'ProxyCommand {docker_proxy_command}'
413
- else:
414
- proxy = ''
415
- # StrictHostKeyChecking=no skips the host key check for the first
416
- # time. UserKnownHostsFile=/dev/null and GlobalKnownHostsFile/dev/null
417
- # prevent the host key from being added to the known_hosts file and
418
- # always return an empty file for known hosts, making the ssh think
419
- # this is a first-time connection, and thus skipping the host key
420
- # check.
421
- codegen = textwrap.dedent(f"""\
422
- {autogen_comment}
423
- Host {host_name}
424
- HostName {ip}
425
- User {username}
426
- IdentityFile {ssh_key_path}
427
- IdentitiesOnly yes
428
- ForwardAgent yes
429
- StrictHostKeyChecking no
430
- UserKnownHostsFile=/dev/null
431
- GlobalKnownHostsFile=/dev/null
432
- Port {port}
433
- {proxy}
434
- """.rstrip())
435
- codegen = codegen + '\n'
436
- return codegen
437
-
438
- @classmethod
439
- @timeline.FileLockEvent(ssh_conf_lock_path)
440
- def add_cluster(
441
- cls,
442
- cluster_name: str,
443
- ips: List[str],
444
- auth_config: Dict[str, str],
445
- ports: List[int],
446
- docker_user: Optional[str] = None,
447
- ssh_user: Optional[str] = None,
448
- ):
449
- """Add authentication information for cluster to local SSH config file.
450
-
451
- If a host with `cluster_name` already exists and the configuration was
452
- not added by sky, then `ip` is used to identify the host instead in the
453
- file.
454
-
455
- If a host with `cluster_name` already exists and the configuration was
456
- added by sky (e.g. a spot instance), then the configuration is
457
- overwritten.
458
-
459
- Args:
460
- cluster_name: Cluster name (see `sky status`)
461
- ips: List of public IP addresses in the cluster. First IP is head
462
- node.
463
- auth_config: read_yaml(handle.cluster_yaml)['auth']
464
- ports: List of port numbers for SSH corresponding to ips
465
- docker_user: If not None, use this user to ssh into the docker
466
- ssh_user: Override the ssh_user in auth_config
467
- """
468
- if ssh_user is None:
469
- username = auth_config['ssh_user']
470
- else:
471
- username = ssh_user
472
- if docker_user is not None:
473
- username = docker_user
474
- key_path = os.path.expanduser(auth_config['ssh_private_key'])
475
- sky_autogen_comment = ('# Added by sky (use `sky stop/down '
476
- f'{cluster_name}` to remove)')
477
- ip = ips[0]
478
- if docker_user is not None:
479
- ip = 'localhost'
480
-
481
- config_path = os.path.expanduser(cls.ssh_conf_path)
482
-
483
- # For backward compatibility: before #2706, we wrote the config of SkyPilot clusters
484
- # directly in ~/.ssh/config. For these clusters, we remove the config in ~/.ssh/config
485
- # and write/overwrite the config in ~/.sky/ssh/<cluster_name> instead.
486
- cls._remove_stale_cluster_config_for_backward_compatibility(
487
- cluster_name, ip, auth_config, docker_user)
488
-
489
- if not os.path.exists(config_path):
490
- config = ['\n']
491
- with open(config_path,
492
- 'w',
493
- encoding='utf-8',
494
- opener=functools.partial(os.open, mode=0o644)) as f:
495
- f.writelines(config)
496
-
497
- with open(config_path, 'r', encoding='utf-8') as f:
498
- config = f.readlines()
499
-
500
- ssh_dir = cls.ssh_cluster_path.format('')
501
- os.makedirs(os.path.expanduser(ssh_dir), exist_ok=True, mode=0o700)
502
-
503
- # Handle Include on top of Config file
504
- include_str = f'Include {cls.ssh_cluster_path.format("*")}'
505
- found = False
506
- for i, line in enumerate(config):
507
- config_str = line.strip()
508
- if config_str == include_str:
509
- found = True
510
- break
511
- if 'Host' in config_str:
512
- break
513
- if not found:
514
- # Did not find Include string. Insert `Include` lines.
515
- with open(config_path, 'w', encoding='utf-8') as f:
516
- config.insert(
517
- 0,
518
- f'# Added by SkyPilot for ssh config of all clusters\n{include_str}\n'
519
- )
520
- f.write(''.join(config).strip())
521
- f.write('\n' * 2)
522
-
523
- proxy_command = auth_config.get('ssh_proxy_command', None)
524
-
525
- docker_proxy_command_generator = None
526
- if docker_user is not None:
527
- docker_proxy_command_generator = lambda ip, port: ' '.join(
528
- ['ssh'] + command_runner.ssh_options_list(
529
- key_path, ssh_control_name=None, port=port) +
530
- ['-W', '%h:%p', f'{auth_config["ssh_user"]}@{ip}'])
531
-
532
- codegen = ''
533
- # Add the nodes to the codegen
534
- for i, ip in enumerate(ips):
535
- docker_proxy_command = None
536
- port = ports[i]
537
- if docker_proxy_command_generator is not None:
538
- docker_proxy_command = docker_proxy_command_generator(ip, port)
539
- ip = 'localhost'
540
- port = constants.DEFAULT_DOCKER_PORT
541
- node_name = cluster_name if i == 0 else cluster_name + f'-worker{i}'
542
- # TODO(romilb): Update port number when k8s supports multinode
543
- codegen += cls._get_generated_config(
544
- sky_autogen_comment, node_name, ip, username, key_path,
545
- proxy_command, port, docker_proxy_command) + '\n'
546
-
547
- cluster_config_path = os.path.expanduser(
548
- cls.ssh_cluster_path.format(cluster_name))
549
-
550
- with open(cluster_config_path,
551
- 'w',
552
- encoding='utf-8',
553
- opener=functools.partial(os.open, mode=0o644)) as f:
554
- f.write(codegen)
555
-
556
- @classmethod
557
- def _remove_stale_cluster_config_for_backward_compatibility(
558
- cls,
559
- cluster_name: str,
560
- ip: str,
561
- auth_config: Dict[str, str],
562
- docker_user: Optional[str] = None,
563
- ):
564
- """Remove authentication information for cluster from local SSH config.
565
-
566
- If no existing host matching the provided specification is found, then
567
- nothing is removed.
568
-
569
- Args:
570
- ip: Head node's IP address.
571
- auth_config: read_yaml(handle.cluster_yaml)['auth']
572
- docker_user: If not None, use this user to ssh into the docker
573
- """
574
- username = auth_config['ssh_user']
575
- config_path = os.path.expanduser(cls.ssh_conf_path)
576
- cluster_config_path = os.path.expanduser(
577
- cls.ssh_cluster_path.format(cluster_name))
578
- if not os.path.exists(config_path):
579
- return
580
-
581
- with open(config_path, 'r', encoding='utf-8') as f:
582
- config = f.readlines()
583
-
584
- start_line_idx = None
585
-
586
- # Scan the config for the cluster name.
587
- for i, line in enumerate(config):
588
- next_line = config[i + 1] if i + 1 < len(config) else ''
589
- if docker_user is None:
590
- found = (line.strip() == f'HostName {ip}' and
591
- next_line.strip() == f'User {username}')
592
- else:
593
- found = (line.strip() == 'HostName localhost' and
594
- next_line.strip() == f'User {docker_user}')
595
- if found:
596
- # Find the line starting with ProxyCommand and contains the ip
597
- found = False
598
- for idx in range(i, len(config)):
599
- # Stop if we reach an empty line, which means a new host
600
- if not config[idx].strip():
601
- break
602
- if config[idx].strip().startswith('ProxyCommand'):
603
- proxy_command_line = config[idx].strip()
604
- if proxy_command_line.endswith(f'@{ip}'):
605
- found = True
606
- break
607
- if found:
608
- start_line_idx = i - 1
609
- break
610
-
611
- if start_line_idx is not None:
612
- # Scan for end of previous config.
613
- cursor = start_line_idx
614
- while cursor > 0 and len(config[cursor].strip()) > 0:
615
- cursor -= 1
616
- prev_end_line_idx = cursor
617
-
618
- # Scan for end of the cluster config.
619
- end_line_idx = None
620
- cursor = start_line_idx + 1
621
- start_line_idx -= 1 # remove auto-generated comment
622
- while cursor < len(config):
623
- if config[cursor].strip().startswith(
624
- '# ') or config[cursor].strip().startswith('Host '):
625
- end_line_idx = cursor
626
- break
627
- cursor += 1
628
-
629
- # Remove sky-generated config and update the file.
630
- config[prev_end_line_idx:end_line_idx] = [
631
- '\n'
632
- ] if end_line_idx is not None else []
633
- with open(config_path, 'w', encoding='utf-8') as f:
634
- f.write(''.join(config).strip())
635
- f.write('\n' * 2)
636
-
637
- # Delete include statement if it exists in the config.
638
- sky_autogen_comment = ('# Added by sky (use `sky stop/down '
639
- f'{cluster_name}` to remove)')
640
- with open(config_path, 'r', encoding='utf-8') as f:
641
- config = f.readlines()
642
-
643
- for i, line in enumerate(config):
644
- config_str = line.strip()
645
- if f'Include {cluster_config_path}' in config_str:
646
- with open(config_path, 'w', encoding='utf-8') as f:
647
- if i < len(config) - 1 and config[i + 1] == '\n':
648
- del config[i + 1]
649
- # Delete Include string
650
- del config[i]
651
- # Delete Sky Autogen Comment
652
- if i > 0 and sky_autogen_comment in config[i - 1].strip():
653
- del config[i - 1]
654
- f.write(''.join(config))
655
- break
656
- if 'Host' in config_str:
657
- break
658
-
659
- @classmethod
660
- # TODO: We can remove this after 0.6.0 and have a lock only per cluster.
661
- @timeline.FileLockEvent(ssh_conf_lock_path)
662
- def remove_cluster(
663
- cls,
664
- cluster_name: str,
665
- ip: str,
666
- auth_config: Dict[str, str],
667
- docker_user: Optional[str] = None,
668
- ):
669
- """Remove authentication information for cluster from ~/.sky/ssh/<cluster_name>.
670
-
671
- For backward compatibility also remove the config from ~/.ssh/config if it exists.
672
-
673
- If no existing host matching the provided specification is found, then
674
- nothing is removed.
675
-
676
- Args:
677
- ip: Head node's IP address.
678
- auth_config: read_yaml(handle.cluster_yaml)['auth']
679
- docker_user: If not None, use this user to ssh into the docker
680
- """
681
- cluster_config_path = os.path.expanduser(
682
- cls.ssh_cluster_path.format(cluster_name))
683
- common_utils.remove_file_if_exists(cluster_config_path)
684
-
685
- # Ensures backward compatibility: before #2706, we wrote the config of SkyPilot clusters
686
- # directly in ~/.ssh/config. For these clusters, we should clean up the config.
687
- # TODO: Remove this after 0.6.0
688
- cls._remove_stale_cluster_config_for_backward_compatibility(
689
- cluster_name, ip, auth_config, docker_user)
690
-
691
-
692
430
  def _replace_yaml_dicts(
693
431
  new_yaml: str, old_yaml: str, restore_key_names: Set[str],
694
432
  restore_key_names_exceptions: Sequence[Tuple[str, ...]]) -> str:
@@ -742,10 +480,46 @@ def _replace_yaml_dicts(
742
480
  return common_utils.dump_yaml_str(new_config)
743
481
 
744
482
 
483
+ def get_expirable_clouds(
484
+ enabled_clouds: Sequence[clouds.Cloud]) -> List[clouds.Cloud]:
485
+ """Returns a list of clouds that use local credentials and whose credentials can expire.
486
+
487
+ This function checks each cloud in the provided sequence to determine if it uses local credentials
488
+ and if its credentials can expire. If both conditions are met, the cloud is added to the list of
489
+ expirable clouds.
490
+
491
+ Args:
492
+ enabled_clouds (Sequence[clouds.Cloud]): A sequence of cloud objects to check.
493
+
494
+ Returns:
495
+ list[clouds.Cloud]: A list of cloud objects that use local credentials and whose credentials can expire.
496
+ """
497
+ expirable_clouds = []
498
+ local_credentials_value = schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value
499
+ for cloud in enabled_clouds:
500
+ remote_identities = skypilot_config.get_nested(
501
+ (str(cloud).lower(), 'remote_identity'), None)
502
+ if remote_identities is None:
503
+ remote_identities = schemas.get_default_remote_identity(
504
+ str(cloud).lower())
505
+
506
+ local_credential_expiring = cloud.can_credential_expire()
507
+ if isinstance(remote_identities, str):
508
+ if remote_identities == local_credentials_value and local_credential_expiring:
509
+ expirable_clouds.append(cloud)
510
+ elif isinstance(remote_identities, list):
511
+ for profile in remote_identities:
512
+ if list(profile.values(
513
+ ))[0] == local_credentials_value and local_credential_expiring:
514
+ expirable_clouds.append(cloud)
515
+ break
516
+ return expirable_clouds
517
+
518
+
745
519
  # TODO: too many things happening here - leaky abstraction. Refactor.
746
520
  @timeline.event
747
521
  def write_cluster_config(
748
- to_provision: 'resources.Resources',
522
+ to_provision: 'resources_lib.Resources',
749
523
  num_nodes: int,
750
524
  cluster_config_template: str,
751
525
  cluster_name: str,
@@ -757,11 +531,17 @@ def write_cluster_config(
757
531
  keep_launch_fields_in_existing_config: bool = True) -> Dict[str, str]:
758
532
  """Fills in cluster configuration templates and writes them out.
759
533
 
760
- Returns: {provisioner: path to yaml, the provisioning spec}.
761
- 'provisioner' can be
762
- - 'ray'
763
- - 'tpu-create-script' (if TPU is requested)
764
- - 'tpu-delete-script' (if TPU is requested)
534
+ Returns:
535
+ Dict with the following keys:
536
+ - 'ray': Path to the generated Ray yaml config file
537
+ - 'cluster_name': Name of the cluster
538
+ - 'cluster_name_on_cloud': Name of the cluster as it appears in the
539
+ cloud provider
540
+ - 'config_hash': Hash of the cluster config and file mounts contents.
541
+ Can be missing if we unexpectedly failed to calculate the hash for
542
+ some reason. In that case we will continue without the optimization to
543
+ skip provisioning.
544
+
765
545
  Raises:
766
546
  exceptions.ResourcesUnavailableError: if the region/zones requested does
767
547
  not appear in the catalog, or an ssh_proxy_command is specified but
@@ -792,35 +572,76 @@ def write_cluster_config(
792
572
  # move the check out of this function, i.e. the caller should be responsible
793
573
  # for the validation.
794
574
  # TODO(tian): Move more cloud agnostic vars to resources.py.
795
- resources_vars = to_provision.make_deploy_variables(cluster_name_on_cloud,
796
- region, zones, dryrun)
575
+ resources_vars = to_provision.make_deploy_variables(
576
+ resources_utils.ClusterName(
577
+ cluster_name,
578
+ cluster_name_on_cloud,
579
+ ), region, zones, num_nodes, dryrun)
797
580
  config_dict = {}
798
581
 
799
582
  specific_reservations = set(
800
583
  skypilot_config.get_nested(
801
584
  (str(to_provision.cloud).lower(), 'specific_reservations'), set()))
802
585
 
586
+ # Remote identity handling can have 4 cases:
587
+ # 1. LOCAL_CREDENTIALS (default for most clouds): Upload local credentials
588
+ # 2. SERVICE_ACCOUNT: SkyPilot creates and manages a service account
589
+ # 3. Custom service account: Use specified service account
590
+ # 4. NO_UPLOAD: Do not upload any credentials
591
+ #
592
+ # We need to upload credentials only if LOCAL_CREDENTIALS is specified. In
593
+ # other cases, we exclude the cloud from credential file uploads after
594
+ # running required checks.
803
595
  assert cluster_name is not None
804
- excluded_clouds = []
805
- remote_identity = skypilot_config.get_nested(
806
- (str(cloud).lower(), 'remote_identity'),
807
- schemas.get_default_remote_identity(str(cloud).lower()))
808
- if remote_identity is not None and not isinstance(remote_identity, str):
809
- for profile in remote_identity:
596
+ excluded_clouds = set()
597
+ remote_identity_config = skypilot_config.get_nested(
598
+ (str(cloud).lower(), 'remote_identity'), None)
599
+ remote_identity = schemas.get_default_remote_identity(str(cloud).lower())
600
+ if isinstance(remote_identity_config, str):
601
+ remote_identity = remote_identity_config
602
+ if isinstance(remote_identity_config, list):
603
+ # Some clouds (e.g., AWS) support specifying multiple service accounts
604
+ # chosen based on the cluster name. Do the matching here to pick the
605
+ # correct one.
606
+ for profile in remote_identity_config:
810
607
  if fnmatch.fnmatchcase(cluster_name, list(profile.keys())[0]):
811
608
  remote_identity = list(profile.values())[0]
812
609
  break
813
610
  if remote_identity != schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value:
814
- if not cloud.supports_service_account_on_remote():
611
+ # If LOCAL_CREDENTIALS is not specified, we add the cloud to the
612
+ # excluded_clouds set, but we must also check if the cloud supports
613
+ # service accounts.
614
+ if remote_identity == schemas.RemoteIdentityOptions.NO_UPLOAD.value:
615
+ # If NO_UPLOAD is specified, fall back to default remote identity
616
+ # for downstream logic but add it to excluded_clouds to skip
617
+ # credential file uploads.
618
+ remote_identity = schemas.get_default_remote_identity(
619
+ str(cloud).lower())
620
+ elif not cloud.supports_service_account_on_remote():
815
621
  raise exceptions.InvalidCloudConfigs(
816
622
  'remote_identity: SERVICE_ACCOUNT is specified in '
817
623
  f'{skypilot_config.loaded_config_path!r} for {cloud}, but it '
818
624
  'is not supported by this cloud. Remove the config or set: '
819
625
  '`remote_identity: LOCAL_CREDENTIALS`.')
820
- excluded_clouds = [cloud]
626
+ if isinstance(cloud, clouds.Kubernetes):
627
+ if skypilot_config.get_nested(
628
+ ('kubernetes', 'allowed_contexts'), None) is None:
629
+ excluded_clouds.add(cloud)
630
+ else:
631
+ excluded_clouds.add(cloud)
632
+
633
+ for cloud_str, cloud_obj in registry.CLOUD_REGISTRY.items():
634
+ remote_identity_config = skypilot_config.get_nested(
635
+ (cloud_str.lower(), 'remote_identity'), None)
636
+ if remote_identity_config:
637
+ if (remote_identity_config ==
638
+ schemas.RemoteIdentityOptions.NO_UPLOAD.value):
639
+ excluded_clouds.add(cloud_obj)
640
+
821
641
  credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)
822
642
 
823
- auth_config = {'ssh_private_key': auth.PRIVATE_SSH_KEY_PATH}
643
+ private_key_path, _ = auth.get_or_generate_keys()
644
+ auth_config = {'ssh_private_key': private_key_path}
824
645
  region_name = resources_vars.get('region')
825
646
 
826
647
  yaml_path = _get_yaml_path_from_cluster_name(cluster_name)
@@ -854,11 +675,6 @@ def write_cluster_config(
854
675
 
855
676
  # User-supplied global instance tags from ~/.sky/config.yaml.
856
677
  labels = skypilot_config.get_nested((str(cloud).lower(), 'labels'), {})
857
- # Deprecated: instance_tags have been replaced by labels. For backward
858
- # compatibility, we support them and the schema allows them only if
859
- # `labels` are not specified. This should be removed after 0.7.0.
860
- labels = skypilot_config.get_nested((str(cloud).lower(), 'instance_tags'),
861
- labels)
862
678
  # labels is a dict, which is guaranteed by the type check in
863
679
  # schemas.py
864
680
  assert isinstance(labels, dict), labels
@@ -873,6 +689,11 @@ def write_cluster_config(
873
689
  f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", encoding="utf-8"))\''
874
690
  )
875
691
 
692
+ # We disable conda auto-activation if the user has specified a docker image
693
+ # to use, which is likely to already have a conda environment activated.
694
+ conda_auto_activate = ('true' if to_provision.extract_docker_image() is None
695
+ else 'false')
696
+
876
697
  # Use a tmp file path to avoid incomplete YAML file being re-used in the
877
698
  # future.
878
699
  tmp_yaml_path = yaml_path + '.tmp'
@@ -907,16 +728,21 @@ def write_cluster_config(
907
728
  'specific_reservations': specific_reservations,
908
729
 
909
730
  # Conda setup
910
- 'conda_installation_commands':
911
- constants.CONDA_INSTALLATION_COMMANDS,
912
731
  # We should not use `.format`, as it contains '{}' as the bash
913
732
  # syntax.
733
+ 'conda_installation_commands':
734
+ constants.CONDA_INSTALLATION_COMMANDS.replace(
735
+ '{conda_auto_activate}', conda_auto_activate),
914
736
  'ray_skypilot_installation_commands':
915
737
  (constants.RAY_SKYPILOT_INSTALLATION_COMMANDS.replace(
916
738
  '{sky_wheel_hash}',
917
739
  wheel_hash).replace('{cloud}',
918
740
  str(cloud).lower())),
919
-
741
+ 'skypilot_wheel_installation_commands':
742
+ constants.SKYPILOT_WHEEL_INSTALLATION_COMMANDS.replace(
743
+ '{sky_wheel_hash}',
744
+ wheel_hash).replace('{cloud}',
745
+ str(cloud).lower()),
920
746
  # Port of Ray (GCS server).
921
747
  # Ray's default port 6379 is conflicted with Redis.
922
748
  'ray_port': constants.SKY_REMOTE_RAY_PORT,
@@ -945,7 +771,7 @@ def write_cluster_config(
945
771
  'sky_local_path': str(local_wheel_path),
946
772
  # Add yaml file path to the template variables.
947
773
  'sky_ray_yaml_remote_path':
948
- cluster_yaml_utils.SKY_CLUSTER_YAML_REMOTE_PATH,
774
+ cluster_utils.SKY_CLUSTER_YAML_REMOTE_PATH,
949
775
  'sky_ray_yaml_local_path': tmp_yaml_path,
950
776
  'sky_version': str(version.parse(sky.__version__)),
951
777
  'sky_wheel_hash': wheel_hash,
@@ -955,17 +781,33 @@ def write_cluster_config(
955
781
  output_path=tmp_yaml_path)
956
782
  config_dict['cluster_name'] = cluster_name
957
783
  config_dict['ray'] = yaml_path
784
+
785
+ # Add kubernetes config fields from ~/.sky/config
786
+ if isinstance(cloud, clouds.Kubernetes):
787
+ kubernetes_utils.combine_pod_config_fields(
788
+ tmp_yaml_path,
789
+ cluster_config_overrides=to_provision.cluster_config_overrides)
790
+ kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
791
+ yaml_obj = common_utils.read_yaml(tmp_yaml_path)
792
+ pod_config = yaml_obj['available_node_types']['ray_head_default'][
793
+ 'node_config']
794
+ valid, message = kubernetes_utils.check_pod_config(pod_config)
795
+ if not valid:
796
+ raise exceptions.InvalidCloudConfigs(
797
+ f'Invalid pod_config. Details: {message}')
798
+
958
799
  if dryrun:
959
800
  # If dryrun, return the unfinished tmp yaml path.
960
801
  config_dict['ray'] = tmp_yaml_path
802
+ try:
803
+ config_dict['config_hash'] = _deterministic_cluster_yaml_hash(
804
+ tmp_yaml_path)
805
+ except Exception as e: # pylint: disable=broad-except
806
+ logger.warning(f'Failed to calculate config_hash: {e}')
807
+ logger.debug('Full exception:', exc_info=e)
961
808
  return config_dict
962
809
  _add_auth_to_cluster_config(cloud, tmp_yaml_path)
963
810
 
964
- # Add kubernetes config fields from ~/.sky/config
965
- if isinstance(cloud, clouds.Kubernetes):
966
- kubernetes_utils.combine_pod_config_fields(tmp_yaml_path)
967
- kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
968
-
969
811
  # Restore the old yaml content for backward compatibility.
970
812
  if os.path.exists(yaml_path) and keep_launch_fields_in_existing_config:
971
813
  with open(yaml_path, 'r', encoding='utf-8') as f:
@@ -979,7 +821,22 @@ def write_cluster_config(
979
821
  with open(tmp_yaml_path, 'w', encoding='utf-8') as f:
980
822
  f.write(restored_yaml_content)
981
823
 
982
- config_dict['cluster_name_on_cloud'] = cluster_name_on_cloud
824
+ # Read the cluster name from the tmp yaml file, to take the backward
825
+ # compatbility restortion above into account.
826
+ # TODO: remove this after 2 minor releases, 0.10.0.
827
+ yaml_config = common_utils.read_yaml(tmp_yaml_path)
828
+ config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
829
+
830
+ # Make sure to do this before we optimize file mounts. Optimization is
831
+ # non-deterministic, but everything else before this point should be
832
+ # deterministic.
833
+ try:
834
+ config_dict['config_hash'] = _deterministic_cluster_yaml_hash(
835
+ tmp_yaml_path)
836
+ except Exception as e: # pylint: disable=broad-except
837
+ logger.warning('Failed to calculate config_hash: '
838
+ f'{common_utils.format_exception(e)}')
839
+ logger.debug('Full exception:', exc_info=e)
983
840
 
984
841
  # Optimization: copy the contents of source files in file_mounts to a
985
842
  # special dir, and upload that as the only file_mount instead. Delay
@@ -1004,13 +861,20 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
1004
861
  """
1005
862
  config = common_utils.read_yaml(cluster_config_file)
1006
863
  # Check the availability of the cloud type.
1007
- if isinstance(cloud, (clouds.AWS, clouds.OCI, clouds.SCP, clouds.Vsphere,
1008
- clouds.Cudo, clouds.Paperspace)):
864
+ if isinstance(cloud, (
865
+ clouds.AWS,
866
+ clouds.OCI,
867
+ clouds.SCP,
868
+ clouds.Vsphere,
869
+ clouds.Cudo,
870
+ clouds.Paperspace,
871
+ clouds.Azure,
872
+ clouds.DO,
873
+ clouds.Nebius,
874
+ )):
1009
875
  config = auth.configure_ssh_info(config)
1010
876
  elif isinstance(cloud, clouds.GCP):
1011
877
  config = auth.setup_gcp_authentication(config)
1012
- elif isinstance(cloud, clouds.Azure):
1013
- config = auth.setup_azure_authentication(config)
1014
878
  elif isinstance(cloud, clouds.Lambda):
1015
879
  config = auth.setup_lambda_authentication(config)
1016
880
  elif isinstance(cloud, clouds.Kubernetes):
@@ -1019,6 +883,8 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
1019
883
  config = auth.setup_ibm_authentication(config)
1020
884
  elif isinstance(cloud, clouds.RunPod):
1021
885
  config = auth.setup_runpod_authentication(config)
886
+ elif isinstance(cloud, clouds.Vast):
887
+ config = auth.setup_vast_authentication(config)
1022
888
  elif isinstance(cloud, clouds.Fluidstack):
1023
889
  config = auth.setup_fluidstack_authentication(config)
1024
890
  else:
@@ -1026,10 +892,6 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
1026
892
  common_utils.dump_yaml(cluster_config_file, config)
1027
893
 
1028
894
 
1029
- def get_run_timestamp() -> str:
1030
- return 'sky-' + datetime.now().strftime('%Y-%m-%d-%H-%M-%S-%f')
1031
-
1032
-
1033
895
  def get_timestamp_from_run_timestamp(run_timestamp: str) -> float:
1034
896
  return datetime.strptime(
1035
897
  run_timestamp.partition('-')[2], '%Y-%m-%d-%H-%M-%S-%f').timestamp()
@@ -1084,6 +946,135 @@ def _count_healthy_nodes_from_ray(output: str,
1084
946
  return ready_head, ready_workers
1085
947
 
1086
948
 
949
+ @timeline.event
950
+ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
951
+ """Hash the cluster yaml and contents of file mounts to a unique string.
952
+
953
+ Two invocations of this function should return the same string if and only
954
+ if the contents of the yaml are the same and the file contents of all the
955
+ file_mounts specified in the yaml are the same.
956
+
957
+ Limitations:
958
+ - This function can be expensive if the file mounts are large. (E.g. a few
959
+ seconds for ~1GB.) This should be okay since we expect that the
960
+ file_mounts in the cluster yaml (the wheel and cloud credentials) will be
961
+ small.
962
+ - Symbolic links are not explicitly handled. Some symbolic link changes may
963
+ not be detected.
964
+
965
+ Implementation: We create a byte sequence that captures the state of the
966
+ yaml file and all the files in the file mounts, then hash the byte sequence.
967
+
968
+ The format of the byte sequence is:
969
+ 32 bytes - sha256 hash of the yaml
970
+ for each file mount:
971
+ file mount remote destination (UTF-8), \0
972
+ if the file mount source is a file:
973
+ 'file' encoded to UTF-8
974
+ 32 byte sha256 hash of the file contents
975
+ if the file mount source is a directory:
976
+ 'dir' encoded to UTF-8
977
+ for each directory and subdirectory withinin the file mount (starting from
978
+ the root and descending recursively):
979
+ name of the directory (UTF-8), \0
980
+ name of each subdirectory within the directory (UTF-8) terminated by \0
981
+ \0
982
+ for each file in the directory:
983
+ name of the file (UTF-8), \0
984
+ 32 bytes - sha256 hash of the file contents
985
+ \0
986
+ if the file mount source is something else or does not exist, nothing
987
+ \0\0
988
+
989
+ Rather than constructing the whole byte sequence, which may be quite large,
990
+ we construct it incrementally by using hash.update() to add new bytes.
991
+ """
992
+
993
+ # Load the yaml contents so that we can directly remove keys.
994
+ yaml_config = common_utils.read_yaml(yaml_path)
995
+ for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
996
+ dict_to_remove_from = yaml_config
997
+ found_key = True
998
+ for key in key_list[:-1]:
999
+ if (not isinstance(dict_to_remove_from, dict) or
1000
+ key not in dict_to_remove_from):
1001
+ found_key = False
1002
+ break
1003
+ dict_to_remove_from = dict_to_remove_from[key]
1004
+ if found_key and key_list[-1] in dict_to_remove_from:
1005
+ dict_to_remove_from.pop(key_list[-1])
1006
+
1007
+ def _hash_file(path: str) -> bytes:
1008
+ return common_utils.hash_file(path, 'sha256').digest()
1009
+
1010
+ config_hash = hashlib.sha256()
1011
+
1012
+ yaml_hash = hashlib.sha256(
1013
+ common_utils.dump_yaml_str(yaml_config).encode('utf-8'))
1014
+ config_hash.update(yaml_hash.digest())
1015
+
1016
+ file_mounts = yaml_config.get('file_mounts', {})
1017
+ # Remove the file mounts added by the newline.
1018
+ if '' in file_mounts:
1019
+ assert file_mounts[''] == '', file_mounts['']
1020
+ file_mounts.pop('')
1021
+
1022
+ for dst, src in sorted(file_mounts.items()):
1023
+ if src == yaml_path:
1024
+ # Skip the yaml file itself. We have already hashed a modified
1025
+ # version of it. The file may include fields we don't want to hash.
1026
+ continue
1027
+
1028
+ expanded_src = os.path.expanduser(src)
1029
+ config_hash.update(dst.encode('utf-8') + b'\0')
1030
+
1031
+ # If the file mount source is a symlink, this should be true. In that
1032
+ # case we hash the contents of the symlink destination.
1033
+ if os.path.isfile(expanded_src):
1034
+ config_hash.update('file'.encode('utf-8'))
1035
+ config_hash.update(_hash_file(expanded_src))
1036
+
1037
+ # This can also be a symlink to a directory. os.walk will treat it as a
1038
+ # normal directory and list the contents of the symlink destination.
1039
+ elif os.path.isdir(expanded_src):
1040
+ config_hash.update('dir'.encode('utf-8'))
1041
+
1042
+ # Aside from expanded_src, os.walk will list symlinks to directories
1043
+ # but will not recurse into them.
1044
+ for (dirpath, dirnames, filenames) in os.walk(expanded_src):
1045
+ config_hash.update(dirpath.encode('utf-8') + b'\0')
1046
+
1047
+ # Note: inplace sort will also affect the traversal order of
1048
+ # os.walk. We need it so that the os.walk order is
1049
+ # deterministic.
1050
+ dirnames.sort()
1051
+ # This includes symlinks to directories. os.walk will recurse
1052
+ # into all the directories but not the symlinks. We don't hash
1053
+ # the link destination, so if a symlink to a directory changes,
1054
+ # we won't notice.
1055
+ for dirname in dirnames:
1056
+ config_hash.update(dirname.encode('utf-8') + b'\0')
1057
+ config_hash.update(b'\0')
1058
+
1059
+ filenames.sort()
1060
+ # This includes symlinks to files. We could hash the symlink
1061
+ # destination itself but instead just hash the destination
1062
+ # contents.
1063
+ for filename in filenames:
1064
+ config_hash.update(filename.encode('utf-8') + b'\0')
1065
+ config_hash.update(
1066
+ _hash_file(os.path.join(dirpath, filename)))
1067
+ config_hash.update(b'\0')
1068
+
1069
+ else:
1070
+ logger.debug(
1071
+ f'Unexpected file_mount that is not a file or dir: {src}')
1072
+
1073
+ config_hash.update(b'\0\0')
1074
+
1075
+ return config_hash.hexdigest()
1076
+
1077
+
1087
1078
  def get_docker_user(ip: str, cluster_config_file: str) -> str:
1088
1079
  """Find docker container username."""
1089
1080
  ssh_credentials = ssh_credential_from_yaml(cluster_config_file)
@@ -1139,7 +1130,8 @@ def wait_until_ray_cluster_ready(
1139
1130
  runner = command_runner.SSHCommandRunner(node=(head_ip, 22),
1140
1131
  **ssh_credentials)
1141
1132
  with rich_utils.safe_status(
1142
- '[bold cyan]Waiting for workers...') as worker_status:
1133
+ ux_utils.spinner_message('Waiting for workers',
1134
+ log_path=log_path)) as worker_status:
1143
1135
  while True:
1144
1136
  rc, output, stderr = runner.run(
1145
1137
  instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
@@ -1155,9 +1147,11 @@ def wait_until_ray_cluster_ready(
1155
1147
  ready_head, ready_workers = _count_healthy_nodes_from_ray(
1156
1148
  output, is_local_cloud=is_local_cloud)
1157
1149
 
1158
- worker_status.update('[bold cyan]'
1159
- f'{ready_workers} out of {num_nodes - 1} '
1160
- 'workers ready')
1150
+ worker_status.update(
1151
+ ux_utils.spinner_message(
1152
+ f'{ready_workers} out of {num_nodes - 1} '
1153
+ 'workers ready',
1154
+ log_path=log_path))
1161
1155
 
1162
1156
  # In the local case, ready_head=0 and ready_workers=num_nodes. This
1163
1157
  # is because there is no matching regex for _LAUNCHED_HEAD_PATTERN.
@@ -1207,7 +1201,7 @@ def wait_until_ray_cluster_ready(
1207
1201
 
1208
1202
 
1209
1203
  def ssh_credential_from_yaml(
1210
- cluster_yaml: str,
1204
+ cluster_yaml: Optional[str],
1211
1205
  docker_user: Optional[str] = None,
1212
1206
  ssh_user: Optional[str] = None,
1213
1207
  ) -> Dict[str, Any]:
@@ -1219,6 +1213,8 @@ def ssh_credential_from_yaml(
1219
1213
  the docker container.
1220
1214
  ssh_user: override the ssh_user in the cluster yaml.
1221
1215
  """
1216
+ if cluster_yaml is None:
1217
+ return dict()
1222
1218
  config = common_utils.read_yaml(cluster_yaml)
1223
1219
  auth_section = config['auth']
1224
1220
  if ssh_user is None:
@@ -1226,6 +1222,12 @@ def ssh_credential_from_yaml(
1226
1222
  ssh_private_key = auth_section.get('ssh_private_key')
1227
1223
  ssh_control_name = config.get('cluster_name', '__default__')
1228
1224
  ssh_proxy_command = auth_section.get('ssh_proxy_command')
1225
+
1226
+ # Update the ssh_user placeholder in proxy command, if required
1227
+ if (ssh_proxy_command is not None and
1228
+ constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
1229
+ ssh_proxy_command = ssh_proxy_command.replace(
1230
+ constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
1229
1231
  credentials = {
1230
1232
  'ssh_user': ssh_user,
1231
1233
  'ssh_private_key': ssh_private_key,
@@ -1242,18 +1244,18 @@ def ssh_credential_from_yaml(
1242
1244
 
1243
1245
 
1244
1246
  def parallel_data_transfer_to_nodes(
1245
- runners: List[command_runner.CommandRunner],
1246
- source: Optional[str],
1247
- target: str,
1248
- cmd: Optional[str],
1249
- run_rsync: bool,
1250
- *,
1251
- action_message: str,
1252
- # Advanced options.
1253
- log_path: str = os.devnull,
1254
- stream_logs: bool = False,
1255
- source_bashrc: bool = False,
1256
- ):
1247
+ runners: List[command_runner.CommandRunner],
1248
+ source: Optional[str],
1249
+ target: str,
1250
+ cmd: Optional[str],
1251
+ run_rsync: bool,
1252
+ *,
1253
+ action_message: str,
1254
+ # Advanced options.
1255
+ log_path: str = os.devnull,
1256
+ stream_logs: bool = False,
1257
+ source_bashrc: bool = False,
1258
+ num_threads: Optional[int] = None):
1257
1259
  """Runs a command on all nodes and optionally runs rsync from src->dst.
1258
1260
 
1259
1261
  Args:
@@ -1265,8 +1267,8 @@ def parallel_data_transfer_to_nodes(
1265
1267
  log_path: str; Path to the log file
1266
1268
  stream_logs: bool; Whether to stream logs to stdout
1267
1269
  source_bashrc: bool; Source bashrc before running the command.
1270
+ num_threads: Optional[int]; Number of threads to use.
1268
1271
  """
1269
- fore = colorama.Fore
1270
1272
  style = colorama.Style
1271
1273
 
1272
1274
  origin_source = source
@@ -1303,12 +1305,10 @@ def parallel_data_transfer_to_nodes(
1303
1305
 
1304
1306
  num_nodes = len(runners)
1305
1307
  plural = 's' if num_nodes > 1 else ''
1306
- message = (f'{fore.CYAN}{action_message} (to {num_nodes} node{plural})'
1307
- f': {style.BRIGHT}{origin_source}{style.RESET_ALL} -> '
1308
- f'{style.BRIGHT}{target}{style.RESET_ALL}')
1308
+ message = (f' {style.DIM}{action_message} (to {num_nodes} node{plural})'
1309
+ f': {origin_source} -> {target}{style.RESET_ALL}')
1309
1310
  logger.info(message)
1310
- with rich_utils.safe_status(f'[bold cyan]{action_message}[/]'):
1311
- subprocess_utils.run_in_parallel(_sync_node, runners)
1311
+ subprocess_utils.run_in_parallel(_sync_node, runners, num_threads)
1312
1312
 
1313
1313
 
1314
1314
  def check_local_gpus() -> bool:
@@ -1335,12 +1335,6 @@ def check_local_gpus() -> bool:
1335
1335
  return is_functional
1336
1336
 
1337
1337
 
1338
- def generate_cluster_name():
1339
- # TODO: change this ID formatting to something more pleasant.
1340
- # User name is helpful in non-isolated accounts, e.g., GCP, Azure.
1341
- return f'sky-{uuid.uuid4().hex[:4]}-{common_utils.get_cleaned_username()}'
1342
-
1343
-
1344
1338
  def _query_head_ip_with_retries(cluster_yaml: str,
1345
1339
  max_attempts: int = 1) -> str:
1346
1340
  """Returns the IP of the head node by querying the cloud.
@@ -1406,8 +1400,8 @@ def get_node_ips(cluster_yaml: str,
1406
1400
  """
1407
1401
  ray_config = common_utils.read_yaml(cluster_yaml)
1408
1402
  # Use the new provisioner for AWS.
1409
- provider_name = cluster_yaml_utils.get_provider_name(ray_config)
1410
- cloud = cloud_registry.CLOUD_REGISTRY.from_str(provider_name)
1403
+ provider_name = cluster_utils.get_provider_name(ray_config)
1404
+ cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
1411
1405
  assert cloud is not None, provider_name
1412
1406
 
1413
1407
  if cloud.PROVISIONER_VERSION >= clouds.ProvisionerVersion.SKYPILOT:
@@ -1506,6 +1500,7 @@ def check_network_connection():
1506
1500
  'Network seems down.') from e
1507
1501
 
1508
1502
 
1503
+ @timeline.event
1509
1504
  def check_owner_identity(cluster_name: str) -> None:
1510
1505
  """Check if current user is the same as the user who created the cluster.
1511
1506
 
@@ -1525,58 +1520,65 @@ def check_owner_identity(cluster_name: str) -> None:
1525
1520
  return
1526
1521
 
1527
1522
  cloud = handle.launched_resources.cloud
1528
- current_user_identity = cloud.get_current_user_identity()
1523
+ user_identities = cloud.get_user_identities()
1529
1524
  owner_identity = record['owner']
1530
- if current_user_identity is None:
1525
+ if user_identities is None:
1531
1526
  # Skip the check if the cloud does not support user identity.
1532
1527
  return
1533
1528
  # The user identity can be None, if the cluster is created by an older
1534
1529
  # version of SkyPilot. In that case, we set the user identity to the
1535
- # current one.
1530
+ # current active one.
1536
1531
  # NOTE: a user who upgrades SkyPilot and switches to a new cloud identity
1537
1532
  # immediately without `sky status --refresh` first, will cause a leakage
1538
1533
  # of the existing cluster. We deem this an acceptable tradeoff mainly
1539
1534
  # because multi-identity is not common (at least at the moment).
1540
1535
  if owner_identity is None:
1541
1536
  global_user_state.set_owner_identity_for_cluster(
1542
- cluster_name, current_user_identity)
1537
+ cluster_name, user_identities[0])
1543
1538
  else:
1544
1539
  assert isinstance(owner_identity, list)
1545
1540
  # It is OK if the owner identity is shorter, which will happen when
1546
1541
  # the cluster is launched before #1808. In that case, we only check
1547
1542
  # the same length (zip will stop at the shorter one).
1548
- for i, (owner,
1549
- current) in enumerate(zip(owner_identity,
1550
- current_user_identity)):
1551
- # Clean up the owner identity for the backslash and newlines, caused
1552
- # by the cloud CLI output, e.g. gcloud.
1553
- owner = owner.replace('\n', '').replace('\\', '')
1554
- if owner == current:
1555
- if i != 0:
1556
- logger.warning(
1557
- f'The cluster was owned by {owner_identity}, but '
1558
- f'a new identity {current_user_identity} is activated. We still '
1559
- 'allow the operation as the two identities are likely to have '
1560
- 'the same access to the cluster. Please be aware that this can '
1561
- 'cause unexpected cluster leakage if the two identities are not '
1562
- 'actually equivalent (e.g., belong to the same person).'
1563
- )
1564
- if i != 0 or len(owner_identity) != len(current_user_identity):
1565
- # We update the owner of a cluster, when:
1566
- # 1. The strictest identty (i.e. the first one) does not
1567
- # match, but the latter ones match.
1568
- # 2. The length of the two identities are different, which
1569
- # will only happen when the cluster is launched before #1808.
1570
- # Update the user identity to avoid showing the warning above
1571
- # again.
1572
- global_user_state.set_owner_identity_for_cluster(
1573
- cluster_name, current_user_identity)
1574
- return # The user identity matches.
1543
+ for identity in user_identities:
1544
+ for i, (owner, current) in enumerate(zip(owner_identity, identity)):
1545
+ # Clean up the owner identity for the backslash and newlines, caused
1546
+ # by the cloud CLI output, e.g. gcloud.
1547
+ owner = owner.replace('\n', '').replace('\\', '')
1548
+ if owner == current:
1549
+ if i != 0:
1550
+ logger.warning(
1551
+ f'The cluster was owned by {owner_identity}, but '
1552
+ f'a new identity {identity} is activated. We still '
1553
+ 'allow the operation as the two identities are '
1554
+ 'likely to have the same access to the cluster. '
1555
+ 'Please be aware that this can cause unexpected '
1556
+ 'cluster leakage if the two identities are not '
1557
+ 'actually equivalent (e.g., belong to the same '
1558
+ 'person).')
1559
+ if i != 0 or len(owner_identity) != len(identity):
1560
+ # We update the owner of a cluster, when:
1561
+ # 1. The strictest identty (i.e. the first one) does not
1562
+ # match, but the latter ones match.
1563
+ # 2. The length of the two identities are different,
1564
+ # which will only happen when the cluster is launched
1565
+ # before #1808. Update the user identity to avoid
1566
+ # showing the warning above again.
1567
+ global_user_state.set_owner_identity_for_cluster(
1568
+ cluster_name, identity)
1569
+ return # The user identity matches.
1570
+ # Generate error message if no match found
1571
+ if len(user_identities) == 1:
1572
+ err_msg = f'the activated identity is {user_identities[0]!r}.'
1573
+ else:
1574
+ err_msg = (f'available identities are {user_identities!r}.')
1575
+ if cloud.is_same_cloud(clouds.Kubernetes()):
1576
+ err_msg += (' Check your kubeconfig file and make sure the '
1577
+ 'correct context is available.')
1575
1578
  with ux_utils.print_exception_no_traceback():
1576
1579
  raise exceptions.ClusterOwnerIdentityMismatchError(
1577
1580
  f'{cluster_name!r} ({cloud}) is owned by account '
1578
- f'{owner_identity!r}, but the activated account '
1579
- f'is {current_user_identity!r}.')
1581
+ f'{owner_identity!r}, but ' + err_msg)
1580
1582
 
1581
1583
 
1582
1584
  def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
@@ -1648,14 +1650,14 @@ def check_can_clone_disk_and_override_task(
1648
1650
  The task to use and the resource handle of the source cluster.
1649
1651
 
1650
1652
  Raises:
1651
- ValueError: If the source cluster does not exist.
1653
+ exceptions.ClusterDoesNotExist: If the source cluster does not exist.
1652
1654
  exceptions.NotSupportedError: If the source cluster is not valid or the
1653
1655
  task is not compatible to clone disk from the source cluster.
1654
1656
  """
1655
1657
  source_cluster_status, handle = refresh_cluster_status_handle(cluster_name)
1656
1658
  if source_cluster_status is None:
1657
1659
  with ux_utils.print_exception_no_traceback():
1658
- raise ValueError(
1660
+ raise exceptions.ClusterDoesNotExist(
1659
1661
  f'Cannot find cluster {cluster_name!r} to clone disk from.')
1660
1662
 
1661
1663
  if not isinstance(handle, backends.CloudVmRayResourceHandle):
@@ -1667,7 +1669,7 @@ def check_can_clone_disk_and_override_task(
1667
1669
  with ux_utils.print_exception_no_traceback():
1668
1670
  raise exceptions.NotSupportedError(
1669
1671
  f'Cannot clone disk from cluster {cluster_name!r} '
1670
- f'({source_cluster_status!r}). Please stop the '
1672
+ f'({source_cluster_status.value!r}). Please stop the '
1671
1673
  f'cluster first: sky stop {cluster_name}')
1672
1674
 
1673
1675
  if target_cluster_name is not None:
@@ -1747,18 +1749,44 @@ def check_can_clone_disk_and_override_task(
1747
1749
  return task, handle
1748
1750
 
1749
1751
 
1750
- def _update_cluster_status_no_lock(
1751
- cluster_name: str) -> Optional[Dict[str, Any]]:
1752
- """Updates the status of the cluster.
1752
+ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
1753
+ """Update the cluster status.
1754
+
1755
+ The cluster status is updated by checking ray cluster and real status from
1756
+ cloud.
1757
+
1758
+ The function will update the cached cluster status in the global state. For
1759
+ the design of the cluster status and transition, please refer to the
1760
+ sky/design_docs/cluster_status.md
1761
+
1762
+ Note: this function is only safe to be called when the caller process is
1763
+ holding the cluster lock, which means no other processes are modifying the
1764
+ cluster.
1765
+
1766
+ Returns:
1767
+ If the cluster is terminated or does not exist, return None. Otherwise
1768
+ returns the input record with status and handle potentially updated.
1753
1769
 
1754
1770
  Raises:
1771
+ exceptions.ClusterOwnerIdentityMismatchError: if the current user is
1772
+ not the same as the user who created the cluster.
1773
+ exceptions.CloudUserIdentityError: if we fail to get the current user
1774
+ identity.
1755
1775
  exceptions.ClusterStatusFetchingError: the cluster status cannot be
1756
- fetched from the cloud provider.
1776
+ fetched from the cloud provider or there are leaked nodes causing
1777
+ the node number larger than expected.
1757
1778
  """
1758
1779
  record = global_user_state.get_cluster_from_name(cluster_name)
1759
1780
  if record is None:
1760
1781
  return None
1761
1782
  handle = record['handle']
1783
+ if handle.cluster_yaml is None:
1784
+ # Remove cluster from db since this cluster does not have a config file
1785
+ # or any other ongoing requests
1786
+ global_user_state.remove_cluster(cluster_name, terminate=True)
1787
+ logger.debug(f'Cluster {cluster_name!r} has no YAML file. '
1788
+ 'Removing the cluster from cache.')
1789
+ return None
1762
1790
  if not isinstance(handle, backends.CloudVmRayResourceHandle):
1763
1791
  return record
1764
1792
  cluster_name = handle.cluster_name
@@ -1813,13 +1841,12 @@ def _update_cluster_status_no_lock(
1813
1841
  logger.debug(
1814
1842
  f'Refreshing status ({cluster_name!r}) failed to get IPs.')
1815
1843
  except RuntimeError as e:
1816
- logger.debug(str(e))
1844
+ logger.debug(common_utils.format_exception(e))
1817
1845
  except Exception as e: # pylint: disable=broad-except
1818
1846
  # This can be raised by `external_ssh_ports()`, due to the
1819
1847
  # underlying call to kubernetes API.
1820
- logger.debug(
1821
- f'Refreshing status ({cluster_name!r}) failed: '
1822
- f'{common_utils.format_exception(e, use_bracket=True)}')
1848
+ logger.debug(f'Refreshing status ({cluster_name!r}) failed: ',
1849
+ exc_info=e)
1823
1850
  return False
1824
1851
 
1825
1852
  # Determining if the cluster is healthy (UP):
@@ -1843,9 +1870,27 @@ def _update_cluster_status_no_lock(
1843
1870
  requested_resources=None,
1844
1871
  ready=True,
1845
1872
  is_launch=False)
1846
- return record
1873
+ return global_user_state.get_cluster_from_name(cluster_name)
1847
1874
 
1848
1875
  # All cases below are transitioning the cluster to non-UP states.
1876
+
1877
+ if (not node_statuses and handle.launched_resources.cloud.STATUS_VERSION >=
1878
+ clouds.StatusVersion.SKYPILOT):
1879
+ # Note: launched_at is set during sky launch, even on an existing
1880
+ # cluster. This will catch the case where the cluster was terminated on
1881
+ # the cloud and restarted by sky launch.
1882
+ time_since_launch = time.time() - record['launched_at']
1883
+ if (record['status'] == status_lib.ClusterStatus.INIT and
1884
+ time_since_launch < _LAUNCH_DOUBLE_CHECK_WINDOW):
1885
+ # It's possible the instances for this cluster were just created,
1886
+ # and haven't appeared yet in the cloud API/console. Wait for a bit
1887
+ # and check again. This is a best-effort leak prevention check.
1888
+ # See https://github.com/skypilot-org/skypilot/issues/4431.
1889
+ time.sleep(_LAUNCH_DOUBLE_CHECK_DELAY)
1890
+ node_statuses = _query_cluster_status_via_cloud_api(handle)
1891
+ # Note: even if all the node_statuses are UP now, we will still
1892
+ # consider this cluster abnormal, and its status will be INIT.
1893
+
1849
1894
  if len(node_statuses) > handle.launched_nodes:
1850
1895
  # Unexpected: in the queried region more than 1 cluster with the same
1851
1896
  # constructed name tag returned. This will typically not happen unless
@@ -1874,13 +1919,15 @@ def _update_cluster_status_no_lock(
1874
1919
  f'{colorama.Style.RESET_ALL}')
1875
1920
  assert len(node_statuses) <= handle.launched_nodes
1876
1921
 
1877
- # If the node_statuses is empty, all the nodes are terminated. We can
1878
- # safely set the cluster status to TERMINATED. This handles the edge case
1879
- # where the cluster is terminated by the user manually through the UI.
1922
+ # If the node_statuses is empty, it should mean that all the nodes are
1923
+ # terminated and we can set the cluster status to TERMINATED. This handles
1924
+ # the edge case where the cluster is terminated by the user manually through
1925
+ # the UI.
1880
1926
  to_terminate = not node_statuses
1881
1927
 
1882
- # A cluster is considered "abnormal", if not all nodes are TERMINATED or
1883
- # not all nodes are STOPPED. We check that with the following logic:
1928
+ # A cluster is considered "abnormal", if some (but not all) nodes are
1929
+ # TERMINATED, or not all nodes are STOPPED. We check that with the following
1930
+ # logic:
1884
1931
  # * Not all nodes are terminated and there's at least one node
1885
1932
  # terminated; or
1886
1933
  # * Any of the non-TERMINATED nodes is in a non-STOPPED status.
@@ -1892,6 +1939,8 @@ def _update_cluster_status_no_lock(
1892
1939
  # cluster is probably down.
1893
1940
  # * The cluster is partially terminated or stopped should be considered
1894
1941
  # abnormal.
1942
+ # * The cluster is partially or completely in the INIT state, which means
1943
+ # that provisioning was interrupted. This is considered abnormal.
1895
1944
  #
1896
1945
  # An abnormal cluster will transition to INIT and have any autostop setting
1897
1946
  # reset (unless it's autostopping/autodowning).
@@ -1921,7 +1970,8 @@ def _update_cluster_status_no_lock(
1921
1970
  except exceptions.CommandError as e:
1922
1971
  success = False
1923
1972
  if e.returncode == 255:
1924
- logger.debug(f'The cluster is likely {noun}ed.')
1973
+ word = 'autostopped' if noun == 'autostop' else 'autodowned'
1974
+ logger.debug(f'The cluster is likely {word}.')
1925
1975
  reset_local_autostop = False
1926
1976
  except (Exception, SystemExit) as e: # pylint: disable=broad-except
1927
1977
  success = False
@@ -1973,52 +2023,22 @@ def _update_cluster_status_no_lock(
1973
2023
  return global_user_state.get_cluster_from_name(cluster_name)
1974
2024
 
1975
2025
 
1976
- def _update_cluster_status(
1977
- cluster_name: str,
1978
- acquire_per_cluster_status_lock: bool,
1979
- cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
1980
- ) -> Optional[Dict[str, Any]]:
1981
- """Update the cluster status.
1982
-
1983
- The cluster status is updated by checking ray cluster and real status from
1984
- cloud.
1985
-
1986
- The function will update the cached cluster status in the global state. For
1987
- the design of the cluster status and transition, please refer to the
1988
- sky/design_docs/cluster_status.md
1989
-
1990
- Args:
1991
- cluster_name: The name of the cluster.
1992
- acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
1993
- before updating the status.
1994
- cluster_status_lock_timeout: The timeout to acquire the per-cluster
1995
- lock.
1996
-
1997
- Returns:
1998
- If the cluster is terminated or does not exist, return None. Otherwise
1999
- returns the input record with status and handle potentially updated.
2026
+ def _must_refresh_cluster_status(
2027
+ record: Dict[str, Any],
2028
+ force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]]
2029
+ ) -> bool:
2030
+ force_refresh_for_cluster = (force_refresh_statuses is not None and
2031
+ record['status'] in force_refresh_statuses)
2000
2032
 
2001
- Raises:
2002
- exceptions.ClusterOwnerIdentityMismatchError: if the current user is
2003
- not the same as the user who created the cluster.
2004
- exceptions.CloudUserIdentityError: if we fail to get the current user
2005
- identity.
2006
- exceptions.ClusterStatusFetchingError: the cluster status cannot be
2007
- fetched from the cloud provider or there are leaked nodes causing
2008
- the node number larger than expected.
2009
- """
2010
- if not acquire_per_cluster_status_lock:
2011
- return _update_cluster_status_no_lock(cluster_name)
2033
+ use_spot = record['handle'].launched_resources.use_spot
2034
+ has_autostop = (record['status'] != status_lib.ClusterStatus.STOPPED and
2035
+ record['autostop'] >= 0)
2036
+ recently_refreshed = (record['status_updated_at'] is not None and
2037
+ time.time() - record['status_updated_at'] <
2038
+ _CLUSTER_STATUS_CACHE_DURATION_SECONDS)
2039
+ is_stale = (use_spot or has_autostop) and not recently_refreshed
2012
2040
 
2013
- try:
2014
- with filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name),
2015
- timeout=cluster_status_lock_timeout):
2016
- return _update_cluster_status_no_lock(cluster_name)
2017
- except filelock.Timeout:
2018
- logger.debug('Refreshing status: Failed get the lock for cluster '
2019
- f'{cluster_name!r}. Using the cached status.')
2020
- record = global_user_state.get_cluster_from_name(cluster_name)
2021
- return record
2041
+ return force_refresh_for_cluster or is_stale
2022
2042
 
2023
2043
 
2024
2044
  def refresh_cluster_record(
@@ -2030,22 +2050,28 @@ def refresh_cluster_record(
2030
2050
  ) -> Optional[Dict[str, Any]]:
2031
2051
  """Refresh the cluster, and return the possibly updated record.
2032
2052
 
2033
- This function will also check the owner identity of the cluster, and raise
2034
- exceptions if the current user is not the same as the user who created the
2035
- cluster.
2053
+ The function will update the cached cluster status in the global state. For
2054
+ the design of the cluster status and transition, please refer to the
2055
+ sky/design_docs/cluster_status.md
2036
2056
 
2037
2057
  Args:
2038
2058
  cluster_name: The name of the cluster.
2039
- force_refresh_statuses: if specified, refresh the cluster if it has one of
2040
- the specified statuses. Additionally, clusters satisfying the
2041
- following conditions will always be refreshed no matter the
2042
- argument is specified or not:
2043
- 1. is a spot cluster, or
2044
- 2. is a non-spot cluster, is not STOPPED, and autostop is set.
2059
+ force_refresh_statuses: if specified, refresh the cluster if it has one
2060
+ of the specified statuses. Additionally, clusters satisfying the
2061
+ following conditions will be refreshed no matter the argument is
2062
+ specified or not:
2063
+ - the most latest available status update is more than
2064
+ _CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
2065
+ 1. the cluster is a spot cluster, or
2066
+ 2. cluster autostop is set and the cluster is not STOPPED.
2045
2067
  acquire_per_cluster_status_lock: Whether to acquire the per-cluster lock
2046
- before updating the status.
2068
+ before updating the status. Even if this is True, the lock may not be
2069
+ acquired if the status does not need to be refreshed.
2047
2070
  cluster_status_lock_timeout: The timeout to acquire the per-cluster
2048
- lock. If timeout, the function will use the cached status.
2071
+ lock. If timeout, the function will use the cached status. If the
2072
+ value is <0, do not timeout (wait for the lock indefinitely). By
2073
+ default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
2074
+ if correctness is required, you must set this to -1.
2049
2075
 
2050
2076
  Returns:
2051
2077
  If the cluster is terminated or does not exist, return None.
@@ -2066,19 +2092,55 @@ def refresh_cluster_record(
2066
2092
  return None
2067
2093
  check_owner_identity(cluster_name)
2068
2094
 
2069
- handle = record['handle']
2070
- if isinstance(handle, backends.CloudVmRayResourceHandle):
2071
- use_spot = handle.launched_resources.use_spot
2072
- has_autostop = (record['status'] != status_lib.ClusterStatus.STOPPED and
2073
- record['autostop'] >= 0)
2074
- force_refresh_for_cluster = (force_refresh_statuses is not None and
2075
- record['status'] in force_refresh_statuses)
2076
- if force_refresh_for_cluster or has_autostop or use_spot:
2077
- record = _update_cluster_status(
2078
- cluster_name,
2079
- acquire_per_cluster_status_lock=acquire_per_cluster_status_lock,
2080
- cluster_status_lock_timeout=cluster_status_lock_timeout)
2081
- return record
2095
+ if not isinstance(record['handle'], backends.CloudVmRayResourceHandle):
2096
+ return record
2097
+
2098
+ # The loop logic allows us to notice if the status was updated in the
2099
+ # global_user_state by another process and stop trying to get the lock.
2100
+ # The core loop logic is adapted from FileLock's implementation.
2101
+ lock = filelock.FileLock(CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
2102
+ start_time = time.perf_counter()
2103
+
2104
+ # Loop until we have an up-to-date status or until we acquire the lock.
2105
+ while True:
2106
+ # Check to see if we can return the cached status.
2107
+ if not _must_refresh_cluster_status(record, force_refresh_statuses):
2108
+ return record
2109
+
2110
+ if not acquire_per_cluster_status_lock:
2111
+ return _update_cluster_status(cluster_name)
2112
+
2113
+ # Try to acquire the lock so we can fetch the status.
2114
+ try:
2115
+ with lock.acquire(blocking=False):
2116
+ # Check the cluster status again, since it could have been
2117
+ # updated between our last check and acquiring the lock.
2118
+ record = global_user_state.get_cluster_from_name(cluster_name)
2119
+ if record is None or not _must_refresh_cluster_status(
2120
+ record, force_refresh_statuses):
2121
+ return record
2122
+ # Update and return the cluster status.
2123
+ return _update_cluster_status(cluster_name)
2124
+ except filelock.Timeout:
2125
+ # lock.acquire() will throw a Timeout exception if the lock is not
2126
+ # available and we have blocking=False.
2127
+ pass
2128
+
2129
+ # Logic adapted from FileLock.acquire().
2130
+ # If cluster_status_lock_time is <0, we will never hit this. No timeout.
2131
+ # Otherwise, if we have timed out, return the cached status. This has
2132
+ # the potential to cause correctness issues, but if so it is the
2133
+ # caller's responsibility to set the timeout to -1.
2134
+ if 0 <= cluster_status_lock_timeout < time.perf_counter() - start_time:
2135
+ logger.debug('Refreshing status: Failed get the lock for cluster '
2136
+ f'{cluster_name!r}. Using the cached status.')
2137
+ return record
2138
+ time.sleep(0.05)
2139
+
2140
+ # Refresh for next loop iteration.
2141
+ record = global_user_state.get_cluster_from_name(cluster_name)
2142
+ if record is None:
2143
+ return None
2082
2144
 
2083
2145
 
2084
2146
  @timeline.event
@@ -2141,7 +2203,7 @@ def check_cluster_available(
2141
2203
  """Check if the cluster is available.
2142
2204
 
2143
2205
  Raises:
2144
- ValueError: if the cluster does not exist.
2206
+ exceptions.ClusterDoesNotExist: if the cluster does not exist.
2145
2207
  exceptions.ClusterNotUpError: if the cluster is not UP.
2146
2208
  exceptions.NotSupportedError: if the cluster is not based on
2147
2209
  CloudVmRayBackend.
@@ -2206,7 +2268,8 @@ def check_cluster_available(
2206
2268
  error_msg += message
2207
2269
 
2208
2270
  with ux_utils.print_exception_no_traceback():
2209
- raise ValueError(f'{colorama.Fore.YELLOW}{error_msg}{reset}')
2271
+ raise exceptions.ClusterDoesNotExist(
2272
+ f'{colorama.Fore.YELLOW}{error_msg}{reset}')
2210
2273
  assert cluster_status is not None, 'handle is not None but status is None'
2211
2274
  backend = get_backend_from_handle(handle)
2212
2275
  if check_cloud_vm_ray_backend and not isinstance(
@@ -2380,10 +2443,21 @@ class CloudFilter(enum.Enum):
2380
2443
  LOCAL = 'local'
2381
2444
 
2382
2445
 
2446
+ def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]:
2447
+ """Returns a list of clusters that match the glob pattern."""
2448
+ glob_clusters = []
2449
+ for cluster in clusters:
2450
+ glob_cluster = global_user_state.get_glob_cluster_names(cluster)
2451
+ if len(glob_cluster) == 0 and not silent:
2452
+ logger.info(f'Cluster {cluster} not found.')
2453
+ glob_clusters.extend(glob_cluster)
2454
+ return list(set(glob_clusters))
2455
+
2456
+
2383
2457
  def get_clusters(
2384
- include_controller: bool,
2385
- refresh: bool,
2458
+ refresh: common.StatusRefreshMode,
2386
2459
  cluster_names: Optional[Union[str, List[str]]] = None,
2460
+ all_users: bool = True,
2387
2461
  ) -> List[Dict[str, Any]]:
2388
2462
  """Returns a list of cached or optionally refreshed cluster records.
2389
2463
 
@@ -2408,20 +2482,55 @@ def get_clusters(
2408
2482
  terminated, the record will be omitted from the returned list.
2409
2483
  """
2410
2484
  records = global_user_state.get_clusters()
2411
-
2412
- if not include_controller:
2485
+ if not all_users:
2486
+ current_user_hash = common_utils.get_user_hash()
2413
2487
  records = [
2414
2488
  record for record in records
2415
- if controller_utils.Controllers.from_name(record['name']) is None
2489
+ if record['user_hash'] == current_user_hash
2416
2490
  ]
2417
2491
 
2418
2492
  yellow = colorama.Fore.YELLOW
2419
2493
  bright = colorama.Style.BRIGHT
2420
2494
  reset = colorama.Style.RESET_ALL
2421
2495
 
2496
+ def _update_record_with_credentials_and_resources_str(
2497
+ record: Optional[Dict[str, Any]]) -> None:
2498
+ """Add the credentials to the record.
2499
+
2500
+ This is useful for the client side to setup the ssh config of the
2501
+ cluster.
2502
+ """
2503
+ if record is None:
2504
+ return
2505
+ handle = record['handle']
2506
+ if handle is None:
2507
+ return
2508
+ record['resources_str'] = resources_utils.get_readable_resources_repr(
2509
+ handle)
2510
+ credentials = ssh_credential_from_yaml(handle.cluster_yaml,
2511
+ handle.docker_user,
2512
+ handle.ssh_user)
2513
+
2514
+ if not credentials:
2515
+ return
2516
+ ssh_private_key_path = credentials.get('ssh_private_key', None)
2517
+ if ssh_private_key_path is not None:
2518
+ with open(os.path.expanduser(ssh_private_key_path),
2519
+ 'r',
2520
+ encoding='utf-8') as f:
2521
+ credentials['ssh_private_key_content'] = f.read()
2522
+ else:
2523
+ private_key_path, _ = auth.get_or_generate_keys()
2524
+ with open(os.path.expanduser(private_key_path),
2525
+ 'r',
2526
+ encoding='utf-8') as f:
2527
+ credentials['ssh_private_key_content'] = f.read()
2528
+ record['credentials'] = credentials
2529
+
2422
2530
  if cluster_names is not None:
2423
2531
  if isinstance(cluster_names, str):
2424
2532
  cluster_names = [cluster_names]
2533
+ cluster_names = _get_glob_clusters(cluster_names, silent=True)
2425
2534
  new_records = []
2426
2535
  not_exist_cluster_names = []
2427
2536
  for cluster_name in cluster_names:
@@ -2436,23 +2545,33 @@ def get_clusters(
2436
2545
  logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
2437
2546
  records = new_records
2438
2547
 
2439
- if not refresh:
2548
+ # Add auth_config to the records
2549
+ for record in records:
2550
+ _update_record_with_credentials_and_resources_str(record)
2551
+
2552
+ if refresh == common.StatusRefreshMode.NONE:
2440
2553
  return records
2441
2554
 
2442
2555
  plural = 's' if len(records) > 1 else ''
2443
2556
  progress = rich_progress.Progress(transient=True,
2444
2557
  redirect_stdout=False,
2445
2558
  redirect_stderr=False)
2446
- task = progress.add_task(
2447
- f'[bold cyan]Refreshing status for {len(records)} cluster{plural}[/]',
2448
- total=len(records))
2559
+ task = progress.add_task(ux_utils.spinner_message(
2560
+ f'Refreshing status for {len(records)} cluster{plural}'),
2561
+ total=len(records))
2562
+
2563
+ if refresh == common.StatusRefreshMode.FORCE:
2564
+ force_refresh_statuses = set(status_lib.ClusterStatus)
2565
+ else:
2566
+ force_refresh_statuses = None
2449
2567
 
2450
2568
  def _refresh_cluster(cluster_name):
2451
2569
  try:
2452
2570
  record = refresh_cluster_record(
2453
2571
  cluster_name,
2454
- force_refresh_statuses=set(status_lib.ClusterStatus),
2572
+ force_refresh_statuses=force_refresh_statuses,
2455
2573
  acquire_per_cluster_status_lock=True)
2574
+ _update_record_with_credentials_and_resources_str(record)
2456
2575
  except (exceptions.ClusterStatusFetchingError,
2457
2576
  exceptions.CloudUserIdentityError,
2458
2577
  exceptions.ClusterOwnerIdentityMismatchError) as e:
@@ -2464,9 +2583,11 @@ def get_clusters(
2464
2583
  return record
2465
2584
 
2466
2585
  cluster_names = [record['name'] for record in records]
2467
- with progress:
2468
- updated_records = subprocess_utils.run_in_parallel(
2469
- _refresh_cluster, cluster_names)
2586
+ updated_records = []
2587
+ if len(cluster_names) > 0:
2588
+ with progress:
2589
+ updated_records = subprocess_utils.run_in_parallel(
2590
+ _refresh_cluster, cluster_names)
2470
2591
 
2471
2592
  # Show information for removed clusters.
2472
2593
  kept_records = []
@@ -2503,6 +2624,7 @@ def get_clusters(
2503
2624
  f'{len(failed_clusters)} cluster{plural}:{reset}')
2504
2625
  for cluster_name, e in failed_clusters:
2505
2626
  logger.warning(f' {bright}{cluster_name}{reset}: {e}')
2627
+
2506
2628
  return kept_records
2507
2629
 
2508
2630
 
@@ -2579,10 +2701,12 @@ def get_task_resources_str(task: 'task_lib.Task',
2579
2701
  the accelerator demands (if any). Otherwise, the CPU demand is shown.
2580
2702
  """
2581
2703
  spot_str = ''
2704
+ is_controller_task = task.is_controller_task()
2582
2705
  task_cpu_demand = (str(constants.CONTROLLER_PROCESS_CPU_DEMAND)
2583
- if task.is_controller_task() else
2584
- str(DEFAULT_TASK_CPU_DEMAND))
2585
- if task.best_resources is not None:
2706
+ if is_controller_task else str(DEFAULT_TASK_CPU_DEMAND))
2707
+ if is_controller_task:
2708
+ resources_str = f'CPU:{task_cpu_demand}'
2709
+ elif task.best_resources is not None:
2586
2710
  accelerator_dict = task.best_resources.accelerators
2587
2711
  if is_managed_job:
2588
2712
  if task.best_resources.use_spot:
@@ -2650,27 +2774,6 @@ def stop_handler(signum, frame):
2650
2774
  raise KeyboardInterrupt(exceptions.SIGTSTP_CODE)
2651
2775
 
2652
2776
 
2653
- def run_command_and_handle_ssh_failure(runner: command_runner.SSHCommandRunner,
2654
- command: str,
2655
- failure_message: str) -> str:
2656
- """Runs command remotely and returns output with proper error handling."""
2657
- rc, stdout, stderr = runner.run(command,
2658
- require_outputs=True,
2659
- stream_logs=False)
2660
- if rc == 255:
2661
- # SSH failed
2662
- raise RuntimeError(
2663
- f'SSH with user {runner.ssh_user} and key {runner.ssh_private_key} '
2664
- f'to {runner.ip} failed. This is most likely due to incorrect '
2665
- 'credentials or incorrect permissions for the key file. Check '
2666
- 'your credentials and try again.')
2667
- subprocess_utils.handle_returncode(rc,
2668
- command,
2669
- failure_message,
2670
- stderr=stderr)
2671
- return stdout
2672
-
2673
-
2674
2777
  def check_rsync_installed() -> None:
2675
2778
  """Checks if rsync is installed.
2676
2779
 
@@ -2703,15 +2806,18 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str,
2703
2806
  pattern = re.compile(r'AttributeError: module \'sky\.(.*)\' has no '
2704
2807
  r'attribute \'(.*)\'')
2705
2808
  if returncode != 0:
2809
+ # TODO(zhwu): Backward compatibility for old SkyPilot runtime version on
2810
+ # the remote cluster. Remove this after 0.10.0 is released.
2706
2811
  attribute_error = re.findall(pattern, stderr)
2707
- if attribute_error:
2812
+ if attribute_error or 'SkyPilot runtime is too old' in stderr:
2708
2813
  with ux_utils.print_exception_no_traceback():
2709
2814
  raise RuntimeError(
2710
2815
  f'{colorama.Fore.RED}SkyPilot runtime needs to be updated '
2711
- 'on the remote cluster. To update, run (existing jobs are '
2712
- f'not interrupted): {colorama.Style.BRIGHT}sky start -f -y '
2816
+ f'on the remote cluster: {cluster_name}. To update, run '
2817
+ '(existing jobs will not be interrupted): '
2818
+ f'{colorama.Style.BRIGHT}sky start -f -y '
2713
2819
  f'{cluster_name}{colorama.Style.RESET_ALL}'
2714
- f'\n--- Details ---\n{stderr.strip()}\n')
2820
+ f'\n--- Details ---\n{stderr.strip()}\n') from None
2715
2821
 
2716
2822
 
2717
2823
  def get_endpoints(cluster: str,
@@ -2748,16 +2854,22 @@ def get_endpoints(cluster: str,
2748
2854
  except ValueError:
2749
2855
  with ux_utils.print_exception_no_traceback():
2750
2856
  raise ValueError(f'Invalid endpoint {port!r}.') from None
2751
- cluster_records = get_clusters(include_controller=True,
2752
- refresh=False,
2857
+ cluster_records = get_clusters(refresh=common.StatusRefreshMode.NONE,
2753
2858
  cluster_names=[cluster])
2859
+ if not cluster_records:
2860
+ with ux_utils.print_exception_no_traceback():
2861
+ raise exceptions.ClusterNotUpError(
2862
+ f'Cluster {cluster!r} not found.', cluster_status=None)
2863
+ assert len(cluster_records) == 1, cluster_records
2754
2864
  cluster_record = cluster_records[0]
2755
2865
  if (not skip_status_check and
2756
2866
  cluster_record['status'] != status_lib.ClusterStatus.UP):
2757
2867
  with ux_utils.print_exception_no_traceback():
2758
2868
  raise exceptions.ClusterNotUpError(
2759
2869
  f'Cluster {cluster_record["name"]!r} '
2760
- 'is not in UP status.', cluster_record['status'])
2870
+ 'is not in UP status.',
2871
+ cluster_status=cluster_record['status'],
2872
+ handle=cluster_record['handle'])
2761
2873
  handle = cluster_record['handle']
2762
2874
  if not isinstance(handle, backends.CloudVmRayResourceHandle):
2763
2875
  with ux_utils.print_exception_no_traceback():
@@ -2773,7 +2885,7 @@ def get_endpoints(cluster: str,
2773
2885
  except exceptions.NotSupportedError:
2774
2886
  with ux_utils.print_exception_no_traceback():
2775
2887
  raise ValueError('Querying endpoints is not supported '
2776
- f'for cluster {cluster!r} on {cloud}.') from None
2888
+ f'for {cluster!r} on {cloud}.') from None
2777
2889
 
2778
2890
  config = common_utils.read_yaml(handle.cluster_yaml)
2779
2891
  port_details = provision_lib.query_ports(repr(cloud),