skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,6 @@
1
1
  """Backend: runs on cloud virtual machines, managed by Ray."""
2
2
  import copy
3
3
  import enum
4
- import functools
5
- import getpass
6
4
  import inspect
7
5
  import json
8
6
  import math
@@ -10,6 +8,7 @@ import os
10
8
  import pathlib
11
9
  import re
12
10
  import shlex
11
+ import shutil
13
12
  import signal
14
13
  import subprocess
15
14
  import sys
@@ -18,13 +17,15 @@ import textwrap
18
17
  import threading
19
18
  import time
20
19
  import typing
21
- from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union
20
+ from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
21
+ Union)
22
22
 
23
23
  import colorama
24
24
  import filelock
25
25
 
26
26
  import sky
27
27
  from sky import backends
28
+ from sky import check as sky_check
28
29
  from sky import cloud_stores
29
30
  from sky import clouds
30
31
  from sky import exceptions
@@ -33,9 +34,7 @@ from sky import jobs as managed_jobs
33
34
  from sky import optimizer
34
35
  from sky import provision as provision_lib
35
36
  from sky import resources as resources_lib
36
- from sky import serve as serve_lib
37
37
  from sky import sky_logging
38
- from sky import status_lib
39
38
  from sky import task as task_lib
40
39
  from sky.backends import backend_utils
41
40
  from sky.backends import wheel_utils
@@ -47,18 +46,26 @@ from sky.provision import common as provision_common
47
46
  from sky.provision import instance_setup
48
47
  from sky.provision import metadata_utils
49
48
  from sky.provision import provisioner
49
+ from sky.provision.kubernetes import utils as kubernetes_utils
50
+ from sky.server.requests import requests as requests_lib
50
51
  from sky.skylet import autostop_lib
51
52
  from sky.skylet import constants
52
53
  from sky.skylet import job_lib
53
54
  from sky.skylet import log_lib
54
55
  from sky.usage import usage_lib
55
56
  from sky.utils import accelerator_registry
57
+ from sky.utils import annotations
58
+ from sky.utils import cluster_utils
56
59
  from sky.utils import command_runner
60
+ from sky.utils import common
57
61
  from sky.utils import common_utils
58
62
  from sky.utils import controller_utils
59
63
  from sky.utils import log_utils
64
+ from sky.utils import message_utils
65
+ from sky.utils import registry
60
66
  from sky.utils import resources_utils
61
67
  from sky.utils import rich_utils
68
+ from sky.utils import status_lib
62
69
  from sky.utils import subprocess_utils
63
70
  from sky.utils import timeline
64
71
  from sky.utils import ux_utils
@@ -81,9 +88,10 @@ _NODES_LAUNCHING_PROGRESS_TIMEOUT = {
81
88
  clouds.AWS: 90,
82
89
  clouds.Azure: 90,
83
90
  clouds.GCP: 240,
84
- clouds.Lambda: 150,
91
+ clouds.Lambda: 300,
85
92
  clouds.IBM: 160,
86
93
  clouds.OCI: 300,
94
+ clouds.Paperspace: 600,
87
95
  clouds.Kubernetes: 300,
88
96
  clouds.Vsphere: 240,
89
97
  }
@@ -95,6 +103,11 @@ _RETRY_UNTIL_UP_INIT_GAP_SECONDS = 30
95
103
  # The maximum retry count for fetching IP address.
96
104
  _FETCH_IP_MAX_ATTEMPTS = 3
97
105
 
106
+ # How many times to query the cloud provider to make sure instances are
107
+ # stopping/terminating, and how long to wait between each query.
108
+ _TEARDOWN_WAIT_MAX_ATTEMPTS = 10
109
+ _TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS = 1
110
+
98
111
  _TEARDOWN_FAILURE_MESSAGE = (
99
112
  f'\n{colorama.Fore.RED}Failed to terminate '
100
113
  '{cluster_name}. {extra_reason}'
@@ -119,9 +132,6 @@ _RSYNC_NOT_FOUND_MESSAGE = (
119
132
 
120
133
  _TPU_NOT_FOUND_ERROR = 'ERROR: (gcloud.compute.tpus.delete) NOT_FOUND'
121
134
 
122
- _CTRL_C_TIP_MESSAGE = ('INFO: Tip: use Ctrl-C to exit log streaming '
123
- '(task will not be killed).')
124
-
125
135
  _MAX_RAY_UP_RETRY = 5
126
136
 
127
137
  # Number of retries for getting zones.
@@ -145,9 +155,24 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
145
155
  # If the command is too long, we instead write it to a file, rsync and execute
146
156
  # it.
147
157
  #
148
- # We use 120KB as a threshold to be safe for other arguments that
158
+ # We use 100KB as a threshold to be safe for other arguments that
149
159
  # might be added during ssh.
150
- _MAX_INLINE_SCRIPT_LENGTH = 120 * 1024
160
+ _MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
161
+
162
+ _RESOURCES_UNAVAILABLE_LOG = (
163
+ 'Reasons for provision failures (for details, please check the log above):')
164
+
165
+
166
+ def _is_command_length_over_limit(command: str) -> bool:
167
+ """Check if the length of the command exceeds the limit.
168
+
169
+ We calculate the length of the command after quoting the command twice as
170
+ when it is executed by the CommandRunner, the command will be quoted twice
171
+ to ensure the correctness, which will add significant length to the command.
172
+ """
173
+
174
+ quoted_length = len(shlex.quote(shlex.quote(command)))
175
+ return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
151
176
 
152
177
 
153
178
  def _get_cluster_config_template(cloud):
@@ -161,16 +186,19 @@ def _get_cluster_config_template(cloud):
161
186
  clouds.SCP: 'scp-ray.yml.j2',
162
187
  clouds.OCI: 'oci-ray.yml.j2',
163
188
  clouds.Paperspace: 'paperspace-ray.yml.j2',
189
+ clouds.DO: 'do-ray.yml.j2',
164
190
  clouds.RunPod: 'runpod-ray.yml.j2',
165
191
  clouds.Kubernetes: 'kubernetes-ray.yml.j2',
166
192
  clouds.Vsphere: 'vsphere-ray.yml.j2',
167
- clouds.Fluidstack: 'fluidstack-ray.yml.j2'
193
+ clouds.Vast: 'vast-ray.yml.j2',
194
+ clouds.Fluidstack: 'fluidstack-ray.yml.j2',
195
+ clouds.Nebius: 'nebius-ray.yml.j2'
168
196
  }
169
197
  return cloud_to_template[type(cloud)]
170
198
 
171
199
 
172
200
  def write_ray_up_script_with_patched_launch_hash_fn(
173
- cluster_config_path: str,
201
+ cluster_config_path: Optional[str],
174
202
  ray_up_kwargs: Dict[str, bool],
175
203
  ) -> str:
176
204
  """Writes a Python script that runs `ray up` with our launch hash func.
@@ -257,6 +285,13 @@ class RayCodeGen:
257
285
  import time
258
286
  from typing import Dict, List, Optional, Tuple, Union
259
287
 
288
+ # Set the environment variables to avoid deduplicating logs and
289
+ # scheduler events. This should be set in driver code, since we are
290
+ # not using `ray job submit` anymore, and the environment variables
291
+ # from the ray cluster is not inherited.
292
+ os.environ['RAY_DEDUP_LOGS'] = '0'
293
+ os.environ['RAY_SCHEDULER_EVENTS'] = '0'
294
+
260
295
  import ray
261
296
  import ray.util as ray_util
262
297
 
@@ -264,12 +299,14 @@ class RayCodeGen:
264
299
  from sky.skylet import constants
265
300
  from sky.skylet import job_lib
266
301
  from sky.utils import log_utils
302
+ from sky.utils import subprocess_utils
267
303
 
268
304
  SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
269
305
 
270
306
  kwargs = dict()
271
- # Only set the `_temp_dir` to SkyPilot's ray cluster directory when the directory
272
- # exists for backward compatibility for the VM launched before #1790.
307
+ # Only set the `_temp_dir` to SkyPilot's ray cluster directory when
308
+ # the directory exists for backward compatibility for the VM
309
+ # launched before #1790.
273
310
  if os.path.exists({constants.SKY_REMOTE_RAY_TEMPDIR!r}):
274
311
  kwargs['_temp_dir'] = {constants.SKY_REMOTE_RAY_TEMPDIR!r}
275
312
  ray.init(
@@ -280,6 +317,8 @@ class RayCodeGen:
280
317
  )
281
318
  def get_or_fail(futures, pg) -> List[int]:
282
319
  \"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
320
+ if not futures:
321
+ return []
283
322
  returncodes = [1] * len(futures)
284
323
  # Wait for 1 task to be ready.
285
324
  ready = []
@@ -307,8 +346,9 @@ class RayCodeGen:
307
346
  ready, unready = ray.wait(unready)
308
347
  idx = futures.index(ready[0])
309
348
  returncodes[idx] = ray.get(ready[0])
310
- # Remove the placement group after all tasks are done, so that the
311
- # next job can be scheduled on the released resources immediately.
349
+ # Remove the placement group after all tasks are done, so that
350
+ # the next job can be scheduled on the released resources
351
+ # immediately.
312
352
  ray_util.remove_placement_group(pg)
313
353
  sys.stdout.flush()
314
354
  return returncodes
@@ -347,9 +387,9 @@ class RayCodeGen:
347
387
  num_nodes: int,
348
388
  resources_dict: Dict[str, float],
349
389
  stable_cluster_internal_ips: List[str],
390
+ env_vars: Dict[str, str],
350
391
  setup_cmd: Optional[str] = None,
351
392
  setup_log_path: Optional[str] = None,
352
- env_vars: Optional[Dict[str, str]] = None,
353
393
  ) -> None:
354
394
  """Create the gang scheduling placement group for a Task.
355
395
 
@@ -388,27 +428,42 @@ class RayCodeGen:
388
428
  **gpu_dict,
389
429
  })
390
430
 
431
+ streaming_message = (
432
+ f'{ux_utils.INDENT_LAST_SYMBOL}Job started. Streaming logs... '
433
+ f'{colorama.Style.DIM}(Ctrl-C to exit log streaming; job will not '
434
+ f'be killed){colorama.Style.RESET_ALL}')
391
435
  self._code += [
392
436
  textwrap.dedent(f"""\
393
437
  pg = ray_util.placement_group({json.dumps(bundles)}, 'STRICT_SPREAD')
394
438
  plural = 's' if {num_nodes} > 1 else ''
395
439
  node_str = f'{num_nodes} node{{plural}}'
396
440
 
397
- message = {_CTRL_C_TIP_MESSAGE!r} + '\\n'
398
- message += f'INFO: Waiting for task resources on {{node_str}}. This will block if the cluster is full.'
399
- print(message,
400
- flush=True)
441
+ # We have this `INFO: Tip:` message only for backward
442
+ # compatibility, because if a cluster has the old SkyPilot version,
443
+ # it relies on this message to start log streaming.
444
+ # This message will be skipped for new clusters, because we use
445
+ # start_streaming_at for the `Waiting for task resources on`
446
+ # message.
447
+ # TODO: Remove this message in v0.9.0.
448
+ message = ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}INFO: '
449
+ 'Tip: use Ctrl-C to exit log streaming, not kill '
450
+ 'the job.{colorama.Style.RESET_ALL}\\n')
451
+ message += ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
452
+ 'Waiting for task resources on '
453
+ f'{{node_str}}.{colorama.Style.RESET_ALL}')
454
+ print(message, flush=True)
401
455
  # FIXME: This will print the error message from autoscaler if
402
456
  # it is waiting for other task to finish. We should hide the
403
457
  # error message.
404
458
  ray.get(pg.ready())
405
- print('INFO: All task resources reserved.',
406
- flush=True)
459
+ print({streaming_message!r}, flush=True)
407
460
  """)
408
461
  ]
409
462
 
410
463
  job_id = self.job_id
411
464
  if setup_cmd is not None:
465
+ setup_envs = env_vars.copy()
466
+ setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
412
467
  self._code += [
413
468
  textwrap.dedent(f"""\
414
469
  setup_cmd = {setup_cmd!r}
@@ -438,7 +493,7 @@ class RayCodeGen:
438
493
  .remote(
439
494
  setup_cmd,
440
495
  os.path.expanduser({setup_log_path!r}),
441
- env_vars={env_vars!r},
496
+ env_vars={setup_envs!r},
442
497
  stream_logs=True,
443
498
  with_ray=True,
444
499
  ) for i in range(total_num_nodes)]
@@ -477,7 +532,6 @@ class RayCodeGen:
477
532
  )).remote()
478
533
  for i in range(pg.bundle_count)
479
534
  ])
480
- print('INFO: Reserved IPs:', gang_scheduling_id_to_ip)
481
535
 
482
536
  cluster_ips_to_node_id = {{ip: i for i, ip in enumerate({stable_cluster_internal_ips!r})}}
483
537
  job_ip_rank_list = sorted(gang_scheduling_id_to_ip, key=cluster_ips_to_node_id.get)
@@ -549,11 +603,13 @@ class RayCodeGen:
549
603
  f'placement_group_bundle_index={gang_scheduling_id})')
550
604
 
551
605
  sky_env_vars_dict_str = [
552
- textwrap.dedent("""\
553
- sky_env_vars_dict = {}
554
- sky_env_vars_dict['SKYPILOT_NODE_IPS'] = job_ip_list_str
555
- # Environment starting with `SKY_` is deprecated.
606
+ textwrap.dedent(f"""\
607
+ sky_env_vars_dict = {{}}
608
+ sky_env_vars_dict['{constants.SKYPILOT_NODE_IPS}'] = job_ip_list_str
609
+ # Backward compatibility: Environment starting with `SKY_` is
610
+ # deprecated. Remove it in v0.9.0.
556
611
  sky_env_vars_dict['SKY_NODE_IPS'] = job_ip_list_str
612
+ sky_env_vars_dict['{constants.SKYPILOT_NUM_NODES}'] = len(job_ip_rank_list)
557
613
  """)
558
614
  ]
559
615
 
@@ -574,8 +630,9 @@ class RayCodeGen:
574
630
 
575
631
 
576
632
  if script is not None:
577
- sky_env_vars_dict['SKYPILOT_NUM_GPUS_PER_NODE'] = {int(math.ceil(num_gpus))!r}
578
- # Environment starting with `SKY_` is deprecated.
633
+ sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
634
+ # Backward compatibility: Environment starting with `SKY_` is
635
+ # deprecated. Remove it in v0.9.0.
579
636
  sky_env_vars_dict['SKY_NUM_GPUS_PER_NODE'] = {int(math.ceil(num_gpus))!r}
580
637
 
581
638
  ip = gang_scheduling_id_to_ip[{gang_scheduling_id!r}]
@@ -592,12 +649,14 @@ class RayCodeGen:
592
649
  node_name = f'worker{{idx_in_cluster}}'
593
650
  name_str = f'{{node_name}}, rank={{rank}},'
594
651
  log_path = os.path.expanduser(os.path.join({log_dir!r}, f'{{rank}}-{{node_name}}.log'))
595
- sky_env_vars_dict['SKYPILOT_NODE_RANK'] = rank
596
- # Environment starting with `SKY_` is deprecated.
652
+ sky_env_vars_dict['{constants.SKYPILOT_NODE_RANK}'] = rank
653
+ # Backward compatibility: Environment starting with `SKY_` is
654
+ # deprecated. Remove it in v0.9.0.
597
655
  sky_env_vars_dict['SKY_NODE_RANK'] = rank
598
656
 
599
657
  sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
600
- # Environment starting with `SKY_` is deprecated.
658
+ # Backward compatibility: Environment starting with `SKY_` is
659
+ # deprecated. Remove it in v0.9.0.
601
660
  sky_env_vars_dict['SKY_INTERNAL_JOB_ID'] = {self.job_id}
602
661
 
603
662
  futures.append(run_bash_command_with_log \\
@@ -680,56 +739,38 @@ class FailoverCloudErrorHandlerV1:
680
739
  """
681
740
 
682
741
  @staticmethod
683
- def _azure_handler(blocked_resources: Set['resources_lib.Resources'],
684
- launchable_resources: 'resources_lib.Resources',
685
- region: 'clouds.Region',
686
- zones: Optional[List['clouds.Zone']], stdout: str,
687
- stderr: str):
688
- del zones # Unused.
689
- # The underlying ray autoscaler will try all zones of a region at once.
690
- style = colorama.Style
742
+ def _handle_errors(stdout: str, stderr: str,
743
+ is_error_str_known: Callable[[str], bool]) -> List[str]:
691
744
  stdout_splits = stdout.split('\n')
692
745
  stderr_splits = stderr.split('\n')
693
746
  errors = [
694
747
  s.strip()
695
748
  for s in stdout_splits + stderr_splits
696
- if ('Exception Details:' in s.strip() or 'InvalidTemplateDeployment'
697
- in s.strip() or '(ReadOnlyDisabledSubscription)' in s.strip())
749
+ if is_error_str_known(s.strip())
698
750
  ]
699
- if not errors:
700
- if 'Head node fetch timed out' in stderr:
701
- # Example: click.exceptions.ClickException: Head node fetch
702
- # timed out. Failed to create head node.
703
- # This is a transient error, but we have retried in need_ray_up
704
- # and failed. So we skip this region.
705
- logger.info('Got \'Head node fetch timed out\' in '
706
- f'{region.name}.')
707
- _add_to_blocked_resources(
708
- blocked_resources,
709
- launchable_resources.copy(region=region.name))
710
- elif 'rsync: command not found' in stderr:
711
- with ux_utils.print_exception_no_traceback():
712
- raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
713
- logger.info('====== stdout ======')
714
- for s in stdout_splits:
715
- print(s)
716
- logger.info('====== stderr ======')
717
- for s in stderr_splits:
718
- print(s)
751
+ if errors:
752
+ return errors
753
+ if 'rsync: command not found' in stderr:
719
754
  with ux_utils.print_exception_no_traceback():
720
- raise RuntimeError('Errors occurred during provision; '
721
- 'check logs above.')
722
-
723
- logger.warning(f'Got error(s) in {region.name}:')
724
- messages = '\n\t'.join(errors)
725
- logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
726
- if any('(ReadOnlyDisabledSubscription)' in s for s in errors):
727
- _add_to_blocked_resources(
728
- blocked_resources,
729
- resources_lib.Resources(cloud=clouds.Azure()))
730
- else:
731
- _add_to_blocked_resources(blocked_resources,
732
- launchable_resources.copy(zone=None))
755
+ e = RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
756
+ setattr(e, 'detailed_reason',
757
+ f'stdout: {stdout}\nstderr: {stderr}')
758
+ raise e
759
+ detailed_reason = textwrap.dedent(f"""\
760
+ ====== stdout ======
761
+ {stdout}
762
+ ====== stderr ======
763
+ {stderr}
764
+ """)
765
+ logger.info('====== stdout ======')
766
+ print(stdout)
767
+ logger.info('====== stderr ======')
768
+ print(stderr)
769
+ with ux_utils.print_exception_no_traceback():
770
+ e = RuntimeError('Errors occurred during provision; '
771
+ 'check logs above.')
772
+ setattr(e, 'detailed_reason', detailed_reason)
773
+ raise e
733
774
 
734
775
  @staticmethod
735
776
  def _lambda_handler(blocked_resources: Set['resources_lib.Resources'],
@@ -737,32 +778,14 @@ class FailoverCloudErrorHandlerV1:
737
778
  region: 'clouds.Region',
738
779
  zones: Optional[List['clouds.Zone']], stdout: str,
739
780
  stderr: str):
740
- del zones # Unused.
781
+ del region, zones # Unused.
782
+ errors = FailoverCloudErrorHandlerV1._handle_errors(
783
+ stdout,
784
+ stderr,
785
+ is_error_str_known=lambda x: 'LambdaCloudError:' in x.strip())
786
+ messages = '\n '.join(errors)
741
787
  style = colorama.Style
742
- stdout_splits = stdout.split('\n')
743
- stderr_splits = stderr.split('\n')
744
- errors = [
745
- s.strip()
746
- for s in stdout_splits + stderr_splits
747
- if 'LambdaCloudError:' in s.strip()
748
- ]
749
- if not errors:
750
- if 'rsync: command not found' in stderr:
751
- with ux_utils.print_exception_no_traceback():
752
- raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
753
- logger.info('====== stdout ======')
754
- for s in stdout_splits:
755
- print(s)
756
- logger.info('====== stderr ======')
757
- for s in stderr_splits:
758
- print(s)
759
- with ux_utils.print_exception_no_traceback():
760
- raise RuntimeError('Errors occurred during provision; '
761
- 'check logs above.')
762
-
763
- logger.warning(f'Got error(s) in {region.name}:')
764
- messages = '\n\t'.join(errors)
765
- logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
788
+ logger.warning(f' {style.DIM}{messages}{style.RESET_ALL}')
766
789
  _add_to_blocked_resources(blocked_resources,
767
790
  launchable_resources.copy(zone=None))
768
791
 
@@ -775,65 +798,21 @@ class FailoverCloudErrorHandlerV1:
775
798
  blocked_resources,
776
799
  launchable_resources.copy(region=r.name, zone=None))
777
800
 
778
- @staticmethod
779
- def _kubernetes_handler(blocked_resources: Set['resources_lib.Resources'],
780
- launchable_resources: 'resources_lib.Resources',
781
- region, zones, stdout, stderr):
782
- del zones # Unused.
783
- style = colorama.Style
784
- stdout_splits = stdout.split('\n')
785
- stderr_splits = stderr.split('\n')
786
- errors = [
787
- s.strip()
788
- for s in stdout_splits + stderr_splits
789
- if 'KubernetesError:' in s.strip()
790
- ]
791
- if not errors:
792
- logger.info('====== stdout ======')
793
- for s in stdout_splits:
794
- print(s)
795
- logger.info('====== stderr ======')
796
- for s in stderr_splits:
797
- print(s)
798
- with ux_utils.print_exception_no_traceback():
799
- raise RuntimeError('Errors occurred during provisioning; '
800
- 'check logs above.')
801
-
802
- logger.warning(f'Got error(s) in {region.name}:')
803
- messages = '\n\t'.join(errors)
804
- logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
805
- _add_to_blocked_resources(blocked_resources,
806
- launchable_resources.copy(zone=None))
807
-
808
801
  @staticmethod
809
802
  def _scp_handler(blocked_resources: Set['resources_lib.Resources'],
810
- launchable_resources: 'resources_lib.Resources', region,
811
- zones, stdout, stderr):
803
+ launchable_resources: 'resources_lib.Resources',
804
+ region: 'clouds.Region',
805
+ zones: Optional[List['clouds.Zone']], stdout: str,
806
+ stderr: str):
812
807
  del zones # Unused.
813
- style = colorama.Style
814
- stdout_splits = stdout.split('\n')
815
- stderr_splits = stderr.split('\n')
816
- errors = [
817
- s.strip()
818
- for s in stdout_splits + stderr_splits
819
- if 'SCPError:' in s.strip()
820
- ]
821
- if not errors:
822
- if 'rsync: command not found' in stderr:
823
- with ux_utils.print_exception_no_traceback():
824
- raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
825
- logger.info('====== stdout ======')
826
- for s in stdout_splits:
827
- print(s)
828
- logger.info('====== stderr ======')
829
- for s in stderr_splits:
830
- print(s)
831
- with ux_utils.print_exception_no_traceback():
832
- raise RuntimeError('Errors occurred during provision; '
833
- 'check logs above.')
808
+ errors = FailoverCloudErrorHandlerV1._handle_errors(
809
+ stdout,
810
+ stderr,
811
+ is_error_str_known=lambda x: 'SCPError:' in x.strip())
834
812
 
835
813
  logger.warning(f'Got error(s) in {region.name}:')
836
814
  messages = '\n\t'.join(errors)
815
+ style = colorama.Style
837
816
  logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
838
817
  _add_to_blocked_resources(blocked_resources,
839
818
  launchable_resources.copy(zone=None))
@@ -854,29 +833,13 @@ class FailoverCloudErrorHandlerV1:
854
833
  zones: Optional[List['clouds.Zone']], stdout: str,
855
834
  stderr: str):
856
835
 
857
- style = colorama.Style
858
- stdout_splits = stdout.split('\n')
859
- stderr_splits = stderr.split('\n')
860
- errors = [
861
- s.strip()
862
- for s in stdout_splits + stderr_splits
863
- if 'ERR' in s.strip() or 'PANIC' in s.strip()
864
- ]
865
- if not errors:
866
- if 'rsync: command not found' in stderr:
867
- with ux_utils.print_exception_no_traceback():
868
- raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
869
- logger.info('====== stdout ======')
870
- for s in stdout_splits:
871
- print(s)
872
- logger.info('====== stderr ======')
873
- for s in stderr_splits:
874
- print(s)
875
- with ux_utils.print_exception_no_traceback():
876
- raise RuntimeError('Errors occurred during provision; '
877
- 'check logs above.')
836
+ errors = FailoverCloudErrorHandlerV1._handle_errors(
837
+ stdout, stderr,
838
+ lambda x: 'ERR' in x.strip() or 'PANIC' in x.strip())
839
+
878
840
  logger.warning(f'Got error(s) on IBM cluster, in {region.name}:')
879
841
  messages = '\n\t'.join(errors)
842
+ style = colorama.Style
880
843
  logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
881
844
 
882
845
  for zone in zones: # type: ignore[union-attr]
@@ -890,35 +853,17 @@ class FailoverCloudErrorHandlerV1:
890
853
  region: 'clouds.Region',
891
854
  zones: Optional[List['clouds.Zone']], stdout: str,
892
855
  stderr: str):
893
-
894
- style = colorama.Style
895
- stdout_splits = stdout.split('\n')
896
- stderr_splits = stderr.split('\n')
897
- errors = [
898
- s.strip()
899
- for s in stdout_splits + stderr_splits
900
- if ('VcnSubnetNotFound' in s.strip()) or
901
- ('oci.exceptions.ServiceError' in s.strip() and
902
- ('NotAuthorizedOrNotFound' in s.strip() or 'CannotParseRequest' in
903
- s.strip() or 'InternalError' in s.strip() or
904
- 'LimitExceeded' in s.strip() or 'NotAuthenticated' in s.strip()))
856
+ known_service_errors = [
857
+ 'NotAuthorizedOrNotFound', 'CannotParseRequest', 'InternalError',
858
+ 'LimitExceeded', 'NotAuthenticated'
905
859
  ]
906
- if not errors:
907
- if 'rsync: command not found' in stderr:
908
- with ux_utils.print_exception_no_traceback():
909
- raise RuntimeError(_RSYNC_NOT_FOUND_MESSAGE)
910
- logger.info('====== stdout ======')
911
- for s in stdout_splits:
912
- print(s)
913
- logger.info('====== stderr ======')
914
- for s in stderr_splits:
915
- print(s)
916
- with ux_utils.print_exception_no_traceback():
917
- raise RuntimeError('Errors occurred during provision; '
918
- 'check logs above.')
919
-
860
+ errors = FailoverCloudErrorHandlerV1._handle_errors(
861
+ stdout, stderr, lambda x: 'VcnSubnetNotFound' in x.strip() or
862
+ ('oci.exceptions.ServiceError' in x.strip() and any(
863
+ known_err in x.strip() for known_err in known_service_errors)))
920
864
  logger.warning(f'Got error(s) in {region.name}:')
921
865
  messages = '\n\t'.join(errors)
866
+ style = colorama.Style
922
867
  logger.warning(f'{style.DIM}\t{messages}{style.RESET_ALL}')
923
868
 
924
869
  if zones is not None:
@@ -1000,6 +945,29 @@ class FailoverCloudErrorHandlerV2:
1000
945
  stdout and stderr.
1001
946
  """
1002
947
 
948
+ @staticmethod
949
+ def _azure_handler(blocked_resources: Set['resources_lib.Resources'],
950
+ launchable_resources: 'resources_lib.Resources',
951
+ region: 'clouds.Region', zones: List['clouds.Zone'],
952
+ err: Exception):
953
+ del region, zones # Unused.
954
+ if '(ReadOnlyDisabledSubscription)' in str(err):
955
+ logger.info(
956
+ f'{colorama.Style.DIM}Azure subscription is read-only. '
957
+ 'Skip provisioning on Azure. Please check the subscription set '
958
+ 'with az account set -s <subscription_id>.'
959
+ f'{colorama.Style.RESET_ALL}')
960
+ _add_to_blocked_resources(
961
+ blocked_resources,
962
+ resources_lib.Resources(cloud=clouds.Azure()))
963
+ elif 'ClientAuthenticationError' in str(err):
964
+ _add_to_blocked_resources(
965
+ blocked_resources,
966
+ resources_lib.Resources(cloud=clouds.Azure()))
967
+ else:
968
+ _add_to_blocked_resources(blocked_resources,
969
+ launchable_resources.copy(zone=None))
970
+
1003
971
  @staticmethod
1004
972
  def _gcp_handler(blocked_resources: Set['resources_lib.Resources'],
1005
973
  launchable_resources: 'resources_lib.Resources',
@@ -1135,7 +1103,7 @@ class FailoverCloudErrorHandlerV2:
1135
1103
  'having the required permissions and the user '
1136
1104
  'account does not have enough permission to '
1137
1105
  'update it. Please contact your administrator and '
1138
- 'check out: https://skypilot.readthedocs.io/en/latest/cloud-setup/cloud-permissions/gcp.html\n' # pylint: disable=line-too-long
1106
+ 'check out: https://docs.skypilot.co/en/latest/cloud-setup/cloud-permissions/gcp.html\n' # pylint: disable=line-too-long
1139
1107
  f'Details: {message}')
1140
1108
  _add_to_blocked_resources(
1141
1109
  blocked_resources,
@@ -1203,6 +1171,7 @@ class RetryingVmProvisioner(object):
1203
1171
  prev_cluster_status: Optional[status_lib.ClusterStatus],
1204
1172
  prev_handle: Optional['CloudVmRayResourceHandle'],
1205
1173
  prev_cluster_ever_up: bool,
1174
+ prev_config_hash: Optional[str],
1206
1175
  ) -> None:
1207
1176
  assert cluster_name is not None, 'cluster_name must be specified.'
1208
1177
  self.cluster_name = cluster_name
@@ -1211,11 +1180,12 @@ class RetryingVmProvisioner(object):
1211
1180
  self.prev_cluster_status = prev_cluster_status
1212
1181
  self.prev_handle = prev_handle
1213
1182
  self.prev_cluster_ever_up = prev_cluster_ever_up
1183
+ self.prev_config_hash = prev_config_hash
1214
1184
 
1215
1185
  def __init__(self,
1216
1186
  log_dir: str,
1217
1187
  dag: 'dag.Dag',
1218
- optimize_target: 'optimizer.OptimizeTarget',
1188
+ optimize_target: 'common.OptimizeTarget',
1219
1189
  requested_features: Set[clouds.CloudImplementationFeatures],
1220
1190
  local_wheel_path: pathlib.Path,
1221
1191
  wheel_hash: str,
@@ -1294,9 +1264,10 @@ class RetryingVmProvisioner(object):
1294
1264
 
1295
1265
  if prev_cluster_status != status_lib.ClusterStatus.UP:
1296
1266
  logger.info(
1297
- f'Cluster {cluster_name!r} (status: '
1298
- f'{prev_cluster_status.value}) was previously launched '
1299
- f'in {cloud} {region.name}. Relaunching in that region.')
1267
+ f'{colorama.Style.DIM}Cluster {cluster_name!r} (status: '
1268
+ f'{prev_cluster_status.value}) was previously in '
1269
+ f'{cloud} ({region.name}). Restarting.'
1270
+ f'{colorama.Style.RESET_ALL}')
1300
1271
  yield zones
1301
1272
 
1302
1273
  # If it reaches here: the cluster status in the database gets
@@ -1371,19 +1342,29 @@ class RetryingVmProvisioner(object):
1371
1342
  prev_cluster_status: Optional[status_lib.ClusterStatus],
1372
1343
  prev_handle: Optional['CloudVmRayResourceHandle'],
1373
1344
  prev_cluster_ever_up: bool,
1345
+ skip_if_config_hash_matches: Optional[str],
1374
1346
  ) -> Dict[str, Any]:
1375
- """The provision retry loop."""
1376
- style = colorama.Style
1377
- fore = colorama.Fore
1347
+ """The provision retry loop.
1348
+
1349
+ Returns a config_dict with the following fields:
1350
+ All fields from backend_utils.write_cluster_config(). See its
1351
+ docstring.
1352
+ - 'provisioning_skipped': True if provisioning was short-circuited
1353
+ by skip_if_config_hash_matches, False otherwise.
1354
+ - 'handle': The provisioned cluster handle.
1355
+ - 'provision_record': (Only if using the new skypilot provisioner) The
1356
+ record returned by provisioner.bulk_provision().
1357
+ - 'resources_vars': (Only if using the new skypilot provisioner) The
1358
+ resources variables given by make_deploy_resources_variables().
1359
+ """
1378
1360
  # Get log_path name
1379
1361
  log_path = os.path.join(self.log_dir, 'provision.log')
1380
1362
  log_abs_path = os.path.abspath(log_path)
1381
1363
  if not dryrun:
1382
1364
  os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
1383
1365
  os.system(f'touch {log_path}')
1384
- tail_cmd = f'tail -n100 -f {log_path}'
1385
- logger.info('To view detailed progress: '
1386
- f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
1366
+ rich_utils.force_update_status(
1367
+ ux_utils.spinner_message('Launching', log_path))
1387
1368
 
1388
1369
  # Get previous cluster status
1389
1370
  cluster_exists = prev_cluster_status is not None
@@ -1419,8 +1400,7 @@ class RetryingVmProvisioner(object):
1419
1400
  f'in {to_provision.cloud}. '
1420
1401
  f'{colorama.Style.RESET_ALL}'
1421
1402
  f'To request quotas, check the instruction: '
1422
- f'https://skypilot.readthedocs.io/en/latest/cloud-setup/quota.html.' # pylint: disable=line-too-long
1423
- )
1403
+ f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.')
1424
1404
 
1425
1405
  for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
1426
1406
  prev_cluster_status,
@@ -1484,8 +1464,18 @@ class RetryingVmProvisioner(object):
1484
1464
  raise exceptions.ResourcesUnavailableError(
1485
1465
  f'Failed to provision on cloud {to_provision.cloud} due to '
1486
1466
  f'invalid cloud config: {common_utils.format_exception(e)}')
1467
+
1468
+ if ('config_hash' in config_dict and
1469
+ skip_if_config_hash_matches == config_dict['config_hash']):
1470
+ logger.debug('Skipping provisioning of cluster with matching '
1471
+ 'config hash.')
1472
+ config_dict['provisioning_skipped'] = True
1473
+ return config_dict
1474
+ config_dict['provisioning_skipped'] = False
1475
+
1487
1476
  if dryrun:
1488
1477
  return config_dict
1478
+
1489
1479
  cluster_config_file = config_dict['ray']
1490
1480
 
1491
1481
  launched_resources = to_provision.copy(region=region.name)
@@ -1540,24 +1530,55 @@ class RetryingVmProvisioner(object):
1540
1530
  assert to_provision.region == region.name, (to_provision,
1541
1531
  region)
1542
1532
  num_nodes = handle.launched_nodes
1533
+ # Some clouds, like RunPod, only support exposing ports during
1534
+ # launch. For those clouds, we pass the ports to open in the
1535
+ # `bulk_provision` to expose the ports during provisioning.
1536
+ # If the `bulk_provision` is to apply on an existing cluster,
1537
+ # it should be ignored by the underlying provisioner impl
1538
+ # as it will only apply to newly-created instances.
1539
+ ports_to_open_on_launch = (
1540
+ list(resources_utils.port_ranges_to_set(to_provision.ports))
1541
+ if to_provision.cloud.OPEN_PORTS_VERSION <=
1542
+ clouds.OpenPortsVersion.LAUNCH_ONLY else None)
1543
1543
  try:
1544
+ controller = controller_utils.Controllers.from_name(
1545
+ cluster_name)
1546
+ controller_str = ('' if controller is None else
1547
+ f' {controller.value.name}')
1548
+ if isinstance(to_provision.cloud, clouds.Kubernetes):
1549
+ # Omit the region name for Kubernetes.
1550
+ logger.info(
1551
+ ux_utils.starting_message(
1552
+ f'Launching{controller_str} on '
1553
+ f'{to_provision.cloud}.'))
1554
+ else:
1555
+ logger.info(
1556
+ ux_utils.starting_message(
1557
+ f'Launching{controller_str} on '
1558
+ f'{to_provision.cloud} '
1559
+ f'{region.name}{colorama.Style.RESET_ALL}'
1560
+ f'{zone_str}.'))
1561
+ assert handle.cluster_yaml is not None
1544
1562
  provision_record = provisioner.bulk_provision(
1545
1563
  to_provision.cloud,
1546
1564
  region,
1547
1565
  zones,
1548
- provisioner.ClusterName(cluster_name,
1549
- handle.cluster_name_on_cloud),
1566
+ resources_utils.ClusterName(
1567
+ cluster_name, handle.cluster_name_on_cloud),
1550
1568
  num_nodes=num_nodes,
1551
1569
  cluster_yaml=handle.cluster_yaml,
1552
1570
  prev_cluster_ever_up=prev_cluster_ever_up,
1553
- log_dir=self.log_dir)
1571
+ log_dir=self.log_dir,
1572
+ ports_to_open_on_launch=ports_to_open_on_launch)
1554
1573
  # NOTE: We will handle the logic of '_ensure_cluster_ray_started' #pylint: disable=line-too-long
1555
1574
  # in 'provision_utils.post_provision_runtime_setup()' in the
1556
1575
  # caller.
1557
1576
  resources_vars = (
1558
1577
  to_provision.cloud.make_deploy_resources_variables(
1559
- to_provision, handle.cluster_name_on_cloud, region,
1560
- zones))
1578
+ to_provision,
1579
+ resources_utils.ClusterName(
1580
+ cluster_name, handle.cluster_name_on_cloud),
1581
+ region, zones, num_nodes))
1561
1582
  config_dict['provision_record'] = provision_record
1562
1583
  config_dict['resources_vars'] = resources_vars
1563
1584
  config_dict['handle'] = handle
@@ -1570,7 +1591,9 @@ class RetryingVmProvisioner(object):
1570
1591
  # cluster does not exist. Also we are fast at
1571
1592
  # cleaning up clusters now if there is no existing node..
1572
1593
  CloudVmRayBackend().post_teardown_cleanup(
1573
- handle, terminate=not prev_cluster_ever_up)
1594
+ handle,
1595
+ terminate=not prev_cluster_ever_up,
1596
+ remove_from_db=False)
1574
1597
  # TODO(suquark): other clouds may have different zone
1575
1598
  # blocking strategy. See '_update_blocklist_on_error'
1576
1599
  # for details.
@@ -1585,6 +1608,7 @@ class RetryingVmProvisioner(object):
1585
1608
  'region_name': region.name,
1586
1609
  'zone_str': zone_str,
1587
1610
  }
1611
+
1588
1612
  status, stdout, stderr, head_internal_ip, head_external_ip = (
1589
1613
  self._gang_schedule_ray_up(to_provision.cloud,
1590
1614
  cluster_config_file, handle,
@@ -1623,9 +1647,9 @@ class RetryingVmProvisioner(object):
1623
1647
  self._ensure_cluster_ray_started(handle, log_abs_path)
1624
1648
 
1625
1649
  config_dict['handle'] = handle
1626
- plural = '' if num_nodes == 1 else 's'
1627
- logger.info(f'{fore.GREEN}Successfully provisioned or found'
1628
- f' existing VM{plural}.{style.RESET_ALL}')
1650
+ logger.info(
1651
+ ux_utils.finishing_message(
1652
+ f'Cluster launched: {cluster_name!r}.', log_path))
1629
1653
  return config_dict
1630
1654
 
1631
1655
  # The cluster is not ready. We must perform error recording and/or
@@ -1686,21 +1710,20 @@ class RetryingVmProvisioner(object):
1686
1710
  # autoscaler proceeds to setup commands, which may fail:
1687
1711
  # ERR updater.py:138 -- New status: update-failed
1688
1712
  CloudVmRayBackend().teardown_no_lock(handle,
1689
- terminate=terminate_or_stop)
1713
+ terminate=terminate_or_stop,
1714
+ remove_from_db=False)
1690
1715
 
1691
1716
  if to_provision.zone is not None:
1692
1717
  message = (
1693
- f'Failed to acquire resources in {to_provision.zone}. '
1694
- 'Try changing resource requirements or use another zone.')
1718
+ f'Failed to acquire resources in {to_provision.zone} for '
1719
+ f'{requested_resources}. ')
1695
1720
  elif to_provision.region is not None:
1696
1721
  # For public clouds, provision.region is always set.
1697
1722
  message = ('Failed to acquire resources in all zones in '
1698
- f'{to_provision.region}. Try changing resource '
1699
- 'requirements or use another region.')
1723
+ f'{to_provision.region} for {requested_resources}. ')
1700
1724
  else:
1701
- message = (f'Failed to acquire resources in {to_provision.cloud}. '
1702
- 'Try changing resource requirements or use another '
1703
- 'cloud provider.')
1725
+ message = (f'Failed to acquire resources in {to_provision.cloud} '
1726
+ f'for {requested_resources}. ')
1704
1727
  # Do not failover to other locations if the cluster was ever up, since
1705
1728
  # the user can have some data on the cluster.
1706
1729
  raise exceptions.ResourcesUnavailableError(
@@ -1751,7 +1774,7 @@ class RetryingVmProvisioner(object):
1751
1774
  log_abs_path,
1752
1775
  stream_logs=False,
1753
1776
  start_streaming_at='Shared connection to',
1754
- line_processor=log_utils.RayUpLineProcessor(),
1777
+ line_processor=log_utils.RayUpLineProcessor(log_abs_path),
1755
1778
  # Reduce BOTO_MAX_RETRIES from 12 to 5 to avoid long hanging
1756
1779
  # time during 'ray up' if insufficient capacity occurs.
1757
1780
  env=dict(
@@ -1771,13 +1794,14 @@ class RetryingVmProvisioner(object):
1771
1794
 
1772
1795
  region_name = logging_info['region_name']
1773
1796
  zone_str = logging_info['zone_str']
1774
- style = colorama.Style
1775
1797
  if isinstance(to_provision_cloud, clouds.Kubernetes):
1776
- logger.info(f'{style.BRIGHT}Launching on {to_provision_cloud} '
1777
- f'{style.RESET_ALL}')
1798
+ logger.info(
1799
+ ux_utils.starting_message(
1800
+ f'Launching on {to_provision_cloud}.'))
1778
1801
  else:
1779
- logger.info(f'{style.BRIGHT}Launching on {to_provision_cloud} '
1780
- f'{region_name}{style.RESET_ALL}{zone_str}')
1802
+ logger.info(
1803
+ ux_utils.starting_message(f'Launching on {to_provision_cloud} '
1804
+ f'{region_name}{zone_str}.'))
1781
1805
  start = time.time()
1782
1806
 
1783
1807
  # Edge case: /tmp/ray does not exist, so autoscaler can't create/store
@@ -1802,19 +1826,6 @@ class RetryingVmProvisioner(object):
1802
1826
  if returncode == 0:
1803
1827
  return False
1804
1828
 
1805
- if isinstance(to_provision_cloud, clouds.Azure):
1806
- if 'Failed to invoke the Azure CLI' in stderr:
1807
- logger.info(
1808
- 'Retrying head node provisioning due to Azure CLI '
1809
- 'issues.')
1810
- return True
1811
- if ('Head node fetch timed out. Failed to create head node.'
1812
- in stderr):
1813
- logger.info(
1814
- 'Retrying head node provisioning due to head fetching '
1815
- 'timeout.')
1816
- return True
1817
-
1818
1829
  if isinstance(to_provision_cloud, clouds.Lambda):
1819
1830
  if 'Your API requests are being rate limited.' in stderr:
1820
1831
  logger.info(
@@ -1892,11 +1903,6 @@ class RetryingVmProvisioner(object):
1892
1903
  head_internal_ip, head_external_ip)
1893
1904
 
1894
1905
  # All code below is handling num_nodes > 1.
1895
- provision_str = ('Successfully provisioned or found existing head '
1896
- 'instance.')
1897
- logger.info(f'{style.BRIGHT}{provision_str} '
1898
- f'Waiting for workers.{style.RESET_ALL}')
1899
-
1900
1906
  # FIXME(zongheng): the below requires ray processes are up on head. To
1901
1907
  # repro it failing: launch a 2-node cluster, log into head and ray
1902
1908
  # stop, then launch again.
@@ -1985,8 +1991,13 @@ class RetryingVmProvisioner(object):
1985
1991
  to_provision_config: ToProvisionConfig,
1986
1992
  dryrun: bool,
1987
1993
  stream_logs: bool,
1994
+ skip_unnecessary_provisioning: bool,
1988
1995
  ) -> Dict[str, Any]:
1989
- """Provision with retries for all launchable resources."""
1996
+ """Provision with retries for all launchable resources.
1997
+
1998
+ Returns the config_dict from _retry_zones() - see its docstring for
1999
+ details.
2000
+ """
1990
2001
  cluster_name = to_provision_config.cluster_name
1991
2002
  to_provision = to_provision_config.resources
1992
2003
  num_nodes = to_provision_config.num_nodes
@@ -1995,10 +2006,28 @@ class RetryingVmProvisioner(object):
1995
2006
  prev_cluster_ever_up = to_provision_config.prev_cluster_ever_up
1996
2007
  launchable_retries_disabled = (self._dag is None or
1997
2008
  self._optimize_target is None)
2009
+ skip_if_config_hash_matches = (to_provision_config.prev_config_hash if
2010
+ skip_unnecessary_provisioning else None)
1998
2011
 
1999
2012
  failover_history: List[Exception] = list()
2013
+ resource_exceptions: Dict[resources_lib.Resources, Exception] = dict()
2014
+ # If the user is using local credentials which may expire, the
2015
+ # controller may leak resources if the credentials expire while a job
2016
+ # is running. Here we check the enabled clouds and expiring credentials
2017
+ # and raise a warning to the user.
2018
+ if task.is_controller_task():
2019
+ enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh()
2020
+ expirable_clouds = backend_utils.get_expirable_clouds(
2021
+ enabled_clouds)
2022
+
2023
+ if len(expirable_clouds) > 0:
2024
+ warnings = (f'\033[93mWarning: Credentials used for '
2025
+ f'{expirable_clouds} may expire. Clusters may be '
2026
+ f'leaked if the credentials expire while jobs '
2027
+ f'are running. It is recommended to use credentials'
2028
+ f' that never expire or a service account.\033[0m')
2029
+ logger.warning(warnings)
2000
2030
 
2001
- style = colorama.Style
2002
2031
  # Retrying launchable resources.
2003
2032
  while True:
2004
2033
  try:
@@ -2008,11 +2037,12 @@ class RetryingVmProvisioner(object):
2008
2037
  if dryrun:
2009
2038
  cloud_user = None
2010
2039
  else:
2011
- cloud_user = to_provision.cloud.get_current_user_identity()
2040
+ cloud_user = to_provision.cloud.get_active_user_identity()
2012
2041
 
2013
2042
  requested_features = self._requested_features.copy()
2014
- # Skip stop feature for Kubernetes controllers.
2015
- if (isinstance(to_provision.cloud, clouds.Kubernetes) and
2043
+ # Skip stop feature for Kubernetes and RunPod controllers.
2044
+ if (isinstance(to_provision.cloud,
2045
+ (clouds.Kubernetes, clouds.RunPod)) and
2016
2046
  controller_utils.Controllers.from_name(cluster_name)
2017
2047
  is not None):
2018
2048
  assert (clouds.CloudImplementationFeatures.STOP
@@ -2034,7 +2064,8 @@ class RetryingVmProvisioner(object):
2034
2064
  cloud_user_identity=cloud_user,
2035
2065
  prev_cluster_status=prev_cluster_status,
2036
2066
  prev_handle=prev_handle,
2037
- prev_cluster_ever_up=prev_cluster_ever_up)
2067
+ prev_cluster_ever_up=prev_cluster_ever_up,
2068
+ skip_if_config_hash_matches=skip_if_config_hash_matches)
2038
2069
  if dryrun:
2039
2070
  return config_dict
2040
2071
  except (exceptions.InvalidClusterNameError,
@@ -2067,17 +2098,12 @@ class RetryingVmProvisioner(object):
2067
2098
  # Provisioning succeeded.
2068
2099
  break
2069
2100
 
2070
- if to_provision.zone is None:
2071
- region_or_zone_str = str(to_provision.region)
2072
- else:
2073
- region_or_zone_str = str(to_provision.zone)
2074
- logger.warning(f'\n{style.BRIGHT}Provision failed for {num_nodes}x '
2075
- f'{to_provision} in {region_or_zone_str}. '
2076
- f'Trying other locations (if any).{style.RESET_ALL}')
2077
2101
  if prev_cluster_status is None:
2078
2102
  # Add failed resources to the blocklist, only when it
2079
2103
  # is in fallback mode.
2080
2104
  _add_to_blocked_resources(self._blocked_resources, to_provision)
2105
+ assert len(failover_history) > 0
2106
+ resource_exceptions[to_provision] = failover_history[-1]
2081
2107
  else:
2082
2108
  # If we reach here, it means that the existing cluster must have
2083
2109
  # a previous status of INIT, because other statuses (UP,
@@ -2088,8 +2114,10 @@ class RetryingVmProvisioner(object):
2088
2114
  ), prev_cluster_status
2089
2115
  assert global_user_state.get_handle_from_cluster_name(
2090
2116
  cluster_name) is None, cluster_name
2091
- logger.info('Retrying provisioning with requested resources '
2092
- f'{task.num_nodes}x {task.resources}')
2117
+ logger.info(
2118
+ ux_utils.retry_message(
2119
+ f'Retrying provisioning with requested resources: '
2120
+ f'{task.num_nodes}x {task.resources}'))
2093
2121
  # Retry with the current, potentially "smaller" resources:
2094
2122
  # to_provision == the current new resources (e.g., V100:1),
2095
2123
  # which may be "smaller" than the original (V100:8).
@@ -2099,12 +2127,18 @@ class RetryingVmProvisioner(object):
2099
2127
  prev_cluster_status = None
2100
2128
  prev_handle = None
2101
2129
 
2130
+ retry_message = ux_utils.retry_message(
2131
+ 'Trying other potential resources.')
2132
+ logger.warning(f'\n{retry_message}')
2133
+ log_path = os.path.join(self.log_dir, 'provision.log')
2134
+ rich_utils.force_update_status(
2135
+ ux_utils.spinner_message('Looking for resources', log_path))
2102
2136
  # Set to None so that sky.optimize() will assign a new one
2103
2137
  # (otherwise will skip re-optimizing this task).
2104
2138
  # TODO: set all remaining tasks' best_resources to None.
2105
2139
  task.best_resources = None
2106
2140
  try:
2107
- self._dag = sky.optimize(
2141
+ self._dag = optimizer.Optimizer.optimize(
2108
2142
  self._dag,
2109
2143
  minimize=self._optimize_target,
2110
2144
  blocked_resources=self._blocked_resources)
@@ -2114,7 +2148,14 @@ class RetryingVmProvisioner(object):
2114
2148
  # possible resources or the requested resources is too
2115
2149
  # restrictive. If we reach here, our failover logic finally
2116
2150
  # ends here.
2117
- raise e.with_failover_history(failover_history)
2151
+ table = log_utils.create_table(['Resource', 'Reason'])
2152
+ for (resource, exception) in resource_exceptions.items():
2153
+ table.add_row(
2154
+ [resources_utils.format_resource(resource), exception])
2155
+ table.max_table_width = shutil.get_terminal_size().columns
2156
+ raise exceptions.ResourcesUnavailableError(
2157
+ _RESOURCES_UNAVAILABLE_LOG + '\n' + table.get_string(),
2158
+ failover_history=failover_history)
2118
2159
  to_provision = task.best_resources
2119
2160
  assert task in self._dag.tasks, 'Internal logic error.'
2120
2161
  assert to_provision is not None, task
@@ -2143,31 +2184,30 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2143
2184
  """
2144
2185
  # Bump if any fields get added/removed/changed, and add backward
2145
2186
  # compaitibility logic in __setstate__.
2146
- _VERSION = 8
2187
+ _VERSION = 10
2147
2188
 
2148
2189
  def __init__(
2149
2190
  self,
2150
2191
  *,
2151
2192
  cluster_name: str,
2152
2193
  cluster_name_on_cloud: str,
2153
- cluster_yaml: str,
2194
+ cluster_yaml: Optional[str],
2154
2195
  launched_nodes: int,
2155
2196
  launched_resources: resources_lib.Resources,
2156
2197
  stable_internal_external_ips: Optional[List[Tuple[str,
2157
2198
  str]]] = None,
2158
2199
  stable_ssh_ports: Optional[List[int]] = None,
2159
- cluster_info: Optional[provision_common.ClusterInfo] = None,
2160
- # The following 2 fields are deprecated. SkyPilot new provisioner
2161
- # API handles the TPU node creation/deletion.
2162
- # Backward compatibility for TPU nodes created before #2943.
2163
- # TODO (zhwu): Remove this after 0.6.0.
2164
- tpu_create_script: Optional[str] = None,
2165
- tpu_delete_script: Optional[str] = None) -> None:
2200
+ cluster_info: Optional[provision_common.ClusterInfo] = None
2201
+ ) -> None:
2166
2202
  self._version = self._VERSION
2167
2203
  self.cluster_name = cluster_name
2168
2204
  self.cluster_name_on_cloud = cluster_name_on_cloud
2169
- self._cluster_yaml = cluster_yaml.replace(os.path.expanduser('~'), '~',
2170
- 1)
2205
+ # Replace the home directory with ~ for better robustness across systems
2206
+ # with different home directories.
2207
+ if cluster_yaml is not None and cluster_yaml.startswith(
2208
+ os.path.expanduser('~')):
2209
+ cluster_yaml = cluster_yaml.replace(os.path.expanduser('~'), '~', 1)
2210
+ self._cluster_yaml = cluster_yaml
2171
2211
  # List of (internal_ip, feasible_ip) tuples for all the nodes in the
2172
2212
  # cluster, sorted by the feasible ips. The feasible ips can be either
2173
2213
  # internal or external ips, depending on the use_internal_ips flag.
@@ -2177,12 +2217,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2177
2217
  self.launched_nodes = launched_nodes
2178
2218
  self.launched_resources = launched_resources
2179
2219
  self.docker_user: Optional[str] = None
2180
- # Deprecated. SkyPilot new provisioner API handles the TPU node
2181
- # creation/deletion.
2182
- # Backward compatibility for TPU nodes created before #2943.
2183
- # TODO (zhwu): Remove this after 0.6.0.
2184
- self.tpu_create_script = tpu_create_script
2185
- self.tpu_delete_script = tpu_delete_script
2186
2220
 
2187
2221
  def __repr__(self):
2188
2222
  return (f'ResourceHandle('
@@ -2198,10 +2232,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2198
2232
  f'\n\tlaunched_resources={self.launched_nodes}x '
2199
2233
  f'{self.launched_resources}, '
2200
2234
  f'\n\tdocker_user={self.docker_user},'
2201
- f'\n\tssh_user={self.ssh_user},'
2202
- # TODO (zhwu): Remove this after 0.6.0.
2203
- f'\n\ttpu_create_script={self.tpu_create_script}, '
2204
- f'\n\ttpu_delete_script={self.tpu_delete_script})')
2235
+ f'\n\tssh_user={self.ssh_user}')
2205
2236
 
2206
2237
  def get_cluster_name(self):
2207
2238
  return self.cluster_name
@@ -2214,26 +2245,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2214
2245
  return common_utils.read_yaml(self.cluster_yaml).get(
2215
2246
  'provider', {}).get('use_internal_ips', False)
2216
2247
 
2217
- def _update_cluster_region(self):
2218
- """Update the region in handle.launched_resources.
2219
-
2220
- This is for backward compatibility to handle the clusters launched
2221
- long before. We should remove this after 0.6.0.
2222
- """
2223
- if self.launched_resources.region is not None:
2224
- return
2225
-
2226
- config = common_utils.read_yaml(self.cluster_yaml)
2227
- provider = config['provider']
2228
- cloud = self.launched_resources.cloud
2229
- if cloud.is_same_cloud(clouds.Azure()):
2230
- region = provider['location']
2231
- elif cloud.is_same_cloud(clouds.GCP()) or cloud.is_same_cloud(
2232
- clouds.AWS()):
2233
- region = provider['region']
2234
-
2235
- self.launched_resources = self.launched_resources.copy(region=region)
2236
-
2237
2248
  def update_ssh_ports(self, max_attempts: int = 1) -> None:
2238
2249
  """Fetches and sets the SSH ports for the cluster nodes.
2239
2250
 
@@ -2322,9 +2333,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2322
2333
  """
2323
2334
  if cluster_info is not None:
2324
2335
  self.cached_cluster_info = cluster_info
2325
- use_internal_ips = self._use_internal_ips()
2326
- cluster_feasible_ips = self.cached_cluster_info.get_feasible_ips(
2327
- use_internal_ips)
2336
+ cluster_feasible_ips = self.cached_cluster_info.get_feasible_ips()
2328
2337
  cluster_internal_ips = self.cached_cluster_info.get_feasible_ips(
2329
2338
  force_internal_ips=True)
2330
2339
  else:
@@ -2403,7 +2412,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2403
2412
  internal_external_ips[1:], key=lambda x: x[1])
2404
2413
  self.stable_internal_external_ips = stable_internal_external_ips
2405
2414
 
2406
- @functools.lru_cache()
2415
+ @annotations.lru_cache(scope='global')
2407
2416
  @timeline.event
2408
2417
  def get_command_runners(self,
2409
2418
  force_cached: bool = False,
@@ -2414,8 +2423,20 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2414
2423
  self.cluster_yaml, self.docker_user, self.ssh_user)
2415
2424
  if avoid_ssh_control:
2416
2425
  ssh_credentials.pop('ssh_control_name', None)
2426
+ updated_to_skypilot_provisioner_after_provisioned = (
2427
+ self.launched_resources.cloud.PROVISIONER_VERSION >=
2428
+ clouds.ProvisionerVersion.SKYPILOT and
2429
+ self.cached_external_ips is not None and
2430
+ self.cached_cluster_info is None)
2431
+ if updated_to_skypilot_provisioner_after_provisioned:
2432
+ logger.debug(
2433
+ f'{self.launched_resources.cloud} has been updated to the new '
2434
+ f'provisioner after cluster {self.cluster_name} was '
2435
+ f'provisioned. Cached IPs are used for connecting to the '
2436
+ 'cluster.')
2417
2437
  if (clouds.ProvisionerVersion.RAY_PROVISIONER_SKYPILOT_TERMINATOR >=
2418
- self.launched_resources.cloud.PROVISIONER_VERSION):
2438
+ self.launched_resources.cloud.PROVISIONER_VERSION or
2439
+ updated_to_skypilot_provisioner_after_provisioned):
2419
2440
  ip_list = (self.cached_external_ips
2420
2441
  if force_cached else self.external_ips())
2421
2442
  if ip_list is None:
@@ -2428,7 +2449,17 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2428
2449
  zip(ip_list, port_list), **ssh_credentials)
2429
2450
  return runners
2430
2451
  if self.cached_cluster_info is None:
2431
- assert not force_cached, 'cached_cluster_info is None.'
2452
+ # We have `and self.cached_external_ips is None` here, because
2453
+ # when a cluster's cloud is just upgraded to the new provsioner,
2454
+ # although it has the cached_external_ips, the cached_cluster_info
2455
+ # can be None. We need to update it here, even when force_cached is
2456
+ # set to True.
2457
+ # TODO: We can remove `self.cached_external_ips is None` after
2458
+ # all clouds moved to new provisioner.
2459
+ if force_cached and self.cached_external_ips is None:
2460
+ raise RuntimeError(
2461
+ 'Tried to use cached cluster info, but it\'s missing for '
2462
+ f'cluster "{self.cluster_name}"')
2432
2463
  self._update_cluster_info()
2433
2464
  assert self.cached_cluster_info is not None, self
2434
2465
  runners = provision_lib.get_command_runners(
@@ -2498,9 +2529,15 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2498
2529
  self.docker_user = docker_user
2499
2530
 
2500
2531
  @property
2501
- def cluster_yaml(self):
2532
+ def cluster_yaml(self) -> Optional[str]:
2533
+ if self._cluster_yaml is None:
2534
+ return None
2502
2535
  return os.path.expanduser(self._cluster_yaml)
2503
2536
 
2537
+ @cluster_yaml.setter
2538
+ def cluster_yaml(self, value: Optional[str]):
2539
+ self._cluster_yaml = value
2540
+
2504
2541
  @property
2505
2542
  def ssh_user(self):
2506
2543
  if self.cached_cluster_info is not None:
@@ -2530,7 +2567,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2530
2567
  """Returns number of IPs per node in the cluster, handling TPU Pod."""
2531
2568
  is_tpu_vm_pod = gcp_utils.is_tpu_vm_pod(self.launched_resources)
2532
2569
  if is_tpu_vm_pod:
2533
- num_ips = gcp_utils.get_num_tpu_devices(self.launched_resources)
2570
+ num_ips = len(self.internal_ips())
2534
2571
  else:
2535
2572
  num_ips = 1
2536
2573
  return num_ips
@@ -2559,6 +2596,35 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2559
2596
  if version < 8:
2560
2597
  self.cached_cluster_info = None
2561
2598
 
2599
+ if version < 9:
2600
+ # For backward compatibility, we should update the region of a
2601
+ # SkyPilot cluster on Kubernetes to the actual context it is using.
2602
+ # pylint: disable=import-outside-toplevel
2603
+ launched_resources = state['launched_resources']
2604
+ if isinstance(launched_resources.cloud, clouds.Kubernetes):
2605
+ yaml_config = common_utils.read_yaml(
2606
+ os.path.expanduser(state['_cluster_yaml']))
2607
+ context = kubernetes_utils.get_context_from_config(
2608
+ yaml_config['provider'])
2609
+ state['launched_resources'] = launched_resources.copy(
2610
+ region=context)
2611
+
2612
+ if version < 10:
2613
+ # In #4660, we keep the cluster entry in the database even when it
2614
+ # is in the transition from one region to another during the
2615
+ # failover. We allow `handle.cluster_yaml` to be None to indicate
2616
+ # that the cluster yaml is intentionally removed. Before that PR,
2617
+ # the `handle.cluster_yaml` is always not None, even if it is
2618
+ # intentionally removed.
2619
+ #
2620
+ # For backward compatibility, we set the `_cluster_yaml` to None
2621
+ # if the file does not exist, assuming all the removal of the
2622
+ # _cluster_yaml for existing clusters are intentional by SkyPilot.
2623
+ # are intentional by SkyPilot.
2624
+ if state['_cluster_yaml'] is not None and not os.path.exists(
2625
+ os.path.expanduser(state['_cluster_yaml'])):
2626
+ state['_cluster_yaml'] = None
2627
+
2562
2628
  self.__dict__.update(state)
2563
2629
 
2564
2630
  # Because the update_cluster_ips and update_ssh_ports
@@ -2574,8 +2640,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2574
2640
  if version < 4:
2575
2641
  self.update_ssh_ports()
2576
2642
 
2577
- self._update_cluster_region()
2578
-
2579
2643
  if version < 8:
2580
2644
  try:
2581
2645
  self._update_cluster_info()
@@ -2585,6 +2649,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2585
2649
  pass
2586
2650
 
2587
2651
 
2652
+ @registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
2588
2653
  class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2589
2654
  """Backend: runs on cloud virtual machines, managed by Ray.
2590
2655
 
@@ -2599,7 +2664,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2599
2664
  ResourceHandle = CloudVmRayResourceHandle # pylint: disable=invalid-name
2600
2665
 
2601
2666
  def __init__(self):
2602
- self.run_timestamp = backend_utils.get_run_timestamp()
2667
+ self.run_timestamp = sky_logging.get_run_timestamp()
2603
2668
  # NOTE: do not expanduser() here, as this '~/...' path is used for
2604
2669
  # remote as well to be expanded on the remote side.
2605
2670
  self.log_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
@@ -2614,7 +2679,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2614
2679
 
2615
2680
  # Command for running the setup script. It is only set when the
2616
2681
  # setup needs to be run outside the self._setup() and as part of
2617
- # a job (--detach-setup).
2682
+ # a job (detach_setup, default).
2618
2683
  self._setup_cmd = None
2619
2684
 
2620
2685
  # --- Implementation of Backend APIs ---
@@ -2623,10 +2688,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2623
2688
  self._dag = kwargs.pop('dag', self._dag)
2624
2689
  self._optimize_target = kwargs.pop(
2625
2690
  'optimize_target',
2626
- self._optimize_target) or optimizer.OptimizeTarget.COST
2691
+ self._optimize_target) or common.OptimizeTarget.COST
2627
2692
  self._requested_features = kwargs.pop('requested_features',
2628
2693
  self._requested_features)
2629
- assert len(kwargs) == 0, f'Unexpected kwargs: {kwargs}'
2694
+ assert not kwargs, f'Unexpected kwargs: {kwargs}'
2630
2695
 
2631
2696
  def check_resources_fit_cluster(
2632
2697
  self,
@@ -2656,8 +2721,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2656
2721
  if record is not None:
2657
2722
  usage_lib.messages.usage.update_cluster_status(record['status'])
2658
2723
 
2659
- # Backward compatibility: the old launched_resources without region info
2660
- # was handled by ResourceHandle._update_cluster_region.
2661
2724
  assert launched_resources.region is not None, handle
2662
2725
 
2663
2726
  mismatch_str = (f'To fix: specify a new cluster name, or down the '
@@ -2720,17 +2783,39 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2720
2783
  f' Existing:\t{handle.launched_nodes}x '
2721
2784
  f'{handle.launched_resources}\n'
2722
2785
  f'{mismatch_str}')
2786
+ else:
2787
+ # For fractional acc count clusters, we round up the number of accs
2788
+ # to 1 (sky/utils/resources_utils.py::make_ray_custom_resources_str)
2789
+ # Here we scale the required acc count to (required / launched) * 1
2790
+ # so the total number of accs is the same as the requested number.
2791
+ launched_accs = launched_resources.accelerators
2792
+ if (launched_accs is not None and
2793
+ valid_resource.accelerators is not None):
2794
+ for _, count in launched_accs.items():
2795
+ if isinstance(count, float) and not count.is_integer():
2796
+ valid_resource = valid_resource.copy(
2797
+ accelerators={
2798
+ k: v / count
2799
+ for k, v in valid_resource.accelerators.items()
2800
+ })
2723
2801
  return valid_resource
2724
2802
 
2725
2803
  def _provision(
2726
- self,
2727
- task: task_lib.Task,
2728
- to_provision: Optional[resources_lib.Resources],
2729
- dryrun: bool,
2730
- stream_logs: bool,
2731
- cluster_name: str,
2732
- retry_until_up: bool = False) -> Optional[CloudVmRayResourceHandle]:
2733
- """Provisions using 'ray up'.
2804
+ self,
2805
+ task: task_lib.Task,
2806
+ to_provision: Optional[resources_lib.Resources],
2807
+ dryrun: bool,
2808
+ stream_logs: bool,
2809
+ cluster_name: str,
2810
+ retry_until_up: bool = False,
2811
+ skip_unnecessary_provisioning: bool = False,
2812
+ ) -> Optional[CloudVmRayResourceHandle]:
2813
+ """Provisions the cluster, or re-provisions an existing cluster.
2814
+
2815
+ Use the SKYPILOT provisioner if it's supported by the cloud, otherwise
2816
+ use 'ray up'.
2817
+
2818
+ See also docstring for Backend.provision().
2734
2819
 
2735
2820
  Raises:
2736
2821
  exceptions.ClusterOwnerIdentityMismatchError: if the cluster
@@ -2744,7 +2829,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2744
2829
  (e.g., cluster name invalid) or a region/zone throwing
2745
2830
  resource unavailability.
2746
2831
  exceptions.CommandError: any ssh command error.
2747
- RuntimeErorr: raised when 'rsync' is not installed.
2832
+ RuntimeError: raised when 'rsync' is not installed.
2748
2833
  # TODO(zhwu): complete the list of exceptions.
2749
2834
  """
2750
2835
  # FIXME: ray up for Azure with different cluster_names will overwrite
@@ -2811,55 +2896,78 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2811
2896
  local_wheel_path,
2812
2897
  wheel_hash,
2813
2898
  blocked_resources=task.blocked_resources)
2899
+ log_path = os.path.join(self.log_dir, 'provision.log')
2900
+ rich_utils.force_update_status(
2901
+ ux_utils.spinner_message('Launching', log_path))
2814
2902
  config_dict = retry_provisioner.provision_with_retries(
2815
- task, to_provision_config, dryrun, stream_logs)
2903
+ task, to_provision_config, dryrun, stream_logs,
2904
+ skip_unnecessary_provisioning)
2816
2905
  break
2817
2906
  except exceptions.ResourcesUnavailableError as e:
2818
- # Do not remove the stopped cluster from the global state
2819
- # if failed to start.
2907
+ log_path = retry_provisioner.log_dir + '/provision.log'
2908
+ error_message = (
2909
+ f'{colorama.Fore.RED}Failed to provision all '
2910
+ f'possible launchable resources.'
2911
+ f'{colorama.Style.RESET_ALL}'
2912
+ ' Relax the task\'s resource requirements: '
2913
+ f'{task.num_nodes}x {list(task.resources)[0]}')
2820
2914
  if e.no_failover:
2821
2915
  error_message = str(e)
2822
- else:
2823
- # Clean up the cluster's entry in `sky status`.
2824
- global_user_state.remove_cluster(cluster_name,
2825
- terminate=True)
2826
- usage_lib.messages.usage.update_final_cluster_status(
2827
- None)
2828
- error_message = (
2829
- 'Failed to provision all possible launchable '
2830
- 'resources.'
2831
- f' Relax the task\'s resource requirements: '
2832
- f'{task.num_nodes}x {list(task.resources)[0]}')
2916
+
2833
2917
  if retry_until_up:
2834
2918
  logger.error(error_message)
2835
2919
  # Sleep and retry.
2836
2920
  gap_seconds = backoff.current_backoff()
2837
2921
  plural = 's' if attempt_cnt > 1 else ''
2838
- logger.info(
2839
- f'{colorama.Style.BRIGHT}=== Retry until up ==='
2840
- f'{colorama.Style.RESET_ALL}\n'
2841
- f'Retrying provisioning after {gap_seconds:.0f}s '
2842
- '(backoff with random jittering). '
2843
- f'Already tried {attempt_cnt} attempt{plural}.')
2922
+ retry_message = ux_utils.retry_message(
2923
+ f'Retry after {gap_seconds:.0f}s '
2924
+ f'({attempt_cnt} attempt{plural}). ')
2925
+ logger.info(f'\n{retry_message} '
2926
+ f'{ux_utils.log_path_hint(log_path)}'
2927
+ f'{colorama.Style.RESET_ALL}')
2844
2928
  attempt_cnt += 1
2845
2929
  time.sleep(gap_seconds)
2846
2930
  continue
2931
+ # Clean up the cluster's entry in `sky status`.
2932
+ # Do not remove the stopped cluster from the global state
2933
+ # if failed to start.
2934
+ if not e.no_failover:
2935
+ global_user_state.remove_cluster(cluster_name,
2936
+ terminate=True)
2937
+ usage_lib.messages.usage.update_final_cluster_status(
2938
+ None)
2939
+ logger.error(
2940
+ ux_utils.error_message(
2941
+ 'Failed to provision resources. '
2942
+ f'{ux_utils.log_path_hint(log_path)}'))
2847
2943
  error_message += (
2848
- '\nTo keep retrying until the cluster is up, use the '
2849
- '`--retry-until-up` flag.')
2944
+ '\nTo keep retrying until the cluster is up, use '
2945
+ 'the `--retry-until-up` flag.')
2850
2946
  with ux_utils.print_exception_no_traceback():
2851
2947
  raise exceptions.ResourcesUnavailableError(
2852
- error_message,
2948
+ error_message + '\n' + str(e),
2853
2949
  failover_history=e.failover_history) from None
2854
2950
  if dryrun:
2855
2951
  record = global_user_state.get_cluster_from_name(cluster_name)
2856
2952
  return record['handle'] if record is not None else None
2857
2953
 
2954
+ if config_dict['provisioning_skipped']:
2955
+ # Skip further provisioning.
2956
+ # In this case, we won't have certain fields in the config_dict
2957
+ # ('handle', 'provision_record', 'resources_vars')
2958
+ # We need to return the handle - but it should be the existing
2959
+ # handle for the cluster.
2960
+ record = global_user_state.get_cluster_from_name(cluster_name)
2961
+ assert record is not None and record['handle'] is not None, (
2962
+ cluster_name, record)
2963
+ return record['handle']
2964
+
2858
2965
  if 'provision_record' in config_dict:
2859
2966
  # New provisioner is used here.
2860
2967
  handle = config_dict['handle']
2861
2968
  provision_record = config_dict['provision_record']
2862
2969
  resources_vars = config_dict['resources_vars']
2970
+ config_hash = config_dict.get('config_hash', None)
2863
2971
 
2864
2972
  # Setup SkyPilot runtime after the cluster is provisioned
2865
2973
  # 1. Wait for SSH to be ready.
@@ -2869,8 +2977,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2869
2977
  # 4. Starting ray cluster and skylet.
2870
2978
  cluster_info = provisioner.post_provision_runtime_setup(
2871
2979
  repr(handle.launched_resources.cloud),
2872
- provisioner.ClusterName(handle.cluster_name,
2873
- handle.cluster_name_on_cloud),
2980
+ resources_utils.ClusterName(handle.cluster_name,
2981
+ handle.cluster_name_on_cloud),
2874
2982
  handle.cluster_yaml,
2875
2983
  provision_record=provision_record,
2876
2984
  custom_resource=resources_vars.get('custom_resources'),
@@ -2893,8 +3001,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2893
3001
 
2894
3002
  self._update_after_cluster_provisioned(
2895
3003
  handle, to_provision_config.prev_handle, task,
2896
- prev_cluster_status, handle.external_ips(),
2897
- handle.external_ssh_ports(), lock_path)
3004
+ prev_cluster_status, lock_path, config_hash)
2898
3005
  return handle
2899
3006
 
2900
3007
  cluster_config_file = config_dict['ray']
@@ -2957,7 +3064,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2957
3064
  # and restarted if necessary.
2958
3065
  logger.debug('Checking if skylet is running on the head node.')
2959
3066
  with rich_utils.safe_status(
2960
- '[bold cyan]Preparing SkyPilot runtime'):
3067
+ ux_utils.spinner_message('Preparing SkyPilot runtime')):
2961
3068
  # We need to source bashrc for skylet to make sure the autostop
2962
3069
  # event can access the path to the cloud CLIs.
2963
3070
  self.run_on_head(handle,
@@ -2966,7 +3073,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2966
3073
 
2967
3074
  self._update_after_cluster_provisioned(
2968
3075
  handle, to_provision_config.prev_handle, task,
2969
- prev_cluster_status, ip_list, ssh_port_list, lock_path)
3076
+ prev_cluster_status, lock_path, config_hash)
2970
3077
  return handle
2971
3078
 
2972
3079
  def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
@@ -2984,8 +3091,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2984
3091
  prev_handle: Optional[CloudVmRayResourceHandle],
2985
3092
  task: task_lib.Task,
2986
3093
  prev_cluster_status: Optional[status_lib.ClusterStatus],
2987
- ip_list: List[str], ssh_port_list: List[int],
2988
- lock_path: str) -> None:
3094
+ lock_path: str, config_hash: str) -> None:
2989
3095
  usage_lib.messages.usage.update_cluster_resources(
2990
3096
  handle.launched_nodes, handle.launched_resources)
2991
3097
  usage_lib.messages.usage.update_final_cluster_status(
@@ -3000,7 +3106,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3000
3106
  cmd = job_lib.JobLibCodeGen.update_status()
3001
3107
  logger.debug('Update job queue on remote cluster.')
3002
3108
  with rich_utils.safe_status(
3003
- '[bold cyan]Preparing SkyPilot runtime'):
3109
+ ux_utils.spinner_message('Preparing SkyPilot runtime')):
3004
3110
  returncode, _, stderr = self.run_on_head(handle,
3005
3111
  cmd,
3006
3112
  require_outputs=True)
@@ -3031,9 +3137,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3031
3137
  resources_utils.port_ranges_to_set(current_ports) -
3032
3138
  resources_utils.port_ranges_to_set(prev_ports))
3033
3139
  if open_new_ports:
3034
- with rich_utils.safe_status(
3035
- '[bold cyan]Launching - Opening new ports'):
3036
- self._open_ports(handle)
3140
+ cloud = handle.launched_resources.cloud
3141
+ if not (cloud.OPEN_PORTS_VERSION <=
3142
+ clouds.OpenPortsVersion.LAUNCH_ONLY):
3143
+ with rich_utils.safe_status(
3144
+ ux_utils.spinner_message(
3145
+ 'Launching - Opening new ports')):
3146
+ self._open_ports(handle)
3037
3147
 
3038
3148
  with timeline.Event('backend.provision.post_process'):
3039
3149
  global_user_state.add_or_update_cluster(
@@ -3041,15 +3151,21 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3041
3151
  handle,
3042
3152
  set(task.resources),
3043
3153
  ready=True,
3154
+ config_hash=config_hash,
3044
3155
  )
3045
3156
  usage_lib.messages.usage.update_final_cluster_status(
3046
3157
  status_lib.ClusterStatus.UP)
3047
- auth_config = common_utils.read_yaml(handle.cluster_yaml)['auth']
3048
- backend_utils.SSHConfigHelper.add_cluster(handle.cluster_name,
3049
- ip_list, auth_config,
3050
- ssh_port_list,
3051
- handle.docker_user,
3052
- handle.ssh_user)
3158
+ # We still add the cluster to ssh config file on API server, this
3159
+ # is helpful for people trying to use `sky launch`'ed cluster for
3160
+ # ssh proxy jump.
3161
+ auth_config = backend_utils.ssh_credential_from_yaml(
3162
+ handle.cluster_yaml,
3163
+ ssh_user=handle.ssh_user,
3164
+ docker_user=handle.docker_user)
3165
+ cluster_utils.SSHConfigHelper.add_cluster(
3166
+ handle.cluster_name, handle.cached_external_ips, auth_config,
3167
+ handle.cached_external_ssh_ports, handle.docker_user,
3168
+ handle.ssh_user)
3053
3169
 
3054
3170
  common_utils.remove_file_if_exists(lock_path)
3055
3171
 
@@ -3078,9 +3194,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3078
3194
  dir_size = backend_utils.path_size_megabytes(full_workdir)
3079
3195
  if dir_size >= _PATH_SIZE_MEGABYTES_WARN_THRESHOLD:
3080
3196
  logger.warning(
3081
- f'{fore.YELLOW}The size of workdir {workdir!r} '
3197
+ f' {fore.YELLOW}The size of workdir {workdir!r} '
3082
3198
  f'is {dir_size} MB. Try to keep workdir small or use '
3083
- '.gitignore to exclude large files, as large sizes will slow '
3199
+ '.skyignore to exclude large files, as large sizes will slow '
3084
3200
  f'down rsync.{style.RESET_ALL}')
3085
3201
 
3086
3202
  log_path = os.path.join(self.log_dir, 'workdir_sync.log')
@@ -3100,17 +3216,17 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3100
3216
  num_nodes = handle.launched_nodes
3101
3217
  plural = 's' if num_nodes > 1 else ''
3102
3218
  logger.info(
3103
- f'{fore.CYAN}Syncing workdir (to {num_nodes} node{plural}): '
3104
- f'{style.BRIGHT}{workdir}{style.RESET_ALL}'
3105
- f' -> '
3106
- f'{style.BRIGHT}{SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
3219
+ f' {style.DIM}Syncing workdir (to {num_nodes} node{plural}): '
3220
+ f'{workdir} -> {SKY_REMOTE_WORKDIR}{style.RESET_ALL}')
3107
3221
  os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
3108
3222
  os.system(f'touch {log_path}')
3109
- tail_cmd = f'tail -n100 -f {log_path}'
3110
- logger.info('To view detailed progress: '
3111
- f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
3112
- with rich_utils.safe_status('[bold cyan]Syncing[/]'):
3113
- subprocess_utils.run_in_parallel(_sync_workdir_node, runners)
3223
+ num_threads = subprocess_utils.get_parallel_threads(
3224
+ str(handle.launched_resources.cloud))
3225
+ with rich_utils.safe_status(
3226
+ ux_utils.spinner_message('Syncing workdir', log_path)):
3227
+ subprocess_utils.run_in_parallel(_sync_workdir_node, runners,
3228
+ num_threads)
3229
+ logger.info(ux_utils.finishing_message('Synced workdir.', log_path))
3114
3230
 
3115
3231
  def _sync_file_mounts(
3116
3232
  self,
@@ -3118,18 +3234,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3118
3234
  all_file_mounts: Optional[Dict[Path, Path]],
3119
3235
  storage_mounts: Optional[Dict[Path, storage_lib.Storage]],
3120
3236
  ) -> None:
3121
- """Mounts all user files to the remote nodes."""
3122
- controller_utils.replace_skypilot_config_path_in_file_mounts(
3123
- handle.launched_resources.cloud, all_file_mounts)
3124
- self._execute_file_mounts(handle, all_file_mounts)
3125
- self._execute_storage_mounts(handle, storage_mounts)
3126
- self._set_storage_mounts_metadata(handle.cluster_name, storage_mounts)
3237
+ """Mounts all user files to the remote nodes.
3238
+
3239
+ Note: This does not handle COPY storage_mounts. These should have
3240
+ already been translated into file_mounts by task.sync_storage_mounts().
3241
+
3242
+ TODO: Delete COPY storage_mounts in task.sync_storage_mounts(), and
3243
+ assert here that all storage_mounts are MOUNT mode.
3244
+ """
3245
+ with rich_utils.safe_status(ux_utils.spinner_message('Syncing files')):
3246
+ controller_utils.replace_skypilot_config_path_in_file_mounts(
3247
+ handle.launched_resources.cloud, all_file_mounts)
3248
+ self._execute_file_mounts(handle, all_file_mounts)
3249
+ self._execute_storage_mounts(handle, storage_mounts)
3250
+ self._set_storage_mounts_metadata(handle.cluster_name,
3251
+ storage_mounts)
3127
3252
 
3128
3253
  def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
3129
3254
  detach_setup: bool) -> None:
3130
3255
  start = time.time()
3131
- style = colorama.Style
3132
- fore = colorama.Fore
3133
3256
 
3134
3257
  if task.setup is None:
3135
3258
  return
@@ -3150,8 +3273,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3150
3273
  setup_script = log_lib.make_task_bash_script(setup,
3151
3274
  env_vars=setup_envs)
3152
3275
  encoded_script = shlex.quote(setup_script)
3153
- if (detach_setup or
3154
- len(encoded_script) > _MAX_INLINE_SCRIPT_LENGTH):
3276
+
3277
+ def _dump_setup_script(setup_script: str) -> None:
3155
3278
  with tempfile.NamedTemporaryFile('w', prefix='sky_setup_') as f:
3156
3279
  f.write(setup_script)
3157
3280
  f.flush()
@@ -3160,6 +3283,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3160
3283
  target=remote_setup_file_name,
3161
3284
  up=True,
3162
3285
  stream_logs=False)
3286
+
3287
+ if detach_setup or _is_command_length_over_limit(encoded_script):
3288
+ _dump_setup_script(setup_script)
3163
3289
  create_script_code = 'true'
3164
3290
  else:
3165
3291
  create_script_code = (f'{{ echo {encoded_script} > '
@@ -3167,20 +3293,52 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3167
3293
 
3168
3294
  if detach_setup:
3169
3295
  return
3296
+
3170
3297
  setup_log_path = os.path.join(self.log_dir,
3171
3298
  f'setup-{runner.node_id}.log')
3172
- returncode = runner.run(
3173
- f'{create_script_code} && {setup_cmd}',
3174
- log_path=setup_log_path,
3175
- process_stream=False,
3176
- # We do not source bashrc for setup, since bashrc is sourced
3177
- # in the script already.
3178
- # Skip an empty line and two lines due to the /bin/bash -i and
3179
- # source ~/.bashrc in the setup_cmd.
3180
- # bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long
3181
- # bash: no job control in this shell
3182
- skip_lines=3,
3183
- )
3299
+
3300
+ def _run_setup(setup_cmd: str) -> int:
3301
+ returncode = runner.run(
3302
+ setup_cmd,
3303
+ log_path=setup_log_path,
3304
+ process_stream=False,
3305
+ # We do not source bashrc for setup, since bashrc is sourced
3306
+ # in the script already.
3307
+ # Skip an empty line and two lines due to the /bin/bash -i
3308
+ # and source ~/.bashrc in the setup_cmd.
3309
+ # bash: cannot set terminal process group (7398): Inappropriate ioctl for device # pylint: disable=line-too-long
3310
+ # bash: no job control in this shell
3311
+ skip_num_lines=3)
3312
+ return returncode
3313
+
3314
+ returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
3315
+ if returncode == 255:
3316
+ is_message_too_long = False
3317
+ try:
3318
+ with open(os.path.expanduser(setup_log_path),
3319
+ 'r',
3320
+ encoding='utf-8') as f:
3321
+ if 'too long' in f.read():
3322
+ is_message_too_long = True
3323
+ except Exception as e: # pylint: disable=broad-except
3324
+ # We don't crash the setup if we cannot read the log file.
3325
+ # Instead, we should retry the setup with dumping the script
3326
+ # to a file to be safe.
3327
+ logger.debug('Failed to read setup log file '
3328
+ f'{setup_log_path}: {e}')
3329
+ is_message_too_long = True
3330
+
3331
+ if is_message_too_long:
3332
+ # If the setup script is too long, we retry it with dumping
3333
+ # the script to a file and running it with SSH. We use a
3334
+ # general length limit check before but it could be
3335
+ # inaccurate on some systems.
3336
+ logger.debug(
3337
+ 'Failed to run setup command inline due to '
3338
+ 'command length limit. Dumping setup script to '
3339
+ 'file and running it with SSH.')
3340
+ _dump_setup_script(setup_script)
3341
+ returncode = _run_setup(setup_cmd)
3184
3342
 
3185
3343
  def error_message() -> str:
3186
3344
  # Use the function to avoid tailing the file in success case
@@ -3211,23 +3369,33 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3211
3369
 
3212
3370
  num_nodes = len(runners)
3213
3371
  plural = 's' if num_nodes > 1 else ''
3372
+ node_str = f'{num_nodes} VM{plural}'
3373
+ if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
3374
+ node_str = f'{num_nodes} pod{plural}'
3375
+ controller = controller_utils.Controllers.from_name(handle.cluster_name)
3376
+ if controller is not None:
3377
+ node_str = controller.value.name
3214
3378
  if not detach_setup:
3215
- logger.info(f'{fore.CYAN}Running setup on {num_nodes} node{plural}.'
3216
- f'{style.RESET_ALL}')
3379
+ logger.info(
3380
+ ux_utils.starting_message(f'Running setup on {node_str}.'))
3217
3381
  # TODO(zhwu): run_in_parallel uses multi-thread to run the commands,
3218
3382
  # which can cause the program waiting for all the threads to finish,
3219
3383
  # even if some of them raise exceptions. We should replace it with
3220
3384
  # multi-process.
3221
- subprocess_utils.run_in_parallel(_setup_node, range(num_nodes))
3385
+ rich_utils.stop_safe_status()
3386
+ subprocess_utils.run_in_parallel(_setup_node, list(range(num_nodes)))
3222
3387
 
3223
3388
  if detach_setup:
3224
3389
  # Only set this when setup needs to be run outside the self._setup()
3225
- # as part of a job (--detach-setup).
3390
+ # as part of a job (detach_setup, default).
3226
3391
  self._setup_cmd = setup_cmd
3392
+ logger.info(ux_utils.finishing_message('Setup detached.'))
3227
3393
  return
3228
- logger.info(f'{fore.GREEN}Setup completed.{style.RESET_ALL}')
3229
3394
  end = time.time()
3230
3395
  logger.debug(f'Setup took {end - start} seconds.')
3396
+ setup_log_path = os.path.join(self.log_dir, 'setup-*.log')
3397
+ logger.info(
3398
+ ux_utils.finishing_message('Setup completed.', setup_log_path))
3231
3399
 
3232
3400
  def _exec_code_on_head(
3233
3401
  self,
@@ -3238,9 +3406,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3238
3406
  managed_job_dag: Optional['dag.Dag'] = None,
3239
3407
  ) -> None:
3240
3408
  """Executes generated code on the head node."""
3241
- style = colorama.Style
3242
- fore = colorama.Fore
3243
-
3244
3409
  script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
3245
3410
  remote_log_dir = self.log_dir
3246
3411
  remote_log_path = os.path.join(remote_log_dir, 'run.log')
@@ -3252,17 +3417,18 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3252
3417
  encoded_script = shlex.quote(codegen)
3253
3418
  create_script_code = (f'{{ echo {encoded_script} > {script_path}; }}')
3254
3419
  job_submit_cmd = (
3255
- f'RAY_DASHBOARD_PORT=$({constants.SKY_PYTHON_CMD} -c "from sky.skylet import job_lib; print(job_lib.get_job_submission_port())" 2> /dev/null || echo 8265);' # pylint: disable=line-too-long
3256
- f'{cd} && {constants.SKY_RAY_CMD} job submit '
3257
- '--address=http://127.0.0.1:$RAY_DASHBOARD_PORT '
3258
- f'--submission-id {job_id}-$(whoami) --no-wait '
3259
- # Redirect stderr to /dev/null to avoid distracting error from ray.
3260
- f'"{constants.SKY_PYTHON_CMD} -u {script_path} > {remote_log_path} 2> /dev/null"'
3261
- )
3420
+ # JOB_CMD_IDENTIFIER is used for identifying the process retrieved
3421
+ # with pid is the same driver process.
3422
+ f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
3423
+ f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
3424
+ # Do not use &>, which is not POSIX and may not work.
3425
+ # Note that the order of ">filename 2>&1" matters.
3426
+ f'> {remote_log_path} 2>&1')
3262
3427
 
3263
3428
  code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
3264
3429
  job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
3265
- if len(job_submit_cmd) > _MAX_INLINE_SCRIPT_LENGTH:
3430
+
3431
+ def _dump_code_to_file(codegen: str) -> None:
3266
3432
  runners = handle.get_command_runners()
3267
3433
  head_runner = runners[0]
3268
3434
  with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
@@ -3277,6 +3443,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3277
3443
  target=script_path,
3278
3444
  up=True,
3279
3445
  stream_logs=False)
3446
+
3447
+ if _is_command_length_over_limit(job_submit_cmd):
3448
+ _dump_code_to_file(codegen)
3280
3449
  job_submit_cmd = f'{mkdir_code} && {code}'
3281
3450
 
3282
3451
  if managed_job_dag is not None:
@@ -3285,90 +3454,72 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3285
3454
  managed_job_code = managed_job_codegen.set_pending(
3286
3455
  job_id, managed_job_dag)
3287
3456
  # Set the managed job to PENDING state to make sure that this
3288
- # managed job appears in the `sky jobs queue`, when there are
3289
- # already 2x vCPU controller processes running on the controller VM,
3290
- # e.g., 16 controller processes running on a controller with 8
3291
- # vCPUs.
3292
- # The managed job should be set to PENDING state *after* the
3293
- # controller process job has been queued, as our skylet on spot
3294
- # controller will set the managed job in FAILED state if the
3295
- # controller process job does not exist.
3296
- # We cannot set the managed job to PENDING state in the codegen for
3297
- # the controller process job, as it will stay in the job pending
3298
- # table and not be executed until there is an empty slot.
3457
+ # managed job appears in the `sky jobs queue`, even if it needs to
3458
+ # wait to be submitted.
3459
+ # We cannot set the managed job to PENDING state in the job template
3460
+ # (jobs-controller.yaml.j2), as it may need to wait for the run
3461
+ # commands to be scheduled on the job controller in high-load cases.
3299
3462
  job_submit_cmd = job_submit_cmd + ' && ' + managed_job_code
3300
3463
 
3301
3464
  returncode, stdout, stderr = self.run_on_head(handle,
3302
3465
  job_submit_cmd,
3303
3466
  stream_logs=False,
3304
3467
  require_outputs=True)
3305
-
3306
- # Happens when someone calls `sky exec` but remote is outdated
3307
- # necessitating calling `sky launch`.
3308
- backend_utils.check_stale_runtime_on_remote(returncode, stdout,
3468
+ # Happens when someone calls `sky exec` but remote is outdated for
3469
+ # running a job. Necessitating calling `sky launch`.
3470
+ backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3309
3471
  handle.cluster_name)
3472
+ if returncode == 255 and 'too long' in stdout + stderr:
3473
+ # If the generated script is too long, we retry it with dumping
3474
+ # the script to a file and running it with SSH. We use a general
3475
+ # length limit check before but it could be inaccurate on some
3476
+ # systems.
3477
+ logger.debug('Failed to submit job due to command length limit. '
3478
+ 'Dumping job to file and running it with SSH.')
3479
+ _dump_code_to_file(codegen)
3480
+ job_submit_cmd = f'{mkdir_code} && {code}'
3481
+ returncode, stdout, stderr = self.run_on_head(handle,
3482
+ job_submit_cmd,
3483
+ stream_logs=False,
3484
+ require_outputs=True)
3485
+
3310
3486
  subprocess_utils.handle_returncode(returncode,
3311
3487
  job_submit_cmd,
3312
3488
  f'Failed to submit job {job_id}.',
3313
3489
  stderr=stdout + stderr)
3314
3490
 
3315
- logger.info('Job submitted with Job ID: '
3316
- f'{style.BRIGHT}{job_id}{style.RESET_ALL}')
3317
-
3318
- try:
3319
- if not detach_run:
3320
- if (handle.cluster_name in controller_utils.Controllers.
3321
- JOBS_CONTROLLER.value.candidate_cluster_names):
3322
- self.tail_managed_job_logs(handle, job_id)
3323
- else:
3324
- # Sky logs. Not using subprocess.run since it will make the
3325
- # ssh keep connected after ctrl-c.
3326
- self.tail_logs(handle, job_id)
3327
- finally:
3328
- name = handle.cluster_name
3329
- controller = controller_utils.Controllers.from_name(name)
3330
- if controller == controller_utils.Controllers.JOBS_CONTROLLER:
3331
- logger.info(
3332
- f'{fore.CYAN}Managed Job ID: '
3333
- f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
3334
- '\nTo cancel the job:\t\t'
3335
- f'{backend_utils.BOLD}sky jobs cancel {job_id}'
3336
- f'{backend_utils.RESET_BOLD}'
3337
- '\nTo stream job logs:\t\t'
3338
- f'{backend_utils.BOLD}sky jobs logs {job_id}'
3339
- f'{backend_utils.RESET_BOLD}'
3340
- f'\nTo stream controller logs:\t'
3341
- f'{backend_utils.BOLD}sky jobs logs --controller {job_id}'
3342
- f'{backend_utils.RESET_BOLD}'
3343
- '\nTo view all managed jobs:\t'
3344
- f'{backend_utils.BOLD}sky jobs queue'
3345
- f'{backend_utils.RESET_BOLD}'
3346
- '\nTo view managed job dashboard:\t'
3347
- f'{backend_utils.BOLD}sky jobs dashboard'
3348
- f'{backend_utils.RESET_BOLD}')
3349
- elif controller is None:
3350
- logger.info(f'{fore.CYAN}Job ID: '
3351
- f'{style.BRIGHT}{job_id}{style.RESET_ALL}'
3352
- '\nTo cancel the job:\t'
3353
- f'{backend_utils.BOLD}sky cancel {name} {job_id}'
3354
- f'{backend_utils.RESET_BOLD}'
3355
- '\nTo stream job logs:\t'
3356
- f'{backend_utils.BOLD}sky logs {name} {job_id}'
3357
- f'{backend_utils.RESET_BOLD}'
3358
- '\nTo view the job queue:\t'
3359
- f'{backend_utils.BOLD}sky queue {name}'
3360
- f'{backend_utils.RESET_BOLD}')
3491
+ controller = controller_utils.Controllers.from_name(handle.cluster_name)
3492
+ if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
3493
+ logger.info(ux_utils.starting_message('Service registered.'))
3494
+ else:
3495
+ logger.info(
3496
+ ux_utils.starting_message(f'Job submitted, ID: {job_id}'))
3497
+ rich_utils.stop_safe_status()
3498
+ if not detach_run:
3499
+ if (handle.cluster_name == controller_utils.Controllers.
3500
+ JOBS_CONTROLLER.value.cluster_name):
3501
+ self.tail_managed_job_logs(handle, job_id)
3502
+ else:
3503
+ # Sky logs. Not using subprocess.run since it will make the
3504
+ # ssh keep connected after ctrl-c.
3505
+ self.tail_logs(handle, job_id)
3361
3506
 
3362
3507
  def _add_job(self, handle: CloudVmRayResourceHandle,
3363
3508
  job_name: Optional[str], resources_str: str) -> int:
3364
- username = getpass.getuser()
3365
- code = job_lib.JobLibCodeGen.add_job(job_name, username,
3366
- self.run_timestamp, resources_str)
3509
+ code = job_lib.JobLibCodeGen.add_job(
3510
+ job_name=job_name,
3511
+ username=common_utils.get_user_hash(),
3512
+ run_timestamp=self.run_timestamp,
3513
+ resources_str=resources_str)
3367
3514
  returncode, job_id_str, stderr = self.run_on_head(handle,
3368
3515
  code,
3369
3516
  stream_logs=False,
3370
3517
  require_outputs=True,
3371
3518
  separate_stderr=True)
3519
+ # Happens when someone calls `sky exec` but remote is outdated for
3520
+ # adding a job. Necessitating calling `sky launch`.
3521
+ backend_utils.check_stale_runtime_on_remote(returncode, stderr,
3522
+ handle.cluster_name)
3372
3523
  # TODO(zhwu): this sometimes will unexpectedly fail, we can add
3373
3524
  # retry for this, after we figure out the reason.
3374
3525
  subprocess_utils.handle_returncode(returncode, code,
@@ -3398,15 +3549,31 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3398
3549
  Returns:
3399
3550
  Job id if the task is submitted to the cluster, None otherwise.
3400
3551
  """
3401
- if task.run is None:
3552
+ if task.run is None and self._setup_cmd is None:
3553
+ # This message is fine without mentioning setup, as there are two
3554
+ # cases when run section is empty:
3555
+ # 1. setup specified: setup is executed in detached mode and this
3556
+ # message will not be shown.
3557
+ # 2. no setup specified: this message is fine as a user is likely
3558
+ # creating a cluster only, and ok with the empty run command.
3402
3559
  logger.info('Run commands not specified or empty.')
3403
3560
  return None
3404
- # Check the task resources vs the cluster resources. Since `sky exec`
3405
- # will not run the provision and _check_existing_cluster
3406
- # We need to check ports here since sky.exec shouldn't change resources
3407
- valid_resource = self.check_resources_fit_cluster(handle,
3408
- task,
3409
- check_ports=True)
3561
+ if task.run is None:
3562
+ # If the task has no run command, we still need to execute the
3563
+ # generated ray driver program to run the setup command in detached
3564
+ # mode.
3565
+ # In this case, we reset the resources for the task, so that the
3566
+ # detached setup does not need to wait for the task resources to be
3567
+ # ready (which is not used for setup anyway).
3568
+ valid_resource = sky.Resources()
3569
+ else:
3570
+ # Check the task resources vs the cluster resources. Since
3571
+ # `sky exec` will not run the provision and _check_existing_cluster
3572
+ # We need to check ports here since sky.exec shouldn't change
3573
+ # resources.
3574
+ valid_resource = self.check_resources_fit_cluster(handle,
3575
+ task,
3576
+ check_ports=True)
3410
3577
  task_copy = copy.copy(task)
3411
3578
  # Handle multiple resources exec case.
3412
3579
  task_copy.set_resources(valid_resource)
@@ -3434,30 +3601,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3434
3601
 
3435
3602
  def _post_execute(self, handle: CloudVmRayResourceHandle,
3436
3603
  down: bool) -> None:
3437
- fore = colorama.Fore
3438
- style = colorama.Style
3439
- name = handle.cluster_name
3440
- controller = controller_utils.Controllers.from_name(name)
3441
- if controller is not None or down:
3442
- return
3443
- stop_str = ('\nTo stop the cluster:'
3444
- f'\t{backend_utils.BOLD}sky stop {name}'
3445
- f'{backend_utils.RESET_BOLD}')
3446
- logger.info(f'\n{fore.CYAN}Cluster name: '
3447
- f'{style.BRIGHT}{name}{style.RESET_ALL}'
3448
- '\nTo log into the head VM:\t'
3449
- f'{backend_utils.BOLD}ssh {name}'
3450
- f'{backend_utils.RESET_BOLD}'
3451
- '\nTo submit a job:'
3452
- f'\t\t{backend_utils.BOLD}sky exec {name} yaml_file'
3453
- f'{backend_utils.RESET_BOLD}'
3454
- f'{stop_str}'
3455
- '\nTo teardown the cluster:'
3456
- f'\t{backend_utils.BOLD}sky down {name}'
3457
- f'{backend_utils.RESET_BOLD}')
3458
- if (gcp_utils.is_tpu(handle.launched_resources) and
3459
- not gcp_utils.is_tpu_vm(handle.launched_resources)):
3460
- logger.info('Tip: `sky down` will delete launched TPU(s) too.')
3604
+ """Post-execute cleanup."""
3605
+ del handle, down # Unused.
3606
+ # All logic is handled in previous stages, no-op.
3461
3607
 
3462
3608
  def _teardown_ephemeral_storage(self, task: task_lib.Task) -> None:
3463
3609
  storage_mounts = task.storage_mounts
@@ -3505,33 +3651,47 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3505
3651
  is_identity_mismatch_and_purge = True
3506
3652
  else:
3507
3653
  raise
3508
-
3509
3654
  lock_path = os.path.expanduser(
3510
3655
  backend_utils.CLUSTER_STATUS_LOCK_PATH.format(cluster_name))
3511
-
3512
- try:
3513
- # TODO(mraheja): remove pylint disabling when filelock
3514
- # version updated
3515
- # pylint: disable=abstract-class-instantiated
3516
- with filelock.FileLock(
3517
- lock_path,
3518
- backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
3519
- self.teardown_no_lock(
3520
- handle,
3521
- terminate,
3522
- purge,
3523
- # When --purge is set and we already see an ID mismatch
3524
- # error, we skip the refresh codepath. This is because
3525
- # refresh checks current user identity can throw
3526
- # ClusterOwnerIdentityMismatchError. The argument/flag
3527
- # `purge` should bypass such ID mismatch errors.
3528
- refresh_cluster_status=not is_identity_mismatch_and_purge)
3529
- if terminate:
3530
- common_utils.remove_file_if_exists(lock_path)
3531
- except filelock.Timeout as e:
3532
- raise RuntimeError(
3533
- f'Cluster {cluster_name!r} is locked by {lock_path}. '
3534
- 'Check to see if it is still being launched') from e
3656
+ # Retry in case new cluster operation comes in and holds the lock
3657
+ # right after the lock is removed.
3658
+ n_attempts = 2
3659
+ while True:
3660
+ n_attempts -= 1
3661
+ # In case other running cluster operations are still holding the
3662
+ # lock.
3663
+ common_utils.remove_file_if_exists(lock_path)
3664
+ # We have to kill the cluster requests, because `down` and `stop`
3665
+ # should be higher priority than the cluster requests, and we should
3666
+ # release the lock from other requests.
3667
+ exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
3668
+ requests_lib.kill_cluster_requests(handle.cluster_name,
3669
+ exclude_request_to_kill)
3670
+ try:
3671
+ with filelock.FileLock(
3672
+ lock_path,
3673
+ backend_utils.CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS):
3674
+ self.teardown_no_lock(
3675
+ handle,
3676
+ terminate,
3677
+ purge,
3678
+ # When --purge is set and we already see an ID mismatch
3679
+ # error, we skip the refresh codepath. This is because
3680
+ # refresh checks current user identity can throw
3681
+ # ClusterOwnerIdentityMismatchError. The argument/flag
3682
+ # `purge` should bypass such ID mismatch errors.
3683
+ refresh_cluster_status=(
3684
+ not is_identity_mismatch_and_purge))
3685
+ if terminate:
3686
+ common_utils.remove_file_if_exists(lock_path)
3687
+ break
3688
+ except filelock.Timeout as e:
3689
+ logger.debug(f'Failed to acquire lock for {cluster_name}, '
3690
+ f'retrying...')
3691
+ if n_attempts <= 0:
3692
+ raise RuntimeError(
3693
+ f'Cluster {cluster_name!r} is locked by {lock_path}. '
3694
+ 'Check to see if it is still being launched') from e
3535
3695
 
3536
3696
  # --- CloudVMRayBackend Specific APIs ---
3537
3697
 
@@ -3555,24 +3715,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3555
3715
  def cancel_jobs(self,
3556
3716
  handle: CloudVmRayResourceHandle,
3557
3717
  jobs: Optional[List[int]],
3558
- cancel_all: bool = False) -> None:
3718
+ cancel_all: bool = False,
3719
+ user_hash: Optional[str] = None) -> None:
3559
3720
  """Cancels jobs.
3560
3721
 
3561
- CloudVMRayBackend specific method.
3562
-
3563
- Args:
3564
- handle: The cluster handle.
3565
- jobs: Job IDs to cancel. (See `cancel_all` for special semantics.)
3566
- cancel_all: Whether to cancel all jobs. If True, asserts `jobs` is
3567
- set to None. If False and `jobs` is None, cancel the latest
3568
- running job.
3722
+ See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
3569
3723
  """
3570
- if cancel_all:
3571
- assert jobs is None, (
3572
- 'If cancel_all=True, usage is to set jobs=None')
3573
- code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all)
3574
-
3575
- # All error messages should have been redirected to stdout.
3724
+ code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all, user_hash)
3576
3725
  returncode, stdout, _ = self.run_on_head(handle,
3577
3726
  code,
3578
3727
  stream_logs=False,
@@ -3581,13 +3730,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3581
3730
  returncode, code,
3582
3731
  f'Failed to cancel jobs on cluster {handle.cluster_name}.', stdout)
3583
3732
 
3584
- cancelled_ids = common_utils.decode_payload(stdout)
3733
+ cancelled_ids = message_utils.decode_payload(stdout)
3585
3734
  if cancelled_ids:
3586
3735
  logger.info(
3587
3736
  f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
3588
3737
  else:
3589
- logger.info(
3590
- 'No jobs cancelled. They may already be in terminal states.')
3738
+ logger.info('No jobs cancelled. They may be in terminal states.')
3591
3739
 
3592
3740
  def sync_down_logs(
3593
3741
  self,
@@ -3608,7 +3756,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3608
3756
  separate_stderr=True)
3609
3757
  subprocess_utils.handle_returncode(returncode, code,
3610
3758
  'Failed to sync logs.', stderr)
3611
- run_timestamps = common_utils.decode_payload(run_timestamps)
3759
+ run_timestamps = message_utils.decode_payload(run_timestamps)
3612
3760
  if not run_timestamps:
3613
3761
  logger.info(f'{colorama.Fore.YELLOW}'
3614
3762
  'No matching log directories found'
@@ -3622,16 +3770,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3622
3770
  for run_timestamp in run_timestamps
3623
3771
  ]
3624
3772
  local_log_dirs = [
3625
- os.path.expanduser(os.path.join(local_dir, run_timestamp))
3773
+ os.path.join(local_dir, run_timestamp)
3626
3774
  for run_timestamp in run_timestamps
3627
3775
  ]
3628
3776
 
3629
- style = colorama.Style
3630
- fore = colorama.Fore
3631
- for job_id, log_dir in zip(job_ids, local_log_dirs):
3632
- logger.info(f'{fore.CYAN}Job {job_id} logs: {log_dir}'
3633
- f'{style.RESET_ALL}')
3634
-
3635
3777
  runners = handle.get_command_runners()
3636
3778
 
3637
3779
  def _rsync_down(args) -> None:
@@ -3642,10 +3784,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3642
3784
  """
3643
3785
  (runner, local_log_dir, remote_log_dir) = args
3644
3786
  try:
3645
- os.makedirs(local_log_dir, exist_ok=True)
3787
+ os.makedirs(os.path.expanduser(local_log_dir), exist_ok=True)
3646
3788
  runner.rsync(
3647
- source=f'{remote_log_dir}/*',
3648
- target=local_log_dir,
3789
+ # Require a `/` at the end to make sure the parent dir
3790
+ # are not created locally. We do not add additional '*' as
3791
+ # kubernetes's rsync does not work with an ending '*'.
3792
+ source=f'{remote_log_dir}/',
3793
+ target=os.path.expanduser(local_log_dir),
3649
3794
  up=False,
3650
3795
  stream_logs=False,
3651
3796
  )
@@ -3653,7 +3798,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3653
3798
  if e.returncode == exceptions.RSYNC_FILE_NOT_FOUND_CODE:
3654
3799
  # Raised by rsync_down. Remote log dir may not exist, since
3655
3800
  # the job can be run on some part of the nodes.
3656
- logger.debug(f'{runner.ip} does not have the tasks/*.')
3801
+ logger.debug(f'{runner.node_id} does not have the tasks/*.')
3657
3802
  else:
3658
3803
  raise
3659
3804
 
@@ -3667,7 +3812,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3667
3812
  handle: CloudVmRayResourceHandle,
3668
3813
  job_id: Optional[int],
3669
3814
  managed_job_id: Optional[int] = None,
3670
- follow: bool = True) -> int:
3815
+ follow: bool = True,
3816
+ tail: int = 0) -> int:
3671
3817
  """Tail the logs of a job.
3672
3818
 
3673
3819
  Args:
@@ -3675,10 +3821,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3675
3821
  job_id: The job ID to tail the logs of.
3676
3822
  managed_job_id: The managed job ID for display purpose only.
3677
3823
  follow: Whether to follow the logs.
3824
+ tail: The number of lines to display from the end of the
3825
+ log file. If 0, print all lines.
3678
3826
  """
3679
3827
  code = job_lib.JobLibCodeGen.tail_logs(job_id,
3680
3828
  managed_job_id=managed_job_id,
3681
- follow=follow)
3829
+ follow=follow,
3830
+ tail=tail)
3682
3831
  if job_id is None and managed_job_id is None:
3683
3832
  logger.info(
3684
3833
  'Job ID not provided. Streaming the logs of the latest job.')
@@ -3697,10 +3846,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3697
3846
  # Allocate a pseudo-terminal to disable output buffering.
3698
3847
  # Otherwise, there may be 5 minutes delay in logging.
3699
3848
  ssh_mode=command_runner.SshMode.INTERACTIVE,
3700
- # Disable stdin to avoid ray outputs mess up the terminal with
3701
- # misaligned output in multithreading/multiprocessing.
3702
- # Refer to: https://github.com/ray-project/ray/blob/d462172be7c5779abf37609aed08af112a533e1e/python/ray/autoscaler/_private/subprocess_output_util.py#L264 # pylint: disable=line-too-long
3703
- stdin=subprocess.DEVNULL,
3704
3849
  )
3705
3850
  except SystemExit as e:
3706
3851
  returncode = e.code
@@ -3730,52 +3875,169 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3730
3875
  stream_logs=True,
3731
3876
  process_stream=False,
3732
3877
  ssh_mode=command_runner.SshMode.INTERACTIVE,
3733
- stdin=subprocess.DEVNULL,
3734
3878
  )
3735
3879
 
3736
- def tail_serve_logs(self, handle: CloudVmRayResourceHandle,
3737
- service_name: str, target: serve_lib.ServiceComponent,
3738
- replica_id: Optional[int], follow: bool) -> None:
3739
- """Tail the logs of a service.
3880
+ def sync_down_managed_job_logs(
3881
+ self,
3882
+ handle: CloudVmRayResourceHandle,
3883
+ job_id: Optional[int] = None,
3884
+ job_name: Optional[str] = None,
3885
+ controller: bool = False,
3886
+ local_dir: str = constants.SKY_LOGS_DIRECTORY) -> Dict[str, str]:
3887
+ """Sync down logs for a managed job.
3740
3888
 
3741
3889
  Args:
3742
- handle: The handle to the sky serve controller.
3743
- service_name: The name of the service.
3744
- target: The component to tail the logs of. Could be controller,
3745
- load balancer, or replica.
3746
- replica_id: The replica ID to tail the logs of. Only used when
3747
- target is replica.
3748
- follow: Whether to follow the logs.
3749
- """
3750
- if target != serve_lib.ServiceComponent.REPLICA:
3751
- code = serve_lib.ServeCodeGen.stream_serve_process_logs(
3752
- service_name,
3753
- stream_controller=(
3754
- target == serve_lib.ServiceComponent.CONTROLLER),
3755
- follow=follow)
3756
- else:
3757
- assert replica_id is not None, service_name
3758
- code = serve_lib.ServeCodeGen.stream_replica_logs(
3759
- service_name, replica_id, follow)
3890
+ handle: The handle to the cluster.
3891
+ job_id: The job ID to sync down logs for.
3892
+ job_name: The job name to sync down logs for.
3893
+ controller: Whether to sync down logs for the controller.
3894
+ local_dir: The local directory to sync down logs to.
3760
3895
 
3761
- signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
3762
- signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
3896
+ Returns:
3897
+ A dictionary mapping job_id to log path.
3898
+ """
3899
+ # if job_name and job_id should not both be specified
3900
+ assert job_name is None or job_id is None, (job_name, job_id)
3763
3901
 
3764
- self.run_on_head(
3902
+ if job_id is None:
3903
+ # generate code to get the job_id
3904
+ # if job_name is None, get all job_ids
3905
+ # TODO: Only get the latest job_id, since that's the only one we use
3906
+ code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
3907
+ job_name=job_name)
3908
+ returncode, job_ids, stderr = self.run_on_head(handle,
3909
+ code,
3910
+ stream_logs=False,
3911
+ require_outputs=True,
3912
+ separate_stderr=True)
3913
+ subprocess_utils.handle_returncode(returncode, code,
3914
+ 'Failed to sync down logs.',
3915
+ stderr)
3916
+ job_ids = message_utils.decode_payload(job_ids)
3917
+ if not job_ids:
3918
+ logger.info(f'{colorama.Fore.YELLOW}'
3919
+ 'No matching job found'
3920
+ f'{colorama.Style.RESET_ALL}')
3921
+ return {}
3922
+ elif len(job_ids) > 1:
3923
+ name_str = ''
3924
+ if job_name is not None:
3925
+ name_str = ('Multiple jobs IDs found under the name '
3926
+ f'{job_name}. ')
3927
+ controller_str = ' (controller)' if controller else ''
3928
+ logger.info(f'{colorama.Fore.YELLOW}'
3929
+ f'{name_str}'
3930
+ f'Downloading the latest job logs{controller_str}.'
3931
+ f'{colorama.Style.RESET_ALL}')
3932
+ # list should aready be in descending order
3933
+ job_id = job_ids[0]
3934
+
3935
+ # get the run_timestamp
3936
+ # the function takes in [job_id]
3937
+ code = job_lib.JobLibCodeGen.get_run_timestamp_with_globbing(
3938
+ [str(job_id)])
3939
+ returncode, run_timestamps, stderr = self.run_on_head(
3765
3940
  handle,
3766
3941
  code,
3767
- stream_logs=True,
3768
- process_stream=False,
3769
- ssh_mode=command_runner.SshMode.INTERACTIVE,
3770
- stdin=subprocess.DEVNULL,
3771
- )
3942
+ stream_logs=False,
3943
+ require_outputs=True,
3944
+ separate_stderr=True)
3945
+ subprocess_utils.handle_returncode(returncode, code,
3946
+ 'Failed to sync logs.', stderr)
3947
+ # returns with a dict of {job_id: run_timestamp}
3948
+ run_timestamps = message_utils.decode_payload(run_timestamps)
3949
+ if not run_timestamps:
3950
+ logger.info(f'{colorama.Fore.YELLOW}'
3951
+ 'No matching log directories found'
3952
+ f'{colorama.Style.RESET_ALL}')
3953
+ return {}
3954
+
3955
+ run_timestamp = list(run_timestamps.values())[0]
3956
+ job_id = list(run_timestamps.keys())[0]
3957
+ local_log_dir = ''
3958
+ if controller: # download controller logs
3959
+ remote_log = os.path.join(managed_jobs.JOBS_CONTROLLER_LOGS_DIR,
3960
+ f'{job_id}.log')
3961
+ local_log_dir = os.path.join(local_dir, run_timestamp)
3962
+ os.makedirs(os.path.dirname(os.path.expanduser(local_log_dir)),
3963
+ exist_ok=True)
3964
+
3965
+ logger.debug(f'{colorama.Fore.CYAN}'
3966
+ f'Job {job_id} local logs: {local_log_dir}'
3967
+ f'{colorama.Style.RESET_ALL}')
3968
+
3969
+ runners = handle.get_command_runners()
3970
+
3971
+ def _rsync_down(args) -> None:
3972
+ """Rsync down logs from remote nodes.
3973
+
3974
+ Args:
3975
+ args: A tuple of (runner, local_log_dir, remote_log_dir)
3976
+ """
3977
+ (runner, local_log_dir, remote_log) = args
3978
+ try:
3979
+ os.makedirs(os.path.expanduser(local_log_dir),
3980
+ exist_ok=True)
3981
+ runner.rsync(
3982
+ source=remote_log,
3983
+ target=f'{local_log_dir}/controller.log',
3984
+ up=False,
3985
+ stream_logs=False,
3986
+ )
3987
+ except exceptions.CommandError as e:
3988
+ if e.returncode == exceptions.RSYNC_FILE_NOT_FOUND_CODE:
3989
+ # Raised by rsync_down. Remote log dir may not exist
3990
+ # since the job can be run on some part of the nodes.
3991
+ logger.debug(
3992
+ f'{runner.node_id} does not have the tasks/*.')
3993
+ else:
3994
+ raise
3995
+
3996
+ parallel_args = [
3997
+ (runner, local_log_dir, remote_log) for runner in runners
3998
+ ]
3999
+ subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
4000
+ else: # download job logs
4001
+ local_log_dir = os.path.join(local_dir, 'managed_jobs',
4002
+ run_timestamp)
4003
+ os.makedirs(os.path.dirname(os.path.expanduser(local_log_dir)),
4004
+ exist_ok=True)
4005
+ log_file = os.path.join(local_log_dir, 'run.log')
4006
+
4007
+ code = managed_jobs.ManagedJobCodeGen.stream_logs(job_name=None,
4008
+ job_id=job_id,
4009
+ follow=False,
4010
+ controller=False)
4011
+
4012
+ # With the stdin=subprocess.DEVNULL, the ctrl-c will not
4013
+ # kill the process, so we need to handle it manually here.
4014
+ if threading.current_thread() is threading.main_thread():
4015
+ signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
4016
+ signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
4017
+
4018
+ # We redirect the output to the log file
4019
+ # and disable the STDOUT and STDERR
4020
+ self.run_on_head(
4021
+ handle,
4022
+ code,
4023
+ log_path=os.path.expanduser(log_file),
4024
+ stream_logs=False,
4025
+ process_stream=False,
4026
+ ssh_mode=command_runner.SshMode.INTERACTIVE,
4027
+ )
4028
+
4029
+ logger.debug(f'{colorama.Fore.CYAN}'
4030
+ f'Job {job_id} logs: {local_log_dir}'
4031
+ f'{colorama.Style.RESET_ALL}')
4032
+ return {str(job_id): local_log_dir}
3772
4033
 
3773
4034
  def teardown_no_lock(self,
3774
4035
  handle: CloudVmRayResourceHandle,
3775
4036
  terminate: bool,
3776
4037
  purge: bool = False,
3777
4038
  post_teardown_cleanup: bool = True,
3778
- refresh_cluster_status: bool = True) -> None:
4039
+ refresh_cluster_status: bool = True,
4040
+ remove_from_db: bool = True) -> None:
3779
4041
  """Teardown the cluster without acquiring the cluster status lock.
3780
4042
 
3781
4043
  NOTE: This method should not be called without holding the cluster
@@ -3787,11 +4049,28 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3787
4049
  Raises:
3788
4050
  RuntimeError: If the cluster fails to be terminated/stopped.
3789
4051
  """
4052
+ exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
4053
+ # We have to kill the cluster requests again within the lock, because
4054
+ # any pending requests on the same cluster should be cancelled after
4055
+ # the cluster is terminated/stopped. Otherwise, it will be quite
4056
+ # confusing to see the cluster restarted immediately after it is
4057
+ # terminated/stopped, when there is a pending launch request.
4058
+ requests_lib.kill_cluster_requests(handle.cluster_name,
4059
+ exclude_request_to_kill)
4060
+ cluster_status_fetched = False
3790
4061
  if refresh_cluster_status:
3791
- prev_cluster_status, _ = (
3792
- backend_utils.refresh_cluster_status_handle(
3793
- handle.cluster_name, acquire_per_cluster_status_lock=False))
3794
- else:
4062
+ try:
4063
+ prev_cluster_status, _ = (
4064
+ backend_utils.refresh_cluster_status_handle(
4065
+ handle.cluster_name,
4066
+ acquire_per_cluster_status_lock=False))
4067
+ cluster_status_fetched = True
4068
+ except exceptions.ClusterStatusFetchingError:
4069
+ logger.warning(
4070
+ 'Failed to fetch cluster status for '
4071
+ f'{handle.cluster_name!r}. Assuming the cluster is still '
4072
+ 'up.')
4073
+ if not cluster_status_fetched:
3795
4074
  record = global_user_state.get_cluster_from_name(
3796
4075
  handle.cluster_name)
3797
4076
  prev_cluster_status = record[
@@ -3805,6 +4084,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3805
4084
  f'Cluster {handle.cluster_name!r} is already terminated. '
3806
4085
  'Skipped.')
3807
4086
  return
4087
+
4088
+ if handle.cluster_yaml is None:
4089
+ logger.warning(f'Cluster {handle.cluster_name!r} has no '
4090
+ f'provision yaml so it '
4091
+ 'has not been provisioned. Skipped.')
4092
+ global_user_state.remove_cluster(handle.cluster_name,
4093
+ terminate=terminate)
4094
+ return
3808
4095
  log_path = os.path.join(os.path.expanduser(self.log_dir),
3809
4096
  'teardown.log')
3810
4097
  log_abs_path = os.path.abspath(log_path)
@@ -3843,7 +4130,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3843
4130
 
3844
4131
  try:
3845
4132
  provisioner.teardown_cluster(repr(cloud),
3846
- provisioner.ClusterName(
4133
+ resources_utils.ClusterName(
3847
4134
  cluster_name,
3848
4135
  cluster_name_on_cloud),
3849
4136
  terminate=terminate,
@@ -3859,25 +4146,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3859
4146
  raise
3860
4147
 
3861
4148
  if post_teardown_cleanup:
3862
- self.post_teardown_cleanup(handle, terminate, purge)
4149
+ self.post_teardown_cleanup(handle, terminate, purge,
4150
+ remove_from_db)
3863
4151
  return
3864
4152
 
3865
- if terminate and isinstance(cloud, clouds.Azure):
3866
- # Here we handle termination of Azure by ourselves instead of Ray
3867
- # autoscaler.
3868
- resource_group = config['provider']['resource_group']
3869
- terminate_cmd = f'az group delete -y --name {resource_group}'
3870
- with rich_utils.safe_status(f'[bold cyan]Terminating '
3871
- f'[green]{cluster_name}'):
3872
- returncode, stdout, stderr = log_lib.run_with_log(
3873
- terminate_cmd,
3874
- log_abs_path,
3875
- shell=True,
3876
- stream_logs=False,
3877
- require_outputs=True)
3878
-
3879
- elif (isinstance(cloud, clouds.IBM) and terminate and
3880
- prev_cluster_status == status_lib.ClusterStatus.STOPPED):
4153
+ if (isinstance(cloud, clouds.IBM) and terminate and
4154
+ prev_cluster_status == status_lib.ClusterStatus.STOPPED):
3881
4155
  # pylint: disable= W0622 W0703 C0415
3882
4156
  from sky.adaptors import ibm
3883
4157
  from sky.skylet.providers.ibm.vpc_provider import IBMVPCProvider
@@ -3895,7 +4169,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3895
4169
  limit=1000).get_result()['items']
3896
4170
  vpc_id = None
3897
4171
  try:
3898
- # pylint: disable=line-too-long
3899
4172
  vpc_id = vpcs_filtered_by_tags_and_region[0]['crn'].rsplit(
3900
4173
  ':', 1)[-1]
3901
4174
  vpc_found = True
@@ -3904,7 +4177,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3904
4177
  returncode = -1
3905
4178
 
3906
4179
  if vpc_found:
3907
- # pylint: disable=line-too-long E1136
3908
4180
  # Delete VPC and it's associated resources
3909
4181
  vpc_provider = IBMVPCProvider(
3910
4182
  config_provider['resource_group_id'], region,
@@ -3936,25 +4208,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3936
4208
  stdout = ''
3937
4209
  stderr = str(e)
3938
4210
 
3939
- # Apr, 2023 by Hysun(hysun.he@oracle.com): Added support for OCI
3940
- # May, 2023 by Hysun: Allow terminate INIT cluster which may have
3941
- # some instances provisioning in background but not completed.
3942
- elif (isinstance(cloud, clouds.OCI) and terminate and
3943
- prev_cluster_status in (status_lib.ClusterStatus.STOPPED,
3944
- status_lib.ClusterStatus.INIT)):
3945
- region = config['provider']['region']
3946
-
3947
- # pylint: disable=import-outside-toplevel
3948
- from ray.autoscaler.tags import TAG_RAY_CLUSTER_NAME
3949
-
3950
- from sky.skylet.providers.oci.query_helper import oci_query_helper
3951
-
3952
- # 0: All terminated successfully, failed count otherwise
3953
- returncode = oci_query_helper.terminate_instances_by_tags(
3954
- {TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}, region)
3955
-
3956
- # To avoid undefined local variables error.
3957
- stdout = stderr = ''
3958
4211
  else:
3959
4212
  config['provider']['cache_stopped_nodes'] = not terminate
3960
4213
  with tempfile.NamedTemporaryFile('w',
@@ -3965,8 +4218,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3965
4218
  f.flush()
3966
4219
 
3967
4220
  teardown_verb = 'Terminating' if terminate else 'Stopping'
3968
- with rich_utils.safe_status(f'[bold cyan]{teardown_verb} '
3969
- f'[green]{cluster_name}'):
4221
+ with rich_utils.safe_status(
4222
+ ux_utils.spinner_message(
4223
+ f'{teardown_verb}: {cluster_name}', log_path)):
3970
4224
  # FIXME(zongheng): support retries. This call can fail for
3971
4225
  # example due to GCP returning list requests per limit
3972
4226
  # exceeded.
@@ -3995,14 +4249,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3995
4249
  # never launched and the errors are related to pre-launch
3996
4250
  # configurations (such as VPC not found). So it's safe & good UX
3997
4251
  # to not print a failure message.
3998
- #
3999
- # '(ResourceGroupNotFound)': this indicates the resource group on
4000
- # Azure is not found. That means the cluster is already deleted
4001
- # on the cloud. So it's safe & good UX to not print a failure
4002
- # message.
4003
4252
  elif ('TPU must be specified.' not in stderr and
4004
- 'SKYPILOT_ERROR_NO_NODES_LAUNCHED: ' not in stderr and
4005
- '(ResourceGroupNotFound)' not in stderr):
4253
+ 'SKYPILOT_ERROR_NO_NODES_LAUNCHED: ' not in stderr):
4006
4254
  raise RuntimeError(
4007
4255
  _TEARDOWN_FAILURE_MESSAGE.format(
4008
4256
  extra_reason='',
@@ -4020,7 +4268,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4020
4268
  def post_teardown_cleanup(self,
4021
4269
  handle: CloudVmRayResourceHandle,
4022
4270
  terminate: bool,
4023
- purge: bool = False) -> None:
4271
+ purge: bool = False,
4272
+ remove_from_db: bool = True) -> None:
4024
4273
  """Cleanup local configs/caches and delete TPUs after teardown.
4025
4274
 
4026
4275
  This method will handle the following cleanup steps:
@@ -4028,53 +4277,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4028
4277
  * Removing ssh configs for the cluster;
4029
4278
  * Updating the local state of the cluster;
4030
4279
  * Removing the terminated cluster's scripts and ray yaml files.
4031
-
4032
- Raises:
4033
- RuntimeError: If it fails to delete the TPU.
4034
4280
  """
4035
- log_path = os.path.join(os.path.expanduser(self.log_dir),
4036
- 'teardown.log')
4037
- log_abs_path = os.path.abspath(log_path)
4038
4281
  cluster_name_on_cloud = handle.cluster_name_on_cloud
4039
-
4040
- # Backward compatibility for TPU nodes created before #2943. Any TPU
4041
- # node launched before that PR have the delete script generated (and do
4042
- # not have the tpu_node config set in its cluster yaml), so we have to
4043
- # call the deletion script to clean up the TPU node.
4044
- # For TPU nodes launched after the PR, deletion is done in SkyPilot's
4045
- # new GCP provisioner API.
4046
- # TODO (zhwu): Remove this after 0.6.0.
4047
- if (handle.tpu_delete_script is not None and
4048
- os.path.exists(handle.tpu_delete_script)):
4049
- # Only call the deletion script if the cluster config does not
4050
- # contain TPU node config. Otherwise, the deletion should
4051
- # already be handled by the new provisioner.
4052
- config = common_utils.read_yaml(handle.cluster_yaml)
4053
- tpu_node_config = config['provider'].get('tpu_node')
4054
- if tpu_node_config is None:
4055
- with rich_utils.safe_status('[bold cyan]Terminating TPU...'):
4056
- tpu_rc, tpu_stdout, tpu_stderr = log_lib.run_with_log(
4057
- ['bash', handle.tpu_delete_script],
4058
- log_abs_path,
4059
- stream_logs=False,
4060
- require_outputs=True)
4061
- if tpu_rc != 0:
4062
- if _TPU_NOT_FOUND_ERROR in tpu_stderr:
4063
- logger.info('TPU not found. '
4064
- 'It should have been deleted already.')
4065
- elif purge:
4066
- logger.warning(
4067
- _TEARDOWN_PURGE_WARNING.format(
4068
- reason='stopping/terminating TPU',
4069
- details=tpu_stderr))
4070
- else:
4071
- raise RuntimeError(
4072
- _TEARDOWN_FAILURE_MESSAGE.format(
4073
- extra_reason='It is caused by TPU failure.',
4074
- cluster_name=common_utils.cluster_name_in_hint(
4075
- handle.cluster_name, cluster_name_on_cloud),
4076
- stdout=tpu_stdout,
4077
- stderr=tpu_stderr))
4282
+ cloud = handle.launched_resources.cloud
4078
4283
 
4079
4284
  if (terminate and handle.launched_resources.is_image_managed is True):
4080
4285
  # Delete the image when terminating a "cloned" cluster, i.e.,
@@ -4095,56 +4300,100 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4095
4300
  'remove it manually to avoid image leakage. Details: '
4096
4301
  f'{common_utils.format_exception(e, use_bracket=True)}')
4097
4302
  if terminate:
4098
- cloud = handle.launched_resources.cloud
4099
- config = common_utils.read_yaml(handle.cluster_yaml)
4100
- try:
4101
- cloud.check_features_are_supported(
4102
- handle.launched_resources,
4103
- {clouds.CloudImplementationFeatures.OPEN_PORTS})
4104
- provision_lib.cleanup_ports(repr(cloud), cluster_name_on_cloud,
4105
- handle.launched_resources.ports,
4106
- config['provider'])
4107
- except exceptions.NotSupportedError:
4108
- pass
4109
- except exceptions.PortDoesNotExistError:
4110
- logger.debug('Ports do not exist. Skipping cleanup.')
4111
- except Exception as e: # pylint: disable=broad-except
4112
- if purge:
4113
- logger.warning(
4114
- f'Failed to cleanup ports. Skipping since purge is '
4115
- f'set. Details: '
4116
- f'{common_utils.format_exception(e, use_bracket=True)}')
4303
+ # This function could be directly called from status refresh,
4304
+ # where we need to cleanup the cluster profile.
4305
+ metadata_utils.remove_cluster_metadata(handle.cluster_name)
4306
+ # The cluster yaml does not exist when skypilot has not found
4307
+ # the right resource to provision the cluster.
4308
+ if handle.cluster_yaml is not None:
4309
+ try:
4310
+ cloud = handle.launched_resources.cloud
4311
+ config = common_utils.read_yaml(handle.cluster_yaml)
4312
+ cloud.check_features_are_supported(
4313
+ handle.launched_resources,
4314
+ {clouds.CloudImplementationFeatures.OPEN_PORTS})
4315
+ provision_lib.cleanup_ports(repr(cloud),
4316
+ cluster_name_on_cloud,
4317
+ handle.launched_resources.ports,
4318
+ config['provider'])
4319
+ self.remove_cluster_config(handle)
4320
+ except exceptions.NotSupportedError:
4321
+ pass
4322
+ except exceptions.PortDoesNotExistError:
4323
+ logger.debug('Ports do not exist. Skipping cleanup.')
4324
+ except Exception as e: # pylint: disable=broad-except
4325
+ if purge:
4326
+ msg = common_utils.format_exception(e, use_bracket=True)
4327
+ logger.warning(
4328
+ f'Failed to cleanup ports. Skipping since purge is '
4329
+ f'set. Details: {msg}')
4330
+ else:
4331
+ raise
4332
+
4333
+ sky.utils.cluster_utils.SSHConfigHelper.remove_cluster(
4334
+ handle.cluster_name)
4335
+
4336
+ def _detect_abnormal_non_terminated_nodes(
4337
+ handle: CloudVmRayResourceHandle) -> None:
4338
+ # Confirm that instances have actually transitioned state before
4339
+ # updating the state database. We do this immediately before
4340
+ # removing the state from the database, so that we can guarantee
4341
+ # that this is always called before the state is removed. We
4342
+ # considered running this check as part of
4343
+ # provisioner.teardown_cluster or provision.terminate_instances, but
4344
+ # it would open the door to code paths that successfully call this
4345
+ # function but do not first call teardown_cluster or
4346
+ # terminate_instances. See
4347
+ # https://github.com/skypilot-org/skypilot/pull/4443#discussion_r1872798032
4348
+ attempts = 0
4349
+ while True:
4350
+ config = common_utils.read_yaml(handle.cluster_yaml)
4351
+
4352
+ logger.debug(f'instance statuses attempt {attempts + 1}')
4353
+ node_status_dict = provision_lib.query_instances(
4354
+ repr(cloud),
4355
+ cluster_name_on_cloud,
4356
+ config['provider'],
4357
+ non_terminated_only=False)
4358
+
4359
+ unexpected_node_state: Optional[Tuple[str, str]] = None
4360
+ for node_id, node_status in node_status_dict.items():
4361
+ logger.debug(f'{node_id} status: {node_status}')
4362
+ # FIXME(cooperc): Some clouds (e.g. GCP) do not distinguish
4363
+ # between "stopping/stopped" and "terminating/terminated",
4364
+ # so we allow for either status instead of casing on
4365
+ # `terminate`.
4366
+ if node_status not in [
4367
+ None, status_lib.ClusterStatus.STOPPED
4368
+ ]:
4369
+ unexpected_node_state = (node_id, node_status)
4370
+ break
4371
+
4372
+ if unexpected_node_state is None:
4373
+ break
4374
+
4375
+ attempts += 1
4376
+ if attempts < _TEARDOWN_WAIT_MAX_ATTEMPTS:
4377
+ time.sleep(_TEARDOWN_WAIT_BETWEEN_ATTEMPS_SECONDS)
4117
4378
  else:
4118
- raise
4379
+ (node_id, node_status) = unexpected_node_state
4380
+ raise RuntimeError(f'Instance {node_id} in unexpected '
4381
+ f'state {node_status}.')
4119
4382
 
4120
- # The cluster file must exist because the cluster_yaml will only
4121
- # be removed after the cluster entry in the database is removed.
4122
- config = common_utils.read_yaml(handle.cluster_yaml)
4123
- auth_config = config['auth']
4124
- backend_utils.SSHConfigHelper.remove_cluster(handle.cluster_name,
4125
- handle.head_ip,
4126
- auth_config,
4127
- handle.docker_user)
4383
+ # If cluster_yaml is None, the cluster should ensured to be terminated,
4384
+ # so we don't need to do the double check.
4385
+ if handle.cluster_yaml is not None:
4386
+ _detect_abnormal_non_terminated_nodes(handle)
4128
4387
 
4129
- global_user_state.remove_cluster(handle.cluster_name,
4130
- terminate=terminate)
4388
+ if not terminate or remove_from_db:
4389
+ global_user_state.remove_cluster(handle.cluster_name,
4390
+ terminate=terminate)
4131
4391
 
4132
- if terminate:
4133
- # This function could be directly called from status refresh,
4134
- # where we need to cleanup the cluster profile.
4135
- metadata_utils.remove_cluster_metadata(handle.cluster_name)
4136
- # Clean up TPU creation/deletion scripts
4137
- # Backward compatibility for TPU nodes created before #2943.
4138
- # TODO (zhwu): Remove this after 0.6.0.
4139
- if handle.tpu_delete_script is not None:
4140
- assert handle.tpu_create_script is not None
4141
- common_utils.remove_file_if_exists(handle.tpu_create_script)
4142
- common_utils.remove_file_if_exists(handle.tpu_delete_script)
4143
-
4144
- # Clean up generated config
4145
- # No try-except is needed since Ray will fail to teardown the
4146
- # cluster if the cluster_yaml is missing.
4147
- common_utils.remove_file_if_exists(handle.cluster_yaml)
4392
+ def remove_cluster_config(self, handle: CloudVmRayResourceHandle) -> None:
4393
+ """Remove the YAML config of a cluster."""
4394
+ handle.cluster_yaml = None
4395
+ global_user_state.update_cluster_handle(handle.cluster_name, handle)
4396
+ common_utils.remove_file_if_exists(handle.cluster_yaml)
4148
4397
 
4149
4398
  def set_autostop(self,
4150
4399
  handle: CloudVmRayResourceHandle,
@@ -4154,16 +4403,27 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4154
4403
  # The core.autostop() function should have already checked that the
4155
4404
  # cloud and resources support requested autostop.
4156
4405
  if idle_minutes_to_autostop is not None:
4157
- # Skip auto-stop for Kubernetes clusters.
4158
- if (isinstance(handle.launched_resources.cloud, clouds.Kubernetes)
4159
- and not down and idle_minutes_to_autostop >= 0):
4406
+ # Skip auto-stop for Kubernetes and RunPod clusters.
4407
+ if (isinstance(handle.launched_resources.cloud,
4408
+ (clouds.Kubernetes, clouds.RunPod)) and not down and
4409
+ idle_minutes_to_autostop >= 0):
4160
4410
  # We should hit this code path only for the controllers on
4161
- # Kubernetes clusters.
4162
- assert (controller_utils.Controllers.from_name(
4163
- handle.cluster_name) is not None), handle.cluster_name
4164
- logger.info('Auto-stop is not supported for Kubernetes '
4165
- 'clusters. Skipping.')
4166
- return
4411
+ # Kubernetes and RunPod clusters.
4412
+ controller = controller_utils.Controllers.from_name(
4413
+ handle.cluster_name)
4414
+ assert (controller is not None), handle.cluster_name
4415
+ if (controller
4416
+ == controller_utils.Controllers.SKY_SERVE_CONTROLLER and
4417
+ isinstance(handle.launched_resources.cloud,
4418
+ clouds.Kubernetes)):
4419
+ # For SkyServe controllers on Kubernetes: override autostop
4420
+ # behavior to force autodown (instead of no-op)
4421
+ # to avoid dangling controllers.
4422
+ down = True
4423
+ else:
4424
+ logger.info('Auto-stop is not supported for Kubernetes '
4425
+ 'and RunPod clusters. Skipping.')
4426
+ return
4167
4427
 
4168
4428
  # Check if we're stopping spot
4169
4429
  assert (handle.launched_resources is not None and
@@ -4182,6 +4442,13 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4182
4442
  global_user_state.set_cluster_autostop_value(
4183
4443
  handle.cluster_name, idle_minutes_to_autostop, down)
4184
4444
 
4445
+ # Add/Remove autodown annotations to/from Kubernetes pods.
4446
+ if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
4447
+ kubernetes_utils.set_autodown_annotations(
4448
+ handle=handle,
4449
+ idle_minutes_to_autostop=idle_minutes_to_autostop,
4450
+ down=down)
4451
+
4185
4452
  def is_definitely_autostopping(self,
4186
4453
  handle: CloudVmRayResourceHandle,
4187
4454
  stream_logs: bool = True) -> bool:
@@ -4203,7 +4470,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4203
4470
  stream_logs=stream_logs)
4204
4471
 
4205
4472
  if returncode == 0:
4206
- return common_utils.decode_payload(stdout)
4473
+ return message_utils.decode_payload(stdout)
4207
4474
  logger.debug('Failed to check if cluster is autostopping with '
4208
4475
  f'{returncode}: {stdout+stderr}\n'
4209
4476
  f'Command: {code}')
@@ -4333,6 +4600,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4333
4600
  # cluster is terminated (through console or auto-dwon), the record will
4334
4601
  # become None and the cluster_ever_up should be considered as False.
4335
4602
  cluster_ever_up = record is not None and record['cluster_ever_up']
4603
+ prev_config_hash = record['config_hash'] if record is not None else None
4336
4604
  logger.debug(f'cluster_ever_up: {cluster_ever_up}')
4337
4605
  logger.debug(f'record: {record}')
4338
4606
 
@@ -4345,12 +4613,24 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4345
4613
  # Assume resources share the same ports.
4346
4614
  for resource in task.resources:
4347
4615
  assert resource.ports == list(task.resources)[0].ports
4348
- all_ports = resources_utils.port_set_to_ranges(
4349
- resources_utils.port_ranges_to_set(
4350
- handle.launched_resources.ports) |
4351
- resources_utils.port_ranges_to_set(
4352
- list(task.resources)[0].ports))
4616
+ requested_ports_set = resources_utils.port_ranges_to_set(
4617
+ list(task.resources)[0].ports)
4618
+ current_ports_set = resources_utils.port_ranges_to_set(
4619
+ handle.launched_resources.ports)
4620
+ all_ports = resources_utils.port_set_to_ranges(current_ports_set |
4621
+ requested_ports_set)
4353
4622
  to_provision = handle.launched_resources
4623
+ if (to_provision.cloud.OPEN_PORTS_VERSION <=
4624
+ clouds.OpenPortsVersion.LAUNCH_ONLY):
4625
+ if not requested_ports_set <= current_ports_set:
4626
+ current_cloud = to_provision.cloud
4627
+ with ux_utils.print_exception_no_traceback():
4628
+ raise exceptions.NotSupportedError(
4629
+ 'Failed to open new ports on an existing cluster '
4630
+ f'with the current cloud {current_cloud} as it only'
4631
+ ' supports opening ports on launch of the cluster. '
4632
+ 'Please terminate the existing cluster and launch '
4633
+ 'a new cluster with the desired ports open.')
4354
4634
  if all_ports:
4355
4635
  to_provision = to_provision.copy(ports=all_ports)
4356
4636
  return RetryingVmProvisioner.ToProvisionConfig(
@@ -4359,7 +4639,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4359
4639
  handle.launched_nodes,
4360
4640
  prev_cluster_status=prev_cluster_status,
4361
4641
  prev_handle=handle,
4362
- prev_cluster_ever_up=cluster_ever_up)
4642
+ prev_cluster_ever_up=cluster_ever_up,
4643
+ prev_config_hash=prev_config_hash)
4363
4644
  usage_lib.messages.usage.set_new_cluster()
4364
4645
  # Use the task_cloud, because the cloud in `to_provision` can be changed
4365
4646
  # later during the retry.
@@ -4394,20 +4675,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4394
4675
  to_provision = handle_before_refresh.launched_resources
4395
4676
  self.check_resources_fit_cluster(handle_before_refresh, task)
4396
4677
 
4397
- logger.info(
4398
- f'{colorama.Fore.CYAN}Creating a new cluster: {cluster_name!r} '
4399
- f'[{task.num_nodes}x {to_provision}].'
4400
- f'{colorama.Style.RESET_ALL}\n'
4401
- 'Tip: to reuse an existing cluster, '
4402
- 'specify --cluster (-c). '
4403
- 'Run `sky status` to see existing clusters.')
4404
4678
  return RetryingVmProvisioner.ToProvisionConfig(
4405
4679
  cluster_name,
4406
4680
  to_provision,
4407
4681
  task.num_nodes,
4408
4682
  prev_cluster_status=None,
4409
4683
  prev_handle=None,
4410
- prev_cluster_ever_up=False)
4684
+ prev_cluster_ever_up=False,
4685
+ prev_config_hash=prev_config_hash)
4411
4686
 
4412
4687
  def _execute_file_mounts(self, handle: CloudVmRayResourceHandle,
4413
4688
  file_mounts: Optional[Dict[Path, Path]]):
@@ -4423,34 +4698,36 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4423
4698
  symlink_commands = []
4424
4699
  fore = colorama.Fore
4425
4700
  style = colorama.Style
4426
- logger.info(f'{fore.CYAN}Processing file mounts.{style.RESET_ALL}')
4427
4701
  start = time.time()
4428
4702
  runners = handle.get_command_runners()
4429
4703
  log_path = os.path.join(self.log_dir, 'file_mounts.log')
4704
+ num_threads = subprocess_utils.get_max_workers_for_file_mounts(
4705
+ file_mounts, str(handle.launched_resources.cloud))
4430
4706
 
4431
4707
  # Check the files and warn
4432
4708
  for dst, src in file_mounts.items():
4433
4709
  if not data_utils.is_cloud_store_url(src):
4434
4710
  full_src = os.path.abspath(os.path.expanduser(src))
4435
4711
  # Checked during Task.set_file_mounts().
4436
- assert os.path.exists(full_src), f'{full_src} does not exist.'
4712
+ assert os.path.exists(
4713
+ full_src), f'{full_src} does not exist. {file_mounts}'
4437
4714
  src_size = backend_utils.path_size_megabytes(full_src)
4438
4715
  if src_size >= _PATH_SIZE_MEGABYTES_WARN_THRESHOLD:
4439
4716
  logger.warning(
4440
- f'{fore.YELLOW}The size of file mount src {src!r} '
4717
+ f' {fore.YELLOW}The size of file mount src {src!r} '
4441
4718
  f'is {src_size} MB. Try to keep src small or use '
4442
- '.gitignore to exclude large files, as large sizes '
4719
+ '.skyignore to exclude large files, as large sizes '
4443
4720
  f'will slow down rsync. {style.RESET_ALL}')
4444
4721
  if os.path.islink(full_src):
4445
4722
  logger.warning(
4446
- f'{fore.YELLOW}Source path {src!r} is a symlink. '
4723
+ f' {fore.YELLOW}Source path {src!r} is a symlink. '
4447
4724
  f'Symlink contents are not uploaded.{style.RESET_ALL}')
4448
4725
 
4449
4726
  os.makedirs(os.path.expanduser(self.log_dir), exist_ok=True)
4450
4727
  os.system(f'touch {log_path}')
4451
- tail_cmd = f'tail -n100 -f {log_path}'
4452
- logger.info('To view detailed progress: '
4453
- f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
4728
+
4729
+ rich_utils.force_update_status(
4730
+ ux_utils.spinner_message('Syncing file mounts', log_path))
4454
4731
 
4455
4732
  for dst, src in file_mounts.items():
4456
4733
  # TODO: room for improvement. Here there are many moving parts
@@ -4488,18 +4765,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4488
4765
  action_message='Syncing',
4489
4766
  log_path=log_path,
4490
4767
  stream_logs=False,
4768
+ num_threads=num_threads,
4491
4769
  )
4492
4770
  continue
4493
4771
 
4494
4772
  storage = cloud_stores.get_storage_from_path(src)
4495
4773
  if storage.is_directory(src):
4496
- sync = storage.make_sync_dir_command(source=src,
4497
- destination=wrapped_dst)
4774
+ sync_cmd = (storage.make_sync_dir_command(
4775
+ source=src, destination=wrapped_dst))
4498
4776
  # It is a directory so make sure it exists.
4499
4777
  mkdir_for_wrapped_dst = f'mkdir -p {wrapped_dst}'
4500
4778
  else:
4501
- sync = storage.make_sync_file_command(source=src,
4502
- destination=wrapped_dst)
4779
+ sync_cmd = (storage.make_sync_file_command(
4780
+ source=src, destination=wrapped_dst))
4503
4781
  # It is a file so make sure *its parent dir* exists.
4504
4782
  mkdir_for_wrapped_dst = (
4505
4783
  f'mkdir -p {os.path.dirname(wrapped_dst)}')
@@ -4508,7 +4786,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4508
4786
  # Ensure sync can write to wrapped_dst (e.g., '/data/').
4509
4787
  mkdir_for_wrapped_dst,
4510
4788
  # Both the wrapped and the symlink dir exist; sync.
4511
- sync,
4789
+ sync_cmd,
4512
4790
  ]
4513
4791
  command = ' && '.join(download_target_commands)
4514
4792
  # dst is only used for message printing.
@@ -4524,6 +4802,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4524
4802
  # Need to source bashrc, as the cloud specific CLI or SDK may
4525
4803
  # require PATH in bashrc.
4526
4804
  source_bashrc=True,
4805
+ num_threads=num_threads,
4527
4806
  )
4528
4807
  # (2) Run the commands to create symlinks on all the nodes.
4529
4808
  symlink_command = ' && '.join(symlink_commands)
@@ -4542,9 +4821,11 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4542
4821
  'Failed to create symlinks. The target destination '
4543
4822
  f'may already exist. Log: {log_path}')
4544
4823
 
4545
- subprocess_utils.run_in_parallel(_symlink_node, runners)
4824
+ subprocess_utils.run_in_parallel(_symlink_node, runners,
4825
+ num_threads)
4546
4826
  end = time.time()
4547
4827
  logger.debug(f'File mount sync took {end - start} seconds.')
4828
+ logger.info(ux_utils.finishing_message('Synced file_mounts.', log_path))
4548
4829
 
4549
4830
  def _execute_storage_mounts(
4550
4831
  self, handle: CloudVmRayResourceHandle,
@@ -4568,17 +4849,19 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4568
4849
  # Handle cases when there aren't any Storages with MOUNT mode.
4569
4850
  if not storage_mounts:
4570
4851
  return
4571
-
4572
- fore = colorama.Fore
4573
- style = colorama.Style
4574
- plural = 's' if len(storage_mounts) > 1 else ''
4575
- logger.info(f'{fore.CYAN}Processing {len(storage_mounts)} '
4576
- f'storage mount{plural}.{style.RESET_ALL}')
4577
4852
  start = time.time()
4578
4853
  runners = handle.get_command_runners()
4854
+ num_threads = subprocess_utils.get_parallel_threads(
4855
+ str(handle.launched_resources.cloud))
4579
4856
  log_path = os.path.join(self.log_dir, 'storage_mounts.log')
4580
4857
 
4858
+ plural = 's' if len(storage_mounts) > 1 else ''
4859
+ rich_utils.force_update_status(
4860
+ ux_utils.spinner_message(
4861
+ f'Mounting {len(storage_mounts)} storage{plural}', log_path))
4862
+
4581
4863
  for dst, storage_obj in storage_mounts.items():
4864
+ storage_obj.construct()
4582
4865
  if not os.path.isabs(dst) and not dst.startswith('~/'):
4583
4866
  dst = f'{SKY_REMOTE_WORKDIR}/{dst}'
4584
4867
  # Raised when the bucket is externall removed before re-mounting
@@ -4592,6 +4875,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4592
4875
  'successfully without mounting the bucket.')
4593
4876
  # Get the first store and use it to mount
4594
4877
  store = list(storage_obj.stores.values())[0]
4878
+ assert store is not None, storage_obj
4595
4879
  mount_cmd = store.mount_command(dst)
4596
4880
  src_print = (storage_obj.source
4597
4881
  if storage_obj.source else storage_obj.name)
@@ -4609,6 +4893,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4609
4893
  # Need to source bashrc, as the cloud specific CLI or SDK
4610
4894
  # may require PATH in bashrc.
4611
4895
  source_bashrc=True,
4896
+ num_threads=num_threads,
4612
4897
  )
4613
4898
  except exceptions.CommandError as e:
4614
4899
  if e.returncode == exceptions.MOUNT_PATH_NON_EMPTY_CODE:
@@ -4631,6 +4916,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4631
4916
 
4632
4917
  end = time.time()
4633
4918
  logger.debug(f'Storage mount sync took {end - start} seconds.')
4919
+ logger.info(ux_utils.finishing_message('Storage mounted.', log_path))
4634
4920
 
4635
4921
  def _set_storage_mounts_metadata(
4636
4922
  self, cluster_name: str,
@@ -4644,6 +4930,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4644
4930
  return
4645
4931
  storage_mounts_metadata = {}
4646
4932
  for dst, storage_obj in storage_mounts.items():
4933
+ if storage_obj.mode != storage_lib.StorageMode.MOUNT:
4934
+ # Skip non-mount storage objects, as there is no need to
4935
+ # reconstruct them during cluster restart.
4936
+ continue
4647
4937
  storage_mounts_metadata[dst] = storage_obj.handle
4648
4938
  lock_path = (
4649
4939
  backend_utils.CLUSTER_FILE_MOUNTS_LOCK_PATH.format(cluster_name))
@@ -4746,9 +5036,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4746
5036
  1,
4747
5037
  resources_dict,
4748
5038
  stable_cluster_internal_ips=internal_ips,
5039
+ env_vars=task_env_vars,
4749
5040
  setup_cmd=self._setup_cmd,
4750
5041
  setup_log_path=os.path.join(log_dir, 'setup.log'),
4751
- env_vars=task_env_vars,
4752
5042
  )
4753
5043
 
4754
5044
  if callable(task.run):
@@ -4795,9 +5085,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
4795
5085
  num_actual_nodes,
4796
5086
  resources_dict,
4797
5087
  stable_cluster_internal_ips=internal_ips,
5088
+ env_vars=task_env_vars,
4798
5089
  setup_cmd=self._setup_cmd,
4799
5090
  setup_log_path=os.path.join(log_dir, 'setup.log'),
4800
- env_vars=task_env_vars)
5091
+ )
4801
5092
 
4802
5093
  if callable(task.run):
4803
5094
  run_fn_code = textwrap.dedent(inspect.getsource(task.run))