skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (299) hide show
  1. sky/__init__.py +64 -32
  2. sky/adaptors/aws.py +23 -6
  3. sky/adaptors/azure.py +432 -15
  4. sky/adaptors/cloudflare.py +5 -5
  5. sky/adaptors/common.py +19 -9
  6. sky/adaptors/do.py +20 -0
  7. sky/adaptors/gcp.py +3 -2
  8. sky/adaptors/kubernetes.py +122 -88
  9. sky/adaptors/nebius.py +100 -0
  10. sky/adaptors/oci.py +39 -1
  11. sky/adaptors/vast.py +29 -0
  12. sky/admin_policy.py +101 -0
  13. sky/authentication.py +117 -98
  14. sky/backends/backend.py +52 -20
  15. sky/backends/backend_utils.py +669 -557
  16. sky/backends/cloud_vm_ray_backend.py +1099 -808
  17. sky/backends/local_docker_backend.py +14 -8
  18. sky/backends/wheel_utils.py +38 -20
  19. sky/benchmark/benchmark_utils.py +22 -23
  20. sky/check.py +76 -27
  21. sky/cli.py +1586 -1139
  22. sky/client/__init__.py +1 -0
  23. sky/client/cli.py +5683 -0
  24. sky/client/common.py +345 -0
  25. sky/client/sdk.py +1765 -0
  26. sky/cloud_stores.py +283 -19
  27. sky/clouds/__init__.py +7 -2
  28. sky/clouds/aws.py +303 -112
  29. sky/clouds/azure.py +185 -179
  30. sky/clouds/cloud.py +115 -37
  31. sky/clouds/cudo.py +29 -22
  32. sky/clouds/do.py +313 -0
  33. sky/clouds/fluidstack.py +44 -54
  34. sky/clouds/gcp.py +206 -65
  35. sky/clouds/ibm.py +26 -21
  36. sky/clouds/kubernetes.py +345 -91
  37. sky/clouds/lambda_cloud.py +40 -29
  38. sky/clouds/nebius.py +297 -0
  39. sky/clouds/oci.py +129 -90
  40. sky/clouds/paperspace.py +22 -18
  41. sky/clouds/runpod.py +53 -34
  42. sky/clouds/scp.py +28 -24
  43. sky/clouds/service_catalog/__init__.py +19 -13
  44. sky/clouds/service_catalog/aws_catalog.py +29 -12
  45. sky/clouds/service_catalog/azure_catalog.py +33 -6
  46. sky/clouds/service_catalog/common.py +95 -75
  47. sky/clouds/service_catalog/constants.py +3 -3
  48. sky/clouds/service_catalog/cudo_catalog.py +13 -3
  49. sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
  50. sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
  51. sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
  52. sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
  53. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
  54. sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
  55. sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
  56. sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
  57. sky/clouds/service_catalog/do_catalog.py +111 -0
  58. sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
  59. sky/clouds/service_catalog/gcp_catalog.py +16 -2
  60. sky/clouds/service_catalog/ibm_catalog.py +2 -2
  61. sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
  62. sky/clouds/service_catalog/lambda_catalog.py +8 -3
  63. sky/clouds/service_catalog/nebius_catalog.py +116 -0
  64. sky/clouds/service_catalog/oci_catalog.py +31 -4
  65. sky/clouds/service_catalog/paperspace_catalog.py +2 -2
  66. sky/clouds/service_catalog/runpod_catalog.py +2 -2
  67. sky/clouds/service_catalog/scp_catalog.py +2 -2
  68. sky/clouds/service_catalog/vast_catalog.py +104 -0
  69. sky/clouds/service_catalog/vsphere_catalog.py +2 -2
  70. sky/clouds/utils/aws_utils.py +65 -0
  71. sky/clouds/utils/azure_utils.py +91 -0
  72. sky/clouds/utils/gcp_utils.py +5 -9
  73. sky/clouds/utils/oci_utils.py +47 -5
  74. sky/clouds/utils/scp_utils.py +4 -3
  75. sky/clouds/vast.py +280 -0
  76. sky/clouds/vsphere.py +22 -18
  77. sky/core.py +361 -107
  78. sky/dag.py +41 -28
  79. sky/data/data_transfer.py +37 -0
  80. sky/data/data_utils.py +211 -32
  81. sky/data/mounting_utils.py +182 -30
  82. sky/data/storage.py +2118 -270
  83. sky/data/storage_utils.py +126 -5
  84. sky/exceptions.py +179 -8
  85. sky/execution.py +158 -85
  86. sky/global_user_state.py +150 -34
  87. sky/jobs/__init__.py +12 -10
  88. sky/jobs/client/__init__.py +0 -0
  89. sky/jobs/client/sdk.py +302 -0
  90. sky/jobs/constants.py +49 -11
  91. sky/jobs/controller.py +161 -99
  92. sky/jobs/dashboard/dashboard.py +171 -25
  93. sky/jobs/dashboard/templates/index.html +572 -60
  94. sky/jobs/recovery_strategy.py +157 -156
  95. sky/jobs/scheduler.py +307 -0
  96. sky/jobs/server/__init__.py +1 -0
  97. sky/jobs/server/core.py +598 -0
  98. sky/jobs/server/dashboard_utils.py +69 -0
  99. sky/jobs/server/server.py +190 -0
  100. sky/jobs/state.py +627 -122
  101. sky/jobs/utils.py +615 -206
  102. sky/models.py +27 -0
  103. sky/optimizer.py +142 -83
  104. sky/provision/__init__.py +20 -5
  105. sky/provision/aws/config.py +124 -42
  106. sky/provision/aws/instance.py +130 -53
  107. sky/provision/azure/__init__.py +7 -0
  108. sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
  109. sky/provision/azure/config.py +220 -0
  110. sky/provision/azure/instance.py +1012 -37
  111. sky/provision/common.py +31 -3
  112. sky/provision/constants.py +25 -0
  113. sky/provision/cudo/__init__.py +2 -1
  114. sky/provision/cudo/cudo_utils.py +112 -0
  115. sky/provision/cudo/cudo_wrapper.py +37 -16
  116. sky/provision/cudo/instance.py +28 -12
  117. sky/provision/do/__init__.py +11 -0
  118. sky/provision/do/config.py +14 -0
  119. sky/provision/do/constants.py +10 -0
  120. sky/provision/do/instance.py +287 -0
  121. sky/provision/do/utils.py +301 -0
  122. sky/provision/docker_utils.py +82 -46
  123. sky/provision/fluidstack/fluidstack_utils.py +57 -125
  124. sky/provision/fluidstack/instance.py +15 -43
  125. sky/provision/gcp/config.py +19 -9
  126. sky/provision/gcp/constants.py +7 -1
  127. sky/provision/gcp/instance.py +55 -34
  128. sky/provision/gcp/instance_utils.py +339 -80
  129. sky/provision/gcp/mig_utils.py +210 -0
  130. sky/provision/instance_setup.py +172 -133
  131. sky/provision/kubernetes/__init__.py +1 -0
  132. sky/provision/kubernetes/config.py +104 -90
  133. sky/provision/kubernetes/constants.py +8 -0
  134. sky/provision/kubernetes/instance.py +680 -325
  135. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
  136. sky/provision/kubernetes/network.py +54 -20
  137. sky/provision/kubernetes/network_utils.py +70 -21
  138. sky/provision/kubernetes/utils.py +1370 -251
  139. sky/provision/lambda_cloud/__init__.py +11 -0
  140. sky/provision/lambda_cloud/config.py +10 -0
  141. sky/provision/lambda_cloud/instance.py +265 -0
  142. sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
  143. sky/provision/logging.py +1 -1
  144. sky/provision/nebius/__init__.py +11 -0
  145. sky/provision/nebius/config.py +11 -0
  146. sky/provision/nebius/instance.py +285 -0
  147. sky/provision/nebius/utils.py +318 -0
  148. sky/provision/oci/__init__.py +15 -0
  149. sky/provision/oci/config.py +51 -0
  150. sky/provision/oci/instance.py +436 -0
  151. sky/provision/oci/query_utils.py +681 -0
  152. sky/provision/paperspace/constants.py +6 -0
  153. sky/provision/paperspace/instance.py +4 -3
  154. sky/provision/paperspace/utils.py +2 -0
  155. sky/provision/provisioner.py +207 -130
  156. sky/provision/runpod/__init__.py +1 -0
  157. sky/provision/runpod/api/__init__.py +3 -0
  158. sky/provision/runpod/api/commands.py +119 -0
  159. sky/provision/runpod/api/pods.py +142 -0
  160. sky/provision/runpod/instance.py +64 -8
  161. sky/provision/runpod/utils.py +239 -23
  162. sky/provision/vast/__init__.py +10 -0
  163. sky/provision/vast/config.py +11 -0
  164. sky/provision/vast/instance.py +247 -0
  165. sky/provision/vast/utils.py +162 -0
  166. sky/provision/vsphere/common/vim_utils.py +1 -1
  167. sky/provision/vsphere/instance.py +8 -18
  168. sky/provision/vsphere/vsphere_utils.py +1 -1
  169. sky/resources.py +247 -102
  170. sky/serve/__init__.py +9 -9
  171. sky/serve/autoscalers.py +361 -299
  172. sky/serve/client/__init__.py +0 -0
  173. sky/serve/client/sdk.py +366 -0
  174. sky/serve/constants.py +12 -3
  175. sky/serve/controller.py +106 -36
  176. sky/serve/load_balancer.py +63 -12
  177. sky/serve/load_balancing_policies.py +84 -2
  178. sky/serve/replica_managers.py +42 -34
  179. sky/serve/serve_state.py +62 -32
  180. sky/serve/serve_utils.py +271 -160
  181. sky/serve/server/__init__.py +0 -0
  182. sky/serve/{core.py → server/core.py} +271 -90
  183. sky/serve/server/server.py +112 -0
  184. sky/serve/service.py +52 -16
  185. sky/serve/service_spec.py +95 -32
  186. sky/server/__init__.py +1 -0
  187. sky/server/common.py +430 -0
  188. sky/server/constants.py +21 -0
  189. sky/server/html/log.html +174 -0
  190. sky/server/requests/__init__.py +0 -0
  191. sky/server/requests/executor.py +472 -0
  192. sky/server/requests/payloads.py +487 -0
  193. sky/server/requests/queues/__init__.py +0 -0
  194. sky/server/requests/queues/mp_queue.py +76 -0
  195. sky/server/requests/requests.py +567 -0
  196. sky/server/requests/serializers/__init__.py +0 -0
  197. sky/server/requests/serializers/decoders.py +192 -0
  198. sky/server/requests/serializers/encoders.py +166 -0
  199. sky/server/server.py +1106 -0
  200. sky/server/stream_utils.py +141 -0
  201. sky/setup_files/MANIFEST.in +2 -5
  202. sky/setup_files/dependencies.py +159 -0
  203. sky/setup_files/setup.py +14 -125
  204. sky/sky_logging.py +59 -14
  205. sky/skylet/autostop_lib.py +2 -2
  206. sky/skylet/constants.py +183 -50
  207. sky/skylet/events.py +22 -10
  208. sky/skylet/job_lib.py +403 -258
  209. sky/skylet/log_lib.py +111 -71
  210. sky/skylet/log_lib.pyi +6 -0
  211. sky/skylet/providers/command_runner.py +6 -8
  212. sky/skylet/providers/ibm/node_provider.py +2 -2
  213. sky/skylet/providers/scp/config.py +11 -3
  214. sky/skylet/providers/scp/node_provider.py +8 -8
  215. sky/skylet/skylet.py +3 -1
  216. sky/skylet/subprocess_daemon.py +69 -17
  217. sky/skypilot_config.py +119 -57
  218. sky/task.py +205 -64
  219. sky/templates/aws-ray.yml.j2 +37 -7
  220. sky/templates/azure-ray.yml.j2 +27 -82
  221. sky/templates/cudo-ray.yml.j2 +7 -3
  222. sky/templates/do-ray.yml.j2 +98 -0
  223. sky/templates/fluidstack-ray.yml.j2 +7 -4
  224. sky/templates/gcp-ray.yml.j2 +26 -6
  225. sky/templates/ibm-ray.yml.j2 +3 -2
  226. sky/templates/jobs-controller.yaml.j2 +46 -11
  227. sky/templates/kubernetes-ingress.yml.j2 +7 -0
  228. sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
  229. sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
  230. sky/templates/kubernetes-ray.yml.j2 +292 -25
  231. sky/templates/lambda-ray.yml.j2 +30 -40
  232. sky/templates/nebius-ray.yml.j2 +79 -0
  233. sky/templates/oci-ray.yml.j2 +18 -57
  234. sky/templates/paperspace-ray.yml.j2 +10 -6
  235. sky/templates/runpod-ray.yml.j2 +26 -4
  236. sky/templates/scp-ray.yml.j2 +3 -2
  237. sky/templates/sky-serve-controller.yaml.j2 +12 -1
  238. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  239. sky/templates/vast-ray.yml.j2 +70 -0
  240. sky/templates/vsphere-ray.yml.j2 +8 -3
  241. sky/templates/websocket_proxy.py +64 -0
  242. sky/usage/constants.py +10 -1
  243. sky/usage/usage_lib.py +130 -37
  244. sky/utils/accelerator_registry.py +35 -51
  245. sky/utils/admin_policy_utils.py +147 -0
  246. sky/utils/annotations.py +51 -0
  247. sky/utils/cli_utils/status_utils.py +81 -23
  248. sky/utils/cluster_utils.py +356 -0
  249. sky/utils/command_runner.py +452 -89
  250. sky/utils/command_runner.pyi +77 -3
  251. sky/utils/common.py +54 -0
  252. sky/utils/common_utils.py +319 -108
  253. sky/utils/config_utils.py +204 -0
  254. sky/utils/control_master_utils.py +48 -0
  255. sky/utils/controller_utils.py +548 -266
  256. sky/utils/dag_utils.py +93 -32
  257. sky/utils/db_utils.py +18 -4
  258. sky/utils/env_options.py +29 -7
  259. sky/utils/kubernetes/create_cluster.sh +8 -60
  260. sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
  261. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  262. sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
  263. sky/utils/kubernetes/gpu_labeler.py +4 -4
  264. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
  265. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  266. sky/utils/kubernetes/rsync_helper.sh +24 -0
  267. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
  268. sky/utils/log_utils.py +240 -33
  269. sky/utils/message_utils.py +81 -0
  270. sky/utils/registry.py +127 -0
  271. sky/utils/resources_utils.py +94 -22
  272. sky/utils/rich_utils.py +247 -18
  273. sky/utils/schemas.py +284 -64
  274. sky/{status_lib.py → utils/status_lib.py} +12 -7
  275. sky/utils/subprocess_utils.py +212 -46
  276. sky/utils/timeline.py +12 -7
  277. sky/utils/ux_utils.py +168 -15
  278. skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
  279. skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
  280. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
  281. sky/clouds/cloud_registry.py +0 -31
  282. sky/jobs/core.py +0 -330
  283. sky/skylet/providers/azure/__init__.py +0 -2
  284. sky/skylet/providers/azure/azure-vm-template.json +0 -301
  285. sky/skylet/providers/azure/config.py +0 -170
  286. sky/skylet/providers/azure/node_provider.py +0 -466
  287. sky/skylet/providers/lambda_cloud/__init__.py +0 -2
  288. sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
  289. sky/skylet/providers/oci/__init__.py +0 -2
  290. sky/skylet/providers/oci/node_provider.py +0 -488
  291. sky/skylet/providers/oci/query_helper.py +0 -383
  292. sky/skylet/providers/oci/utils.py +0 -21
  293. sky/utils/cluster_yaml_utils.py +0 -24
  294. sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
  295. skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
  296. skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
  297. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
  298. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
  299. {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/cli.py CHANGED
@@ -26,59 +26,66 @@ each other.
26
26
  import copy
27
27
  import datetime
28
28
  import functools
29
- import multiprocessing
29
+ import getpass
30
30
  import os
31
31
  import shlex
32
- import signal
32
+ import shutil
33
33
  import subprocess
34
34
  import sys
35
35
  import textwrap
36
- import time
36
+ import traceback
37
37
  import typing
38
- from typing import Any, Dict, List, Optional, Tuple, Union
39
- import webbrowser
38
+ from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
40
39
 
41
40
  import click
42
41
  import colorama
43
42
  import dotenv
43
+ import requests as requests_lib
44
44
  from rich import progress as rich_progress
45
45
  import yaml
46
46
 
47
47
  import sky
48
48
  from sky import backends
49
- from sky import check as sky_check
50
- from sky import clouds as sky_clouds
51
- from sky import core
49
+ from sky import clouds
52
50
  from sky import exceptions
53
51
  from sky import global_user_state
54
52
  from sky import jobs as managed_jobs
53
+ from sky import models
55
54
  from sky import serve as serve_lib
56
55
  from sky import sky_logging
57
- from sky import status_lib
58
56
  from sky.adaptors import common as adaptors_common
59
- from sky.backends import backend_utils
60
57
  from sky.benchmark import benchmark_state
61
58
  from sky.benchmark import benchmark_utils
59
+ from sky.client import sdk
62
60
  from sky.clouds import service_catalog
63
61
  from sky.data import storage_utils
62
+ from sky.provision.kubernetes import constants as kubernetes_constants
64
63
  from sky.provision.kubernetes import utils as kubernetes_utils
64
+ from sky.server import common as server_common
65
+ from sky.server import constants as server_constants
66
+ from sky.server.requests import requests
65
67
  from sky.skylet import constants
66
68
  from sky.skylet import job_lib
67
- from sky.skylet import log_lib
68
69
  from sky.usage import usage_lib
70
+ from sky.utils import annotations
71
+ from sky.utils import cluster_utils
72
+ from sky.utils import common
69
73
  from sky.utils import common_utils
70
74
  from sky.utils import controller_utils
71
75
  from sky.utils import dag_utils
76
+ from sky.utils import env_options
72
77
  from sky.utils import log_utils
78
+ from sky.utils import registry
73
79
  from sky.utils import resources_utils
74
80
  from sky.utils import rich_utils
81
+ from sky.utils import status_lib
75
82
  from sky.utils import subprocess_utils
76
83
  from sky.utils import timeline
77
84
  from sky.utils import ux_utils
78
85
  from sky.utils.cli_utils import status_utils
79
86
 
80
87
  if typing.TYPE_CHECKING:
81
- from sky.backends import backend as backend_lib
88
+ import types
82
89
 
83
90
  pd = adaptors_common.LazyImport('pandas')
84
91
  logger = sky_logging.init_logger(__name__)
@@ -98,23 +105,96 @@ _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = (
98
105
  '{cluster_num} cluster{plural} {verb}. Please specify {cause} '
99
106
  'cluster to show its {property}.\nUsage: `sky status --{flag} <cluster>`')
100
107
 
101
- _ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
102
- 'please retry after a while.')
103
-
104
108
  _DAG_NOT_SUPPORTED_MESSAGE = ('YAML specifies a DAG which is only supported by '
105
109
  '`sky jobs launch`. `{command}` supports a '
106
110
  'single task only.')
107
111
 
108
112
 
109
- def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]:
110
- """Returns a list of clusters that match the glob pattern."""
111
- glob_clusters = []
112
- for cluster in clusters:
113
- glob_cluster = global_user_state.get_glob_cluster_names(cluster)
114
- if len(glob_cluster) == 0 and not silent:
115
- click.echo(f'Cluster {cluster} not found.')
116
- glob_clusters.extend(glob_cluster)
117
- return list(set(glob_clusters))
113
+ def _get_cluster_records_and_set_ssh_config(
114
+ clusters: Optional[List[str]],
115
+ refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
116
+ all_users: bool = False,
117
+ ) -> List[dict]:
118
+ """Returns a list of clusters that match the glob pattern.
119
+
120
+ Args:
121
+ clusters: A list of cluster names to query. If None, query all clusters.
122
+ refresh: The refresh mode for the status command.
123
+ all_users: Whether to query clusters from all users.
124
+ If clusters is not None, this field is ignored because cluster list
125
+ can include other users' clusters.
126
+ """
127
+ # TODO(zhwu): we should move this function into SDK.
128
+ # TODO(zhwu): this additional RTT makes CLIs slow. We should optimize this.
129
+ if clusters is not None:
130
+ all_users = True
131
+ request_id = sdk.status(clusters, refresh=refresh, all_users=all_users)
132
+ cluster_records = sdk.stream_and_get(request_id)
133
+ # Update the SSH config for all clusters
134
+ for record in cluster_records:
135
+ handle = record['handle']
136
+ # During the failover, even though a cluster does not exist, the handle
137
+ # can still exist in the record, and we check for credentials to avoid
138
+ # updating the SSH config for non-existent clusters.
139
+ if (handle is not None and handle.cached_external_ips is not None and
140
+ 'credentials' in record):
141
+ credentials = record['credentials']
142
+ if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
143
+ # Replace the proxy command to proxy through the SkyPilot API
144
+ # server with websocket.
145
+ key_path = (
146
+ cluster_utils.SSHConfigHelper.generate_local_key_file(
147
+ handle.cluster_name, credentials))
148
+ # Instead of directly use websocket_proxy.py, we add an
149
+ # additional proxy, so that ssh can use the head pod in the
150
+ # cluster to jump to worker pods.
151
+ proxy_command = (
152
+ f'ssh -tt -i {key_path} '
153
+ '-o StrictHostKeyChecking=no '
154
+ '-o UserKnownHostsFile=/dev/null '
155
+ '-o IdentitiesOnly=yes '
156
+ '-W %h:%p '
157
+ f'{handle.ssh_user}@127.0.0.1 '
158
+ '-o ProxyCommand='
159
+ # TODO(zhwu): write the template to a temp file, don't use
160
+ # the one in skypilot repo, to avoid changing the file when
161
+ # updating skypilot.
162
+ f'\'{sys.executable} {sky.__root_dir__}/templates/'
163
+ f'websocket_proxy.py '
164
+ f'{server_common.get_server_url().split("://")[1]} '
165
+ f'{handle.cluster_name}\'')
166
+ credentials['ssh_proxy_command'] = proxy_command
167
+ cluster_utils.SSHConfigHelper.add_cluster(
168
+ handle.cluster_name,
169
+ handle.cached_external_ips,
170
+ credentials,
171
+ handle.cached_external_ssh_ports,
172
+ handle.docker_user,
173
+ handle.ssh_user,
174
+ )
175
+ else:
176
+ # If the cluster is not UP or does not have credentials available,
177
+ # we need to remove the cluster from the SSH config.
178
+ cluster_utils.SSHConfigHelper.remove_cluster(record['name'])
179
+
180
+ # Clean up SSH configs for clusters that do not exist.
181
+ #
182
+ # We do this in a conservative way: only when a query is made for all users
183
+ # or specific clusters. Without those, the table returned only contains the
184
+ # current user's clusters, and the information is not enough for
185
+ # removing clusters, because SkyPilot has no idea whether to remove
186
+ # ssh config of a cluster from another user.
187
+ clusters_exists = set(record['name'] for record in cluster_records)
188
+ if clusters is not None:
189
+ for cluster in clusters:
190
+ if cluster not in clusters_exists:
191
+ cluster_utils.SSHConfigHelper.remove_cluster(cluster)
192
+ elif all_users:
193
+ for cluster_name in cluster_utils.SSHConfigHelper.list_cluster_names():
194
+ if cluster_name not in clusters_exists:
195
+ cluster_utils.SSHConfigHelper.remove_cluster(cluster_name)
196
+
197
+ return cluster_records
118
198
 
119
199
 
120
200
  def _get_glob_storages(storages: List[str]) -> List[str]:
@@ -122,7 +202,7 @@ def _get_glob_storages(storages: List[str]) -> List[str]:
122
202
  glob_storages = []
123
203
  for storage_object in storages:
124
204
  glob_storage = global_user_state.get_glob_storage_name(storage_object)
125
- if len(glob_storage) == 0:
205
+ if not glob_storage:
126
206
  click.echo(f'Storage {storage_object} not found.')
127
207
  glob_storages.extend(glob_storage)
128
208
  return list(set(glob_storages))
@@ -144,6 +224,44 @@ def _parse_env_var(env_var: str) -> Tuple[str, str]:
144
224
  return ret[0], ret[1]
145
225
 
146
226
 
227
+ def _async_call_or_wait(request_id: str, async_call: bool,
228
+ request_name: str) -> Any:
229
+ short_request_id = request_id[:8]
230
+ if not async_call:
231
+ try:
232
+ return sdk.stream_and_get(request_id)
233
+ except KeyboardInterrupt:
234
+ logger.info(
235
+ ux_utils.starting_message('Request will continue running '
236
+ 'asynchronously.') +
237
+ f'\n{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}View logs: '
238
+ f'{ux_utils.BOLD}sky api logs {short_request_id}'
239
+ f'{colorama.Style.RESET_ALL}'
240
+ f'\n{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}Or, '
241
+ 'visit: '
242
+ f'{server_common.get_server_url()}/api/stream?'
243
+ f'request_id={short_request_id}'
244
+ f'\n{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}To cancel '
245
+ 'the request, run: '
246
+ f'{ux_utils.BOLD}sky api cancel {short_request_id}'
247
+ f'{colorama.Style.RESET_ALL}'
248
+ f'\n{colorama.Style.RESET_ALL}')
249
+ raise
250
+ else:
251
+ click.secho(f'Submitted {request_name} request: {request_id}',
252
+ fg='green')
253
+ click.echo(
254
+ f'{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}Check logs with: '
255
+ f'sky api logs {short_request_id}{colorama.Style.RESET_ALL}\n'
256
+ f'{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}Or, visit: '
257
+ f'{server_common.get_server_url()}/api/stream?'
258
+ f'request_id={short_request_id}'
259
+ f'\n{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}To cancel '
260
+ 'the request, run: '
261
+ f'{ux_utils.BOLD}sky api cancel {short_request_id}'
262
+ f'{colorama.Style.RESET_ALL}\n')
263
+
264
+
147
265
  def _merge_env_vars(env_dict: Optional[Dict[str, str]],
148
266
  env_list: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
149
267
  """Merges all values from env_list into env_dict."""
@@ -154,6 +272,15 @@ def _merge_env_vars(env_dict: Optional[Dict[str, str]],
154
272
  return list(env_dict.items())
155
273
 
156
274
 
275
+ _COMMON_OPTIONS = [
276
+ click.option('--async/--no-async',
277
+ 'async_call',
278
+ required=False,
279
+ is_flag=True,
280
+ default=False,
281
+ help=('Run the command asynchronously.'))
282
+ ]
283
+
157
284
  _TASK_OPTIONS = [
158
285
  click.option(
159
286
  '--workdir',
@@ -305,14 +432,28 @@ def _complete_cluster_name(ctx: click.Context, param: click.Parameter,
305
432
  incomplete: str) -> List[str]:
306
433
  """Handle shell completion for cluster names."""
307
434
  del ctx, param # Unused.
308
- return global_user_state.get_cluster_names_start_with(incomplete)
435
+ # TODO(zhwu): we send requests to API server for completion, which can cause
436
+ # large latency. We should investigate caching mechanism if needed.
437
+ response = requests_lib.get(
438
+ f'{server_common.get_server_url()}'
439
+ f'/api/completion/cluster_name?incomplete={incomplete}',
440
+ timeout=2.0,
441
+ )
442
+ response.raise_for_status()
443
+ return response.json()
309
444
 
310
445
 
311
446
  def _complete_storage_name(ctx: click.Context, param: click.Parameter,
312
447
  incomplete: str) -> List[str]:
313
448
  """Handle shell completion for storage names."""
314
449
  del ctx, param # Unused.
315
- return global_user_state.get_storage_names_start_with(incomplete)
450
+ response = requests_lib.get(
451
+ f'{server_common.get_server_url()}'
452
+ f'/api/completion/storage_name?incomplete={incomplete}',
453
+ timeout=2.0,
454
+ )
455
+ response.raise_for_status()
456
+ return response.json()
316
457
 
317
458
 
318
459
  def _complete_file_name(ctx: click.Context, param: click.Parameter,
@@ -338,7 +479,6 @@ def _get_shell_complete_args(complete_fn):
338
479
 
339
480
 
340
481
  _RELOAD_ZSH_CMD = 'source ~/.zshrc'
341
- _RELOAD_FISH_CMD = 'source ~/.config/fish/config.fish'
342
482
  _RELOAD_BASH_CMD = 'source ~/.bashrc'
343
483
 
344
484
 
@@ -368,14 +508,18 @@ def _install_shell_completion(ctx: click.Context, param: click.Parameter,
368
508
  echo "{bashrc_diff}" >> ~/.bashrc'
369
509
 
370
510
  cmd = (f'(grep -q "SkyPilot" ~/.bashrc) || '
371
- f'[[ ${{BASH_VERSINFO[0]}} -ge 4 ]] && ({install_cmd})')
511
+ f'([[ ${{BASH_VERSINFO[0]}} -ge 4 ]] && ({install_cmd}) || '
512
+ f'(echo "Bash must be version 4 or above." && exit 1))')
513
+
372
514
  reload_cmd = _RELOAD_BASH_CMD
373
515
 
374
516
  elif value == 'fish':
375
517
  cmd = '_SKY_COMPLETE=fish_source sky > \
376
518
  ~/.config/fish/completions/sky.fish'
377
519
 
378
- reload_cmd = _RELOAD_FISH_CMD
520
+ # Fish does not need to be reloaded and will automatically pick up
521
+ # completions.
522
+ reload_cmd = None
379
523
 
380
524
  elif value == 'zsh':
381
525
  install_cmd = f'_SKY_COMPLETE=zsh_source sky > \
@@ -390,11 +534,15 @@ def _install_shell_completion(ctx: click.Context, param: click.Parameter,
390
534
  ctx.exit()
391
535
 
392
536
  try:
393
- subprocess.run(cmd, shell=True, check=True, executable='/bin/bash')
537
+ subprocess.run(cmd,
538
+ shell=True,
539
+ check=True,
540
+ executable=shutil.which('bash'))
394
541
  click.secho(f'Shell completion installed for {value}', fg='green')
395
- click.echo(
396
- 'Completion will take effect once you restart the terminal: ' +
397
- click.style(f'{reload_cmd}', bold=True))
542
+ if reload_cmd is not None:
543
+ click.echo(
544
+ 'Completion will take effect once you restart the terminal: ' +
545
+ click.style(f'{reload_cmd}', bold=True))
398
546
  except subprocess.CalledProcessError as e:
399
547
  click.secho(f'> Installation failed with code {e.returncode}', fg='red')
400
548
  ctx.exit()
@@ -425,7 +573,9 @@ def _uninstall_shell_completion(ctx: click.Context, param: click.Parameter,
425
573
 
426
574
  elif value == 'fish':
427
575
  cmd = 'rm -f ~/.config/fish/completions/sky.fish'
428
- reload_cmd = _RELOAD_FISH_CMD
576
+ # Fish does not need to be reloaded and will automatically pick up
577
+ # completions.
578
+ reload_cmd = None
429
579
 
430
580
  elif value == 'zsh':
431
581
  cmd = 'sed -i"" -e "/# For SkyPilot shell completion/d" ~/.zshrc && \
@@ -441,8 +591,10 @@ def _uninstall_shell_completion(ctx: click.Context, param: click.Parameter,
441
591
  try:
442
592
  subprocess.run(cmd, shell=True, check=True)
443
593
  click.secho(f'Shell completion uninstalled for {value}', fg='green')
444
- click.echo('Changes will take effect once you restart the terminal: ' +
445
- click.style(f'{reload_cmd}', bold=True))
594
+ if reload_cmd is not None:
595
+ click.echo(
596
+ 'Changes will take effect once you restart the terminal: ' +
597
+ click.style(f'{reload_cmd}', bold=True))
446
598
  except subprocess.CalledProcessError as e:
447
599
  click.secho(f'> Uninstallation failed with code {e.returncode}',
448
600
  fg='red')
@@ -472,14 +624,14 @@ def _parse_override_params(
472
624
  image_id: Optional[str] = None,
473
625
  disk_size: Optional[int] = None,
474
626
  disk_tier: Optional[str] = None,
475
- ports: Optional[Tuple[str]] = None) -> Dict[str, Any]:
627
+ ports: Optional[Tuple[str, ...]] = None) -> Dict[str, Any]:
476
628
  """Parses the override parameters into a dictionary."""
477
629
  override_params: Dict[str, Any] = {}
478
630
  if cloud is not None:
479
631
  if cloud.lower() == 'none':
480
632
  override_params['cloud'] = None
481
633
  else:
482
- override_params['cloud'] = sky_clouds.CLOUD_REGISTRY.from_str(cloud)
634
+ override_params['cloud'] = registry.CLOUD_REGISTRY.from_str(cloud)
483
635
  if region is not None:
484
636
  if region.lower() == 'none':
485
637
  override_params['region'] = None
@@ -525,91 +677,17 @@ def _parse_override_params(
525
677
  else:
526
678
  override_params['disk_tier'] = disk_tier
527
679
  if ports:
528
- override_params['ports'] = ports
680
+ if any(p.lower() == 'none' for p in ports):
681
+ if len(ports) > 1:
682
+ with ux_utils.print_exception_no_traceback():
683
+ raise ValueError('Cannot specify both "none" and other '
684
+ 'ports.')
685
+ override_params['ports'] = None
686
+ else:
687
+ override_params['ports'] = ports
529
688
  return override_params
530
689
 
531
690
 
532
- def _launch_with_confirm(
533
- task: sky.Task,
534
- backend: backends.Backend,
535
- cluster: Optional[str],
536
- *,
537
- dryrun: bool,
538
- detach_run: bool,
539
- detach_setup: bool = False,
540
- no_confirm: bool = False,
541
- idle_minutes_to_autostop: Optional[int] = None,
542
- down: bool = False, # pylint: disable=redefined-outer-name
543
- retry_until_up: bool = False,
544
- no_setup: bool = False,
545
- clone_disk_from: Optional[str] = None,
546
- ):
547
- """Launch a cluster with a Task."""
548
- if cluster is None:
549
- cluster = backend_utils.generate_cluster_name()
550
-
551
- clone_source_str = ''
552
- if clone_disk_from is not None:
553
- clone_source_str = f' from the disk of {clone_disk_from!r}'
554
- task, _ = backend_utils.check_can_clone_disk_and_override_task(
555
- clone_disk_from, cluster, task)
556
-
557
- with sky.Dag() as dag:
558
- dag.add(task)
559
-
560
- maybe_status, handle = backend_utils.refresh_cluster_status_handle(cluster)
561
- if maybe_status is None:
562
- # Show the optimize log before the prompt if the cluster does not exist.
563
- try:
564
- sky_check.get_cached_enabled_clouds_or_refresh(
565
- raise_if_no_cloud_access=True)
566
- except exceptions.NoCloudAccessError as e:
567
- # Catch the exception where the public cloud is not enabled, and
568
- # make it yellow for better visibility.
569
- with ux_utils.print_exception_no_traceback():
570
- raise RuntimeError(f'{colorama.Fore.YELLOW}{e}'
571
- f'{colorama.Style.RESET_ALL}') from e
572
- dag = sky.optimize(dag)
573
- task = dag.tasks[0]
574
-
575
- if handle is not None:
576
- backend.check_resources_fit_cluster(handle, task)
577
-
578
- confirm_shown = False
579
- if not no_confirm:
580
- # Prompt if (1) --cluster is None, or (2) cluster doesn't exist, or (3)
581
- # it exists but is STOPPED.
582
- prompt = None
583
- if maybe_status is None:
584
- cluster_str = '' if cluster is None else f' {cluster!r}'
585
- prompt = (
586
- f'Launching a new cluster{cluster_str}{clone_source_str}. '
587
- 'Proceed?')
588
- elif maybe_status == status_lib.ClusterStatus.STOPPED:
589
- prompt = f'Restarting the stopped cluster {cluster!r}. Proceed?'
590
- if prompt is not None:
591
- confirm_shown = True
592
- click.confirm(prompt, default=True, abort=True, show_default=True)
593
-
594
- if not confirm_shown:
595
- click.secho(f'Running task on cluster {cluster}...', fg='yellow')
596
-
597
- sky.launch(
598
- dag,
599
- dryrun=dryrun,
600
- stream_logs=True,
601
- cluster_name=cluster,
602
- detach_setup=detach_setup,
603
- detach_run=detach_run,
604
- backend=backend,
605
- idle_minutes_to_autostop=idle_minutes_to_autostop,
606
- down=down,
607
- retry_until_up=retry_until_up,
608
- no_setup=no_setup,
609
- clone_disk_from=clone_disk_from,
610
- )
611
-
612
-
613
691
  def _check_yaml(entrypoint: str) -> Tuple[bool, Optional[Dict[str, Any]]]:
614
692
  """Checks if entrypoint is a readable YAML file.
615
693
 
@@ -690,7 +768,6 @@ def _pop_and_ignore_fields_in_override_params(
690
768
  def _make_task_or_dag_from_entrypoint_with_overrides(
691
769
  entrypoint: Tuple[str, ...],
692
770
  *,
693
- entrypoint_name: str = 'Task',
694
771
  name: Optional[str] = None,
695
772
  workdir: Optional[str] = None,
696
773
  cloud: Optional[str] = None,
@@ -705,7 +782,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
705
782
  image_id: Optional[str] = None,
706
783
  disk_size: Optional[int] = None,
707
784
  disk_tier: Optional[str] = None,
708
- ports: Optional[Tuple[str]] = None,
785
+ ports: Optional[Tuple[str, ...]] = None,
709
786
  env: Optional[List[Tuple[str, str]]] = None,
710
787
  field_to_ignore: Optional[List[str]] = None,
711
788
  # job launch specific
@@ -722,19 +799,15 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
722
799
  entrypoint: Optional[str]
723
800
  if is_yaml:
724
801
  # Treat entrypoint as a yaml.
725
- click.secho(f'{entrypoint_name} from YAML spec: ',
726
- fg='yellow',
727
- nl=False)
728
- click.secho(entrypoint, bold=True)
802
+ click.secho('YAML to run: ', fg='cyan', nl=False)
803
+ click.secho(entrypoint)
729
804
  else:
730
805
  if not entrypoint:
731
806
  entrypoint = None
732
807
  else:
733
808
  # Treat entrypoint as a bash command.
734
- click.secho(f'{entrypoint_name} from command: ',
735
- fg='yellow',
736
- nl=False)
737
- click.secho(entrypoint, bold=True)
809
+ click.secho('Command to run: ', fg='cyan', nl=False)
810
+ click.secho(entrypoint)
738
811
 
739
812
  override_params = _parse_override_params(cloud=cloud,
740
813
  region=region,
@@ -798,7 +871,7 @@ class _NaturalOrderGroup(click.Group):
798
871
  Reference: https://github.com/pallets/click/issues/513
799
872
  """
800
873
 
801
- def list_commands(self, ctx):
874
+ def list_commands(self, ctx): # pylint: disable=unused-argument
802
875
  return self.commands.keys()
803
876
 
804
877
  @usage_lib.entrypoint('sky.cli', fallback=True)
@@ -925,6 +998,7 @@ def _deprecate_and_hide_command(group, command_to_deprecate,
925
998
  prog_name='skypilot',
926
999
  message='%(prog)s, commit %(version)s',
927
1000
  help='Show the commit hash and exit')
1001
+ @annotations.client_api
928
1002
  def cli():
929
1003
  pass
930
1004
 
@@ -945,20 +1019,9 @@ def cli():
945
1019
  default=False,
946
1020
  is_flag=True,
947
1021
  help='If True, do not actually run the job.')
948
- @click.option(
949
- '--detach-setup',
950
- '-s',
951
- default=False,
952
- is_flag=True,
953
- help=
954
- ('If True, run setup in non-interactive mode as part of the job itself. '
955
- 'You can safely ctrl-c to detach from logging, and it will not interrupt '
956
- 'the setup process. To see the logs again after detaching, use `sky logs`.'
957
- ' To cancel setup, cancel the job via `sky cancel`. Useful for long-'
958
- 'running setup commands.'))
959
1022
  @click.option(
960
1023
  '--detach-run',
961
- '-d',
1024
+ '-d/-no-d',
962
1025
  default=False,
963
1026
  is_flag=True,
964
1027
  help=('If True, as soon as a job is submitted, return from this call '
@@ -967,8 +1030,12 @@ def cli():
967
1030
  'backend_name',
968
1031
  flag_value=backends.LocalDockerBackend.NAME,
969
1032
  default=False,
970
- help='If used, runs locally inside a docker container.')
971
- @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS)
1033
+ hidden=True,
1034
+ help=('(Deprecated) Local docker support is deprecated. '
1035
+ 'To run locally, create a local Kubernetes cluster with '
1036
+ '``sky local up``.'))
1037
+ @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS +
1038
+ _COMMON_OPTIONS)
972
1039
  @click.option(
973
1040
  '--idle-minutes-to-autostop',
974
1041
  '-i',
@@ -1028,38 +1095,45 @@ def cli():
1028
1095
  help=('[Experimental] Clone disk from an existing cluster to launch '
1029
1096
  'a new one. This is useful when the new cluster needs to have '
1030
1097
  'the same data on the boot disk as an existing cluster.'))
1098
+ @click.option(
1099
+ '--fast',
1100
+ is_flag=True,
1101
+ default=False,
1102
+ required=False,
1103
+ help=('[Experimental] If the cluster is already up and available, skip '
1104
+ 'provisioning and setup steps.'))
1031
1105
  @usage_lib.entrypoint
1032
1106
  def launch(
1033
- entrypoint: Tuple[str, ...],
1034
- cluster: Optional[str],
1035
- dryrun: bool,
1036
- detach_setup: bool,
1037
- detach_run: bool,
1038
- backend_name: Optional[str],
1039
- name: Optional[str],
1040
- workdir: Optional[str],
1041
- cloud: Optional[str],
1042
- region: Optional[str],
1043
- zone: Optional[str],
1044
- gpus: Optional[str],
1045
- cpus: Optional[str],
1046
- memory: Optional[str],
1047
- instance_type: Optional[str],
1048
- num_nodes: Optional[int],
1049
- use_spot: Optional[bool],
1050
- image_id: Optional[str],
1051
- env_file: Optional[Dict[str, str]],
1052
- env: List[Tuple[str, str]],
1053
- disk_size: Optional[int],
1054
- disk_tier: Optional[str],
1055
- ports: Tuple[str],
1056
- idle_minutes_to_autostop: Optional[int],
1057
- down: bool, # pylint: disable=redefined-outer-name
1058
- retry_until_up: bool,
1059
- yes: bool,
1060
- no_setup: bool,
1061
- clone_disk_from: Optional[str],
1062
- ):
1107
+ entrypoint: Tuple[str, ...],
1108
+ cluster: Optional[str],
1109
+ dryrun: bool,
1110
+ detach_run: bool,
1111
+ backend_name: Optional[str],
1112
+ name: Optional[str],
1113
+ workdir: Optional[str],
1114
+ cloud: Optional[str],
1115
+ region: Optional[str],
1116
+ zone: Optional[str],
1117
+ gpus: Optional[str],
1118
+ cpus: Optional[str],
1119
+ memory: Optional[str],
1120
+ instance_type: Optional[str],
1121
+ num_nodes: Optional[int],
1122
+ use_spot: Optional[bool],
1123
+ image_id: Optional[str],
1124
+ env_file: Optional[Dict[str, str]],
1125
+ env: List[Tuple[str, str]],
1126
+ disk_size: Optional[int],
1127
+ disk_tier: Optional[str],
1128
+ ports: Tuple[str, ...],
1129
+ idle_minutes_to_autostop: Optional[int],
1130
+ down: bool, # pylint: disable=redefined-outer-name
1131
+ retry_until_up: bool,
1132
+ yes: bool,
1133
+ no_setup: bool,
1134
+ clone_disk_from: Optional[str],
1135
+ fast: bool,
1136
+ async_call: bool):
1063
1137
  """Launch a cluster or task.
1064
1138
 
1065
1139
  If ENTRYPOINT points to a valid YAML file, it is read in as the task
@@ -1069,6 +1143,14 @@ def launch(
1069
1143
  and they undergo job queue scheduling.
1070
1144
  """
1071
1145
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1146
+ # TODO(zhwu): the current --async is a bit inconsistent with the direct
1147
+ # sky launch, as `sky api logs` does not contain the logs for the actual job
1148
+ # submitted, while the synchronous way of `sky launch` does. We should
1149
+ # consider having the job logs available in `sky api logs` as well.
1150
+ # Reason for not doing it right now: immediately tailing the logs for the
1151
+ # job can take up resources on the API server. When there are a lot of
1152
+ # `launch` submitted asynchronously, the log tailing may overwhelm the API
1153
+ # server, if the jobs are long running.
1072
1154
  env = _merge_env_vars(env_file, env)
1073
1155
  controller_utils.check_cluster_name_not_controller(
1074
1156
  cluster, operation_str='Launching tasks on it')
@@ -1102,6 +1184,11 @@ def launch(
1102
1184
  backend: backends.Backend
1103
1185
  if backend_name == backends.LocalDockerBackend.NAME:
1104
1186
  backend = backends.LocalDockerBackend()
1187
+ click.secho(
1188
+ 'WARNING: LocalDockerBackend is deprecated and will be '
1189
+ 'removed in a future release. To run locally, create a local '
1190
+ 'Kubernetes cluster with `sky local up`.',
1191
+ fg='yellow')
1105
1192
  elif backend_name == backends.CloudVmRayBackend.NAME:
1106
1193
  backend = backends.CloudVmRayBackend()
1107
1194
  else:
@@ -1116,18 +1203,35 @@ def launch(
1116
1203
  f'{colorama.Style.RESET_ALL}{colorama.Style.BRIGHT}sky serve up'
1117
1204
  f'{colorama.Style.RESET_ALL}')
1118
1205
 
1119
- _launch_with_confirm(task,
1120
- backend,
1121
- cluster,
1122
- dryrun=dryrun,
1123
- detach_setup=detach_setup,
1124
- detach_run=detach_run,
1125
- no_confirm=yes,
1126
- idle_minutes_to_autostop=idle_minutes_to_autostop,
1127
- down=down,
1128
- retry_until_up=retry_until_up,
1129
- no_setup=no_setup,
1130
- clone_disk_from=clone_disk_from)
1206
+ request_id = sdk.launch(
1207
+ task,
1208
+ dryrun=dryrun,
1209
+ cluster_name=cluster,
1210
+ backend=backend,
1211
+ idle_minutes_to_autostop=idle_minutes_to_autostop,
1212
+ down=down,
1213
+ retry_until_up=retry_until_up,
1214
+ no_setup=no_setup,
1215
+ clone_disk_from=clone_disk_from,
1216
+ fast=fast,
1217
+ _need_confirmation=not yes,
1218
+ )
1219
+ job_id_handle = _async_call_or_wait(request_id, async_call, 'sky.launch')
1220
+ if not async_call:
1221
+ job_id, handle = job_id_handle
1222
+ if not handle:
1223
+ assert dryrun, 'handle should only be None when dryrun is true'
1224
+ return
1225
+ # Add ssh config for the cluster
1226
+ _get_cluster_records_and_set_ssh_config(
1227
+ clusters=[handle.get_cluster_name()])
1228
+ # job_id will be None if no job was submitted (e.g. no entrypoint
1229
+ # provided)
1230
+ if not detach_run and job_id is not None:
1231
+ sdk.tail_logs(handle.get_cluster_name(), job_id, follow=True)
1232
+ click.secho(
1233
+ ux_utils.command_hint_messages(ux_utils.CommandHintType.CLUSTER_JOB,
1234
+ job_id, handle.get_cluster_name()))
1131
1235
 
1132
1236
 
1133
1237
  @cli.command(cls=_DocumentedCodeCommand)
@@ -1155,32 +1259,19 @@ def launch(
1155
1259
  is_flag=True,
1156
1260
  help=('If True, as soon as a job is submitted, return from this call '
1157
1261
  'and do not stream execution logs.'))
1158
- @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS)
1262
+ @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS +
1263
+ _COMMON_OPTIONS)
1159
1264
  @usage_lib.entrypoint
1160
1265
  # pylint: disable=redefined-builtin
1161
- def exec(
1162
- cluster: Optional[str],
1163
- cluster_option: Optional[str],
1164
- entrypoint: Tuple[str, ...],
1165
- detach_run: bool,
1166
- name: Optional[str],
1167
- cloud: Optional[str],
1168
- region: Optional[str],
1169
- zone: Optional[str],
1170
- workdir: Optional[str],
1171
- gpus: Optional[str],
1172
- ports: Tuple[str],
1173
- instance_type: Optional[str],
1174
- num_nodes: Optional[int],
1175
- use_spot: Optional[bool],
1176
- image_id: Optional[str],
1177
- env_file: Optional[Dict[str, str]],
1178
- env: List[Tuple[str, str]],
1179
- cpus: Optional[str],
1180
- memory: Optional[str],
1181
- disk_size: Optional[int],
1182
- disk_tier: Optional[str],
1183
- ):
1266
+ def exec(cluster: Optional[str], cluster_option: Optional[str],
1267
+ entrypoint: Tuple[str, ...], detach_run: bool, name: Optional[str],
1268
+ cloud: Optional[str], region: Optional[str], zone: Optional[str],
1269
+ workdir: Optional[str], gpus: Optional[str], ports: Tuple[str],
1270
+ instance_type: Optional[str], num_nodes: Optional[int],
1271
+ use_spot: Optional[bool], image_id: Optional[str],
1272
+ env_file: Optional[Dict[str, str]], env: List[Tuple[str, str]],
1273
+ cpus: Optional[str], memory: Optional[str], disk_size: Optional[int],
1274
+ disk_tier: Optional[str], async_call: bool):
1184
1275
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1185
1276
  """Execute a task or command on an existing cluster.
1186
1277
 
@@ -1253,11 +1344,6 @@ def exec(
1253
1344
  env = _merge_env_vars(env_file, env)
1254
1345
  controller_utils.check_cluster_name_not_controller(
1255
1346
  cluster, operation_str='Executing task on it')
1256
- handle = global_user_state.get_handle_from_cluster_name(cluster)
1257
- if handle is None:
1258
- raise click.BadParameter(f'Cluster {cluster!r} not found. '
1259
- 'Use `sky launch` to provision first.')
1260
- backend = backend_utils.get_backend_from_handle(handle)
1261
1347
 
1262
1348
  task_or_dag = _make_task_or_dag_from_entrypoint_with_overrides(
1263
1349
  entrypoint=entrypoint,
@@ -1285,23 +1371,26 @@ def exec(
1285
1371
  'supports a single task only.')
1286
1372
  task = task_or_dag
1287
1373
 
1288
- click.secho(f'Executing task on cluster {cluster}...', fg='yellow')
1289
- sky.exec(task, backend=backend, cluster_name=cluster, detach_run=detach_run)
1374
+ click.secho('Submitting job to cluster: ', fg='cyan', nl=False)
1375
+ click.secho(cluster)
1376
+ request_id = sdk.exec(task, cluster_name=cluster)
1377
+ job_id_handle = _async_call_or_wait(request_id, async_call, 'sky.exec')
1378
+ if not async_call and not detach_run:
1379
+ job_id, _ = job_id_handle
1380
+ sdk.tail_logs(cluster, job_id, follow=True)
1290
1381
 
1291
1382
 
1292
- def _get_managed_jobs(
1293
- refresh: bool,
1294
- skip_finished: bool,
1383
+ def _handle_jobs_queue_request(
1384
+ request_id: str,
1295
1385
  show_all: bool,
1386
+ show_user: bool,
1296
1387
  limit_num_jobs_to_show: bool = False,
1297
1388
  is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
1298
1389
  """Get the in-progress managed jobs.
1299
1390
 
1300
1391
  Args:
1301
- refresh: Query the latest statuses, restarting the jobs controller if
1302
- stopped.
1303
- skip_finished: Show only in-progress jobs.
1304
1392
  show_all: Show all information of each job (e.g., region, price).
1393
+ show_user: Show the user who submitted the job.
1305
1394
  limit_num_jobs_to_show: If True, limit the number of jobs to show to
1306
1395
  _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS, which is mainly used by
1307
1396
  `sky status`.
@@ -1314,14 +1403,13 @@ def _get_managed_jobs(
1314
1403
  msg contains the error message. Otherwise, msg contains the formatted
1315
1404
  managed job table.
1316
1405
  """
1406
+ # TODO(SKY-980): remove unnecessary fallbacks on the client side.
1317
1407
  num_in_progress_jobs = None
1408
+ msg = ''
1318
1409
  try:
1319
1410
  if not is_called_by_user:
1320
1411
  usage_lib.messages.usage.set_internal()
1321
- with sky_logging.silent():
1322
- # Make the call silent
1323
- managed_jobs_ = managed_jobs.queue(refresh=refresh,
1324
- skip_finished=skip_finished)
1412
+ managed_jobs_ = sdk.get(request_id)
1325
1413
  num_in_progress_jobs = len(set(job['job_id'] for job in managed_jobs_))
1326
1414
  except exceptions.ClusterNotUpError as e:
1327
1415
  controller_status = e.cluster_status
@@ -1334,17 +1422,19 @@ def _get_managed_jobs(
1334
1422
  msg += (f' (See finished managed jobs: {colorama.Style.BRIGHT}'
1335
1423
  f'sky jobs queue --refresh{colorama.Style.RESET_ALL})')
1336
1424
  except RuntimeError as e:
1337
- msg = ''
1338
1425
  try:
1339
1426
  # Check the controller status again, as the RuntimeError is likely
1340
1427
  # due to the controller being autostopped when querying the jobs.
1341
- controller_type = controller_utils.Controllers.JOBS_CONTROLLER
1342
- record = backend_utils.refresh_cluster_record(
1343
- controller_type.value.cluster_name,
1344
- cluster_status_lock_timeout=0)
1345
- if (record is None or
1346
- record['status'] == status_lib.ClusterStatus.STOPPED):
1347
- msg = controller_type.value.default_hint_if_non_existent
1428
+ # Since we are client-side, we may not know the exact name of the
1429
+ # controller, so use the prefix with a wildcard.
1430
+ # Query status of the controller cluster.
1431
+ records = sdk.get(
1432
+ sdk.status(cluster_names=[common.JOB_CONTROLLER_PREFIX + '*'],
1433
+ all_users=True))
1434
+ if (not records or
1435
+ records[0]['status'] == status_lib.ClusterStatus.STOPPED):
1436
+ controller = controller_utils.Controllers.JOBS_CONTROLLER.value
1437
+ msg = controller.default_hint_if_non_existent
1348
1438
  except Exception: # pylint: disable=broad-except
1349
1439
  # This is to an best effort to find the latest controller status to
1350
1440
  # print more helpful message, so we can ignore any exception to
@@ -1357,21 +1447,28 @@ def _get_managed_jobs(
1357
1447
  f'Details: {common_utils.format_exception(e, use_bracket=True)}'
1358
1448
  )
1359
1449
  except Exception as e: # pylint: disable=broad-except
1360
- msg = ('Failed to query managed jobs: '
1361
- f'{common_utils.format_exception(e, use_bracket=True)}')
1450
+ msg = ''
1451
+ if env_options.Options.SHOW_DEBUG_INFO.get():
1452
+ msg += traceback.format_exc()
1453
+ msg += '\n'
1454
+ msg += ('Failed to query managed jobs: '
1455
+ f'{common_utils.format_exception(e, use_bracket=True)}')
1362
1456
  else:
1363
1457
  max_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS
1364
1458
  if limit_num_jobs_to_show else None)
1365
1459
  msg = managed_jobs.format_job_table(managed_jobs_,
1366
1460
  show_all=show_all,
1461
+ show_user=show_user,
1367
1462
  max_jobs=max_jobs_to_show)
1368
1463
  return num_in_progress_jobs, msg
1369
1464
 
1370
1465
 
1371
- def _get_services(service_names: Optional[List[str]],
1372
- show_all: bool,
1373
- show_endpoint: bool,
1374
- is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
1466
+ def _handle_services_request(
1467
+ request_id: str,
1468
+ service_names: Optional[List[str]],
1469
+ show_all: bool,
1470
+ show_endpoint: bool,
1471
+ is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
1375
1472
  """Get service statuses.
1376
1473
 
1377
1474
  Args:
@@ -1390,12 +1487,8 @@ def _get_services(service_names: Optional[List[str]],
1390
1487
  try:
1391
1488
  if not is_called_by_user:
1392
1489
  usage_lib.messages.usage.set_internal()
1393
- with sky_logging.silent():
1394
- if not service_names:
1395
- # Change empty list to None
1396
- service_names = None
1397
- service_records = serve_lib.status(service_names)
1398
- num_services = len(service_records)
1490
+ service_records = sdk.get(request_id)
1491
+ num_services = len(service_records)
1399
1492
  except exceptions.ClusterNotUpError as e:
1400
1493
  controller_status = e.cluster_status
1401
1494
  msg = str(e)
@@ -1408,13 +1501,18 @@ def _get_services(service_names: Optional[List[str]],
1408
1501
  # Check the controller status again, as the RuntimeError is likely
1409
1502
  # due to the controller being autostopped when querying the
1410
1503
  # services.
1411
- controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
1412
- record = backend_utils.refresh_cluster_record(
1413
- controller_type.value.cluster_name,
1414
- cluster_status_lock_timeout=0)
1415
- if (record is None or
1416
- record['status'] == status_lib.ClusterStatus.STOPPED):
1417
- msg = controller_type.value.default_hint_if_non_existent
1504
+ # Since we are client-side, we may not know the exact name of the
1505
+ # controller, so use the prefix with a wildcard.
1506
+ # Query status of the controller cluster.
1507
+ records = sdk.get(
1508
+ sdk.status(
1509
+ cluster_names=[common.SKY_SERVE_CONTROLLER_PREFIX + '*'],
1510
+ all_users=True))
1511
+ if (not records or
1512
+ records[0]['status'] == status_lib.ClusterStatus.STOPPED):
1513
+ controller = (
1514
+ controller_utils.Controllers.SKY_SERVE_CONTROLLER.value)
1515
+ msg = controller.default_hint_if_non_existent
1418
1516
  except Exception: # pylint: disable=broad-except
1419
1517
  # This is to an best effort to find the latest controller status to
1420
1518
  # print more helpful message, so we can ignore any exception to
@@ -1432,12 +1530,13 @@ def _get_services(service_names: Optional[List[str]],
1432
1530
  if len(service_records) != 1:
1433
1531
  plural = 's' if len(service_records) > 1 else ''
1434
1532
  service_num = (str(len(service_records))
1435
- if len(service_records) > 0 else 'No')
1533
+ if service_records else 'No')
1436
1534
  raise click.UsageError(
1437
1535
  f'{service_num} service{plural} found. Please specify '
1438
1536
  'an existing service to show its endpoint. Usage: '
1439
1537
  'sky serve status --endpoint <service-name>')
1440
- msg = serve_lib.get_endpoint(service_records[0])
1538
+ endpoint = service_records[0]['endpoint']
1539
+ msg = '-' if endpoint is None else endpoint
1441
1540
  else:
1442
1541
  msg = serve_lib.format_service_table(service_records, show_all)
1443
1542
  service_not_found_msg = ''
@@ -1452,9 +1551,105 @@ def _get_services(service_names: Optional[List[str]],
1452
1551
  return num_services, msg
1453
1552
 
1454
1553
 
1554
+ def _status_kubernetes(show_all: bool):
1555
+ """Show all SkyPilot resources in the current Kubernetes context.
1556
+
1557
+ Args:
1558
+ show_all (bool): Show all job information (e.g., start time, failures).
1559
+ """
1560
+ all_clusters, unmanaged_clusters, all_jobs, context = (sdk.stream_and_get(
1561
+ sdk.status_kubernetes()))
1562
+ click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1563
+ f'Kubernetes cluster state (context: {context})'
1564
+ f'{colorama.Style.RESET_ALL}')
1565
+ status_utils.show_kubernetes_cluster_status_table(unmanaged_clusters,
1566
+ show_all)
1567
+ if all_jobs:
1568
+ click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1569
+ f'Managed jobs'
1570
+ f'{colorama.Style.RESET_ALL}')
1571
+ msg = managed_jobs.format_job_table(all_jobs,
1572
+ show_all=show_all,
1573
+ show_user=False)
1574
+ click.echo(msg)
1575
+ if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
1576
+ # TODO: Parse serve controllers and show services separately.
1577
+ # Currently we show a hint that services are shown as clusters.
1578
+ click.echo(f'\n{colorama.Style.DIM}Hint: SkyServe replica pods are '
1579
+ 'shown in the "SkyPilot clusters" section.'
1580
+ f'{colorama.Style.RESET_ALL}')
1581
+
1582
+
1583
+ def _show_endpoint(query_clusters: Optional[List[str]],
1584
+ cluster_records: List[Dict[str, Any]], ip: bool,
1585
+ endpoints: bool, endpoint: Optional[int]) -> None:
1586
+ show_endpoints = endpoints or endpoint is not None
1587
+ show_single_endpoint = endpoint is not None
1588
+ if len(cluster_records) != 1:
1589
+ with ux_utils.print_exception_no_traceback():
1590
+ plural = 's' if len(cluster_records) > 1 else ''
1591
+ if cluster_records:
1592
+ cluster_num = str(len(cluster_records))
1593
+ else:
1594
+ cluster_num = (f'{query_clusters[0]!r}'
1595
+ if query_clusters else 'No')
1596
+ verb = 'found' if cluster_records else 'not found'
1597
+ cause = 'a single'
1598
+ if query_clusters and len(query_clusters) > 1:
1599
+ cause = 'an existing'
1600
+ raise ValueError(
1601
+ _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format(
1602
+ cluster_num=cluster_num,
1603
+ plural=plural,
1604
+ verb=verb,
1605
+ cause=cause,
1606
+ property='IP address' if ip else 'endpoint(s)',
1607
+ flag='ip' if ip else
1608
+ ('endpoint port' if show_single_endpoint else 'endpoints')))
1609
+
1610
+ cluster_record = cluster_records[0]
1611
+ if cluster_record['status'] != status_lib.ClusterStatus.UP:
1612
+ with ux_utils.print_exception_no_traceback():
1613
+ raise RuntimeError(f'Cluster {cluster_record["name"]!r} '
1614
+ 'is not in UP status.')
1615
+ handle = cluster_record['handle']
1616
+ if not isinstance(handle, backends.CloudVmRayResourceHandle):
1617
+ with ux_utils.print_exception_no_traceback():
1618
+ raise ValueError('Querying IP address is not supported '
1619
+ 'for local clusters.')
1620
+
1621
+ head_ip = handle.external_ips()[0]
1622
+ # The endpoint request is relatively fast, so we don't add special handling
1623
+ # for keyboard interrupt and abort the request to avoid additional latency.
1624
+ if show_endpoints:
1625
+ if endpoint:
1626
+ request_id = sdk.endpoints(cluster_record['name'], endpoint)
1627
+ cluster_endpoints = sdk.stream_and_get(request_id)
1628
+ cluster_endpoint = cluster_endpoints.get(str(endpoint), None)
1629
+ if not cluster_endpoint:
1630
+ raise click.Abort(f'Endpoint {endpoint} not found for cluster '
1631
+ f'{cluster_record["name"]!r}.')
1632
+ click.echo(cluster_endpoint)
1633
+ else:
1634
+ request_id = sdk.endpoints(cluster_record['name'])
1635
+ cluster_endpoints = sdk.stream_and_get(request_id)
1636
+ assert isinstance(cluster_endpoints, dict)
1637
+ if not cluster_endpoints:
1638
+ raise click.Abort(f'No endpoint found for cluster '
1639
+ f'{cluster_record["name"]!r}.')
1640
+ for port, port_endpoint in cluster_endpoints.items():
1641
+ click.echo(f'{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}'
1642
+ f'{colorama.Style.RESET_ALL}: '
1643
+ f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1644
+ f'{port_endpoint}{colorama.Style.RESET_ALL}')
1645
+ return
1646
+ click.echo(head_ip)
1647
+ return
1648
+
1649
+
1455
1650
  @cli.command()
1456
- @click.option('--all',
1457
- '-a',
1651
+ @click.option('--verbose',
1652
+ '-v',
1458
1653
  default=False,
1459
1654
  is_flag=True,
1460
1655
  required=False,
@@ -1497,16 +1692,32 @@ def _get_services(service_names: Optional[List[str]],
1497
1692
  is_flag=True,
1498
1693
  required=False,
1499
1694
  help='Also show sky serve services, if any.')
1695
+ @click.option(
1696
+ '--kubernetes',
1697
+ '--k8s',
1698
+ default=False,
1699
+ is_flag=True,
1700
+ required=False,
1701
+ help='[Experimental] Show all SkyPilot resources (including from other '
1702
+ 'users) in the current Kubernetes context.')
1500
1703
  @click.argument('clusters',
1501
1704
  required=False,
1502
1705
  type=str,
1503
1706
  nargs=-1,
1504
1707
  **_get_shell_complete_args(_complete_cluster_name))
1708
+ @click.option('--all-users',
1709
+ '-u',
1710
+ default=False,
1711
+ is_flag=True,
1712
+ required=False,
1713
+ help='Show all clusters, including those not owned by the '
1714
+ 'current user.')
1505
1715
  @usage_lib.entrypoint
1506
1716
  # pylint: disable=redefined-builtin
1507
- def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
1717
+ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1508
1718
  endpoint: Optional[int], show_managed_jobs: bool,
1509
- show_services: bool, clusters: List[str]):
1719
+ show_services: bool, kubernetes: bool, clusters: List[str],
1720
+ all_users: bool):
1510
1721
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1511
1722
  """Show clusters.
1512
1723
 
@@ -1521,11 +1732,15 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
1521
1732
  ``sky status --endpoints mycluster``. To query a single endpoint, you
1522
1733
  can use ``sky status mycluster --endpoint 8888``.
1523
1734
 
1735
+ Running `sky status` will update the ssh config for the clusters locally, so
1736
+ that you can directly ssh into the clusters or connect to the clusters with
1737
+ vscode.
1738
+
1524
1739
  The following fields for each cluster are recorded: cluster name, time
1525
1740
  since last launch, resources, region, zone, hourly price, status, autostop,
1526
1741
  command.
1527
1742
 
1528
- Display all fields using ``sky status -a``.
1743
+ Display all fields using ``sky status -v``.
1529
1744
 
1530
1745
  Each cluster can have one of the following statuses:
1531
1746
 
@@ -1565,243 +1780,163 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
1565
1780
  or for autostop-enabled clusters, use ``--refresh`` to query the latest
1566
1781
  cluster statuses from the cloud providers.
1567
1782
  """
1568
- # Using a pool with 2 worker to run the managed job query and sky serve
1569
- # service query in parallel to speed up. The pool provides a AsyncResult
1570
- # object that can be used as a future.
1571
- with multiprocessing.Pool(2) as pool:
1572
- # Do not show job queue if user specifies clusters, and if user
1573
- # specifies --ip or --endpoint(s).
1574
- show_managed_jobs = show_managed_jobs and not any(
1575
- [clusters, ip, endpoints])
1576
- show_endpoints = endpoints or endpoint is not None
1577
- show_single_endpoint = endpoint is not None
1578
- if show_managed_jobs:
1579
- # Run managed job query in parallel to speed up the status query.
1580
- managed_jobs_future = pool.apply_async(
1581
- _get_managed_jobs,
1582
- kwds=dict(refresh=False,
1583
- skip_finished=True,
1584
- show_all=False,
1585
- limit_num_jobs_to_show=not all,
1586
- is_called_by_user=False))
1587
-
1588
- show_services = show_services and not clusters and not ip
1589
- if show_services:
1590
- # Run the sky serve service query in parallel to speed up the
1591
- # status query.
1592
- services_future = pool.apply_async(_get_services,
1593
- kwds=dict(
1594
- service_names=None,
1595
- show_all=False,
1596
- show_endpoint=False,
1597
- is_called_by_user=False))
1598
- if ip or show_endpoints:
1599
- if refresh:
1600
- raise click.UsageError(
1601
- 'Using --ip or --endpoint(s) with --refresh is not'
1602
- 'supported for now. To fix, refresh first, '
1603
- 'then query the IP or endpoint.')
1783
+ if kubernetes:
1784
+ _status_kubernetes(verbose)
1785
+ return
1786
+ # Do not show job queue if user specifies clusters, and if user
1787
+ # specifies --ip or --endpoint(s).
1788
+ show_managed_jobs = show_managed_jobs and not any([clusters, ip, endpoints])
1789
+ if show_managed_jobs:
1790
+ managed_jobs_queue_request_id = managed_jobs.queue(refresh=False,
1791
+ skip_finished=True,
1792
+ all_users=all_users)
1793
+ show_endpoints = endpoints or endpoint is not None
1794
+ show_single_endpoint = endpoint is not None
1795
+ show_services = show_services and not any([clusters, ip, endpoints])
1796
+ if show_services:
1797
+ # Run the sky serve service query in parallel to speed up the
1798
+ # status query.
1799
+ service_status_request_id = serve_lib.status(service_names=None)
1800
+
1801
+ if ip or show_endpoints:
1802
+ if refresh:
1803
+ raise click.UsageError(
1804
+ 'Using --ip or --endpoint(s) with --refresh is not'
1805
+ 'supported for now. To fix, refresh first, '
1806
+ 'then query the IP or endpoint.')
1604
1807
 
1605
- if ip and show_endpoints:
1606
- with ux_utils.print_exception_no_traceback():
1607
- raise ValueError(
1608
- 'Cannot specify both --ip and --endpoint(s) '
1609
- 'at the same time.')
1808
+ if ip and show_endpoints:
1809
+ with ux_utils.print_exception_no_traceback():
1810
+ raise ValueError('Cannot specify both --ip and --endpoint(s) '
1811
+ 'at the same time.')
1610
1812
 
1611
- if endpoint is not None and endpoints:
1612
- with ux_utils.print_exception_no_traceback():
1613
- raise ValueError(
1614
- 'Cannot specify both --endpoint and --endpoints '
1615
- 'at the same time.')
1813
+ if endpoint is not None and endpoints:
1814
+ with ux_utils.print_exception_no_traceback():
1815
+ raise ValueError(
1816
+ 'Cannot specify both --endpoint and --endpoints '
1817
+ 'at the same time.')
1616
1818
 
1617
- if len(clusters) != 1:
1618
- with ux_utils.print_exception_no_traceback():
1619
- plural = 's' if len(clusters) > 1 else ''
1620
- cluster_num = (str(len(clusters))
1621
- if len(clusters) > 0 else 'No')
1622
- cause = 'a single' if len(clusters) > 1 else 'an existing'
1623
- raise ValueError(
1624
- _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format(
1625
- cluster_num=cluster_num,
1626
- plural=plural,
1627
- verb='specified',
1628
- cause=cause,
1629
- property='IP address' if ip else 'endpoint(s)',
1630
- flag='ip' if ip else
1631
- ('endpoint port'
1632
- if show_single_endpoint else 'endpoints')))
1819
+ if len(clusters) != 1:
1820
+ with ux_utils.print_exception_no_traceback():
1821
+ plural = 's' if len(clusters) > 1 else ''
1822
+ cluster_num = (str(len(clusters)) if clusters else 'No')
1823
+ cause = 'a single' if len(clusters) > 1 else 'an existing'
1824
+ raise ValueError(
1825
+ _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format(
1826
+ cluster_num=cluster_num,
1827
+ plural=plural,
1828
+ verb='specified',
1829
+ cause=cause,
1830
+ property='IP address' if ip else 'endpoint(s)',
1831
+ flag='ip' if ip else
1832
+ ('endpoint port'
1833
+ if show_single_endpoint else 'endpoints')))
1834
+ else:
1835
+ click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters'
1836
+ f'{colorama.Style.RESET_ALL}')
1837
+ query_clusters: Optional[List[str]] = None if not clusters else clusters
1838
+ refresh_mode = common.StatusRefreshMode.NONE
1839
+ if refresh:
1840
+ refresh_mode = common.StatusRefreshMode.FORCE
1841
+ cluster_records = _get_cluster_records_and_set_ssh_config(
1842
+ query_clusters, refresh_mode, all_users)
1843
+
1844
+ # TOOD(zhwu): setup the ssh config for status
1845
+ if ip or show_endpoints:
1846
+ _show_endpoint(query_clusters, cluster_records, ip, endpoints, endpoint)
1847
+ return
1848
+ hints = []
1849
+ normal_clusters = []
1850
+ controllers = []
1851
+ for cluster_record in cluster_records:
1852
+ cluster_name = cluster_record['name']
1853
+ controller = controller_utils.Controllers.from_name(cluster_name)
1854
+ if controller is not None:
1855
+ controllers.append(cluster_record)
1633
1856
  else:
1634
- click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters'
1635
- f'{colorama.Style.RESET_ALL}')
1636
- query_clusters: Optional[List[str]] = None
1637
- if clusters:
1638
- query_clusters = _get_glob_clusters(clusters, silent=ip)
1639
- cluster_records = core.status(cluster_names=query_clusters,
1640
- refresh=refresh)
1641
- if ip or show_endpoints:
1642
- if len(cluster_records) != 1:
1643
- with ux_utils.print_exception_no_traceback():
1644
- plural = 's' if len(cluster_records) > 1 else ''
1645
- cluster_num = (str(len(cluster_records))
1646
- if len(cluster_records) > 0 else
1647
- f'{clusters[0]!r}')
1648
- verb = 'found' if len(cluster_records) > 0 else 'not found'
1649
- cause = 'a single' if len(clusters) > 1 else 'an existing'
1650
- raise ValueError(
1651
- _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format(
1652
- cluster_num=cluster_num,
1653
- plural=plural,
1654
- verb=verb,
1655
- cause=cause,
1656
- property='IP address' if ip else 'endpoint(s)',
1657
- flag='ip' if ip else
1658
- ('endpoint port'
1659
- if show_single_endpoint else 'endpoints')))
1660
-
1661
- cluster_record = cluster_records[0]
1662
- if cluster_record['status'] != status_lib.ClusterStatus.UP:
1663
- with ux_utils.print_exception_no_traceback():
1664
- raise RuntimeError(f'Cluster {cluster_record["name"]!r} '
1665
- 'is not in UP status.')
1666
- handle = cluster_record['handle']
1667
- if not isinstance(handle, backends.CloudVmRayResourceHandle):
1668
- with ux_utils.print_exception_no_traceback():
1669
- raise ValueError('Querying IP address is not supported '
1670
- 'for local clusters.')
1671
-
1672
- head_ip = handle.external_ips()[0]
1673
- if show_endpoints:
1674
- if endpoint:
1675
- cluster_endpoint = core.endpoints(cluster_record['name'],
1676
- endpoint).get(
1677
- endpoint, None)
1678
- if not cluster_endpoint:
1679
- raise click.Abort(
1680
- f'Endpoint {endpoint} not found for cluster '
1681
- f'{cluster_record["name"]!r}.')
1682
- click.echo(cluster_endpoint)
1683
- else:
1684
- cluster_endpoints = core.endpoints(cluster_record['name'])
1685
- assert isinstance(cluster_endpoints, dict)
1686
- if not cluster_endpoints:
1687
- raise click.Abort(f'No endpoint found for cluster '
1688
- f'{cluster_record["name"]!r}.')
1689
- for port, port_endpoint in cluster_endpoints.items():
1690
- click.echo(
1691
- f'{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}'
1692
- f'{colorama.Style.RESET_ALL}: '
1693
- f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1694
- f'{port_endpoint}{colorama.Style.RESET_ALL}')
1695
- return
1696
- click.echo(head_ip)
1697
- return
1698
- hints = []
1699
- normal_clusters = []
1700
- controllers = []
1701
- for cluster_record in cluster_records:
1702
- cluster_name = cluster_record['name']
1703
- controller = controller_utils.Controllers.from_name(cluster_name)
1704
- if controller is not None:
1705
- controllers.append(cluster_record)
1706
- else:
1707
- normal_clusters.append(cluster_record)
1857
+ normal_clusters.append(cluster_record)
1708
1858
 
1709
- num_pending_autostop = 0
1710
- num_pending_autostop += status_utils.show_status_table(
1711
- normal_clusters + controllers, all)
1859
+ num_pending_autostop = 0
1860
+ num_pending_autostop += status_utils.show_status_table(
1861
+ normal_clusters + controllers, verbose, all_users, query_clusters)
1712
1862
 
1713
- def _try_get_future_result(future) -> Tuple[bool, Any]:
1714
- result = None
1715
- interrupted = False
1863
+ managed_jobs_query_interrupted = False
1864
+ if show_managed_jobs:
1865
+ click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1866
+ f'Managed jobs{colorama.Style.RESET_ALL}')
1867
+ with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
1716
1868
  try:
1717
- result = future.get()
1869
+ num_in_progress_jobs, msg = _handle_jobs_queue_request(
1870
+ managed_jobs_queue_request_id,
1871
+ show_all=False,
1872
+ show_user=False,
1873
+ limit_num_jobs_to_show=not all,
1874
+ is_called_by_user=False)
1718
1875
  except KeyboardInterrupt:
1719
- pool.terminate()
1720
- interrupted = True
1721
- return interrupted, result
1722
-
1723
- managed_jobs_query_interrupted = False
1724
- if show_managed_jobs:
1725
- click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1726
- f'Managed jobs{colorama.Style.RESET_ALL}')
1727
- with rich_utils.safe_status('[cyan]Checking managed jobs[/]'):
1728
- managed_jobs_query_interrupted, result = _try_get_future_result(
1729
- managed_jobs_future)
1730
- if managed_jobs_query_interrupted:
1731
- # Set to -1, so that the controller is not considered
1732
- # down, and the hint for showing sky jobs queue
1733
- # will still be shown.
1734
- num_in_progress_jobs = -1
1735
- msg = 'KeyboardInterrupt'
1736
- else:
1737
- num_in_progress_jobs, msg = result
1738
-
1739
- click.echo(msg)
1740
- if num_in_progress_jobs is not None:
1741
- # jobs controller is UP.
1742
- job_info = ''
1743
- if num_in_progress_jobs > 0:
1744
- plural_and_verb = ' is'
1745
- if num_in_progress_jobs > 1:
1746
- plural_and_verb = 's are'
1747
- job_info = (
1748
- f'{num_in_progress_jobs} managed job{plural_and_verb} '
1749
- 'in progress')
1750
- if (num_in_progress_jobs >
1751
- _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS):
1752
- job_info += (
1753
- f' ({_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS} latest '
1754
- 'ones shown)')
1755
- job_info += '. '
1756
- hints.append(
1757
- controller_utils.Controllers.JOBS_CONTROLLER.value.
1758
- in_progress_hint.format(job_info=job_info))
1759
-
1760
- if show_services:
1761
- click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1762
- f'Services{colorama.Style.RESET_ALL}')
1763
- num_services = None
1764
- if managed_jobs_query_interrupted:
1765
- # The pool is terminated, so we cannot run the service query.
1876
+ sdk.api_cancel(managed_jobs_queue_request_id, silent=True)
1877
+ managed_jobs_query_interrupted = True
1878
+ # Set to -1, so that the controller is not considered
1879
+ # down, and the hint for showing sky jobs queue
1880
+ # will still be shown.
1881
+ num_in_progress_jobs = -1
1766
1882
  msg = 'KeyboardInterrupt'
1767
- else:
1768
- with rich_utils.safe_status('[cyan]Checking services[/]'):
1769
- interrupted, result = _try_get_future_result(
1770
- services_future)
1771
- if interrupted:
1772
- num_services = -1
1773
- msg = 'KeyboardInterrupt'
1774
- else:
1775
- num_services, msg = result
1776
- click.echo(msg)
1777
- if num_services is not None:
1778
- hints.append(controller_utils.Controllers.SKY_SERVE_CONTROLLER.
1779
- value.in_progress_hint)
1780
1883
 
1781
- if show_managed_jobs or show_services:
1782
- try:
1783
- pool.close()
1784
- pool.join()
1785
- except SystemExit as e:
1786
- # This is to avoid a "Exception ignored" problem caused by
1787
- # ray worker setting the sigterm handler to sys.exit(15)
1788
- # (see ray/_private/worker.py).
1789
- # TODO (zhwu): Remove any importing of ray in SkyPilot.
1790
- if e.code != 15:
1791
- raise
1792
-
1793
- if num_pending_autostop > 0 and not refresh:
1794
- # Don't print this hint if there's no pending autostop or user has
1795
- # already passed --refresh.
1796
- plural_and_verb = ' has'
1797
- if num_pending_autostop > 1:
1798
- plural_and_verb = 's have'
1799
- hints.append(f'* {num_pending_autostop} cluster{plural_and_verb} '
1800
- 'auto{stop,down} scheduled. Refresh statuses with: '
1801
- f'{colorama.Style.BRIGHT}sky status --refresh'
1802
- f'{colorama.Style.RESET_ALL}')
1803
- if hints:
1804
- click.echo('\n' + '\n'.join(hints))
1884
+ click.echo(msg)
1885
+ if num_in_progress_jobs is not None:
1886
+ # jobs controller is UP.
1887
+ job_info = ''
1888
+ if num_in_progress_jobs > 0:
1889
+ plural_and_verb = ' is'
1890
+ if num_in_progress_jobs > 1:
1891
+ plural_and_verb = 's are'
1892
+ job_info = (
1893
+ f'{num_in_progress_jobs} managed job{plural_and_verb} '
1894
+ 'in progress')
1895
+ if num_in_progress_jobs > _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS:
1896
+ job_info += (
1897
+ f' ({_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS} latest '
1898
+ 'ones shown)')
1899
+ job_info += '. '
1900
+ hints.append(
1901
+ controller_utils.Controllers.JOBS_CONTROLLER.value.
1902
+ in_progress_hint.format(job_info=job_info))
1903
+
1904
+ if show_services:
1905
+ click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1906
+ f'Services{colorama.Style.RESET_ALL}')
1907
+ num_services = None
1908
+ if managed_jobs_query_interrupted:
1909
+ msg = 'KeyboardInterrupt'
1910
+ else:
1911
+ with rich_utils.client_status('[cyan]Checking services[/]'):
1912
+ try:
1913
+ num_services, msg = _handle_services_request(
1914
+ service_status_request_id,
1915
+ service_names=None,
1916
+ show_all=False,
1917
+ show_endpoint=False,
1918
+ is_called_by_user=False)
1919
+ except KeyboardInterrupt:
1920
+ sdk.api_cancel(service_status_request_id, silent=True)
1921
+ num_services = -1
1922
+ msg = 'KeyboardInterrupt'
1923
+ click.echo(msg)
1924
+ if num_services is not None:
1925
+ hints.append(controller_utils.Controllers.SKY_SERVE_CONTROLLER.
1926
+ value.in_progress_hint)
1927
+
1928
+ if num_pending_autostop > 0 and not refresh:
1929
+ # Don't print this hint if there's no pending autostop or user has
1930
+ # already passed --refresh.
1931
+ plural_and_verb = ' has'
1932
+ if num_pending_autostop > 1:
1933
+ plural_and_verb = 's have'
1934
+ hints.append(f'* {num_pending_autostop} cluster{plural_and_verb} '
1935
+ 'auto{stop,down} scheduled. Refresh statuses with: '
1936
+ f'{colorama.Style.BRIGHT}sky status --refresh'
1937
+ f'{colorama.Style.RESET_ALL}')
1938
+ if hints:
1939
+ click.echo('\n' + '\n'.join(hints))
1805
1940
 
1806
1941
 
1807
1942
  @cli.command()
@@ -1810,7 +1945,7 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
1810
1945
  default=False,
1811
1946
  is_flag=True,
1812
1947
  required=False,
1813
- help='Show all information in full.')
1948
+ help='Show all cluster information.')
1814
1949
  @usage_lib.entrypoint
1815
1950
  def cost_report(all: bool): # pylint: disable=redefined-builtin
1816
1951
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
@@ -1831,7 +1966,7 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin
1831
1966
 
1832
1967
  - Clusters that were terminated/stopped on the cloud console.
1833
1968
  """
1834
- cluster_records = core.cost_report()
1969
+ cluster_records = sdk.get(sdk.cost_report())
1835
1970
 
1836
1971
  normal_cluster_records = []
1837
1972
  controllers = dict()
@@ -1876,7 +2011,7 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin
1876
2011
 
1877
2012
  @cli.command()
1878
2013
  @click.option('--all-users',
1879
- '-a',
2014
+ '-u',
1880
2015
  default=False,
1881
2016
  is_flag=True,
1882
2017
  required=False,
@@ -1896,18 +2031,21 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin
1896
2031
  def queue(clusters: List[str], skip_finished: bool, all_users: bool):
1897
2032
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1898
2033
  """Show the job queue for cluster(s)."""
1899
- click.secho('Fetching and parsing job queue...', fg='yellow')
1900
- if clusters:
1901
- clusters = _get_glob_clusters(clusters)
1902
- else:
1903
- cluster_infos = global_user_state.get_clusters()
1904
- clusters = [c['name'] for c in cluster_infos]
2034
+ click.secho('Fetching and parsing job queue...', fg='cyan')
2035
+ if not clusters:
2036
+ cluster_records = _get_cluster_records_and_set_ssh_config(
2037
+ None, all_users=all_users)
2038
+ clusters = [cluster['name'] for cluster in cluster_records]
1905
2039
 
1906
2040
  unsupported_clusters = []
1907
- for cluster in clusters:
2041
+ logger.info(f'Fetching job queue for: {", ".join(clusters)}')
2042
+ job_tables = {}
2043
+
2044
+ def _get_job_queue(cluster):
1908
2045
  try:
1909
- job_table = core.queue(cluster, skip_finished, all_users)
1910
- except (exceptions.CommandError, ValueError,
2046
+ job_table = sdk.stream_and_get(
2047
+ sdk.queue(cluster, skip_finished, all_users))
2048
+ except (RuntimeError, exceptions.CommandError, ValueError,
1911
2049
  exceptions.NotSupportedError, exceptions.ClusterNotUpError,
1912
2050
  exceptions.CloudUserIdentityError,
1913
2051
  exceptions.ClusterOwnerIdentityMismatchError) as e:
@@ -1916,9 +2054,14 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
1916
2054
  click.echo(f'{colorama.Fore.YELLOW}Failed to get the job queue for '
1917
2055
  f'cluster {cluster!r}.{colorama.Style.RESET_ALL}\n'
1918
2056
  f' {common_utils.format_exception(e)}')
1919
- continue
1920
- job_table = job_lib.format_job_queue(job_table)
1921
- click.echo(f'\nJob queue of cluster {cluster}\n{job_table}')
2057
+ return
2058
+ job_tables[cluster] = job_lib.format_job_queue(job_table)
2059
+
2060
+ subprocess_utils.run_in_parallel(_get_job_queue, clusters)
2061
+ user_str = 'all users' if all_users else 'current user'
2062
+ for cluster, job_table in job_tables.items():
2063
+ click.echo(f'\nJob queue of {user_str} on cluster {cluster}\n'
2064
+ f'{job_table}')
1922
2065
 
1923
2066
  if unsupported_clusters:
1924
2067
  click.secho(
@@ -1948,6 +2091,12 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
1948
2091
  help=('Follow the logs of a job. '
1949
2092
  'If --no-follow is specified, print the log so far and exit. '
1950
2093
  '[default: --follow]'))
2094
+ @click.option(
2095
+ '--tail',
2096
+ default=0,
2097
+ type=int,
2098
+ help=('The number of lines to display from the end of the log file. '
2099
+ 'Default is 0, which means print all lines.'))
1951
2100
  @click.argument('cluster',
1952
2101
  required=True,
1953
2102
  type=str,
@@ -1961,6 +2110,7 @@ def logs(
1961
2110
  sync_down: bool,
1962
2111
  status: bool, # pylint: disable=redefined-outer-name
1963
2112
  follow: bool,
2113
+ tail: int,
1964
2114
  ):
1965
2115
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1966
2116
  """Tail the log of a job.
@@ -1991,25 +2141,34 @@ def logs(
1991
2141
  job_ids = None if not job_ids else job_ids
1992
2142
 
1993
2143
  if sync_down:
1994
- core.download_logs(cluster, job_ids)
2144
+ with rich_utils.client_status(
2145
+ ux_utils.spinner_message('Downloading logs')):
2146
+ log_local_path_dict = sdk.download_logs(cluster, job_ids)
2147
+ style = colorama.Style
2148
+ fore = colorama.Fore
2149
+ for job, log_local_path in log_local_path_dict.items():
2150
+ logger.info(f'{fore.CYAN}Job {job} logs: {log_local_path}'
2151
+ f'{style.RESET_ALL}')
1995
2152
  return
1996
2153
 
1997
2154
  assert job_ids is None or len(job_ids) <= 1, job_ids
1998
- job_id = None
2155
+ job_id: Optional[int] = None
1999
2156
  job_ids_to_query: Optional[List[int]] = None
2000
2157
  if job_ids:
2001
2158
  # Already check that len(job_ids) <= 1. This variable is used later
2002
- # in core.tail_logs.
2003
- job_id = job_ids[0]
2004
- if not job_id.isdigit():
2005
- raise click.UsageError(f'Invalid job ID {job_id}. '
2159
+ # in sdk.tail_logs.
2160
+ cur_job_id = job_ids[0]
2161
+ if not cur_job_id.isdigit():
2162
+ raise click.UsageError(f'Invalid job ID {cur_job_id}. '
2006
2163
  'Job ID must be integers.')
2007
- job_ids_to_query = [int(job_id)]
2164
+ job_id = int(cur_job_id)
2165
+ job_ids_to_query = [int(job_ids[0])]
2008
2166
  else:
2009
2167
  # job_ids is either None or empty list, so it is safe to cast it here.
2010
2168
  job_ids_to_query = typing.cast(Optional[List[int]], job_ids)
2011
2169
  if status:
2012
- job_statuses = core.job_status(cluster, job_ids_to_query)
2170
+ job_statuses = sdk.stream_and_get(
2171
+ sdk.job_status(cluster, job_ids_to_query))
2013
2172
  job_id = list(job_statuses.keys())[0]
2014
2173
  # If job_ids is None and no job has been submitted to the cluster,
2015
2174
  # it will return {None: None}.
@@ -2027,7 +2186,15 @@ def logs(
2027
2186
  click.secho(f'Job {id_str}not found', fg='red')
2028
2187
  sys.exit(1)
2029
2188
 
2030
- core.tail_logs(cluster, job_id, follow)
2189
+ job_str = f'job {job_id}'
2190
+ if job_id is None:
2191
+ job_str = 'the last job'
2192
+ logger.info(f'{colorama.Fore.YELLOW}'
2193
+ f'Tailing logs of {job_str} on cluster {cluster!r}...'
2194
+ f'{colorama.Style.RESET_ALL}')
2195
+
2196
+ # Stream logs from the server.
2197
+ sdk.tail_logs(cluster, job_id, follow, tail=tail)
2031
2198
 
2032
2199
 
2033
2200
  @cli.command()
@@ -2040,16 +2207,31 @@ def logs(
2040
2207
  default=False,
2041
2208
  is_flag=True,
2042
2209
  required=False,
2043
- help='Cancel all jobs on the specified cluster.')
2210
+ help='Cancel all jobs from current user on the specified cluster.'
2211
+ )
2212
+ @click.option('--all-users',
2213
+ '-u',
2214
+ default=False,
2215
+ is_flag=True,
2216
+ required=False,
2217
+ help='Cancel all jobs on the specified cluster for all users.')
2044
2218
  @click.option('--yes',
2045
2219
  '-y',
2046
2220
  is_flag=True,
2047
2221
  default=False,
2048
2222
  required=False,
2049
2223
  help='Skip confirmation prompt.')
2224
+ @_add_click_options(_COMMON_OPTIONS)
2050
2225
  @click.argument('jobs', required=False, type=int, nargs=-1)
2051
2226
  @usage_lib.entrypoint
2052
- def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disable=redefined-builtin, redefined-outer-name
2227
+ def cancel(
2228
+ cluster: str,
2229
+ all: bool, # pylint: disable=redefined-builtin
2230
+ all_users: bool,
2231
+ jobs: List[int], # pylint: disable=redefined-outer-name
2232
+ yes: bool,
2233
+ async_call: bool,
2234
+ ): # pylint: disable=redefined-builtin
2053
2235
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
2054
2236
  """Cancel job(s).
2055
2237
 
@@ -2062,30 +2244,36 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa
2062
2244
  sky cancel cluster_name 1
2063
2245
  sky cancel cluster_name 1 2 3
2064
2246
  \b
2065
- # Cancel all jobs on a cluster.
2247
+ # Cancel all your jobs on a cluster.
2066
2248
  sky cancel cluster_name -a
2067
2249
  \b
2250
+ # Cancel all users' jobs on a cluster.
2251
+ sky cancel cluster_name -u
2252
+ \b
2068
2253
  # Cancel the latest running job on a cluster.
2069
2254
  sky cancel cluster_name
2070
2255
 
2071
2256
  Job IDs can be looked up by ``sky queue cluster_name``.
2072
2257
  """
2073
- job_identity_str = None
2258
+ job_identity_str = ''
2074
2259
  job_ids_to_cancel = None
2075
- if not jobs and not all:
2076
- click.echo(f'{colorama.Fore.YELLOW}No job IDs or --all provided; '
2077
- 'cancelling the latest running job.'
2078
- f'{colorama.Style.RESET_ALL}')
2260
+ if not jobs and not all and not all_users:
2261
+ click.echo(
2262
+ f'{colorama.Fore.YELLOW}No job IDs or --all/--all-users provided; '
2263
+ 'cancelling the latest running job.'
2264
+ f'{colorama.Style.RESET_ALL}')
2079
2265
  job_identity_str = 'the latest running job'
2266
+ elif all_users:
2267
+ job_identity_str = 'all users\' jobs'
2080
2268
  else:
2081
- # Cancelling specific jobs or --all.
2082
- job_ids = ' '.join(map(str, jobs))
2083
- plural = 's' if len(job_ids) > 1 else ''
2084
- job_identity_str = f'job{plural} {job_ids}'
2085
- job_ids_to_cancel = jobs
2086
2269
  if all:
2087
- job_identity_str = 'all jobs'
2088
- job_ids_to_cancel = None
2270
+ job_identity_str = 'all your jobs'
2271
+ if jobs:
2272
+ jobs_str = ' '.join(map(str, jobs))
2273
+ plural = 's' if len(jobs) > 1 else ''
2274
+ connector = ' and ' if job_identity_str else ''
2275
+ job_identity_str += f'{connector}job{plural} {jobs_str}'
2276
+ job_ids_to_cancel = jobs
2089
2277
  job_identity_str += f' on cluster {cluster!r}'
2090
2278
 
2091
2279
  if not yes:
@@ -2095,7 +2283,11 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa
2095
2283
  show_default=True)
2096
2284
 
2097
2285
  try:
2098
- core.cancel(cluster, all=all, job_ids=job_ids_to_cancel)
2286
+ request_id = sdk.cancel(cluster,
2287
+ all=all,
2288
+ all_users=all_users,
2289
+ job_ids=job_ids_to_cancel)
2290
+ _async_call_or_wait(request_id, async_call, 'sky.cancel')
2099
2291
  except exceptions.NotSupportedError as e:
2100
2292
  controller = controller_utils.Controllers.from_name(cluster)
2101
2293
  assert controller is not None, cluster
@@ -2115,20 +2307,28 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa
2115
2307
  **_get_shell_complete_args(_complete_cluster_name))
2116
2308
  @click.option('--all',
2117
2309
  '-a',
2118
- default=None,
2310
+ default=False,
2119
2311
  is_flag=True,
2120
2312
  help='Stop all existing clusters.')
2313
+ @click.option('--all-users',
2314
+ '-u',
2315
+ default=False,
2316
+ is_flag=True,
2317
+ help='Stop all existing clusters for all users.')
2121
2318
  @click.option('--yes',
2122
2319
  '-y',
2123
2320
  is_flag=True,
2124
2321
  default=False,
2125
2322
  required=False,
2126
2323
  help='Skip confirmation prompt.')
2324
+ @_add_click_options(_COMMON_OPTIONS)
2127
2325
  @usage_lib.entrypoint
2128
2326
  def stop(
2129
2327
  clusters: List[str],
2130
- all: Optional[bool], # pylint: disable=redefined-builtin
2328
+ all: bool, # pylint: disable=redefined-builtin
2329
+ all_users: bool,
2131
2330
  yes: bool,
2331
+ async_call: bool,
2132
2332
  ):
2133
2333
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
2134
2334
  """Stop cluster(s).
@@ -2161,8 +2361,10 @@ def stop(
2161
2361
  """
2162
2362
  _down_or_stop_clusters(clusters,
2163
2363
  apply_to_all=all,
2364
+ all_users=all_users,
2164
2365
  down=False,
2165
- no_confirm=yes)
2366
+ no_confirm=yes,
2367
+ async_call=async_call)
2166
2368
 
2167
2369
 
2168
2370
  @cli.command(cls=_DocumentedCodeCommand)
@@ -2172,9 +2374,14 @@ def stop(
2172
2374
  **_get_shell_complete_args(_complete_cluster_name))
2173
2375
  @click.option('--all',
2174
2376
  '-a',
2175
- default=None,
2377
+ default=False,
2378
+ is_flag=True,
2379
+ help='Autostop all existing clusters.')
2380
+ @click.option('--all-users',
2381
+ '-u',
2382
+ default=False,
2176
2383
  is_flag=True,
2177
- help='Apply this command to all existing clusters.')
2384
+ help='Autostop all existing clusters for all users.')
2178
2385
  @click.option('--idle-minutes',
2179
2386
  '-i',
2180
2387
  type=int,
@@ -2202,14 +2409,17 @@ def stop(
2202
2409
  default=False,
2203
2410
  required=False,
2204
2411
  help='Skip confirmation prompt.')
2412
+ @_add_click_options(_COMMON_OPTIONS)
2205
2413
  @usage_lib.entrypoint
2206
2414
  def autostop(
2207
2415
  clusters: List[str],
2208
- all: Optional[bool], # pylint: disable=redefined-builtin
2416
+ all: bool, # pylint: disable=redefined-builtin
2417
+ all_users: bool,
2209
2418
  idle_minutes: Optional[int],
2210
2419
  cancel: bool, # pylint: disable=redefined-outer-name
2211
2420
  down: bool, # pylint: disable=redefined-outer-name
2212
2421
  yes: bool,
2422
+ async_call: bool,
2213
2423
  ):
2214
2424
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
2215
2425
  """Schedule an autostop or autodown for cluster(s).
@@ -2262,9 +2472,11 @@ def autostop(
2262
2472
  idle_minutes = 5
2263
2473
  _down_or_stop_clusters(clusters,
2264
2474
  apply_to_all=all,
2475
+ all_users=all_users,
2265
2476
  down=down,
2266
2477
  no_confirm=yes,
2267
- idle_minutes_to_autostop=idle_minutes)
2478
+ idle_minutes_to_autostop=idle_minutes,
2479
+ async_call=async_call)
2268
2480
 
2269
2481
 
2270
2482
  @cli.command(cls=_DocumentedCodeCommand)
@@ -2327,16 +2539,19 @@ def autostop(
2327
2539
  required=False,
2328
2540
  help=('Force start the cluster even if it is already UP. Useful for '
2329
2541
  'upgrading the SkyPilot runtime on the cluster.'))
2542
+ @_add_click_options(_COMMON_OPTIONS)
2330
2543
  @usage_lib.entrypoint
2331
2544
  # pylint: disable=redefined-builtin
2332
2545
  def start(
2333
- clusters: List[str],
2334
- all: bool,
2335
- yes: bool,
2336
- idle_minutes_to_autostop: Optional[int],
2337
- down: bool, # pylint: disable=redefined-outer-name
2338
- retry_until_up: bool,
2339
- force: bool):
2546
+ clusters: List[str],
2547
+ all: bool,
2548
+ yes: bool,
2549
+ idle_minutes_to_autostop: Optional[int],
2550
+ down: bool, # pylint: disable=redefined-outer-name
2551
+ retry_until_up: bool,
2552
+ force: bool,
2553
+ async_call: bool,
2554
+ ):
2340
2555
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
2341
2556
  """Restart cluster(s).
2342
2557
 
@@ -2370,40 +2585,45 @@ def start(
2370
2585
  '--idle-minutes-to-autostop must be set if --down is set.')
2371
2586
  to_start = []
2372
2587
 
2588
+ cluster_records = None
2373
2589
  if not clusters and not all:
2374
2590
  # UX: frequently users may have only 1 cluster. In this case, be smart
2375
2591
  # and default to that unique choice.
2376
- all_cluster_names = global_user_state.get_cluster_names_start_with('')
2377
- if len(all_cluster_names) <= 1:
2378
- clusters = all_cluster_names
2592
+ all_clusters = _get_cluster_records_and_set_ssh_config(
2593
+ clusters=None, refresh=common.StatusRefreshMode.AUTO)
2594
+ if len(all_clusters) <= 1:
2595
+ cluster_records = all_clusters
2379
2596
  else:
2380
2597
  raise click.UsageError(
2381
2598
  '`sky start` requires either a cluster name or glob '
2382
2599
  '(see `sky status`), or the -a/--all flag.')
2383
2600
 
2384
2601
  if all:
2385
- if len(clusters) > 0:
2602
+ if clusters:
2386
2603
  click.echo('Both --all and cluster(s) specified for sky start. '
2387
2604
  'Letting --all take effect.')
2388
2605
 
2606
+ all_clusters = _get_cluster_records_and_set_ssh_config(
2607
+ clusters=None, refresh=common.StatusRefreshMode.AUTO)
2608
+
2389
2609
  # Get all clusters that are not controllers.
2390
- clusters = [
2391
- cluster['name']
2392
- for cluster in global_user_state.get_clusters()
2610
+ cluster_records = [
2611
+ cluster for cluster in all_clusters
2393
2612
  if controller_utils.Controllers.from_name(cluster['name']) is None
2394
2613
  ]
2614
+ if cluster_records is None:
2615
+ # Get GLOB cluster names
2616
+ cluster_records = _get_cluster_records_and_set_ssh_config(
2617
+ clusters, refresh=common.StatusRefreshMode.AUTO)
2395
2618
 
2396
- if not clusters:
2619
+ if not cluster_records:
2397
2620
  click.echo('Cluster(s) not found (tip: see `sky status`). Do you '
2398
2621
  'mean to use `sky launch` to provision a new cluster?')
2399
2622
  return
2400
2623
  else:
2401
- # Get GLOB cluster names
2402
- clusters = _get_glob_clusters(clusters)
2403
-
2404
- for name in clusters:
2405
- cluster_status, _ = backend_utils.refresh_cluster_status_handle(
2406
- name)
2624
+ for cluster in cluster_records:
2625
+ name = cluster['name']
2626
+ cluster_status = cluster['status']
2407
2627
  # A cluster may have one of the following states:
2408
2628
  #
2409
2629
  # STOPPED - ok to restart
@@ -2461,8 +2681,8 @@ def start(
2461
2681
  'is currently not supported.\n'
2462
2682
  'Please start the former independently.')
2463
2683
  if controllers:
2464
- bold = backend_utils.BOLD
2465
- reset_bold = backend_utils.RESET_BOLD
2684
+ bold = ux_utils.BOLD
2685
+ reset_bold = ux_utils.RESET_BOLD
2466
2686
  if len(controllers) != 1:
2467
2687
  raise click.UsageError(
2468
2688
  'Starting multiple controllers is currently not supported.\n'
@@ -2483,18 +2703,25 @@ def start(
2483
2703
  abort=True,
2484
2704
  show_default=True)
2485
2705
 
2486
- for name in to_start:
2706
+ request_ids = subprocess_utils.run_in_parallel(
2707
+ lambda name: sdk.start(name,
2708
+ idle_minutes_to_autostop,
2709
+ retry_until_up,
2710
+ down=down,
2711
+ force=force), to_start)
2712
+
2713
+ for name, request_id in zip(to_start, request_ids):
2487
2714
  try:
2488
- core.start(name,
2489
- idle_minutes_to_autostop,
2490
- retry_until_up,
2491
- down=down,
2492
- force=force)
2715
+ _async_call_or_wait(request_id, async_call, 'sky.start')
2716
+ if not async_call:
2717
+ # Add ssh config for the cluster
2718
+ _get_cluster_records_and_set_ssh_config(clusters=[name])
2493
2719
  except (exceptions.NotSupportedError,
2494
2720
  exceptions.ClusterOwnerIdentityMismatchError) as e:
2495
2721
  click.echo(str(e))
2496
2722
  else:
2497
- click.secho(f'Cluster {name} started.', fg='green')
2723
+ if not async_call:
2724
+ click.secho(f'Cluster {name} started.', fg='green')
2498
2725
 
2499
2726
 
2500
2727
  @cli.command(cls=_DocumentedCodeCommand)
@@ -2504,10 +2731,15 @@ def start(
2504
2731
  **_get_shell_complete_args(_complete_cluster_name))
2505
2732
  @click.option('--all',
2506
2733
  '-a',
2507
- default=None,
2734
+ default=False,
2508
2735
  is_flag=True,
2509
2736
  help='Tear down all existing clusters.')
2510
- @click.option('--yes',
2737
+ @click.option('--all-users',
2738
+ '-u',
2739
+ default=False,
2740
+ is_flag=True,
2741
+ help='Tear down all existing clusters for all users.')
2742
+ @click.option('--yes',
2511
2743
  '-y',
2512
2744
  is_flag=True,
2513
2745
  default=False,
@@ -2525,12 +2757,15 @@ def start(
2525
2757
  ' in certain manual troubleshooting scenarios; with it set, it is the'
2526
2758
  ' user\'s responsibility to ensure there are no leaked instances and '
2527
2759
  'related resources.'))
2760
+ @_add_click_options(_COMMON_OPTIONS)
2528
2761
  @usage_lib.entrypoint
2529
2762
  def down(
2530
2763
  clusters: List[str],
2531
- all: Optional[bool], # pylint: disable=redefined-builtin
2764
+ all: bool, # pylint: disable=redefined-builtin
2765
+ all_users: bool,
2532
2766
  yes: bool,
2533
2767
  purge: bool,
2768
+ async_call: bool,
2534
2769
  ):
2535
2770
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
2536
2771
  """Tear down cluster(s).
@@ -2562,12 +2797,15 @@ def down(
2562
2797
  """
2563
2798
  _down_or_stop_clusters(clusters,
2564
2799
  apply_to_all=all,
2800
+ all_users=all_users,
2565
2801
  down=True,
2566
2802
  no_confirm=yes,
2567
- purge=purge)
2803
+ purge=purge,
2804
+ async_call=async_call)
2568
2805
 
2569
2806
 
2570
- def _hint_or_raise_for_down_jobs_controller(controller_name: str):
2807
+ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
2808
+ purge: bool) -> None:
2571
2809
  """Helper function to check job controller status before tearing it down.
2572
2810
 
2573
2811
  Raises helpful exceptions and errors if the controller is not in a safe
@@ -2582,11 +2820,13 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str):
2582
2820
  controller = controller_utils.Controllers.from_name(controller_name)
2583
2821
  assert controller is not None, controller_name
2584
2822
 
2585
- with rich_utils.safe_status(
2823
+ with rich_utils.client_status(
2586
2824
  '[bold cyan]Checking for in-progress managed jobs[/]'):
2587
2825
  try:
2588
- managed_jobs_ = managed_jobs.queue(refresh=False,
2589
- skip_finished=True)
2826
+ request_id = managed_jobs.queue(refresh=False,
2827
+ skip_finished=True,
2828
+ all_users=True)
2829
+ managed_jobs_ = sdk.stream_and_get(request_id)
2590
2830
  except exceptions.ClusterNotUpError as e:
2591
2831
  if controller.value.connection_error_hint in str(e):
2592
2832
  with ux_utils.print_exception_no_traceback():
@@ -2609,19 +2849,26 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str):
2609
2849
  'jobs (output of `sky jobs queue`) will be lost.')
2610
2850
  click.echo(msg)
2611
2851
  if managed_jobs_:
2612
- job_table = managed_jobs.format_job_table(managed_jobs_, show_all=False)
2852
+ job_table = managed_jobs.format_job_table(managed_jobs_,
2853
+ show_all=False,
2854
+ show_user=True)
2613
2855
  msg = controller.value.decline_down_for_dirty_controller_hint
2614
2856
  # Add prefix to each line to align with the bullet point.
2615
2857
  msg += '\n'.join(
2616
2858
  [' ' + line for line in job_table.split('\n') if line != ''])
2617
- with ux_utils.print_exception_no_traceback():
2618
- raise exceptions.NotSupportedError(msg)
2859
+ if purge:
2860
+ logger.warning('--purge is set, ignoring the in-progress managed '
2861
+ 'jobs. This could cause leaked clusters!')
2862
+ else:
2863
+ with ux_utils.print_exception_no_traceback():
2864
+ raise exceptions.NotSupportedError(msg)
2619
2865
  else:
2620
2866
  click.echo(' * No in-progress managed jobs found. It should be safe to '
2621
2867
  'terminate (see caveats above).')
2622
2868
 
2623
2869
 
2624
- def _hint_or_raise_for_down_sky_serve_controller(controller_name: str):
2870
+ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
2871
+ purge: bool) -> None:
2625
2872
  """Helper function to check serve controller status before tearing it down.
2626
2873
 
2627
2874
  Raises helpful exceptions and errors if the controller is not in a safe
@@ -2635,9 +2882,10 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str):
2635
2882
  """
2636
2883
  controller = controller_utils.Controllers.from_name(controller_name)
2637
2884
  assert controller is not None, controller_name
2638
- with rich_utils.safe_status('[bold cyan]Checking for live services[/]'):
2885
+ with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
2639
2886
  try:
2640
- services = serve_lib.status()
2887
+ request_id = serve_lib.status(service_names=None)
2888
+ services = sdk.stream_and_get(request_id)
2641
2889
  except exceptions.ClusterNotUpError as e:
2642
2890
  if controller.value.connection_error_hint in str(e):
2643
2891
  with ux_utils.print_exception_no_traceback():
@@ -2654,35 +2902,52 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str):
2654
2902
 
2655
2903
  if services:
2656
2904
  service_names = [service['name'] for service in services]
2657
- with ux_utils.print_exception_no_traceback():
2658
- msg = (
2659
- controller.value.decline_down_for_dirty_controller_hint.format(
2660
- service_names=', '.join(service_names)))
2661
- raise exceptions.NotSupportedError(msg)
2905
+ if purge:
2906
+ logger.warning('--purge is set, ignoring the in-progress services. '
2907
+ 'This could cause leaked clusters!')
2908
+ else:
2909
+ with ux_utils.print_exception_no_traceback():
2910
+ msg = (controller.value.decline_down_for_dirty_controller_hint.
2911
+ format(service_names=', '.join(service_names)))
2912
+ raise exceptions.NotSupportedError(msg)
2662
2913
  # Do nothing for STOPPED state, as it is safe to terminate the cluster.
2663
2914
  click.echo(f'Terminate sky serve controller: {controller_name}.')
2664
2915
 
2665
2916
 
2666
- _CONTROLLER_TO_HINT_OR_RAISE = {
2667
- controller_utils.Controllers.JOBS_CONTROLLER:
2668
- (_hint_or_raise_for_down_jobs_controller),
2669
- controller_utils.Controllers.SKY_SERVE_CONTROLLER:
2670
- (_hint_or_raise_for_down_sky_serve_controller),
2671
- }
2917
+ def _controller_to_hint_or_raise(
2918
+ controller: controller_utils.Controllers
2919
+ ) -> Callable[[str, bool], None]:
2920
+ if controller == controller_utils.Controllers.JOBS_CONTROLLER:
2921
+ return _hint_or_raise_for_down_jobs_controller
2922
+ return _hint_or_raise_for_down_sky_serve_controller
2672
2923
 
2673
2924
 
2674
2925
  def _down_or_stop_clusters(
2675
2926
  names: List[str],
2676
- apply_to_all: Optional[bool],
2677
- down: bool, # pylint: disable=redefined-outer-name
2678
- no_confirm: bool,
2927
+ apply_to_all: bool = False,
2928
+ all_users: bool = False,
2929
+ down: bool = False, # pylint: disable=redefined-outer-name
2930
+ no_confirm: bool = True,
2679
2931
  purge: bool = False,
2680
- idle_minutes_to_autostop: Optional[int] = None) -> None:
2932
+ idle_minutes_to_autostop: Optional[int] = None,
2933
+ async_call: bool = False) -> None:
2681
2934
  """Tears down or (auto-)stops a cluster (or all clusters).
2682
2935
 
2683
2936
  Controllers (jobs controller and sky serve controller) can only be
2684
2937
  terminated if the cluster name is explicitly and uniquely specified (not
2685
2938
  via glob).
2939
+
2940
+ Args:
2941
+ names: The names of the clusters to tear down or stop. If empty,
2942
+ apply_to_all or all_users must be set.
2943
+ apply_to_all: If True, apply the operation to all clusters.
2944
+ all_users: If True, apply the operation to all clusters for all users.
2945
+ down: If True, tear down the clusters.
2946
+ no_confirm: If True, skip the confirmation prompt.
2947
+ purge: If True, forcefully remove the clusters from the cluster table.
2948
+ idle_minutes_to_autostop: The number of minutes to wait before
2949
+ automatically stopping the cluster.
2950
+ async_call: If True, send the request asynchronously.
2686
2951
  """
2687
2952
  if down:
2688
2953
  command = 'down'
@@ -2690,17 +2955,12 @@ def _down_or_stop_clusters(
2690
2955
  command = 'autostop'
2691
2956
  else:
2692
2957
  command = 'stop'
2693
- if not names and apply_to_all is None:
2694
- # UX: frequently users may have only 1 cluster. In this case, 'sky
2695
- # stop/down' without args should be smart and default to that unique
2696
- # choice.
2697
- all_cluster_names = global_user_state.get_cluster_names_start_with('')
2698
- if len(all_cluster_names) <= 1:
2699
- names = all_cluster_names
2700
- else:
2701
- raise click.UsageError(
2702
- f'`sky {command}` requires either a cluster name or glob '
2703
- '(see `sky status`), or the -a/--all flag.')
2958
+ if not names and not apply_to_all and not all_users:
2959
+ raise click.UsageError(
2960
+ f'`sky {command}` requires either a cluster name or glob '
2961
+ '(see `sky status`), or the -a/--all flag for all your '
2962
+ 'clusters, or the -u/--all-users flag for all clusters in '
2963
+ 'your team.')
2704
2964
 
2705
2965
  operation = 'Terminating' if down else 'Stopping'
2706
2966
  if idle_minutes_to_autostop is not None:
@@ -2711,21 +2971,23 @@ def _down_or_stop_clusters(
2711
2971
  option_str = '{stop,down}'
2712
2972
  operation = f'{verb} auto{option_str} on'
2713
2973
 
2714
- if len(names) > 0:
2974
+ names = list(names)
2975
+ if names:
2715
2976
  controllers = [
2716
2977
  name for name in names
2717
2978
  if controller_utils.Controllers.from_name(name) is not None
2718
2979
  ]
2719
2980
  controllers_str = ', '.join(map(repr, controllers))
2720
2981
  names = [
2721
- name for name in _get_glob_clusters(names)
2722
- if controller_utils.Controllers.from_name(name) is None
2982
+ cluster['name']
2983
+ for cluster in _get_cluster_records_and_set_ssh_config(names)
2984
+ if controller_utils.Controllers.from_name(cluster['name']) is None
2723
2985
  ]
2724
2986
 
2725
2987
  # Make sure the controllers are explicitly specified without other
2726
2988
  # normal clusters.
2727
2989
  if controllers:
2728
- if len(names) != 0:
2990
+ if names:
2729
2991
  names_str = ', '.join(map(repr, names))
2730
2992
  raise click.UsageError(
2731
2993
  f'{operation} controller(s) '
@@ -2746,7 +3008,7 @@ def _down_or_stop_clusters(
2746
3008
  controller = controller_utils.Controllers.from_name(
2747
3009
  controller_name)
2748
3010
  assert controller is not None
2749
- hint_or_raise = _CONTROLLER_TO_HINT_OR_RAISE[controller]
3011
+ hint_or_raise = _controller_to_hint_or_raise(controller)
2750
3012
  try:
2751
3013
  # TODO(zhwu): This hint or raise is not transactional, which
2752
3014
  # means even if it passed the check with no in-progress spot
@@ -2755,7 +3017,7 @@ def _down_or_stop_clusters(
2755
3017
  # `sky serve up` before typing the delete, causing a leaked
2756
3018
  # managed job or service. We should make this check atomic
2757
3019
  # with the termination.
2758
- hint_or_raise(controller_name)
3020
+ hint_or_raise(controller_name, purge)
2759
3021
  except (exceptions.ClusterOwnerIdentityMismatchError,
2760
3022
  RuntimeError) as e:
2761
3023
  if purge:
@@ -2776,9 +3038,10 @@ def _down_or_stop_clusters(
2776
3038
  no_confirm = True
2777
3039
  names += controllers
2778
3040
 
2779
- if apply_to_all:
2780
- all_clusters = global_user_state.get_clusters()
2781
- if len(names) > 0:
3041
+ if apply_to_all or all_users:
3042
+ all_clusters = _get_cluster_records_and_set_ssh_config(
3043
+ clusters=None, all_users=all_users)
3044
+ if names:
2782
3045
  click.echo(
2783
3046
  f'Both --all and cluster(s) specified for `sky {command}`. '
2784
3047
  'Letting --all take effect.')
@@ -2790,22 +3053,14 @@ def _down_or_stop_clusters(
2790
3053
  if controller_utils.Controllers.from_name(record['name']) is None
2791
3054
  ]
2792
3055
 
2793
- clusters = []
2794
- for name in names:
2795
- handle = global_user_state.get_handle_from_cluster_name(name)
2796
- if handle is None:
2797
- # This codepath is used for 'sky down -p <controller>' when the
2798
- # controller is not in 'sky status'. Cluster-not-found message
2799
- # should've been printed by _get_glob_clusters() above.
2800
- continue
2801
- clusters.append(name)
3056
+ clusters = names
2802
3057
  usage_lib.record_cluster_name_for_current_operation(clusters)
2803
3058
 
2804
3059
  if not clusters:
2805
3060
  click.echo('Cluster(s) not found (tip: see `sky status`).')
2806
3061
  return
2807
3062
 
2808
- if not no_confirm and len(clusters) > 0:
3063
+ if not no_confirm and clusters:
2809
3064
  cluster_str = 'clusters' if len(clusters) > 1 else 'cluster'
2810
3065
  cluster_list = ', '.join(clusters)
2811
3066
  click.confirm(
@@ -2823,11 +3078,17 @@ def _down_or_stop_clusters(
2823
3078
  f'[bold cyan]{operation} {len(clusters)} cluster{plural}[/]',
2824
3079
  total=len(clusters))
2825
3080
 
3081
+ request_ids = []
3082
+
2826
3083
  def _down_or_stop(name: str):
2827
3084
  success_progress = False
2828
3085
  if idle_minutes_to_autostop is not None:
2829
3086
  try:
2830
- core.autostop(name, idle_minutes_to_autostop, down)
3087
+ request_id = sdk.autostop(name, idle_minutes_to_autostop, down)
3088
+ request_ids.append(request_id)
3089
+ _async_call_or_wait(
3090
+ request_id, async_call,
3091
+ server_constants.REQUEST_NAME_PREFIX + operation)
2831
3092
  except (exceptions.NotSupportedError,
2832
3093
  exceptions.ClusterNotUpError) as e:
2833
3094
  message = str(e)
@@ -2850,9 +3111,17 @@ def _down_or_stop_clusters(
2850
3111
  else:
2851
3112
  try:
2852
3113
  if down:
2853
- core.down(name, purge=purge)
3114
+ request_id = sdk.down(name, purge=purge)
2854
3115
  else:
2855
- core.stop(name, purge=purge)
3116
+ request_id = sdk.stop(name, purge=purge)
3117
+ request_ids.append(request_id)
3118
+ _async_call_or_wait(
3119
+ request_id, async_call,
3120
+ server_constants.REQUEST_NAME_PREFIX + operation)
3121
+ if not async_call:
3122
+ # Remove the cluster from the SSH config file as soon as it
3123
+ # is stopped or downed.
3124
+ cluster_utils.SSHConfigHelper.remove_cluster(name)
2856
3125
  except RuntimeError as e:
2857
3126
  message = (
2858
3127
  f'{colorama.Fore.RED}{operation} cluster {name}...failed. '
@@ -2883,6 +3152,10 @@ def _down_or_stop_clusters(
2883
3152
  # Make sure the progress bar not mess up the terminal.
2884
3153
  progress.refresh()
2885
3154
 
3155
+ if async_call:
3156
+ click.secho(f'{operation} requests are sent. Check the requests\' '
3157
+ 'status with `sky request get <request_id>`.')
3158
+
2886
3159
 
2887
3160
  @cli.command(cls=_DocumentedCodeCommand)
2888
3161
  @click.argument('clouds', required=False, type=str, nargs=-1)
@@ -2892,6 +3165,7 @@ def _down_or_stop_clusters(
2892
3165
  default=False,
2893
3166
  help='Show the activated account for each cloud.')
2894
3167
  @usage_lib.entrypoint
3168
+ # pylint: disable=redefined-outer-name
2895
3169
  def check(clouds: Tuple[str], verbose: bool):
2896
3170
  """Check which clouds are available to use.
2897
3171
 
@@ -2915,7 +3189,12 @@ def check(clouds: Tuple[str], verbose: bool):
2915
3189
  sky check aws gcp
2916
3190
  """
2917
3191
  clouds_arg = clouds if len(clouds) > 0 else None
2918
- sky_check.check(verbose=verbose, clouds=clouds_arg)
3192
+ request_id = sdk.check(clouds=clouds_arg, verbose=verbose)
3193
+ sdk.stream_and_get(request_id)
3194
+ api_server_url = server_common.get_server_url()
3195
+ click.echo()
3196
+ click.echo(
3197
+ click.style(f'Using SkyPilot API server: {api_server_url}', fg='green'))
2919
3198
 
2920
3199
 
2921
3200
  @cli.command()
@@ -2972,9 +3251,9 @@ def show_gpus(
2972
3251
  and spot instances. There may be multiple regions with the same lowest
2973
3252
  price.
2974
3253
 
2975
- If ``--cloud kubernetes`` is specified, it will show the maximum quantities
2976
- of the GPU available on a single node and the real-time availability of
2977
- the GPU across all nodes in the Kubernetes cluster.
3254
+ If ``--cloud kubernetes`` or ``--cloud k8s`` is specified, it will show the
3255
+ maximum quantities of the GPU available on a single node and the real-time
3256
+ availability of the GPU across all nodes in the Kubernetes cluster.
2978
3257
 
2979
3258
  Definitions of certain fields:
2980
3259
 
@@ -3008,49 +3287,45 @@ def show_gpus(
3008
3287
  '--all-regions and --region flags cannot be used simultaneously.')
3009
3288
 
3010
3289
  # This will validate 'cloud' and raise if not found.
3011
- cloud_obj = sky_clouds.CLOUD_REGISTRY.from_str(cloud)
3012
- service_catalog.validate_region_zone(region, None, clouds=cloud)
3290
+ cloud_obj = registry.CLOUD_REGISTRY.from_str(cloud)
3291
+ cloud_name = str(cloud_obj).lower() if cloud is not None else None
3013
3292
  show_all = all
3014
3293
  if show_all and accelerator_str is not None:
3015
3294
  raise click.UsageError('--all is only allowed without a GPU name.')
3016
3295
 
3017
3296
  # Kubernetes specific bools
3018
- cloud_is_kubernetes = isinstance(cloud_obj, sky_clouds.Kubernetes)
3297
+ enabled_clouds = sdk.get(sdk.enabled_clouds())
3298
+ cloud_is_kubernetes = isinstance(cloud_obj, clouds.Kubernetes)
3299
+ # TODO(romilb): We should move this to the backend.
3019
3300
  kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None
3020
- kubernetes_is_enabled = sky_clouds.cloud_in_iterable(
3021
- sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds())
3022
-
3023
- if cloud_is_kubernetes and region is not None:
3024
- raise click.UsageError(
3025
- 'The --region flag cannot be set with --cloud kubernetes.')
3301
+ kubernetes_is_enabled = clouds.cloud_in_iterable(
3302
+ clouds.Kubernetes(),
3303
+ enabled_clouds,
3304
+ )
3026
3305
 
3027
3306
  def _list_to_str(lst):
3028
3307
  return ', '.join([str(e) for e in lst])
3029
3308
 
3309
+ # TODO(zhwu,romilb): We should move most of these kubernetes related
3310
+ # queries into the backend, especially behind the server.
3030
3311
  def _get_kubernetes_realtime_gpu_table(
3312
+ context: Optional[str] = None,
3031
3313
  name_filter: Optional[str] = None,
3032
3314
  quantity_filter: Optional[int] = None):
3033
3315
  if quantity_filter:
3034
3316
  qty_header = 'QTY_FILTER'
3035
3317
  free_header = 'FILTERED_FREE_GPUS'
3036
3318
  else:
3037
- qty_header = 'QTY_PER_NODE'
3319
+ qty_header = 'REQUESTABLE_QTY_PER_NODE'
3038
3320
  free_header = 'TOTAL_FREE_GPUS'
3039
3321
  realtime_gpu_table = log_utils.create_table(
3040
3322
  ['GPU', qty_header, 'TOTAL_GPUS', free_header])
3041
- counts, capacity, available = service_catalog.list_accelerator_realtime(
3042
- gpus_only=True,
3043
- clouds='kubernetes',
3044
- name_filter=name_filter,
3045
- region_filter=region,
3046
- quantity_filter=quantity_filter,
3047
- case_sensitive=False)
3048
- assert (set(counts.keys()) == set(capacity.keys()) == set(
3049
- available.keys())), (f'Keys of counts ({list(counts.keys())}), '
3050
- f'capacity ({list(capacity.keys())}), '
3051
- f'and available ({list(available.keys())}) '
3052
- 'must be same.')
3053
- if len(counts) == 0:
3323
+ realtime_gpu_availability_list = sdk.stream_and_get(
3324
+ sdk.realtime_kubernetes_gpu_availability(
3325
+ context=context,
3326
+ name_filter=name_filter,
3327
+ quantity_filter=quantity_filter))
3328
+ if not realtime_gpu_availability_list:
3054
3329
  err_msg = 'No GPUs found in Kubernetes cluster. '
3055
3330
  debug_msg = 'To further debug, run: sky check '
3056
3331
  if name_filter is not None:
@@ -3062,17 +3337,43 @@ def show_gpus(
3062
3337
  'in Kubernetes cluster. ')
3063
3338
  debug_msg = ('To show available accelerators on kubernetes,'
3064
3339
  ' run: sky show-gpus --cloud kubernetes ')
3065
- full_err_msg = (err_msg + kubernetes_utils.NO_GPU_HELP_MESSAGE +
3340
+ full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
3066
3341
  debug_msg)
3067
3342
  raise ValueError(full_err_msg)
3068
- for gpu, _ in sorted(counts.items()):
3343
+ no_permissions_str = '<no permissions>'
3344
+ for realtime_gpu_availability in sorted(realtime_gpu_availability_list):
3345
+ gpu_availability = models.RealtimeGpuAvailability(
3346
+ *realtime_gpu_availability)
3347
+ available_qty = (gpu_availability.available
3348
+ if gpu_availability.available != -1 else
3349
+ no_permissions_str)
3069
3350
  realtime_gpu_table.add_row([
3070
- gpu,
3071
- _list_to_str(counts.pop(gpu)), capacity[gpu], available[gpu]
3351
+ gpu_availability.gpu,
3352
+ _list_to_str(gpu_availability.counts),
3353
+ gpu_availability.capacity,
3354
+ available_qty,
3072
3355
  ])
3073
3356
  return realtime_gpu_table
3074
3357
 
3075
- def _output():
3358
+ # TODO(zhwu): this needs to run on remote server.
3359
+ def _get_kubernetes_node_info_table(context: Optional[str]):
3360
+ node_table = log_utils.create_table(
3361
+ ['NODE_NAME', 'GPU_NAME', 'TOTAL_GPUS', 'FREE_GPUS'])
3362
+
3363
+ no_permissions_str = '<no permissions>'
3364
+ node_info_dict = sdk.stream_and_get(
3365
+ sdk.kubernetes_node_info(context=context))
3366
+ for node_name, node_info in node_info_dict.items():
3367
+ available = node_info.free[
3368
+ 'accelerators_available'] if node_info.free[
3369
+ 'accelerators_available'] != -1 else no_permissions_str
3370
+ node_table.add_row([
3371
+ node_name, node_info.accelerator_type,
3372
+ node_info.total['accelerator_count'], available
3373
+ ])
3374
+ return node_table
3375
+
3376
+ def _output() -> Generator[str, None, None]:
3076
3377
  gpu_table = log_utils.create_table(
3077
3378
  ['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
3078
3379
  tpu_table = log_utils.create_table(
@@ -3085,8 +3386,8 @@ def show_gpus(
3085
3386
  # Optimization - do not poll for Kubernetes API for fetching
3086
3387
  # common GPUs because that will be fetched later for the table after
3087
3388
  # common GPUs.
3088
- clouds_to_list = cloud
3089
- if cloud is None:
3389
+ clouds_to_list: Union[Optional[str], List[str]] = cloud_name
3390
+ if cloud_name is None:
3090
3391
  clouds_to_list = [
3091
3392
  c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes'
3092
3393
  ]
@@ -3096,12 +3397,16 @@ def show_gpus(
3096
3397
  # Collect k8s related messages in k8s_messages and print them at end
3097
3398
  print_section_titles = False
3098
3399
  # If cloud is kubernetes, we want to show real-time capacity
3099
- if kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes):
3400
+ if kubernetes_is_enabled and (cloud_name is None or
3401
+ cloud_is_kubernetes):
3402
+ context = region
3403
+
3100
3404
  try:
3101
3405
  # If --cloud kubernetes is not specified, we want to catch
3102
3406
  # the case where no GPUs are available on the cluster and
3103
3407
  # print the warning at the end.
3104
- k8s_realtime_table = _get_kubernetes_realtime_gpu_table()
3408
+ k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
3409
+ context)
3105
3410
  except ValueError as e:
3106
3411
  if not cloud_is_kubernetes:
3107
3412
  # Make it a note if cloud is not kubernetes
@@ -3109,9 +3414,27 @@ def show_gpus(
3109
3414
  k8s_messages += str(e)
3110
3415
  else:
3111
3416
  print_section_titles = True
3417
+ context_str = f'(Context: {context})' if context else ''
3112
3418
  yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3113
- f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
3419
+ f'Kubernetes GPUs {context_str}'
3420
+ f'{colorama.Style.RESET_ALL}\n')
3114
3421
  yield from k8s_realtime_table.get_string()
3422
+ k8s_node_table = _get_kubernetes_node_info_table(context)
3423
+ yield '\n\n'
3424
+ # TODO(Doyoung): Update the message with the multi-host TPU
3425
+ # support.
3426
+ k8s_per_node_acc_message = (
3427
+ 'Kubernetes per node accelerator availability ')
3428
+ if kubernetes_utils.multi_host_tpu_exists_in_cluster(
3429
+ context):
3430
+ k8s_per_node_acc_message += (
3431
+ '(Note: Multi-host TPUs are detected and excluded '
3432
+ 'from the display as multi-host TPUs are not '
3433
+ 'supported.)')
3434
+ yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3435
+ f'{k8s_per_node_acc_message}'
3436
+ f'{colorama.Style.RESET_ALL}\n')
3437
+ yield from k8s_node_table.get_string()
3115
3438
  if kubernetes_autoscaling:
3116
3439
  k8s_messages += (
3117
3440
  '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
@@ -3129,11 +3452,14 @@ def show_gpus(
3129
3452
  yield k8s_messages
3130
3453
  yield '\n\n'
3131
3454
 
3132
- result = service_catalog.list_accelerator_counts(
3133
- gpus_only=True,
3134
- clouds=clouds_to_list,
3135
- region_filter=region,
3136
- )
3455
+ result = sdk.stream_and_get(
3456
+ sdk.list_accelerator_counts(
3457
+ gpus_only=True,
3458
+ clouds=clouds_to_list,
3459
+ region_filter=region,
3460
+ ))
3461
+ # TODO(zhwu): handle the case where no accelerators are found,
3462
+ # especially when --region specified a non-existent region.
3137
3463
 
3138
3464
  if print_section_titles:
3139
3465
  # If section titles were printed above, print again here
@@ -3151,7 +3477,7 @@ def show_gpus(
3151
3477
  for tpu in service_catalog.get_tpus():
3152
3478
  if tpu in result:
3153
3479
  tpu_table.add_row([tpu, _list_to_str(result.pop(tpu))])
3154
- if len(tpu_table.get_string()) > 0:
3480
+ if tpu_table.get_string():
3155
3481
  yield '\n\n'
3156
3482
  yield from tpu_table.get_string()
3157
3483
 
@@ -3192,13 +3518,14 @@ def show_gpus(
3192
3518
  name, quantity = accelerator_str, None
3193
3519
 
3194
3520
  print_section_titles = False
3195
- if (kubernetes_is_enabled and (cloud is None or cloud_is_kubernetes) and
3196
- not show_all):
3521
+ if (kubernetes_is_enabled and
3522
+ (cloud_name is None or cloud_is_kubernetes) and not show_all):
3197
3523
  # Print section title if not showing all and instead a specific
3198
3524
  # accelerator is requested
3199
3525
  print_section_titles = True
3200
3526
  yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3201
3527
  f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
3528
+ # TODO(romilb): Show filtered per node GPU availability here as well
3202
3529
  try:
3203
3530
  k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
3204
3531
  name_filter=name, quantity_filter=quantity)
@@ -3220,16 +3547,17 @@ def show_gpus(
3220
3547
 
3221
3548
  # For clouds other than Kubernetes, get the accelerator details
3222
3549
  # Case-sensitive
3223
- result = service_catalog.list_accelerators(gpus_only=True,
3224
- name_filter=name,
3225
- quantity_filter=quantity,
3226
- region_filter=region,
3227
- clouds=clouds_to_list,
3228
- case_sensitive=False,
3229
- all_regions=all_regions)
3550
+ result = sdk.stream_and_get(
3551
+ sdk.list_accelerators(gpus_only=True,
3552
+ name_filter=name,
3553
+ quantity_filter=quantity,
3554
+ region_filter=region,
3555
+ clouds=clouds_to_list,
3556
+ case_sensitive=False,
3557
+ all_regions=all_regions))
3230
3558
  # Import here to save module load speed.
3231
3559
  # pylint: disable=import-outside-toplevel,line-too-long
3232
- from sky.clouds.service_catalog import common
3560
+ from sky.clouds.service_catalog import common as catalog_common
3233
3561
 
3234
3562
  # For each gpu name (count not included):
3235
3563
  # - Group by cloud
@@ -3250,7 +3578,7 @@ def show_gpus(
3250
3578
  df = df.sort_values(by=['min_price', 'min_spot_price'])
3251
3579
  df = df.drop(columns=['min_price', 'min_spot_price'])
3252
3580
  sorted_dataclasses = [
3253
- common.InstanceTypeInfo(*row)
3581
+ catalog_common.InstanceTypeInfo(*row)
3254
3582
  for row in df.to_records(index=False)
3255
3583
  ]
3256
3584
  new_result[gpu] = sorted_dataclasses
@@ -3261,10 +3589,10 @@ def show_gpus(
3261
3589
  yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3262
3590
  f'Cloud GPUs{colorama.Style.RESET_ALL}\n')
3263
3591
 
3264
- if len(result) == 0:
3592
+ if not result:
3265
3593
  quantity_str = (f' with requested quantity {quantity}'
3266
3594
  if quantity else '')
3267
- cloud_str = f' on {cloud_obj}.' if cloud else ' in cloud catalogs.'
3595
+ cloud_str = f' on {cloud_obj}.' if cloud_name else ' in cloud catalogs.'
3268
3596
  yield f'Resources \'{name}\'{quantity_str} not found{cloud_str} '
3269
3597
  yield 'To show available accelerators, run: sky show-gpus --all'
3270
3598
  return
@@ -3325,10 +3653,11 @@ def show_gpus(
3325
3653
  yield '\n\n'
3326
3654
  yield from accelerator_table.get_string()
3327
3655
 
3656
+ outputs = _output()
3328
3657
  if show_all:
3329
- click.echo_via_pager(_output())
3658
+ click.echo_via_pager(outputs)
3330
3659
  else:
3331
- for out in _output():
3660
+ for out in outputs:
3332
3661
  click.echo(out, nl=False)
3333
3662
  click.echo()
3334
3663
 
@@ -3340,18 +3669,20 @@ def storage():
3340
3669
 
3341
3670
 
3342
3671
  @storage.command('ls', cls=_DocumentedCodeCommand)
3343
- @click.option('--all',
3344
- '-a',
3672
+ @click.option('--verbose',
3673
+ '-v',
3345
3674
  default=False,
3346
3675
  is_flag=True,
3347
3676
  required=False,
3348
3677
  help='Show all information in full.')
3349
3678
  @usage_lib.entrypoint
3350
3679
  # pylint: disable=redefined-builtin
3351
- def storage_ls(all: bool):
3680
+ def storage_ls(verbose: bool):
3352
3681
  """List storage objects managed by SkyPilot."""
3353
- storages = sky.storage_ls()
3354
- storage_table = storage_utils.format_storage_table(storages, show_all=all)
3682
+ request_id = sdk.storage_ls()
3683
+ storages = sdk.stream_and_get(request_id)
3684
+ storage_table = storage_utils.format_storage_table(storages,
3685
+ show_all=verbose)
3355
3686
  click.echo(storage_table)
3356
3687
 
3357
3688
 
@@ -3373,8 +3704,9 @@ def storage_ls(all: bool):
3373
3704
  is_flag=True,
3374
3705
  required=False,
3375
3706
  help='Skip confirmation prompt.')
3707
+ @_add_click_options(_COMMON_OPTIONS)
3376
3708
  @usage_lib.entrypoint
3377
- def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=redefined-builtin
3709
+ def storage_delete(names: List[str], all: bool, yes: bool, async_call: bool): # pylint: disable=redefined-builtin
3378
3710
  """Delete storage objects.
3379
3711
 
3380
3712
  Examples:
@@ -3390,14 +3722,13 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
3390
3722
  # Delete all storage objects.
3391
3723
  sky storage delete -a
3392
3724
  """
3393
- if sum([len(names) > 0, all]) != 1:
3725
+ if sum([bool(names), all]) != 1:
3394
3726
  raise click.UsageError('Either --all or a name must be specified.')
3395
3727
  if all:
3396
- storages = sky.storage_ls()
3728
+ storages = sdk.get(sdk.storage_ls())
3397
3729
  if not storages:
3398
3730
  click.echo('No storage(s) to delete.')
3399
3731
  return
3400
- names = [s['name'] for s in storages]
3401
3732
  else:
3402
3733
  names = _get_glob_storages(names)
3403
3734
  if names:
@@ -3411,13 +3742,25 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
3411
3742
  abort=True,
3412
3743
  show_default=True)
3413
3744
 
3414
- subprocess_utils.run_in_parallel(sky.storage_delete, names)
3745
+ request_ids = {}
3746
+ # TODO(zhwu): Support all flag for the underlying SDK and API server to
3747
+ # avoid multiple requests.
3748
+ for name in names:
3749
+ request_ids[name] = sdk.storage_delete(name)
3415
3750
 
3751
+ for name, request_id in request_ids.items():
3752
+ try:
3753
+ _async_call_or_wait(request_id, async_call, 'sky.storage')
3754
+ except Exception as e: # pylint: disable=broad-except
3755
+ logger.error(f'{colorama.Fore.RED}Error deleting storage {name}: '
3756
+ f'{common_utils.format_exception(e, use_bracket=True)}'
3757
+ f'{colorama.Style.RESET_ALL}')
3416
3758
 
3417
- @cli.group(cls=_NaturalOrderGroup)
3759
+
3760
+ @cli.group(cls=_NaturalOrderGroup, hidden=True)
3418
3761
  def bench():
3419
3762
  """SkyPilot Benchmark CLI."""
3420
- pass
3763
+ raise click.UsageError('The benchmark CLI is currently disabled.')
3421
3764
 
3422
3765
 
3423
3766
  @cli.group(cls=_NaturalOrderGroup)
@@ -3433,13 +3776,14 @@ def jobs():
3433
3776
  nargs=-1,
3434
3777
  **_get_shell_complete_args(_complete_file_name))
3435
3778
  # TODO(zhwu): Add --dryrun option to test the launch command.
3436
- @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS)
3779
+ @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS +
3780
+ _COMMON_OPTIONS)
3437
3781
  @click.option('--cluster',
3438
3782
  '-c',
3439
3783
  default=None,
3440
3784
  type=str,
3441
3785
  hidden=True,
3442
- help=('Alias for --name, the name of the spot job.'))
3786
+ help=('Alias for --name, the name of the managed job.'))
3443
3787
  @click.option('--job-recovery',
3444
3788
  default=None,
3445
3789
  type=str,
@@ -3451,18 +3795,6 @@ def jobs():
3451
3795
  is_flag=True,
3452
3796
  help=('If True, as soon as a job is submitted, return from this call '
3453
3797
  'and do not stream execution logs.'))
3454
- @click.option(
3455
- '--retry-until-up/--no-retry-until-up',
3456
- '-r/-no-r',
3457
- default=None,
3458
- is_flag=True,
3459
- required=False,
3460
- help=(
3461
- '(Default: True; this flag is deprecated and will be removed in a '
3462
- 'future release.) Whether to retry provisioning infinitely until the '
3463
- 'cluster is up, if unavailability errors are encountered. This ' # pylint: disable=bad-docstring-quotes
3464
- 'applies to launching all managed jobs (both the initial and '
3465
- 'any recovery attempts), not the jobs controller.'))
3466
3798
  @click.option('--yes',
3467
3799
  '-y',
3468
3800
  is_flag=True,
@@ -3493,8 +3825,8 @@ def jobs_launch(
3493
3825
  disk_tier: Optional[str],
3494
3826
  ports: Tuple[str],
3495
3827
  detach_run: bool,
3496
- retry_until_up: bool,
3497
3828
  yes: bool,
3829
+ async_call: bool,
3498
3830
  ):
3499
3831
  """Launch a managed job from a YAML or a command.
3500
3832
 
@@ -3536,19 +3868,6 @@ def jobs_launch(
3536
3868
  ports=ports,
3537
3869
  job_recovery=job_recovery,
3538
3870
  )
3539
- # Deprecation. We set the default behavior to be retry until up, and the
3540
- # flag `--retry-until-up` is deprecated. We can remove the flag in 0.8.0.
3541
- if retry_until_up is not None:
3542
- flag_str = '--retry-until-up'
3543
- if not retry_until_up:
3544
- flag_str = '--no-retry-until-up'
3545
- click.secho(
3546
- f'Flag {flag_str} is deprecated and will be removed in a '
3547
- 'future release (managed jobs will always be retried). '
3548
- 'Please file an issue if this does not work for you.',
3549
- fg='yellow')
3550
- else:
3551
- retry_until_up = True
3552
3871
 
3553
3872
  if not isinstance(task_or_dag, sky.Dag):
3554
3873
  assert isinstance(task_or_dag, sky.Task), task_or_dag
@@ -3564,26 +3883,25 @@ def jobs_launch(
3564
3883
  dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
3565
3884
  dag_utils.fill_default_config_in_dag_for_job_launch(dag)
3566
3885
 
3886
+ common_utils.check_cluster_name_is_valid(name)
3887
+
3567
3888
  click.secho(f'Managed job {dag.name!r} will be launched on (estimated):',
3568
3889
  fg='yellow')
3569
- dag = sky.optimize(dag)
3570
-
3571
- if not yes:
3572
- prompt = f'Launching a managed job {dag.name!r}. Proceed?'
3573
- if prompt is not None:
3574
- click.confirm(prompt, default=True, abort=True, show_default=True)
3575
-
3576
- common_utils.check_cluster_name_is_valid(name)
3577
3890
 
3578
- managed_jobs.launch(dag,
3579
- name,
3580
- detach_run=detach_run,
3581
- retry_until_up=retry_until_up)
3891
+ request_id = managed_jobs.launch(dag, name, _need_confirmation=not yes)
3892
+ job_id_handle = _async_call_or_wait(request_id, async_call,
3893
+ 'sky.jobs.launch')
3894
+ if not async_call and not detach_run:
3895
+ job_id = job_id_handle[0]
3896
+ managed_jobs.tail_logs(name=None,
3897
+ job_id=job_id,
3898
+ follow=True,
3899
+ controller=False)
3582
3900
 
3583
3901
 
3584
3902
  @jobs.command('queue', cls=_DocumentedCodeCommand)
3585
- @click.option('--all',
3586
- '-a',
3903
+ @click.option('--verbose',
3904
+ '-v',
3587
3905
  default=False,
3588
3906
  is_flag=True,
3589
3907
  required=False,
@@ -3602,9 +3920,16 @@ def jobs_launch(
3602
3920
  is_flag=True,
3603
3921
  required=False,
3604
3922
  help='Show only pending/running jobs\' information.')
3923
+ @click.option('--all-users',
3924
+ '-u',
3925
+ default=False,
3926
+ is_flag=True,
3927
+ required=False,
3928
+ help='Show jobs from all users.')
3605
3929
  @usage_lib.entrypoint
3606
3930
  # pylint: disable=redefined-builtin
3607
- def jobs_queue(all: bool, refresh: bool, skip_finished: bool):
3931
+ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
3932
+ all_users: bool):
3608
3933
  """Show statuses of managed jobs.
3609
3934
 
3610
3935
  Each managed jobs can have one of the following statuses:
@@ -3658,12 +3983,14 @@ def jobs_queue(all: bool, refresh: bool, skip_finished: bool):
3658
3983
  watch -n60 sky jobs queue
3659
3984
 
3660
3985
  """
3661
- click.secho('Fetching managed job statuses...', fg='yellow')
3662
- with rich_utils.safe_status('[cyan]Checking managed jobs[/]'):
3663
- _, msg = _get_managed_jobs(refresh=refresh,
3664
- skip_finished=skip_finished,
3665
- show_all=all,
3666
- is_called_by_user=True)
3986
+ click.secho('Fetching managed job statuses...', fg='cyan')
3987
+ with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
3988
+ managed_jobs_request_id = managed_jobs.queue(
3989
+ refresh=refresh, skip_finished=skip_finished, all_users=all_users)
3990
+ _, msg = _handle_jobs_queue_request(managed_jobs_request_id,
3991
+ show_all=verbose,
3992
+ show_user=all_users,
3993
+ is_called_by_user=True)
3667
3994
  if not skip_finished:
3668
3995
  in_progress_only_hint = ''
3669
3996
  else:
@@ -3685,16 +4012,23 @@ def jobs_queue(all: bool, refresh: bool, skip_finished: bool):
3685
4012
  is_flag=True,
3686
4013
  default=False,
3687
4014
  required=False,
3688
- help='Cancel all managed jobs.')
4015
+ help='Cancel all managed jobs for the current user.')
3689
4016
  @click.option('--yes',
3690
4017
  '-y',
3691
4018
  is_flag=True,
3692
4019
  default=False,
3693
4020
  required=False,
3694
4021
  help='Skip confirmation prompt.')
4022
+ @click.option('--all-users',
4023
+ '-u',
4024
+ is_flag=True,
4025
+ default=False,
4026
+ required=False,
4027
+ help='Cancel all managed jobs from all users.')
3695
4028
  @usage_lib.entrypoint
3696
4029
  # pylint: disable=redefined-builtin
3697
- def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
4030
+ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
4031
+ all_users: bool):
3698
4032
  """Cancel managed jobs.
3699
4033
 
3700
4034
  You can provide either a job name or a list of job IDs to be cancelled.
@@ -3710,31 +4044,34 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
3710
4044
  # Cancel managed jobs with IDs 1, 2, 3
3711
4045
  $ sky jobs cancel 1 2 3
3712
4046
  """
3713
- backend_utils.is_controller_accessible(
3714
- controller=controller_utils.Controllers.JOBS_CONTROLLER,
3715
- stopped_message='All managed jobs should have finished.',
3716
- exit_if_not_accessible=True)
3717
-
3718
4047
  job_id_str = ','.join(map(str, job_ids))
3719
- if sum([len(job_ids) > 0, name is not None, all]) != 1:
3720
- argument_str = f'--job-ids {job_id_str}' if len(job_ids) > 0 else ''
3721
- argument_str += f' --name {name}' if name is not None else ''
3722
- argument_str += ' --all' if all else ''
4048
+ if sum([bool(job_ids), name is not None, all or all_users]) != 1:
4049
+ arguments = []
4050
+ arguments += [f'--job-ids {job_id_str}'] if job_ids else []
4051
+ arguments += [f'--name {name}'] if name is not None else []
4052
+ arguments += ['--all'] if all else []
4053
+ arguments += ['--all-users'] if all_users else []
3723
4054
  raise click.UsageError(
3724
- 'Can only specify one of JOB_IDS or --name or --all. '
3725
- f'Provided {argument_str!r}.')
4055
+ 'Can only specify one of JOB_IDS, --name, or --all/--all-users. '
4056
+ f'Provided {" ".join(arguments)!r}.')
3726
4057
 
3727
4058
  if not yes:
3728
4059
  job_identity_str = (f'managed jobs with IDs {job_id_str}'
3729
4060
  if job_ids else repr(name))
3730
- if all:
4061
+ if all_users:
4062
+ job_identity_str = 'all managed jobs FOR ALL USERS'
4063
+ elif all:
3731
4064
  job_identity_str = 'all managed jobs'
3732
4065
  click.confirm(f'Cancelling {job_identity_str}. Proceed?',
3733
4066
  default=True,
3734
4067
  abort=True,
3735
4068
  show_default=True)
3736
4069
 
3737
- managed_jobs.cancel(job_ids=job_ids, name=name, all=all)
4070
+ sdk.stream_and_get(
4071
+ managed_jobs.cancel(job_ids=job_ids,
4072
+ name=name,
4073
+ all=all,
4074
+ all_users=all_users))
3738
4075
 
3739
4076
 
3740
4077
  @jobs.command('logs', cls=_DocumentedCodeCommand)
@@ -3755,97 +4092,56 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
3755
4092
  default=False,
3756
4093
  help=('Show the controller logs of this job; useful for debugging '
3757
4094
  'launching/recoveries, etc.'))
4095
+ @click.option(
4096
+ '--refresh',
4097
+ '-r',
4098
+ default=False,
4099
+ is_flag=True,
4100
+ required=False,
4101
+ help='Query the latest job logs, restarting the jobs controller if stopped.'
4102
+ )
4103
+ @click.option('--sync-down',
4104
+ '-s',
4105
+ default=False,
4106
+ is_flag=True,
4107
+ required=False,
4108
+ help='Download logs for all jobs shown in the queue.')
3758
4109
  @click.argument('job_id', required=False, type=int)
3759
4110
  @usage_lib.entrypoint
3760
4111
  def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
3761
- controller: bool):
3762
- """Tail the log of a managed job."""
4112
+ controller: bool, refresh: bool, sync_down: bool):
4113
+ """Tail or sync down the log of a managed job."""
3763
4114
  try:
3764
- managed_jobs.tail_logs(name=name,
3765
- job_id=job_id,
3766
- follow=follow,
3767
- controller=controller)
4115
+ if sync_down:
4116
+ with rich_utils.client_status(
4117
+ ux_utils.spinner_message('Downloading jobs logs')):
4118
+ log_local_path_dict = managed_jobs.download_logs(
4119
+ name=name,
4120
+ job_id=job_id,
4121
+ controller=controller,
4122
+ refresh=refresh)
4123
+ style = colorama.Style
4124
+ fore = colorama.Fore
4125
+ controller_str = ' (controller)' if controller else ''
4126
+ for job, log_local_path in log_local_path_dict.items():
4127
+ logger.info(f'{fore.CYAN}Job {job} logs{controller_str}: '
4128
+ f'{log_local_path}{style.RESET_ALL}')
4129
+ else:
4130
+ managed_jobs.tail_logs(name=name,
4131
+ job_id=job_id,
4132
+ follow=follow,
4133
+ controller=controller,
4134
+ refresh=refresh)
3768
4135
  except exceptions.ClusterNotUpError:
3769
4136
  with ux_utils.print_exception_no_traceback():
3770
4137
  raise
3771
4138
 
3772
4139
 
3773
4140
  @jobs.command('dashboard', cls=_DocumentedCodeCommand)
3774
- @click.option(
3775
- '--port',
3776
- '-p',
3777
- default=None,
3778
- type=int,
3779
- required=False,
3780
- help=('Local port to use for the dashboard. If None, a free port is '
3781
- 'automatically chosen.'))
3782
4141
  @usage_lib.entrypoint
3783
- def jobs_dashboard(port: Optional[int]):
3784
- """Opens a dashboard for managed jobs (needs controller to be UP)."""
3785
- # TODO(zongheng): ideally, the controller/dashboard server should expose the
3786
- # API perhaps via REST. Then here we would (1) not have to use SSH to try to
3787
- # see if the controller is UP first, which is slow; (2) not have to run SSH
3788
- # port forwarding first (we'd just launch a local dashboard which would make
3789
- # REST API calls to the controller dashboard server).
3790
- click.secho('Checking if jobs controller is up...', fg='yellow')
3791
- hint = ('Dashboard is not available if jobs controller is not up. Run a '
3792
- 'managed job first.')
3793
- backend_utils.is_controller_accessible(
3794
- controller=controller_utils.Controllers.JOBS_CONTROLLER,
3795
- stopped_message=hint,
3796
- non_existent_message=hint,
3797
- exit_if_not_accessible=True)
3798
-
3799
- # SSH forward a free local port to remote's dashboard port.
3800
- remote_port = constants.SPOT_DASHBOARD_REMOTE_PORT
3801
- if port is None:
3802
- free_port = common_utils.find_free_port(remote_port)
3803
- else:
3804
- free_port = port
3805
- ssh_command = (
3806
- f'ssh -qNL {free_port}:localhost:{remote_port} '
3807
- f'{controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name}')
3808
- click.echo('Forwarding port: ', nl=False)
3809
- click.secho(f'{ssh_command}', dim=True)
3810
-
3811
- with subprocess.Popen(ssh_command, shell=True,
3812
- start_new_session=True) as ssh_process:
3813
- time.sleep(3) # Added delay for ssh_command to initialize.
3814
- webbrowser.open(f'http://localhost:{free_port}')
3815
- click.secho(
3816
- f'Dashboard is now available at: http://127.0.0.1:{free_port}',
3817
- fg='green')
3818
- try:
3819
- ssh_process.wait()
3820
- except KeyboardInterrupt:
3821
- # When user presses Ctrl-C in terminal, exits the previous ssh
3822
- # command so that <free local port> is freed up.
3823
- try:
3824
- os.killpg(os.getpgid(ssh_process.pid), signal.SIGTERM)
3825
- except ProcessLookupError:
3826
- # This happens if jobs controller is auto-stopped.
3827
- pass
3828
- finally:
3829
- click.echo('Exiting.')
3830
-
3831
-
3832
- # TODO(zhwu): Backward compatibility for the old `sky spot launch` command.
3833
- # It is now renamed to `sky jobs launch` and the old command is deprecated.
3834
- # Remove in v0.8.0.
3835
- @cli.group(cls=_NaturalOrderGroup)
3836
- def spot():
3837
- """Alias for Managed Jobs CLI (default to managed spot jobs)."""
3838
- pass
3839
-
3840
-
3841
- _add_command_alias(jobs,
3842
- jobs_launch,
3843
- new_group=spot,
3844
- override_command_argument={'use_spot': True})
3845
- _add_command_alias(jobs, jobs_queue, new_group=spot)
3846
- _add_command_alias(jobs, jobs_logs, new_group=spot)
3847
- _add_command_alias(jobs, jobs_cancel, new_group=spot)
3848
- _add_command_alias(jobs, jobs_dashboard, new_group=spot)
4142
+ def jobs_dashboard():
4143
+ """Opens a dashboard for managed jobs."""
4144
+ managed_jobs.dashboard()
3849
4145
 
3850
4146
 
3851
4147
  @cli.group(cls=_NaturalOrderGroup)
@@ -3868,7 +4164,7 @@ def _generate_task_with_service(
3868
4164
  env: List[Tuple[str, str]],
3869
4165
  gpus: Optional[str],
3870
4166
  instance_type: Optional[str],
3871
- ports: Tuple[str],
4167
+ ports: Optional[Tuple[str]],
3872
4168
  cpus: Optional[str],
3873
4169
  memory: Optional[str],
3874
4170
  disk_size: Optional[int],
@@ -3900,7 +4196,6 @@ def _generate_task_with_service(
3900
4196
  disk_size=disk_size,
3901
4197
  disk_tier=disk_tier,
3902
4198
  ports=ports,
3903
- entrypoint_name='Service',
3904
4199
  )
3905
4200
  if isinstance(task, sky.Dag):
3906
4201
  raise click.UsageError(
@@ -3910,32 +4205,64 @@ def _generate_task_with_service(
3910
4205
  with ux_utils.print_exception_no_traceback():
3911
4206
  raise ValueError('Service section not found in the YAML file. '
3912
4207
  'To fix, add a valid `service` field.')
3913
- service_port: Optional[int] = None
3914
- for requested_resources in list(task.resources):
3915
- if requested_resources.ports is None or len(
3916
- requested_resources.ports) != 1:
3917
- with ux_utils.print_exception_no_traceback():
3918
- raise ValueError(
3919
- 'Must only specify one port in resources. Each replica '
3920
- 'will use the port specified as application ingress port.')
3921
- service_port_str = requested_resources.ports[0]
3922
- if not service_port_str.isdigit():
3923
- # For the case when the user specified a port range like 10000-10010
3924
- with ux_utils.print_exception_no_traceback():
3925
- raise ValueError(f'Port {service_port_str!r} is not a valid '
3926
- 'port number. Please specify a single port '
3927
- f'instead. Got: {service_port_str!r}')
3928
- # We request all the replicas using the same port for now, but it
3929
- # should be fine to allow different replicas to use different ports
3930
- # in the future.
3931
- resource_port = int(service_port_str)
3932
- if service_port is None:
3933
- service_port = resource_port
3934
- if service_port != resource_port:
3935
- with ux_utils.print_exception_no_traceback():
3936
- raise ValueError(f'Got multiple ports: {service_port} and '
3937
- f'{resource_port} in different resources. '
3938
- 'Please specify single port instead.')
4208
+
4209
+ # NOTE(yi): we only allow one service port now.
4210
+ service_port: Optional[int] = int(
4211
+ task.service.ports) if task.service.ports is not None else None
4212
+ if service_port is None:
4213
+ for requested_resources in list(task.resources):
4214
+ if requested_resources.ports is None:
4215
+ with ux_utils.print_exception_no_traceback():
4216
+ raise ValueError(
4217
+ 'Must specify at least one ports in resources. Each '
4218
+ 'replica will use the port specified as application '
4219
+ 'ingress port if only one port is specified in the '
4220
+ 'replica resources. If there are multiple ports opened '
4221
+ 'in the replica, please set the `service.ports` field '
4222
+ 'in the service config.')
4223
+ requested_ports = list(
4224
+ resources_utils.port_ranges_to_set(requested_resources.ports))
4225
+ if len(requested_ports) > 1:
4226
+ with ux_utils.print_exception_no_traceback():
4227
+ raise ValueError(
4228
+ 'Multiple ports specified in resources. Please '
4229
+ 'specify the main port in the `service.ports` field.')
4230
+ # We request all the replicas using the same port for now, but it
4231
+ # should be fine to allow different replicas to use different ports
4232
+ # in the future.
4233
+ resource_port = requested_ports[0]
4234
+ if service_port is None:
4235
+ service_port = resource_port
4236
+ if service_port != resource_port:
4237
+ with ux_utils.print_exception_no_traceback():
4238
+ raise ValueError(
4239
+ f'Got multiple ports: {service_port} and '
4240
+ f'{resource_port} in different resources. '
4241
+ 'Please specify the same port in all replicas, or '
4242
+ 'explicitly set the service port in the '
4243
+ '`service.ports` section.')
4244
+ assert service_port is not None
4245
+ task.service.set_ports(str(service_port))
4246
+ else:
4247
+ for requested_resources in list(task.resources):
4248
+ if requested_resources.ports is None:
4249
+ with ux_utils.print_exception_no_traceback():
4250
+ raise ValueError(
4251
+ 'Must specify at least one ports in every replica '
4252
+ 'resources.')
4253
+ ports_set = resources_utils.port_ranges_to_set(
4254
+ requested_resources.ports)
4255
+ if service_port not in ports_set:
4256
+ with ux_utils.print_exception_no_traceback():
4257
+ # TODO(tian): Automatically infer resource port from
4258
+ # service port if none of them is specified in the
4259
+ # replica resources.
4260
+ raise ValueError(
4261
+ f'The service port {service_port} specified in the '
4262
+ 'service section is not found in some resources. '
4263
+ 'Please check if the service port is correct or add '
4264
+ 'the service port to replica resources.')
4265
+
3939
4266
  return task
3940
4267
 
3941
4268
 
@@ -3951,7 +4278,7 @@ def _generate_task_with_service(
3951
4278
  type=str,
3952
4279
  help='A service name. Unique for each service. If not provided, '
3953
4280
  'a unique name is autogenerated.')
3954
- @_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS)
4281
+ @_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS + _COMMON_OPTIONS)
3955
4282
  @click.option('--yes',
3956
4283
  '-y',
3957
4284
  is_flag=True,
@@ -3980,6 +4307,7 @@ def serve_up(
3980
4307
  disk_size: Optional[int],
3981
4308
  disk_tier: Optional[str],
3982
4309
  yes: bool,
4310
+ async_call: bool,
3983
4311
  ):
3984
4312
  """Launch a SkyServe service.
3985
4313
 
@@ -4033,21 +4361,16 @@ def serve_up(
4033
4361
  ports=ports,
4034
4362
  not_supported_cmd='sky serve up',
4035
4363
  )
4036
- click.secho('Service Spec:', fg='cyan')
4364
+ click.secho('Service spec:', fg='cyan')
4037
4365
  click.echo(task.service)
4038
4366
 
4039
4367
  click.secho('Each replica will use the following resources (estimated):',
4040
4368
  fg='cyan')
4041
4369
  with sky.Dag() as dag:
4042
4370
  dag.add(task)
4043
- sky.optimize(dag)
4044
-
4045
- if not yes:
4046
- prompt = f'Launching a new service {service_name!r}. Proceed?'
4047
- if prompt is not None:
4048
- click.confirm(prompt, default=True, abort=True, show_default=True)
4049
4371
 
4050
- serve_lib.up(task, service_name)
4372
+ request_id = serve_lib.up(task, service_name, _need_confirmation=not yes)
4373
+ _async_call_or_wait(request_id, async_call, 'sky.serve.up')
4051
4374
 
4052
4375
 
4053
4376
  # TODO(MaoZiming): Update Doc.
@@ -4060,7 +4383,7 @@ def serve_up(
4060
4383
  type=str,
4061
4384
  nargs=-1,
4062
4385
  **_get_shell_complete_args(_complete_file_name))
4063
- @_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS)
4386
+ @_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS + _COMMON_OPTIONS)
4064
4387
  @click.option('--mode',
4065
4388
  default=serve_lib.DEFAULT_UPDATE_MODE.value,
4066
4389
  type=click.Choice([m.value for m in serve_lib.UpdateMode],
@@ -4077,28 +4400,16 @@ def serve_up(
4077
4400
  help='Skip confirmation prompt.')
4078
4401
  @timeline.event
4079
4402
  @usage_lib.entrypoint
4080
- def serve_update(
4081
- service_name: str,
4082
- service_yaml: Tuple[str, ...],
4083
- workdir: Optional[str],
4084
- cloud: Optional[str],
4085
- region: Optional[str],
4086
- zone: Optional[str],
4087
- num_nodes: Optional[int],
4088
- use_spot: Optional[bool],
4089
- image_id: Optional[str],
4090
- env_file: Optional[Dict[str, str]],
4091
- env: List[Tuple[str, str]],
4092
- gpus: Optional[str],
4093
- instance_type: Optional[str],
4094
- ports: Tuple[str],
4095
- cpus: Optional[str],
4096
- memory: Optional[str],
4097
- disk_size: Optional[int],
4098
- disk_tier: Optional[str],
4099
- mode: str,
4100
- yes: bool,
4101
- ):
4403
+ def serve_update(service_name: str, service_yaml: Tuple[str, ...],
4404
+ workdir: Optional[str], cloud: Optional[str],
4405
+ region: Optional[str], zone: Optional[str],
4406
+ num_nodes: Optional[int], use_spot: Optional[bool],
4407
+ image_id: Optional[str], env_file: Optional[Dict[str, str]],
4408
+ env: List[Tuple[str, str]], gpus: Optional[str],
4409
+ instance_type: Optional[str], ports: Tuple[str],
4410
+ cpus: Optional[str], memory: Optional[str],
4411
+ disk_size: Optional[int], disk_tier: Optional[str], mode: str,
4412
+ yes: bool, async_call: bool):
4102
4413
  """Update a SkyServe service.
4103
4414
 
4104
4415
  service_yaml must point to a valid YAML file.
@@ -4149,27 +4460,24 @@ def serve_update(
4149
4460
  ports=ports,
4150
4461
  not_supported_cmd='sky serve update',
4151
4462
  )
4152
- click.secho('Service Spec:', fg='cyan')
4463
+ click.secho('Service spec:', fg='cyan')
4153
4464
  click.echo(task.service)
4154
4465
 
4155
4466
  click.secho('New replica will use the following resources (estimated):',
4156
4467
  fg='cyan')
4157
4468
  with sky.Dag() as dag:
4158
4469
  dag.add(task)
4159
- sky.optimize(dag)
4160
4470
 
4161
- if not yes:
4162
- click.confirm(f'Updating service {service_name!r}. Proceed?',
4163
- default=True,
4164
- abort=True,
4165
- show_default=True)
4166
-
4167
- serve_lib.update(task, service_name, mode=serve_lib.UpdateMode(mode))
4471
+ request_id = serve_lib.update(task,
4472
+ service_name,
4473
+ mode=serve_lib.UpdateMode(mode),
4474
+ _need_confirmation=not yes)
4475
+ _async_call_or_wait(request_id, async_call, 'sky.serve.update')
4168
4476
 
4169
4477
 
4170
4478
  @serve.command('status', cls=_DocumentedCodeCommand)
4171
- @click.option('--all',
4172
- '-a',
4479
+ @click.option('--verbose',
4480
+ '-v',
4173
4481
  default=False,
4174
4482
  is_flag=True,
4175
4483
  required=False,
@@ -4182,7 +4490,7 @@ def serve_update(
4182
4490
  @click.argument('service_names', required=False, type=str, nargs=-1)
4183
4491
  @usage_lib.entrypoint
4184
4492
  # pylint: disable=redefined-builtin
4185
- def serve_status(all: bool, endpoint: bool, service_names: List[str]):
4493
+ def serve_status(verbose: bool, endpoint: bool, service_names: List[str]):
4186
4494
  """Show statuses of SkyServe services.
4187
4495
 
4188
4496
  Show detailed statuses of one or more services. If SERVICE_NAME is not
@@ -4269,17 +4577,22 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]):
4269
4577
  sky serve status
4270
4578
  \b
4271
4579
  # Show detailed status for all services
4272
- sky serve status -a
4580
+ sky serve status -v
4273
4581
  \b
4274
4582
  # Only show status of my-service
4275
4583
  sky serve status my-service
4276
4584
  """
4585
+ service_names_to_query: Optional[List[str]] = service_names
4586
+ if not service_names:
4587
+ service_names_to_query = None
4277
4588
  # This won't pollute the output of --endpoint.
4278
- with rich_utils.safe_status('[cyan]Checking services[/]'):
4279
- _, msg = _get_services(service_names,
4280
- show_all=all,
4281
- show_endpoint=endpoint,
4282
- is_called_by_user=True)
4589
+ with rich_utils.client_status('[cyan]Checking services[/]'):
4590
+ service_status_request_id = serve_lib.status(service_names_to_query)
4591
+ _, msg = _handle_services_request(service_status_request_id,
4592
+ service_names=service_names_to_query,
4593
+ show_all=verbose,
4594
+ show_endpoint=endpoint,
4595
+ is_called_by_user=True)
4283
4596
 
4284
4597
  if not endpoint:
4285
4598
  click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
@@ -4305,8 +4618,21 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]):
4305
4618
  default=False,
4306
4619
  required=False,
4307
4620
  help='Skip confirmation prompt.')
4621
+ @click.option('--replica-id',
4622
+ default=None,
4623
+ type=int,
4624
+ help='Tear down a given replica')
4625
+ @_add_click_options(_COMMON_OPTIONS)
4626
+ @usage_lib.entrypoint
4308
4627
  # pylint: disable=redefined-builtin
4309
- def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool):
4628
+ def serve_down(
4629
+ service_names: List[str],
4630
+ all: bool,
4631
+ purge: bool,
4632
+ yes: bool,
4633
+ replica_id: Optional[int],
4634
+ async_call: bool,
4635
+ ) -> None:
4310
4636
  """Teardown service(s).
4311
4637
 
4312
4638
  SERVICE_NAMES is the name of the service (or glob pattern) to tear down. If
@@ -4333,31 +4659,58 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool):
4333
4659
  \b
4334
4660
  # Forcefully tear down a service in failed status.
4335
4661
  sky serve down failed-service --purge
4662
+ \b
4663
+ # Tear down a specific replica
4664
+ sky serve down my-service --replica-id 1
4665
+ \b
4666
+ # Forcefully tear down a specific replica, even in failed status.
4667
+ sky serve down my-service --replica-id 1 --purge
4336
4668
  """
4337
- if sum([len(service_names) > 0, all]) != 1:
4338
- argument_str = f'SERVICE_NAMES={",".join(service_names)}' if len(
4339
- service_names) > 0 else ''
4669
+ if sum([bool(service_names), all]) != 1:
4670
+ argument_str = (f'SERVICE_NAMES={",".join(service_names)}'
4671
+ if service_names else '')
4340
4672
  argument_str += ' --all' if all else ''
4341
4673
  raise click.UsageError(
4342
4674
  'Can only specify one of SERVICE_NAMES or --all. '
4343
4675
  f'Provided {argument_str!r}.')
4344
4676
 
4345
- backend_utils.is_controller_accessible(
4346
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
4347
- stopped_message='All services should have been terminated.',
4348
- exit_if_not_accessible=True)
4349
-
4350
- if not yes:
4351
- quoted_service_names = [f'{name!r}' for name in service_names]
4352
- service_identity_str = f'service(s) {", ".join(quoted_service_names)}'
4677
+ replica_id_is_defined = replica_id is not None
4678
+ if replica_id_is_defined:
4679
+ if len(service_names) != 1:
4680
+ service_names_str = ', '.join(service_names)
4681
+ raise click.UsageError(f'The --replica-id option can only be used '
4682
+ f'with a single service name. Got: '
4683
+ f'{service_names_str}.')
4353
4684
  if all:
4354
- service_identity_str = 'all services'
4355
- click.confirm(f'Terminating {service_identity_str}. Proceed?',
4356
- default=True,
4357
- abort=True,
4358
- show_default=True)
4359
-
4360
- serve_lib.down(service_names=service_names, all=all, purge=purge)
4685
+ raise click.UsageError('The --replica-id option cannot be used '
4686
+ 'with the --all option.')
4687
+ if not yes:
4688
+ if replica_id_is_defined:
4689
+ click.confirm(
4690
+ f'Terminating replica ID {replica_id} in '
4691
+ f'{service_names[0]!r}. Proceed?',
4692
+ default=True,
4693
+ abort=True,
4694
+ show_default=True)
4695
+ else:
4696
+ quoted_service_names = [f'{name!r}' for name in service_names]
4697
+ list_service_str = ', '.join(quoted_service_names)
4698
+ service_identity_str = f'service(s) {list_service_str}'
4699
+ if all:
4700
+ service_identity_str = 'all services'
4701
+ click.confirm(f'Terminating {service_identity_str}. Proceed?',
4702
+ default=True,
4703
+ abort=True,
4704
+ show_default=True)
4705
+
4706
+ if replica_id_is_defined:
4707
+ request_id = serve_lib.terminate_replica(service_names[0], replica_id,
4708
+ purge)
4709
+ else:
4710
+ request_id = serve_lib.down(service_names=service_names,
4711
+ all=all,
4712
+ purge=purge)
4713
+ _async_call_or_wait(request_id, async_call, 'sky.serve.down')
4361
4714
 
4362
4715
 
4363
4716
  @serve.command('logs', cls=_DocumentedCodeCommand)
@@ -4484,7 +4837,7 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]:
4484
4837
  required=True,
4485
4838
  type=str,
4486
4839
  help='Benchmark name.')
4487
- @_add_click_options(_TASK_OPTIONS_WITH_NAME)
4840
+ @_add_click_options(_TASK_OPTIONS_WITH_NAME + _COMMON_OPTIONS)
4488
4841
  @click.option('--gpus',
4489
4842
  required=False,
4490
4843
  type=str,
@@ -4519,26 +4872,27 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]:
4519
4872
  help='Skip confirmation prompt.')
4520
4873
  @usage_lib.entrypoint
4521
4874
  def benchmark_launch(
4522
- entrypoint: str,
4523
- benchmark: str,
4524
- name: Optional[str],
4525
- workdir: Optional[str],
4526
- cloud: Optional[str],
4527
- region: Optional[str],
4528
- zone: Optional[str],
4529
- gpus: Optional[str],
4530
- num_nodes: Optional[int],
4531
- use_spot: Optional[bool],
4532
- image_id: Optional[str],
4533
- env_file: Optional[Dict[str, str]],
4534
- env: List[Tuple[str, str]],
4535
- cpus: Optional[str],
4536
- memory: Optional[str],
4537
- disk_size: Optional[int],
4538
- disk_tier: Optional[str],
4539
- ports: Tuple[str],
4540
- idle_minutes_to_autostop: Optional[int],
4541
- yes: bool,
4875
+ entrypoint: str,
4876
+ benchmark: str,
4877
+ name: Optional[str],
4878
+ workdir: Optional[str],
4879
+ cloud: Optional[str],
4880
+ region: Optional[str],
4881
+ zone: Optional[str],
4882
+ gpus: Optional[str],
4883
+ num_nodes: Optional[int],
4884
+ use_spot: Optional[bool],
4885
+ image_id: Optional[str],
4886
+ env_file: Optional[Dict[str, str]],
4887
+ env: List[Tuple[str, str]],
4888
+ cpus: Optional[str],
4889
+ memory: Optional[str],
4890
+ disk_size: Optional[int],
4891
+ disk_tier: Optional[str],
4892
+ ports: Tuple[str],
4893
+ idle_minutes_to_autostop: Optional[int],
4894
+ yes: bool,
4895
+ async_call: bool, # pylint: disable=unused-argument
4542
4896
  ) -> None:
4543
4897
  """Benchmark a task on different resources.
4544
4898
 
@@ -4547,6 +4901,7 @@ def benchmark_launch(
4547
4901
  Alternatively, specify the benchmarking resources in your YAML (see doc),
4548
4902
  which allows benchmarking on many more resource fields.
4549
4903
  """
4904
+ # TODO(zhwu): move benchmark to SkyPilot API server
4550
4905
  env = _merge_env_vars(env_file, env)
4551
4906
  record = benchmark_state.get_benchmark_from_name(benchmark)
4552
4907
  if record is not None:
@@ -4565,7 +4920,7 @@ def benchmark_launch(
4565
4920
  'Please provide a YAML file.')
4566
4921
  assert config is not None, (is_yaml, config)
4567
4922
 
4568
- click.secho('Benchmarking a task from YAML spec: ', fg='yellow', nl=False)
4923
+ click.secho('Benchmarking a task from YAML: ', fg='cyan', nl=False)
4569
4924
  click.secho(entrypoint, bold=True)
4570
4925
 
4571
4926
  candidates = _get_candidate_configs(entrypoint)
@@ -4686,7 +5041,7 @@ def benchmark_launch(
4686
5041
  if idle_minutes_to_autostop is None:
4687
5042
  idle_minutes_to_autostop = 5
4688
5043
  commandline_args['idle-minutes-to-autostop'] = idle_minutes_to_autostop
4689
- if len(env) > 0:
5044
+ if env:
4690
5045
  commandline_args['env'] = [f'{k}={v}' for k, v in env]
4691
5046
 
4692
5047
  # Launch the benchmarking clusters in detach mode in parallel.
@@ -4699,11 +5054,11 @@ def benchmark_launch(
4699
5054
  f'\n{colorama.Fore.CYAN}Benchmark name: '
4700
5055
  f'{colorama.Style.BRIGHT}{benchmark}{colorama.Style.RESET_ALL}'
4701
5056
  '\nTo see the benchmark results: '
4702
- f'{backend_utils.BOLD}sky bench show '
4703
- f'{benchmark}{backend_utils.RESET_BOLD}'
5057
+ f'{ux_utils.BOLD}sky bench show '
5058
+ f'{benchmark}{ux_utils.RESET_BOLD}'
4704
5059
  '\nTo teardown the clusters: '
4705
- f'{backend_utils.BOLD}sky bench down '
4706
- f'{benchmark}{backend_utils.RESET_BOLD}')
5060
+ f'{ux_utils.BOLD}sky bench down '
5061
+ f'{benchmark}{ux_utils.RESET_BOLD}')
4707
5062
  subprocess_utils.run('sky bench ls')
4708
5063
  else:
4709
5064
  logger.error('No benchmarking clusters are created.')
@@ -4937,10 +5292,7 @@ def benchmark_down(
4937
5292
  continue
4938
5293
  to_stop.append(cluster)
4939
5294
 
4940
- _down_or_stop_clusters(to_stop,
4941
- apply_to_all=False,
4942
- down=True,
4943
- no_confirm=yes)
5295
+ _down_or_stop_clusters(to_stop, down=True, no_confirm=yes)
4944
5296
 
4945
5297
 
4946
5298
  @bench.command('delete', cls=_DocumentedCodeCommand)
@@ -4965,7 +5317,7 @@ def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool],
4965
5317
  raise click.BadParameter(
4966
5318
  'Either specify benchmarks or use --all to delete all benchmarks.')
4967
5319
  to_delete = []
4968
- if len(benchmarks) > 0:
5320
+ if benchmarks:
4969
5321
  for benchmark in benchmarks:
4970
5322
  record = benchmark_state.get_benchmark_from_name(benchmark)
4971
5323
  if record is None:
@@ -4974,7 +5326,7 @@ def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool],
4974
5326
  to_delete.append(record)
4975
5327
  if all:
4976
5328
  to_delete = benchmark_state.get_benchmarks()
4977
- if len(benchmarks) > 0:
5329
+ if benchmarks:
4978
5330
  print('Both --all and benchmark(s) specified '
4979
5331
  'for sky bench delete. Letting --all take effect.')
4980
5332
 
@@ -5011,8 +5363,8 @@ def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool],
5011
5363
  message = (f'{colorama.Fore.YELLOW}Benchmark {benchmark} '
5012
5364
  f'has {num_clusters} un-terminated cluster{plural}. '
5013
5365
  f'Terminate the cluster{plural} with '
5014
- f'{backend_utils.BOLD} sky bench down {benchmark} '
5015
- f'{backend_utils.RESET_BOLD} '
5366
+ f'{ux_utils.BOLD} sky bench down {benchmark} '
5367
+ f'{ux_utils.RESET_BOLD} '
5016
5368
  'before deleting the benchmark report.')
5017
5369
  success = False
5018
5370
  else:
@@ -5051,181 +5403,276 @@ def local():
5051
5403
  is_flag=True,
5052
5404
  help='Launch cluster without GPU support even '
5053
5405
  'if GPUs are detected on the host.')
5406
+ @click.option(
5407
+ '--ips',
5408
+ type=str,
5409
+ required=False,
5410
+ help='Path to the file containing IP addresses of remote machines.')
5411
+ @click.option('--ssh-user',
5412
+ type=str,
5413
+ required=False,
5414
+ help='SSH username for accessing remote machines.')
5415
+ @click.option('--ssh-key-path',
5416
+ type=str,
5417
+ required=False,
5418
+ help='Path to the SSH private key.')
5419
+ @click.option('--cleanup',
5420
+ is_flag=True,
5421
+ help='Clean up the remote cluster instead of deploying it.')
5054
5422
  @local.command('up', cls=_DocumentedCodeCommand)
5423
+ @_add_click_options(_COMMON_OPTIONS)
5055
5424
  @usage_lib.entrypoint
5056
- def local_up(gpus: bool):
5057
- """Creates a local cluster."""
5058
- cluster_created = False
5059
-
5060
- # Check if GPUs are available on the host
5061
- local_gpus_available = backend_utils.check_local_gpus()
5062
- gpus = gpus and local_gpus_available
5063
-
5064
- # Check if ~/.kube/config exists:
5065
- if os.path.exists(os.path.expanduser('~/.kube/config')):
5066
- curr_context = kubernetes_utils.get_current_kube_config_context_name()
5067
- skypilot_context = 'kind-skypilot'
5068
- if curr_context is not None and curr_context != skypilot_context:
5069
- click.echo(
5070
- f'Current context in kube config: {curr_context}'
5071
- '\nWill automatically switch to kind-skypilot after the local '
5072
- 'cluster is created.')
5073
- message_str = 'Creating local cluster{}...'
5074
- message_str = message_str.format((' with GPU support (this may take up '
5075
- 'to 15 minutes)') if gpus else '')
5076
- path_to_package = os.path.dirname(os.path.dirname(__file__))
5077
- up_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes',
5078
- 'create_cluster.sh')
5079
-
5080
- # Get directory of script and run it from there
5081
- cwd = os.path.dirname(os.path.abspath(up_script_path))
5082
- run_command = up_script_path + ' --gpus' if gpus else up_script_path
5083
- run_command = shlex.split(run_command)
5084
-
5085
- # Setup logging paths
5086
- run_timestamp = backend_utils.get_run_timestamp()
5087
- log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
5088
- 'local_up.log')
5089
- tail_cmd = 'tail -n100 -f ' + log_path
5090
-
5091
- click.echo(message_str)
5092
- style = colorama.Style
5093
- click.echo('To view detailed progress: '
5094
- f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
5095
-
5096
- returncode, _, stderr = log_lib.run_with_log(
5097
- cmd=run_command,
5098
- log_path=log_path,
5099
- require_outputs=True,
5100
- stream_logs=False,
5101
- line_processor=log_utils.SkyLocalUpLineProcessor(),
5102
- cwd=cwd)
5103
-
5104
- # Kind always writes to stderr even if it succeeds.
5105
- # If the failure happens after the cluster is created, we need
5106
- # to strip all stderr of "No kind clusters found.", which is
5107
- # printed when querying with kind get clusters.
5108
- stderr = stderr.replace('No kind clusters found.\n', '')
5109
-
5110
- if returncode == 0:
5111
- cluster_created = True
5112
- elif returncode == 100:
5113
- click.echo(f'{colorama.Fore.GREEN}Local cluster already '
5114
- f'exists.{style.RESET_ALL}\n'
5115
- 'If you want to delete it instead, run: sky local down')
5116
- else:
5117
- with ux_utils.print_exception_no_traceback():
5118
- raise RuntimeError(
5119
- 'Failed to create local cluster. '
5120
- f'Full log: {log_path}'
5121
- f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}')
5122
- # Run sky check
5123
- with rich_utils.safe_status('[bold cyan]Running sky check...'):
5124
- sky_check.check(clouds=['kubernetes'], quiet=True)
5125
- if cluster_created:
5126
- # Prepare completion message which shows CPU and GPU count
5127
- # Get number of CPUs
5128
- p = subprocess_utils.run(
5129
- 'kubectl get nodes -o jsonpath=\'{.items[0].status.capacity.cpu}\'',
5130
- capture_output=True)
5131
- num_cpus = int(p.stdout.decode('utf-8'))
5132
-
5133
- # GPU count/type parsing
5134
- gpu_message = ''
5135
- gpu_hint = ''
5136
- if gpus:
5137
- # Get GPU model by querying the node labels
5138
- label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
5139
- gpu_type_cmd = f'kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
5140
- try:
5141
- # Run the command and capture the output
5142
- gpu_count_output = subprocess.check_output(gpu_type_cmd,
5143
- shell=True,
5144
- text=True)
5145
- gpu_type_str = gpu_count_output.strip() + ' '
5146
- except subprocess.CalledProcessError as e:
5147
- output = str(e.output.decode('utf-8'))
5148
- logger.warning(f'Failed to get GPU type: {output}')
5149
- gpu_type_str = ''
5150
-
5151
- # Get number of GPUs (sum of nvidia.com/gpu resources)
5152
- gpu_count_command = 'kubectl get nodes -o=jsonpath=\'{range .items[*]}{.status.allocatable.nvidia\\.com/gpu}{\"\\n\"}{end}\' | awk \'{sum += $1} END {print sum}\'' # pylint: disable=line-too-long
5153
- try:
5154
- # Run the command and capture the output
5155
- gpu_count_output = subprocess.check_output(gpu_count_command,
5156
- shell=True,
5157
- text=True)
5158
- gpu_count = gpu_count_output.strip(
5159
- ) # Remove any extra whitespace
5160
- gpu_message = f' and {gpu_count} {gpu_type_str}GPUs'
5161
- except subprocess.CalledProcessError as e:
5162
- output = str(e.output.decode('utf-8'))
5163
- logger.warning(f'Failed to get GPU count: {output}')
5164
- gpu_message = f' with {gpu_type_str}GPU support'
5165
-
5166
- gpu_hint = (
5167
- '\nHint: To see the list of GPUs in the cluster, '
5168
- 'run \'sky show-gpus --cloud kubernetes\'') if gpus else ''
5169
-
5170
- if num_cpus < 2:
5171
- click.echo('Warning: Local cluster has less than 2 CPUs. '
5172
- 'This may cause issues with running tasks.')
5173
- click.echo(
5174
- f'\n{colorama.Fore.GREEN}Local Kubernetes cluster created '
5175
- 'successfully with '
5176
- f'{num_cpus} CPUs{gpu_message}.{style.RESET_ALL}\n`sky launch` can '
5177
- 'now run tasks locally.'
5178
- '\nHint: To change the number of CPUs, change your docker '
5179
- 'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
5180
- f'{gpu_hint}')
5425
+ def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
5426
+ cleanup: bool, async_call: bool):
5427
+ """Creates a local or remote cluster."""
5428
+
5429
+ def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
5430
+ # If any of --ips, --ssh-user, or --ssh-key-path is specified,
5431
+ # all must be specified
5432
+ if bool(ips) or bool(ssh_user) or bool(ssh_key_path):
5433
+ if not (ips and ssh_user and ssh_key_path):
5434
+ raise click.BadParameter(
5435
+ 'All --ips, --ssh-user, and --ssh-key-path '
5436
+ 'must be specified together.')
5437
+
5438
+ # --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
5439
+ # are all provided
5440
+ if cleanup and not (ips and ssh_user and ssh_key_path):
5441
+ raise click.BadParameter('--cleanup can only be used with '
5442
+ '--ips, --ssh-user and --ssh-key-path.')
5443
+
5444
+ _validate_args(ips, ssh_user, ssh_key_path, cleanup)
5445
+
5446
+ # If remote deployment arguments are specified, run remote up script
5447
+ ip_list = None
5448
+ ssh_key = None
5449
+ if ips and ssh_user and ssh_key_path:
5450
+ # Read and validate IP file
5451
+ try:
5452
+ with open(os.path.expanduser(ips), 'r', encoding='utf-8') as f:
5453
+ ip_list = f.read().strip().splitlines()
5454
+ if not ip_list:
5455
+ raise click.BadParameter(f'IP file is empty: {ips}')
5456
+ except (IOError, OSError) as e:
5457
+ raise click.BadParameter(f'Failed to read IP file {ips}: {str(e)}')
5458
+
5459
+ # Read and validate SSH key file
5460
+ try:
5461
+ with open(os.path.expanduser(ssh_key_path), 'r',
5462
+ encoding='utf-8') as f:
5463
+ ssh_key = f.read()
5464
+ if not ssh_key:
5465
+ raise click.BadParameter(
5466
+ f'SSH key file is empty: {ssh_key_path}')
5467
+ except (IOError, OSError) as e:
5468
+ raise click.BadParameter(
5469
+ f'Failed to read SSH key file {ssh_key_path}: {str(e)}')
5470
+
5471
+ request_id = sdk.local_up(gpus, ip_list, ssh_user, ssh_key, cleanup)
5472
+ _async_call_or_wait(request_id, async_call, request_name='local up')
5181
5473
 
5182
5474
 
5183
5475
  @local.command('down', cls=_DocumentedCodeCommand)
5476
+ @_add_click_options(_COMMON_OPTIONS)
5184
5477
  @usage_lib.entrypoint
5185
- def local_down():
5478
+ def local_down(async_call: bool):
5186
5479
  """Deletes a local cluster."""
5187
- cluster_removed = False
5480
+ request_id = sdk.local_down()
5481
+ _async_call_or_wait(request_id, async_call, request_name='sky.local.down')
5188
5482
 
5189
- path_to_package = os.path.dirname(os.path.dirname(__file__))
5190
- down_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes',
5191
- 'delete_cluster.sh')
5192
5483
 
5193
- cwd = os.path.dirname(os.path.abspath(down_script_path))
5194
- run_command = shlex.split(down_script_path)
5484
+ @cli.group(cls=_NaturalOrderGroup)
5485
+ def api():
5486
+ """SkyPilot API server commands."""
5487
+ pass
5195
5488
 
5196
- # Setup logging paths
5197
- run_timestamp = backend_utils.get_run_timestamp()
5198
- log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
5199
- 'local_down.log')
5200
- tail_cmd = 'tail -n100 -f ' + log_path
5201
5489
 
5202
- with rich_utils.safe_status('[bold cyan]Removing local cluster...'):
5203
- style = colorama.Style
5204
- click.echo('To view detailed progress: '
5205
- f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
5206
- returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
5207
- log_path=log_path,
5208
- require_outputs=True,
5209
- stream_logs=False,
5210
- cwd=cwd)
5211
- stderr = stderr.replace('No kind clusters found.\n', '')
5212
-
5213
- if returncode == 0:
5214
- cluster_removed = True
5215
- elif returncode == 100:
5216
- click.echo('\nLocal cluster does not exist.')
5217
- else:
5218
- with ux_utils.print_exception_no_traceback():
5219
- raise RuntimeError(
5220
- 'Failed to create local cluster. '
5221
- f'Stdout: {stdout}'
5222
- f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}')
5223
- if cluster_removed:
5224
- # Run sky check
5225
- with rich_utils.safe_status('[bold cyan]Running sky check...'):
5226
- sky_check.check(clouds=['kubernetes'], quiet=True)
5227
- click.echo(
5228
- f'{colorama.Fore.GREEN}Local cluster removed.{style.RESET_ALL}')
5490
+ @api.command('start', cls=_DocumentedCodeCommand)
5491
+ @click.option('--deploy',
5492
+ type=bool,
5493
+ is_flag=True,
5494
+ default=False,
5495
+ required=False,
5496
+ help=('Deploy the SkyPilot API server. When set to True, '
5497
+ 'SkyPilot API server will use all resources on the host '
5498
+ 'machine assuming the machine is dedicated to SkyPilot API '
5499
+ 'server; host will also be set to 0.0.0.0 to allow remote '
5500
+ 'access.'))
5501
+ @click.option('--host',
5502
+ default='127.0.0.1',
5503
+ type=click.Choice(server_common.AVAILBLE_LOCAL_API_SERVER_HOSTS),
5504
+ required=False,
5505
+ help=('The host to deploy the SkyPilot API server. To allow '
5506
+ 'remote access, set this to 0.0.0.0'))
5507
+ @click.option('--foreground',
5508
+ is_flag=True,
5509
+ default=False,
5510
+ required=False,
5511
+ help='Run the SkyPilot API server in the foreground and output '
5512
+ 'its logs to stdout/stderr. Allowing external systems '
5513
+ 'to manage the process lifecycle and collect logs directly. '
5514
+ 'This is useful when the API server is managed by systems '
5515
+ 'like systemd and Kubernetes.')
5516
+ @usage_lib.entrypoint
5517
+ def api_start(deploy: bool, host: Optional[str], foreground: bool):
5518
+ """Starts the SkyPilot API server locally."""
5519
+ sdk.api_start(deploy=deploy, host=host, foreground=foreground)
5520
+
5521
+
5522
+ @api.command('stop', cls=_DocumentedCodeCommand)
5523
+ @usage_lib.entrypoint
5524
+ def api_stop():
5525
+ """Stops the SkyPilot API server locally."""
5526
+ sdk.api_stop()
5527
+
5528
+
5529
+ @api.command('logs', cls=_DocumentedCodeCommand)
5530
+ @click.argument('request_id', required=False, type=str)
5531
+ @click.option('--server-logs',
5532
+ is_flag=True,
5533
+ default=False,
5534
+ required=False,
5535
+ help='Stream the server logs.')
5536
+ @click.option('--log-path',
5537
+ '-l',
5538
+ required=False,
5539
+ type=str,
5540
+ help='The path to the log file to stream.')
5541
+ @click.option('--tail',
5542
+ required=False,
5543
+ type=int,
5544
+ help=('Number of lines to show from the end of the logs. '
5545
+ '(default: None)'))
5546
+ @click.option('--follow/--no-follow',
5547
+ is_flag=True,
5548
+ default=True,
5549
+ required=False,
5550
+ help='Follow the logs.')
5551
+ @usage_lib.entrypoint
5552
+ def api_logs(request_id: Optional[str], server_logs: bool,
5553
+ log_path: Optional[str], tail: Optional[int], follow: bool):
5554
+ """Stream the logs of a request running on SkyPilot API server."""
5555
+ if not server_logs and request_id is None and log_path is None:
5556
+ # TODO(zhwu): get the latest request ID.
5557
+ raise click.BadParameter('Please provide the request ID or log path.')
5558
+ if server_logs:
5559
+ sdk.api_server_logs(follow=follow, tail=tail)
5560
+ return
5561
+
5562
+ if request_id is not None and log_path is not None:
5563
+ raise click.BadParameter(
5564
+ 'Only one of request ID and log path can be provided.')
5565
+ sdk.stream_and_get(request_id, log_path, tail)
5566
+
5567
+
5568
+ @api.command('cancel', cls=_DocumentedCodeCommand)
5569
+ @click.argument('request_ids', required=False, type=str, nargs=-1)
5570
+ @click.option('--all',
5571
+ '-a',
5572
+ is_flag=True,
5573
+ default=False,
5574
+ required=False,
5575
+ help='Cancel all your requests.')
5576
+ @click.option('--all-users',
5577
+ '-u',
5578
+ is_flag=True,
5579
+ default=False,
5580
+ required=False,
5581
+ help='Cancel all requests from all users.')
5582
+ @usage_lib.entrypoint
5583
+ # pylint: disable=redefined-builtin
5584
+ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
5585
+ """Cancel a request running on SkyPilot API server."""
5586
+ if all or all_users:
5587
+ keyword = 'ALL USERS\'' if all_users else 'YOUR'
5588
+ user_input = click.prompt(
5589
+ f'This will cancel all {keyword} requests.\n'
5590
+ f'To proceed, please type {colorama.Style.BRIGHT}'
5591
+ f'\'cancel all requests\'{colorama.Style.RESET_ALL}',
5592
+ type=str)
5593
+ if user_input != 'cancel all requests':
5594
+ raise click.Abort()
5595
+ if all:
5596
+ request_ids = None
5597
+ cancelled_request_ids = sdk.get(
5598
+ sdk.api_cancel(request_ids=request_ids, all_users=all_users))
5599
+ if not cancelled_request_ids:
5600
+ click.secho('No requests need to be cancelled.', fg='green')
5601
+ elif len(cancelled_request_ids) == 1:
5602
+ click.secho(f'Cancelled 1 request: {cancelled_request_ids[0]}',
5603
+ fg='green')
5604
+ else:
5605
+ click.secho(f'Cancelled {len(cancelled_request_ids)} requests.',
5606
+ fg='green')
5607
+
5608
+
5609
+ @api.command('status', cls=_DocumentedCodeCommand)
5610
+ @click.argument('request_ids', required=False, type=str, nargs=-1)
5611
+ @click.option('--all-status',
5612
+ '-a',
5613
+ is_flag=True,
5614
+ default=False,
5615
+ required=False,
5616
+ help='Show requests of all statuses.')
5617
+ @click.option('--verbose',
5618
+ '-v',
5619
+ is_flag=True,
5620
+ default=False,
5621
+ required=False,
5622
+ help='Show more details.')
5623
+ @usage_lib.entrypoint
5624
+ # pylint: disable=redefined-builtin
5625
+ def api_status(request_ids: Optional[List[str]], all_status: bool,
5626
+ verbose: bool):
5627
+ """List requests on SkyPilot API server."""
5628
+ if not request_ids:
5629
+ request_ids = None
5630
+ request_list = sdk.api_status(request_ids, all_status)
5631
+ columns = ['ID', 'User', 'Name']
5632
+ if verbose:
5633
+ columns.append('Cluster')
5634
+ columns.extend(['Created', 'Status'])
5635
+ table = log_utils.create_table(columns)
5636
+ for request in request_list:
5637
+ r_id = request.request_id
5638
+ if not verbose:
5639
+ r_id = common_utils.truncate_long_string(r_id, 36)
5640
+ req_status = requests.RequestStatus(request.status)
5641
+ row = [r_id, request.user_name, request.name]
5642
+ if verbose:
5643
+ row.append(request.cluster_name)
5644
+ row.extend([
5645
+ log_utils.readable_time_duration(request.created_at),
5646
+ req_status.colored_str()
5647
+ ])
5648
+ table.add_row(row)
5649
+ click.echo(table)
5650
+
5651
+
5652
+ @api.command('login', cls=_DocumentedCodeCommand)
5653
+ @click.option('--endpoint',
5654
+ '-e',
5655
+ required=False,
5656
+ help='The SkyPilot API server endpoint.')
5657
+ @usage_lib.entrypoint
5658
+ def api_login(endpoint: Optional[str]):
5659
+ """Logs into a SkyPilot API server."""
5660
+ sdk.api_login(endpoint)
5661
+
5662
+
5663
+ @api.command('info', cls=_DocumentedCodeCommand)
5664
+ @usage_lib.entrypoint
5665
+ def api_info():
5666
+ """Shows the SkyPilot API server URL."""
5667
+ url = server_common.get_server_url()
5668
+ api_server_info = sdk.api_info()
5669
+ user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
5670
+ user_hash = common_utils.get_user_hash()
5671
+ click.echo(f'Using SkyPilot API server: {url}\n'
5672
+ f'{ux_utils.INDENT_SYMBOL}Status: {api_server_info["status"]}, '
5673
+ f'commit: {api_server_info["commit"]}, '
5674
+ f'version: {api_server_info["version"]}\n'
5675
+ f'{ux_utils.INDENT_LAST_SYMBOL}User: {user_name} ({user_hash})')
5229
5676
 
5230
5677
 
5231
5678
  def main():