skypilot-nightly 1.0.0.dev20250215__py3-none-any.whl → 1.0.0.dev20250217__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (172) hide show
  1. sky/__init__.py +48 -22
  2. sky/adaptors/aws.py +2 -1
  3. sky/adaptors/azure.py +4 -4
  4. sky/adaptors/cloudflare.py +4 -4
  5. sky/adaptors/kubernetes.py +8 -8
  6. sky/authentication.py +42 -45
  7. sky/backends/backend.py +2 -2
  8. sky/backends/backend_utils.py +108 -221
  9. sky/backends/cloud_vm_ray_backend.py +283 -282
  10. sky/benchmark/benchmark_utils.py +6 -2
  11. sky/check.py +40 -28
  12. sky/cli.py +1213 -1116
  13. sky/client/__init__.py +1 -0
  14. sky/client/cli.py +5644 -0
  15. sky/client/common.py +345 -0
  16. sky/client/sdk.py +1757 -0
  17. sky/cloud_stores.py +12 -6
  18. sky/clouds/__init__.py +0 -2
  19. sky/clouds/aws.py +20 -13
  20. sky/clouds/azure.py +5 -3
  21. sky/clouds/cloud.py +1 -1
  22. sky/clouds/cudo.py +2 -1
  23. sky/clouds/do.py +2 -1
  24. sky/clouds/fluidstack.py +3 -2
  25. sky/clouds/gcp.py +10 -8
  26. sky/clouds/ibm.py +8 -7
  27. sky/clouds/kubernetes.py +7 -6
  28. sky/clouds/lambda_cloud.py +8 -7
  29. sky/clouds/oci.py +4 -3
  30. sky/clouds/paperspace.py +2 -1
  31. sky/clouds/runpod.py +2 -1
  32. sky/clouds/scp.py +8 -7
  33. sky/clouds/service_catalog/__init__.py +3 -3
  34. sky/clouds/service_catalog/aws_catalog.py +7 -1
  35. sky/clouds/service_catalog/common.py +4 -2
  36. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +2 -2
  37. sky/clouds/utils/oci_utils.py +1 -1
  38. sky/clouds/vast.py +2 -1
  39. sky/clouds/vsphere.py +2 -1
  40. sky/core.py +263 -99
  41. sky/dag.py +4 -0
  42. sky/data/mounting_utils.py +2 -1
  43. sky/data/storage.py +97 -35
  44. sky/data/storage_utils.py +69 -9
  45. sky/exceptions.py +138 -5
  46. sky/execution.py +47 -50
  47. sky/global_user_state.py +105 -22
  48. sky/jobs/__init__.py +12 -14
  49. sky/jobs/client/__init__.py +0 -0
  50. sky/jobs/client/sdk.py +296 -0
  51. sky/jobs/constants.py +30 -1
  52. sky/jobs/controller.py +12 -6
  53. sky/jobs/dashboard/dashboard.py +2 -6
  54. sky/jobs/recovery_strategy.py +22 -29
  55. sky/jobs/server/__init__.py +1 -0
  56. sky/jobs/{core.py → server/core.py} +101 -34
  57. sky/jobs/server/dashboard_utils.py +64 -0
  58. sky/jobs/server/server.py +182 -0
  59. sky/jobs/utils.py +32 -23
  60. sky/models.py +27 -0
  61. sky/optimizer.py +9 -11
  62. sky/provision/__init__.py +6 -3
  63. sky/provision/aws/config.py +2 -2
  64. sky/provision/aws/instance.py +1 -1
  65. sky/provision/azure/instance.py +1 -1
  66. sky/provision/cudo/instance.py +1 -1
  67. sky/provision/do/instance.py +1 -1
  68. sky/provision/do/utils.py +0 -5
  69. sky/provision/fluidstack/fluidstack_utils.py +4 -3
  70. sky/provision/fluidstack/instance.py +4 -2
  71. sky/provision/gcp/instance.py +1 -1
  72. sky/provision/instance_setup.py +2 -2
  73. sky/provision/kubernetes/constants.py +8 -0
  74. sky/provision/kubernetes/instance.py +1 -1
  75. sky/provision/kubernetes/utils.py +67 -76
  76. sky/provision/lambda_cloud/instance.py +3 -15
  77. sky/provision/logging.py +1 -1
  78. sky/provision/oci/instance.py +7 -4
  79. sky/provision/paperspace/instance.py +1 -1
  80. sky/provision/provisioner.py +3 -2
  81. sky/provision/runpod/instance.py +1 -1
  82. sky/provision/vast/instance.py +1 -1
  83. sky/provision/vast/utils.py +2 -1
  84. sky/provision/vsphere/instance.py +2 -11
  85. sky/resources.py +55 -40
  86. sky/serve/__init__.py +6 -10
  87. sky/serve/client/__init__.py +0 -0
  88. sky/serve/client/sdk.py +366 -0
  89. sky/serve/constants.py +3 -0
  90. sky/serve/replica_managers.py +10 -10
  91. sky/serve/serve_utils.py +56 -36
  92. sky/serve/server/__init__.py +0 -0
  93. sky/serve/{core.py → server/core.py} +37 -17
  94. sky/serve/server/server.py +117 -0
  95. sky/serve/service.py +8 -1
  96. sky/server/__init__.py +1 -0
  97. sky/server/common.py +441 -0
  98. sky/server/constants.py +21 -0
  99. sky/server/html/log.html +174 -0
  100. sky/server/requests/__init__.py +0 -0
  101. sky/server/requests/executor.py +462 -0
  102. sky/server/requests/payloads.py +481 -0
  103. sky/server/requests/queues/__init__.py +0 -0
  104. sky/server/requests/queues/mp_queue.py +76 -0
  105. sky/server/requests/requests.py +567 -0
  106. sky/server/requests/serializers/__init__.py +0 -0
  107. sky/server/requests/serializers/decoders.py +192 -0
  108. sky/server/requests/serializers/encoders.py +166 -0
  109. sky/server/server.py +1095 -0
  110. sky/server/stream_utils.py +144 -0
  111. sky/setup_files/MANIFEST.in +1 -0
  112. sky/setup_files/dependencies.py +12 -4
  113. sky/setup_files/setup.py +1 -1
  114. sky/sky_logging.py +9 -13
  115. sky/skylet/autostop_lib.py +2 -2
  116. sky/skylet/constants.py +46 -12
  117. sky/skylet/events.py +5 -6
  118. sky/skylet/job_lib.py +78 -66
  119. sky/skylet/log_lib.py +17 -11
  120. sky/skypilot_config.py +79 -94
  121. sky/task.py +119 -73
  122. sky/templates/aws-ray.yml.j2 +4 -4
  123. sky/templates/azure-ray.yml.j2 +3 -2
  124. sky/templates/cudo-ray.yml.j2 +3 -2
  125. sky/templates/fluidstack-ray.yml.j2 +3 -2
  126. sky/templates/gcp-ray.yml.j2 +3 -2
  127. sky/templates/ibm-ray.yml.j2 +3 -2
  128. sky/templates/jobs-controller.yaml.j2 +1 -12
  129. sky/templates/kubernetes-ray.yml.j2 +3 -2
  130. sky/templates/lambda-ray.yml.j2 +3 -2
  131. sky/templates/oci-ray.yml.j2 +3 -2
  132. sky/templates/paperspace-ray.yml.j2 +3 -2
  133. sky/templates/runpod-ray.yml.j2 +3 -2
  134. sky/templates/scp-ray.yml.j2 +3 -2
  135. sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
  136. sky/templates/vsphere-ray.yml.j2 +4 -2
  137. sky/templates/websocket_proxy.py +64 -0
  138. sky/usage/constants.py +8 -0
  139. sky/usage/usage_lib.py +45 -11
  140. sky/utils/accelerator_registry.py +33 -53
  141. sky/utils/admin_policy_utils.py +2 -1
  142. sky/utils/annotations.py +51 -0
  143. sky/utils/cli_utils/status_utils.py +33 -3
  144. sky/utils/cluster_utils.py +356 -0
  145. sky/utils/command_runner.py +69 -14
  146. sky/utils/common.py +74 -0
  147. sky/utils/common_utils.py +133 -93
  148. sky/utils/config_utils.py +204 -0
  149. sky/utils/control_master_utils.py +2 -3
  150. sky/utils/controller_utils.py +133 -147
  151. sky/utils/dag_utils.py +72 -24
  152. sky/utils/kubernetes/deploy_remote_cluster.sh +2 -2
  153. sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
  154. sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
  155. sky/utils/log_utils.py +83 -23
  156. sky/utils/message_utils.py +81 -0
  157. sky/utils/registry.py +127 -0
  158. sky/utils/resources_utils.py +2 -2
  159. sky/utils/rich_utils.py +213 -34
  160. sky/utils/schemas.py +19 -2
  161. sky/{status_lib.py → utils/status_lib.py} +12 -7
  162. sky/utils/subprocess_utils.py +51 -35
  163. sky/utils/timeline.py +7 -2
  164. sky/utils/ux_utils.py +95 -25
  165. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/METADATA +8 -3
  166. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/RECORD +170 -132
  167. sky/clouds/cloud_registry.py +0 -76
  168. sky/utils/cluster_yaml_utils.py +0 -24
  169. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/LICENSE +0 -0
  170. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/WHEEL +0 -0
  171. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/entry_points.txt +0 -0
  172. {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/top_level.txt +0 -0
sky/cli.py CHANGED
@@ -26,62 +26,66 @@ each other.
26
26
  import copy
27
27
  import datetime
28
28
  import functools
29
- import multiprocessing
29
+ import getpass
30
30
  import os
31
31
  import shlex
32
32
  import shutil
33
- import signal
34
33
  import subprocess
35
34
  import sys
36
35
  import textwrap
37
- import time
36
+ import traceback
38
37
  import typing
39
- from typing import Any, Dict, List, Optional, Tuple, Union
40
- import webbrowser
38
+ from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
41
39
 
42
40
  import click
43
41
  import colorama
44
42
  import dotenv
43
+ import requests as requests_lib
45
44
  from rich import progress as rich_progress
46
45
  import yaml
47
46
 
48
47
  import sky
49
- from sky import admin_policy
50
48
  from sky import backends
51
- from sky import check as sky_check
52
- from sky import clouds as sky_clouds
53
- from sky import core
49
+ from sky import clouds
54
50
  from sky import exceptions
55
51
  from sky import global_user_state
56
52
  from sky import jobs as managed_jobs
53
+ from sky import models
57
54
  from sky import serve as serve_lib
58
55
  from sky import sky_logging
59
- from sky import status_lib
60
56
  from sky.adaptors import common as adaptors_common
61
- from sky.backends import backend_utils
62
57
  from sky.benchmark import benchmark_state
63
58
  from sky.benchmark import benchmark_utils
59
+ from sky.client import sdk
64
60
  from sky.clouds import service_catalog
65
61
  from sky.data import storage_utils
62
+ from sky.provision.kubernetes import constants as kubernetes_constants
66
63
  from sky.provision.kubernetes import utils as kubernetes_utils
64
+ from sky.server import common as server_common
65
+ from sky.server import constants as server_constants
66
+ from sky.server.requests import requests
67
67
  from sky.skylet import constants
68
68
  from sky.skylet import job_lib
69
- from sky.skylet import log_lib
70
69
  from sky.usage import usage_lib
71
- from sky.utils import admin_policy_utils
70
+ from sky.utils import annotations
71
+ from sky.utils import cluster_utils
72
+ from sky.utils import common
72
73
  from sky.utils import common_utils
73
74
  from sky.utils import controller_utils
74
75
  from sky.utils import dag_utils
76
+ from sky.utils import env_options
75
77
  from sky.utils import log_utils
78
+ from sky.utils import registry
76
79
  from sky.utils import resources_utils
77
80
  from sky.utils import rich_utils
81
+ from sky.utils import status_lib
78
82
  from sky.utils import subprocess_utils
79
83
  from sky.utils import timeline
80
84
  from sky.utils import ux_utils
81
85
  from sky.utils.cli_utils import status_utils
82
86
 
83
87
  if typing.TYPE_CHECKING:
84
- from sky.backends import backend as backend_lib
88
+ import types
85
89
 
86
90
  pd = adaptors_common.LazyImport('pandas')
87
91
  logger = sky_logging.init_logger(__name__)
@@ -101,23 +105,92 @@ _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = (
101
105
  '{cluster_num} cluster{plural} {verb}. Please specify {cause} '
102
106
  'cluster to show its {property}.\nUsage: `sky status --{flag} <cluster>`')
103
107
 
104
- _ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
105
- 'please retry after a while.')
106
-
107
108
  _DAG_NOT_SUPPORTED_MESSAGE = ('YAML specifies a DAG which is only supported by '
108
109
  '`sky jobs launch`. `{command}` supports a '
109
110
  'single task only.')
110
111
 
111
112
 
112
- def _get_glob_clusters(clusters: List[str], silent: bool = False) -> List[str]:
113
- """Returns a list of clusters that match the glob pattern."""
114
- glob_clusters = []
115
- for cluster in clusters:
116
- glob_cluster = global_user_state.get_glob_cluster_names(cluster)
117
- if not glob_cluster and not silent:
118
- click.echo(f'Cluster {cluster} not found.')
119
- glob_clusters.extend(glob_cluster)
120
- return list(set(glob_clusters))
113
+ def _get_cluster_records_and_set_ssh_config(
114
+ clusters: Optional[List[str]],
115
+ refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
116
+ all_users: bool = False,
117
+ ) -> List[dict]:
118
+ """Returns a list of clusters that match the glob pattern.
119
+
120
+ Args:
121
+ clusters: A list of cluster names to query. If None, query all clusters.
122
+ refresh: The refresh mode for the status command.
123
+ all_users: Whether to query clusters from all users.
124
+ If clusters is not None, this field is ignored because cluster list
125
+ can include other users' clusters.
126
+ """
127
+ # TODO(zhwu): we should move this function into SDK.
128
+ # TODO(zhwu): this additional RTT makes CLIs slow. We should optimize this.
129
+ if clusters is not None:
130
+ all_users = True
131
+ request_id = sdk.status(clusters, refresh=refresh, all_users=all_users)
132
+ cluster_records = sdk.stream_and_get(request_id)
133
+ # Update the SSH config for all clusters
134
+ for record in cluster_records:
135
+ handle = record['handle']
136
+ if handle is not None and handle.cached_external_ips is not None:
137
+ credentials = record['credentials']
138
+ if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
139
+ # Replace the proxy command to proxy through the SkyPilot API
140
+ # server with websocket.
141
+ key_path = (
142
+ cluster_utils.SSHConfigHelper.generate_local_key_file(
143
+ handle.cluster_name, credentials))
144
+ # Instead of directly use websocket_proxy.py, we add an
145
+ # additional proxy, so that ssh can use the head pod in the
146
+ # cluster to jump to worker pods.
147
+ proxy_command = (
148
+ f'ssh -tt -i {key_path} '
149
+ '-o StrictHostKeyChecking=no '
150
+ '-o UserKnownHostsFile=/dev/null '
151
+ '-o IdentitiesOnly=yes '
152
+ '-W %h:%p '
153
+ f'{handle.ssh_user}@127.0.0.1 '
154
+ '-o ProxyCommand='
155
+ # TODO(zhwu): write the template to a temp file, don't use
156
+ # the one in skypilot repo, to avoid changing the file when
157
+ # updating skypilot.
158
+ f'\'{sys.executable} {sky.__root_dir__}/templates/'
159
+ f'websocket_proxy.py '
160
+ f'{server_common.get_server_url().split("://")[1]} '
161
+ f'{handle.cluster_name}\'')
162
+ credentials['ssh_proxy_command'] = proxy_command
163
+ cluster_utils.SSHConfigHelper.add_cluster(
164
+ handle.cluster_name,
165
+ handle.cached_external_ips,
166
+ credentials,
167
+ handle.cached_external_ssh_ports,
168
+ handle.docker_user,
169
+ handle.ssh_user,
170
+ )
171
+ else:
172
+ # If the cluster is not UP or does not have IPs, we need to remove
173
+ # the cluster from the SSH config.
174
+ cluster_utils.SSHConfigHelper.remove_cluster(handle.cluster_name)
175
+
176
+ # Clean up SSH configs for clusters that do not exist.
177
+ #
178
+ # We do this in a conservative way: only when a query is made for all users
179
+ # or specific clusters. Without those, the table returned only contains the
180
+ # current user's clusters, and the information is not enough for
181
+ # removing clusters, because SkyPilot has no idea whether to remove
182
+ # ssh config of a cluster from another user.
183
+ clusters_exists = set(record['name'] for record in cluster_records)
184
+ if clusters is not None:
185
+ for cluster in clusters:
186
+ if cluster not in clusters_exists:
187
+ cluster_utils.SSHConfigHelper.remove_cluster(cluster)
188
+ elif all_users:
189
+ for cluster_name in cluster_utils.SSHConfigHelper.list_cluster_names():
190
+ if cluster_name not in clusters_exists:
191
+ cluster_utils.SSHConfigHelper.remove_cluster(cluster_name)
192
+
193
+ return cluster_records
121
194
 
122
195
 
123
196
  def _get_glob_storages(storages: List[str]) -> List[str]:
@@ -147,6 +220,44 @@ def _parse_env_var(env_var: str) -> Tuple[str, str]:
147
220
  return ret[0], ret[1]
148
221
 
149
222
 
223
+ def _async_call_or_wait(request_id: str, async_call: bool,
224
+ request_name: str) -> Any:
225
+ short_request_id = request_id[:8]
226
+ if not async_call:
227
+ try:
228
+ return sdk.stream_and_get(request_id)
229
+ except KeyboardInterrupt:
230
+ logger.info(
231
+ ux_utils.starting_message('Request will continue running '
232
+ 'asynchronously.') +
233
+ f'\n{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}View logs: '
234
+ f'{ux_utils.BOLD}sky api logs {short_request_id}'
235
+ f'{colorama.Style.RESET_ALL}'
236
+ f'\n{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}Or, '
237
+ 'visit: '
238
+ f'{server_common.get_server_url()}/api/stream?'
239
+ f'request_id={short_request_id}'
240
+ f'\n{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}To cancel '
241
+ 'the request, run: '
242
+ f'{ux_utils.BOLD}sky api cancel {short_request_id}'
243
+ f'{colorama.Style.RESET_ALL}'
244
+ f'\n{colorama.Style.RESET_ALL}')
245
+ raise
246
+ else:
247
+ click.secho(f'Submitted {request_name} request: {request_id}',
248
+ fg='green')
249
+ click.echo(
250
+ f'{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}Check logs with: '
251
+ f'sky api logs {short_request_id}{colorama.Style.RESET_ALL}\n'
252
+ f'{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}Or, visit: '
253
+ f'{server_common.get_server_url()}/api/stream?'
254
+ f'request_id={short_request_id}'
255
+ f'\n{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}To cancel '
256
+ 'the request, run: '
257
+ f'{ux_utils.BOLD}sky api cancel {short_request_id}'
258
+ f'{colorama.Style.RESET_ALL}\n')
259
+
260
+
150
261
  def _merge_env_vars(env_dict: Optional[Dict[str, str]],
151
262
  env_list: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
152
263
  """Merges all values from env_list into env_dict."""
@@ -157,6 +268,15 @@ def _merge_env_vars(env_dict: Optional[Dict[str, str]],
157
268
  return list(env_dict.items())
158
269
 
159
270
 
271
+ _COMMON_OPTIONS = [
272
+ click.option('--async/--no-async',
273
+ 'async_call',
274
+ required=False,
275
+ is_flag=True,
276
+ default=False,
277
+ help=('Run the command asynchronously.'))
278
+ ]
279
+
160
280
  _TASK_OPTIONS = [
161
281
  click.option(
162
282
  '--workdir',
@@ -308,14 +428,28 @@ def _complete_cluster_name(ctx: click.Context, param: click.Parameter,
308
428
  incomplete: str) -> List[str]:
309
429
  """Handle shell completion for cluster names."""
310
430
  del ctx, param # Unused.
311
- return global_user_state.get_cluster_names_start_with(incomplete)
431
+ # TODO(zhwu): we send requests to API server for completion, which can cause
432
+ # large latency. We should investigate caching mechanism if needed.
433
+ response = requests_lib.get(
434
+ f'{server_common.get_server_url()}'
435
+ f'/api/completion/cluster_name?incomplete={incomplete}',
436
+ timeout=2.0,
437
+ )
438
+ response.raise_for_status()
439
+ return response.json()
312
440
 
313
441
 
314
442
  def _complete_storage_name(ctx: click.Context, param: click.Parameter,
315
443
  incomplete: str) -> List[str]:
316
444
  """Handle shell completion for storage names."""
317
445
  del ctx, param # Unused.
318
- return global_user_state.get_storage_names_start_with(incomplete)
446
+ response = requests_lib.get(
447
+ f'{server_common.get_server_url()}'
448
+ f'/api/completion/storage_name?incomplete={incomplete}',
449
+ timeout=2.0,
450
+ )
451
+ response.raise_for_status()
452
+ return response.json()
319
453
 
320
454
 
321
455
  def _complete_file_name(ctx: click.Context, param: click.Parameter,
@@ -493,7 +627,7 @@ def _parse_override_params(
493
627
  if cloud.lower() == 'none':
494
628
  override_params['cloud'] = None
495
629
  else:
496
- override_params['cloud'] = sky_clouds.CLOUD_REGISTRY.from_str(cloud)
630
+ override_params['cloud'] = registry.CLOUD_REGISTRY.from_str(cloud)
497
631
  if region is not None:
498
632
  if region.lower() == 'none':
499
633
  override_params['region'] = None
@@ -550,99 +684,6 @@ def _parse_override_params(
550
684
  return override_params
551
685
 
552
686
 
553
- def _launch_with_confirm(
554
- task: sky.Task,
555
- backend: backends.Backend,
556
- cluster: Optional[str],
557
- *,
558
- dryrun: bool,
559
- detach_run: bool,
560
- detach_setup: bool = False,
561
- no_confirm: bool = False,
562
- idle_minutes_to_autostop: Optional[int] = None,
563
- down: bool = False, # pylint: disable=redefined-outer-name
564
- retry_until_up: bool = False,
565
- no_setup: bool = False,
566
- clone_disk_from: Optional[str] = None,
567
- fast: bool = False,
568
- ):
569
- """Launch a cluster with a Task."""
570
- if cluster is None:
571
- cluster = backend_utils.generate_cluster_name()
572
-
573
- clone_source_str = ''
574
- if clone_disk_from is not None:
575
- clone_source_str = f' from the disk of {clone_disk_from!r}'
576
- task, _ = backend_utils.check_can_clone_disk_and_override_task(
577
- clone_disk_from, cluster, task)
578
-
579
- with sky.Dag() as dag:
580
- dag.add(task)
581
-
582
- maybe_status, handle = backend_utils.refresh_cluster_status_handle(cluster)
583
- if maybe_status is None:
584
- # Show the optimize log before the prompt if the cluster does not exist.
585
- try:
586
- sky_check.get_cached_enabled_clouds_or_refresh(
587
- raise_if_no_cloud_access=True)
588
- except exceptions.NoCloudAccessError as e:
589
- # Catch the exception where the public cloud is not enabled, and
590
- # make it yellow for better visibility.
591
- with ux_utils.print_exception_no_traceback():
592
- raise RuntimeError(f'{colorama.Fore.YELLOW}{e}'
593
- f'{colorama.Style.RESET_ALL}') from e
594
- dag, _ = admin_policy_utils.apply(
595
- dag,
596
- request_options=admin_policy.RequestOptions(
597
- cluster_name=cluster,
598
- idle_minutes_to_autostop=idle_minutes_to_autostop,
599
- down=down,
600
- dryrun=dryrun,
601
- ),
602
- )
603
- dag = sky.optimize(dag)
604
- task = dag.tasks[0]
605
-
606
- if handle is not None:
607
- backend.check_resources_fit_cluster(handle, task)
608
-
609
- confirm_shown = False
610
- if not no_confirm:
611
- # Prompt if (1) --cluster is None, or (2) cluster doesn't exist, or (3)
612
- # it exists but is STOPPED.
613
- prompt = None
614
- if maybe_status is None:
615
- cluster_str = '' if cluster is None else f' {cluster!r}'
616
- prompt = (
617
- f'Launching a new cluster{cluster_str}{clone_source_str}. '
618
- 'Proceed?')
619
- elif maybe_status == status_lib.ClusterStatus.STOPPED:
620
- prompt = f'Restarting the stopped cluster {cluster!r}. Proceed?'
621
- if prompt is not None:
622
- confirm_shown = True
623
- click.confirm(prompt, default=True, abort=True, show_default=True)
624
-
625
- if not confirm_shown:
626
- click.secho('Running on cluster: ', fg='cyan', nl=False)
627
- click.secho(cluster)
628
-
629
- sky.launch(
630
- dag,
631
- dryrun=dryrun,
632
- stream_logs=True,
633
- cluster_name=cluster,
634
- detach_setup=detach_setup,
635
- detach_run=detach_run,
636
- backend=backend,
637
- idle_minutes_to_autostop=idle_minutes_to_autostop,
638
- down=down,
639
- retry_until_up=retry_until_up,
640
- no_setup=no_setup,
641
- clone_disk_from=clone_disk_from,
642
- fast=fast,
643
- )
644
-
645
-
646
687
  def _check_yaml(entrypoint: str) -> Tuple[bool, Optional[Dict[str, Any]]]:
647
688
  """Checks if entrypoint is a readable YAML file.
648
689
 
@@ -953,6 +994,7 @@ def _deprecate_and_hide_command(group, command_to_deprecate,
953
994
  prog_name='skypilot',
954
995
  message='%(prog)s, commit %(version)s',
955
996
  help='Show the commit hash and exit')
997
+ @annotations.client_api
956
998
  def cli():
957
999
  pass
958
1000
 
@@ -973,20 +1015,9 @@ def cli():
973
1015
  default=False,
974
1016
  is_flag=True,
975
1017
  help='If True, do not actually run the job.')
976
- @click.option(
977
- '--detach-setup',
978
- '-s',
979
- default=False,
980
- is_flag=True,
981
- help=
982
- ('If True, run setup in non-interactive mode as part of the job itself. '
983
- 'You can safely ctrl-c to detach from logging, and it will not interrupt '
984
- 'the setup process. To see the logs again after detaching, use `sky logs`.'
985
- ' To cancel setup, cancel the job via `sky cancel`. Useful for long-'
986
- 'running setup commands.'))
987
1018
  @click.option(
988
1019
  '--detach-run',
989
- '-d',
1020
+ '-d/-no-d',
990
1021
  default=False,
991
1022
  is_flag=True,
992
1023
  help=('If True, as soon as a job is submitted, return from this call '
@@ -994,11 +1025,13 @@ def cli():
994
1025
  @click.option('--docker',
995
1026
  'backend_name',
996
1027
  flag_value=backends.LocalDockerBackend.NAME,
1028
+ default=False,
997
1029
  hidden=True,
998
1030
  help=('(Deprecated) Local docker support is deprecated. '
999
1031
  'To run locally, create a local Kubernetes cluster with '
1000
1032
  '``sky local up``.'))
1001
- @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS)
1033
+ @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS +
1034
+ _COMMON_OPTIONS)
1002
1035
  @click.option(
1003
1036
  '--idle-minutes-to-autostop',
1004
1037
  '-i',
@@ -1067,37 +1100,36 @@ def cli():
1067
1100
  'provisioning and setup steps.'))
1068
1101
  @usage_lib.entrypoint
1069
1102
  def launch(
1070
- entrypoint: Tuple[str, ...],
1071
- cluster: Optional[str],
1072
- dryrun: bool,
1073
- detach_setup: bool,
1074
- detach_run: bool,
1075
- backend_name: Optional[str],
1076
- name: Optional[str],
1077
- workdir: Optional[str],
1078
- cloud: Optional[str],
1079
- region: Optional[str],
1080
- zone: Optional[str],
1081
- gpus: Optional[str],
1082
- cpus: Optional[str],
1083
- memory: Optional[str],
1084
- instance_type: Optional[str],
1085
- num_nodes: Optional[int],
1086
- use_spot: Optional[bool],
1087
- image_id: Optional[str],
1088
- env_file: Optional[Dict[str, str]],
1089
- env: List[Tuple[str, str]],
1090
- disk_size: Optional[int],
1091
- disk_tier: Optional[str],
1092
- ports: Tuple[str, ...],
1093
- idle_minutes_to_autostop: Optional[int],
1094
- down: bool, # pylint: disable=redefined-outer-name
1095
- retry_until_up: bool,
1096
- yes: bool,
1097
- no_setup: bool,
1098
- clone_disk_from: Optional[str],
1099
- fast: bool,
1100
- ):
1103
+ entrypoint: Tuple[str, ...],
1104
+ cluster: Optional[str],
1105
+ dryrun: bool,
1106
+ detach_run: bool,
1107
+ backend_name: Optional[str],
1108
+ name: Optional[str],
1109
+ workdir: Optional[str],
1110
+ cloud: Optional[str],
1111
+ region: Optional[str],
1112
+ zone: Optional[str],
1113
+ gpus: Optional[str],
1114
+ cpus: Optional[str],
1115
+ memory: Optional[str],
1116
+ instance_type: Optional[str],
1117
+ num_nodes: Optional[int],
1118
+ use_spot: Optional[bool],
1119
+ image_id: Optional[str],
1120
+ env_file: Optional[Dict[str, str]],
1121
+ env: List[Tuple[str, str]],
1122
+ disk_size: Optional[int],
1123
+ disk_tier: Optional[str],
1124
+ ports: Tuple[str, ...],
1125
+ idle_minutes_to_autostop: Optional[int],
1126
+ down: bool, # pylint: disable=redefined-outer-name
1127
+ retry_until_up: bool,
1128
+ yes: bool,
1129
+ no_setup: bool,
1130
+ clone_disk_from: Optional[str],
1131
+ fast: bool,
1132
+ async_call: bool):
1101
1133
  """Launch a cluster or task.
1102
1134
 
1103
1135
  If ENTRYPOINT points to a valid YAML file, it is read in as the task
@@ -1107,6 +1139,14 @@ def launch(
1107
1139
  and they undergo job queue scheduling.
1108
1140
  """
1109
1141
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1142
+ # TODO(zhwu): the current --async is a bit inconsistent with the direct
1143
+ # sky launch, as `sky api logs` does not contain the logs for the actual job
1144
+ # submitted, while the synchronous way of `sky launch` does. We should
1145
+ # consider having the job logs available in `sky api logs` as well.
1146
+ # Reason for not doing it right now: immediately tailing the logs for the
1147
+ # job can take up resources on the API server. When there are a lot of
1148
+ # `launch` submitted asynchronously, the log tailing may overwhelm the API
1149
+ # server, if the jobs are long running.
1110
1150
  env = _merge_env_vars(env_file, env)
1111
1151
  controller_utils.check_cluster_name_not_controller(
1112
1152
  cluster, operation_str='Launching tasks on it')
@@ -1159,19 +1199,35 @@ def launch(
1159
1199
  f'{colorama.Style.RESET_ALL}{colorama.Style.BRIGHT}sky serve up'
1160
1200
  f'{colorama.Style.RESET_ALL}')
1161
1201
 
1162
- _launch_with_confirm(task,
1163
- backend,
1164
- cluster,
1165
- dryrun=dryrun,
1166
- detach_setup=detach_setup,
1167
- detach_run=detach_run,
1168
- no_confirm=yes,
1169
- idle_minutes_to_autostop=idle_minutes_to_autostop,
1170
- down=down,
1171
- retry_until_up=retry_until_up,
1172
- no_setup=no_setup,
1173
- clone_disk_from=clone_disk_from,
1174
- fast=fast)
1202
+ request_id = sdk.launch(
1203
+ task,
1204
+ dryrun=dryrun,
1205
+ cluster_name=cluster,
1206
+ backend=backend,
1207
+ idle_minutes_to_autostop=idle_minutes_to_autostop,
1208
+ down=down,
1209
+ retry_until_up=retry_until_up,
1210
+ no_setup=no_setup,
1211
+ clone_disk_from=clone_disk_from,
1212
+ fast=fast,
1213
+ _need_confirmation=not yes,
1214
+ )
1215
+ job_id_handle = _async_call_or_wait(request_id, async_call, 'sky.launch')
1216
+ if not async_call:
1217
+ job_id, handle = job_id_handle
1218
+ if not handle:
1219
+ assert dryrun, 'handle should only be None when dryrun is true'
1220
+ return
1221
+ # Add ssh config for the cluster
1222
+ _get_cluster_records_and_set_ssh_config(
1223
+ clusters=[handle.get_cluster_name()])
1224
+ # job_id will be None if no job was submitted (e.g. no entrypoint
1225
+ # provided)
1226
+ if not detach_run and job_id is not None:
1227
+ sdk.tail_logs(handle.get_cluster_name(), job_id, follow=True)
1228
+ click.secho(
1229
+ ux_utils.command_hint_messages(ux_utils.CommandHintType.CLUSTER_JOB,
1230
+ job_id, handle.get_cluster_name()))
1175
1231
 
1176
1232
 
1177
1233
  @cli.command(cls=_DocumentedCodeCommand)
@@ -1199,32 +1255,19 @@ def launch(
1199
1255
  is_flag=True,
1200
1256
  help=('If True, as soon as a job is submitted, return from this call '
1201
1257
  'and do not stream execution logs.'))
1202
- @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS)
1258
+ @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS +
1259
+ _COMMON_OPTIONS)
1203
1260
  @usage_lib.entrypoint
1204
1261
  # pylint: disable=redefined-builtin
1205
- def exec(
1206
- cluster: Optional[str],
1207
- cluster_option: Optional[str],
1208
- entrypoint: Tuple[str, ...],
1209
- detach_run: bool,
1210
- name: Optional[str],
1211
- cloud: Optional[str],
1212
- region: Optional[str],
1213
- zone: Optional[str],
1214
- workdir: Optional[str],
1215
- gpus: Optional[str],
1216
- ports: Tuple[str],
1217
- instance_type: Optional[str],
1218
- num_nodes: Optional[int],
1219
- use_spot: Optional[bool],
1220
- image_id: Optional[str],
1221
- env_file: Optional[Dict[str, str]],
1222
- env: List[Tuple[str, str]],
1223
- cpus: Optional[str],
1224
- memory: Optional[str],
1225
- disk_size: Optional[int],
1226
- disk_tier: Optional[str],
1227
- ):
1262
+ def exec(cluster: Optional[str], cluster_option: Optional[str],
1263
+ entrypoint: Tuple[str, ...], detach_run: bool, name: Optional[str],
1264
+ cloud: Optional[str], region: Optional[str], zone: Optional[str],
1265
+ workdir: Optional[str], gpus: Optional[str], ports: Tuple[str],
1266
+ instance_type: Optional[str], num_nodes: Optional[int],
1267
+ use_spot: Optional[bool], image_id: Optional[str],
1268
+ env_file: Optional[Dict[str, str]], env: List[Tuple[str, str]],
1269
+ cpus: Optional[str], memory: Optional[str], disk_size: Optional[int],
1270
+ disk_tier: Optional[str], async_call: bool):
1228
1271
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1229
1272
  """Execute a task or command on an existing cluster.
1230
1273
 
@@ -1297,11 +1340,6 @@ def exec(
1297
1340
  env = _merge_env_vars(env_file, env)
1298
1341
  controller_utils.check_cluster_name_not_controller(
1299
1342
  cluster, operation_str='Executing task on it')
1300
- handle = global_user_state.get_handle_from_cluster_name(cluster)
1301
- if handle is None:
1302
- raise click.BadParameter(f'Cluster {cluster!r} not found. '
1303
- 'Use `sky launch` to provision first.')
1304
- backend = backend_utils.get_backend_from_handle(handle)
1305
1343
 
1306
1344
  task_or_dag = _make_task_or_dag_from_entrypoint_with_overrides(
1307
1345
  entrypoint=entrypoint,
@@ -1331,21 +1369,21 @@ def exec(
1331
1369
 
1332
1370
  click.secho('Submitting job to cluster: ', fg='cyan', nl=False)
1333
1371
  click.secho(cluster)
1334
- sky.exec(task, backend=backend, cluster_name=cluster, detach_run=detach_run)
1372
+ request_id = sdk.exec(task, cluster_name=cluster)
1373
+ job_id_handle = _async_call_or_wait(request_id, async_call, 'sky.exec')
1374
+ if not async_call and not detach_run:
1375
+ job_id, _ = job_id_handle
1376
+ sdk.tail_logs(cluster, job_id, follow=True)
1335
1377
 
1336
1378
 
1337
- def _get_managed_jobs(
1338
- refresh: bool,
1339
- skip_finished: bool,
1379
+ def _handle_jobs_queue_request(
1380
+ request_id: str,
1340
1381
  show_all: bool,
1341
1382
  limit_num_jobs_to_show: bool = False,
1342
1383
  is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
1343
1384
  """Get the in-progress managed jobs.
1344
1385
 
1345
1386
  Args:
1346
- refresh: Query the latest statuses, restarting the jobs controller if
1347
- stopped.
1348
- skip_finished: Show only in-progress jobs.
1349
1387
  show_all: Show all information of each job (e.g., region, price).
1350
1388
  limit_num_jobs_to_show: If True, limit the number of jobs to show to
1351
1389
  _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS, which is mainly used by
@@ -1359,14 +1397,13 @@ def _get_managed_jobs(
1359
1397
  msg contains the error message. Otherwise, msg contains the formatted
1360
1398
  managed job table.
1361
1399
  """
1400
+ # TODO(SKY-980): remove unnecessary fallbacks on the client side.
1362
1401
  num_in_progress_jobs = None
1402
+ msg = ''
1363
1403
  try:
1364
1404
  if not is_called_by_user:
1365
1405
  usage_lib.messages.usage.set_internal()
1366
- with sky_logging.silent():
1367
- # Make the call silent
1368
- managed_jobs_ = managed_jobs.queue(refresh=refresh,
1369
- skip_finished=skip_finished)
1406
+ managed_jobs_ = sdk.get(request_id)
1370
1407
  num_in_progress_jobs = len(set(job['job_id'] for job in managed_jobs_))
1371
1408
  except exceptions.ClusterNotUpError as e:
1372
1409
  controller_status = e.cluster_status
@@ -1379,16 +1416,18 @@ def _get_managed_jobs(
1379
1416
  msg += (f' (See finished managed jobs: {colorama.Style.BRIGHT}'
1380
1417
  f'sky jobs queue --refresh{colorama.Style.RESET_ALL})')
1381
1418
  except RuntimeError as e:
1382
- msg = ''
1383
1419
  try:
1384
1420
  # Check the controller status again, as the RuntimeError is likely
1385
1421
  # due to the controller being autostopped when querying the jobs.
1386
1422
  controller_type = controller_utils.Controllers.JOBS_CONTROLLER
1387
- record = backend_utils.refresh_cluster_record(
1388
- controller_type.value.cluster_name,
1389
- cluster_status_lock_timeout=0)
1390
- if (record is None or
1391
- record['status'] == status_lib.ClusterStatus.STOPPED):
1423
+ # Query status of the controller cluster. We add a wildcard because
1424
+ # the controller cluster name can have a suffix like
1425
+ # '-remote-<hash>' when using remote API server.
1426
+ records = sdk.get(
1427
+ sdk.status(
1428
+ cluster_names=[controller_type.value.cluster_name + '*']))
1429
+ if (not records or
1430
+ records[0]['status'] == status_lib.ClusterStatus.STOPPED):
1392
1431
  msg = controller_type.value.default_hint_if_non_existent
1393
1432
  except Exception: # pylint: disable=broad-except
1394
1433
  # This is to an best effort to find the latest controller status to
@@ -1402,8 +1441,12 @@ def _get_managed_jobs(
1402
1441
  f'Details: {common_utils.format_exception(e, use_bracket=True)}'
1403
1442
  )
1404
1443
  except Exception as e: # pylint: disable=broad-except
1405
- msg = ('Failed to query managed jobs: '
1406
- f'{common_utils.format_exception(e, use_bracket=True)}')
1444
+ msg = ''
1445
+ if env_options.Options.SHOW_DEBUG_INFO.get():
1446
+ msg += traceback.format_exc()
1447
+ msg += '\n'
1448
+ msg += ('Failed to query managed jobs: '
1449
+ f'{common_utils.format_exception(e, use_bracket=True)}')
1407
1450
  else:
1408
1451
  max_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS
1409
1452
  if limit_num_jobs_to_show else None)
@@ -1413,10 +1456,12 @@ def _get_managed_jobs(
1413
1456
  return num_in_progress_jobs, msg
1414
1457
 
1415
1458
 
1416
- def _get_services(service_names: Optional[List[str]],
1417
- show_all: bool,
1418
- show_endpoint: bool,
1419
- is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
1459
+ def _handle_services_request(
1460
+ request_id: str,
1461
+ service_names: Optional[List[str]],
1462
+ show_all: bool,
1463
+ show_endpoint: bool,
1464
+ is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
1420
1465
  """Get service statuses.
1421
1466
 
1422
1467
  Args:
@@ -1435,12 +1480,8 @@ def _get_services(service_names: Optional[List[str]],
1435
1480
  try:
1436
1481
  if not is_called_by_user:
1437
1482
  usage_lib.messages.usage.set_internal()
1438
- with sky_logging.silent():
1439
- if not service_names:
1440
- # Change empty list to None
1441
- service_names = None
1442
- service_records = serve_lib.status(service_names)
1443
- num_services = len(service_records)
1483
+ service_records = sdk.get(request_id)
1484
+ num_services = len(service_records)
1444
1485
  except exceptions.ClusterNotUpError as e:
1445
1486
  controller_status = e.cluster_status
1446
1487
  msg = str(e)
@@ -1454,11 +1495,14 @@ def _get_services(service_names: Optional[List[str]],
1454
1495
  # due to the controller being autostopped when querying the
1455
1496
  # services.
1456
1497
  controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
1457
- record = backend_utils.refresh_cluster_record(
1458
- controller_type.value.cluster_name,
1459
- cluster_status_lock_timeout=0)
1460
- if (record is None or
1461
- record['status'] == status_lib.ClusterStatus.STOPPED):
1498
+ # Query status of the controller cluster. We add a wildcard because
1499
+ # the controller cluster name can have a suffix like
1500
+ # '-remote-<hash>' when using remote API server.
1501
+ records = sdk.get(
1502
+ sdk.status(
1503
+ cluster_names=[controller_type.value.cluster_name + '*']))
1504
+ if (not records or
1505
+ records[0]['status'] == status_lib.ClusterStatus.STOPPED):
1462
1506
  msg = controller_type.value.default_hint_if_non_existent
1463
1507
  except Exception: # pylint: disable=broad-except
1464
1508
  # This is to an best effort to find the latest controller status to
@@ -1482,7 +1526,8 @@ def _get_services(service_names: Optional[List[str]],
1482
1526
  f'{service_num} service{plural} found. Please specify '
1483
1527
  'an existing service to show its endpoint. Usage: '
1484
1528
  'sky serve status --endpoint <service-name>')
1485
- msg = serve_lib.get_endpoint(service_records[0])
1529
+ endpoint = service_records[0]['endpoint']
1530
+ msg = '-' if endpoint is None else endpoint
1486
1531
  else:
1487
1532
  msg = serve_lib.format_service_table(service_records, show_all)
1488
1533
  service_not_found_msg = ''
@@ -1503,8 +1548,8 @@ def _status_kubernetes(show_all: bool):
1503
1548
  Args:
1504
1549
  show_all (bool): Show all job information (e.g., start time, failures).
1505
1550
  """
1506
- all_clusters, unmanaged_clusters, all_jobs, context = (
1507
- core.status_kubernetes())
1551
+ all_clusters, unmanaged_clusters, all_jobs, context = (sdk.stream_and_get(
1552
+ sdk.status_kubernetes()))
1508
1553
  click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1509
1554
  f'Kubernetes cluster state (context: {context})'
1510
1555
  f'{colorama.Style.RESET_ALL}')
@@ -1524,9 +1569,76 @@ def _status_kubernetes(show_all: bool):
1524
1569
  f'{colorama.Style.RESET_ALL}')
1525
1570
 
1526
1571
 
1572
+ def _show_endpoint(query_clusters: Optional[List[str]],
1573
+ cluster_records: List[Dict[str, Any]], ip: bool,
1574
+ endpoints: bool, endpoint: Optional[int]) -> None:
1575
+ show_endpoints = endpoints or endpoint is not None
1576
+ show_single_endpoint = endpoint is not None
1577
+ if len(cluster_records) != 1:
1578
+ with ux_utils.print_exception_no_traceback():
1579
+ plural = 's' if len(cluster_records) > 1 else ''
1580
+ if cluster_records:
1581
+ cluster_num = str(len(cluster_records))
1582
+ else:
1583
+ cluster_num = (f'{query_clusters[0]!r}'
1584
+ if query_clusters else 'No')
1585
+ verb = 'found' if cluster_records else 'not found'
1586
+ cause = 'a single'
1587
+ if query_clusters and len(query_clusters) > 1:
1588
+ cause = 'an existing'
1589
+ raise ValueError(
1590
+ _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format(
1591
+ cluster_num=cluster_num,
1592
+ plural=plural,
1593
+ verb=verb,
1594
+ cause=cause,
1595
+ property='IP address' if ip else 'endpoint(s)',
1596
+ flag='ip' if ip else
1597
+ ('endpoint port' if show_single_endpoint else 'endpoints')))
1598
+
1599
+ cluster_record = cluster_records[0]
1600
+ if cluster_record['status'] != status_lib.ClusterStatus.UP:
1601
+ with ux_utils.print_exception_no_traceback():
1602
+ raise RuntimeError(f'Cluster {cluster_record["name"]!r} '
1603
+ 'is not in UP status.')
1604
+ handle = cluster_record['handle']
1605
+ if not isinstance(handle, backends.CloudVmRayResourceHandle):
1606
+ with ux_utils.print_exception_no_traceback():
1607
+ raise ValueError('Querying IP address is not supported '
1608
+ 'for local clusters.')
1609
+
1610
+ head_ip = handle.external_ips()[0]
1611
+ # The endpoint request is relatively fast, so we don't add special handling
1612
+ # for keyboard interrupt and abort the request to avoid additional latency.
1613
+ if show_endpoints:
1614
+ if endpoint:
1615
+ request_id = sdk.endpoints(cluster_record['name'], endpoint)
1616
+ cluster_endpoints = sdk.stream_and_get(request_id)
1617
+ cluster_endpoint = cluster_endpoints.get(str(endpoint), None)
1618
+ if not cluster_endpoint:
1619
+ raise click.Abort(f'Endpoint {endpoint} not found for cluster '
1620
+ f'{cluster_record["name"]!r}.')
1621
+ click.echo(cluster_endpoint)
1622
+ else:
1623
+ request_id = sdk.endpoints(cluster_record['name'])
1624
+ cluster_endpoints = sdk.stream_and_get(request_id)
1625
+ assert isinstance(cluster_endpoints, dict)
1626
+ if not cluster_endpoints:
1627
+ raise click.Abort(f'No endpoint found for cluster '
1628
+ f'{cluster_record["name"]!r}.')
1629
+ for port, port_endpoint in cluster_endpoints.items():
1630
+ click.echo(f'{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}'
1631
+ f'{colorama.Style.RESET_ALL}: '
1632
+ f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1633
+ f'{port_endpoint}{colorama.Style.RESET_ALL}')
1634
+ return
1635
+ click.echo(head_ip)
1636
+ return
1637
+
1638
+
1527
1639
  @cli.command()
1528
- @click.option('--all',
1529
- '-a',
1640
+ @click.option('--verbose',
1641
+ '-v',
1530
1642
  default=False,
1531
1643
  is_flag=True,
1532
1644
  required=False,
@@ -1582,11 +1694,19 @@ def _status_kubernetes(show_all: bool):
1582
1694
  type=str,
1583
1695
  nargs=-1,
1584
1696
  **_get_shell_complete_args(_complete_cluster_name))
1697
+ @click.option('--all-users',
1698
+ '-u',
1699
+ default=False,
1700
+ is_flag=True,
1701
+ required=False,
1702
+ help='Show all clusters, including those not owned by the '
1703
+ 'current user.')
1585
1704
  @usage_lib.entrypoint
1586
1705
  # pylint: disable=redefined-builtin
1587
- def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
1706
+ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1588
1707
  endpoint: Optional[int], show_managed_jobs: bool,
1589
- show_services: bool, kubernetes: bool, clusters: List[str]):
1708
+ show_services: bool, kubernetes: bool, clusters: List[str],
1709
+ all_users: bool):
1590
1710
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1591
1711
  """Show clusters.
1592
1712
 
@@ -1601,11 +1721,15 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
1601
1721
  ``sky status --endpoints mycluster``. To query a single endpoint, you
1602
1722
  can use ``sky status mycluster --endpoint 8888``.
1603
1723
 
1724
+ Running `sky status` will update the ssh config for the clusters locally, so
1725
+ that you can directly ssh into the clusters or connect to the clusters with
1726
+ vscode.
1727
+
1604
1728
  The following fields for each cluster are recorded: cluster name, time
1605
1729
  since last launch, resources, region, zone, hourly price, status, autostop,
1606
1730
  command.
1607
1731
 
1608
- Display all fields using ``sky status -a``.
1732
+ Display all fields using ``sky status -v``.
1609
1733
 
1610
1734
  Each cluster can have one of the following statuses:
1611
1735
 
@@ -1646,245 +1770,160 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
1646
1770
  cluster statuses from the cloud providers.
1647
1771
  """
1648
1772
  if kubernetes:
1649
- _status_kubernetes(all)
1773
+ _status_kubernetes(verbose)
1650
1774
  return
1651
- # Using a pool with 2 worker to run the managed job query and sky serve
1652
- # service query in parallel to speed up. The pool provides a AsyncResult
1653
- # object that can be used as a future.
1654
- with multiprocessing.Pool(2) as pool:
1655
- # Do not show job queue if user specifies clusters, and if user
1656
- # specifies --ip or --endpoint(s).
1657
- show_managed_jobs = show_managed_jobs and not any(
1658
- [clusters, ip, endpoints])
1659
- show_endpoints = endpoints or endpoint is not None
1660
- show_single_endpoint = endpoint is not None
1661
- if show_managed_jobs:
1662
- # Run managed job query in parallel to speed up the status query.
1663
- managed_jobs_future = pool.apply_async(
1664
- _get_managed_jobs,
1665
- kwds=dict(refresh=False,
1666
- skip_finished=True,
1667
- show_all=False,
1668
- limit_num_jobs_to_show=not all,
1669
- is_called_by_user=False))
1670
-
1671
- show_services = show_services and not clusters and not ip
1672
- if show_services:
1673
- # Run the sky serve service query in parallel to speed up the
1674
- # status query.
1675
- services_future = pool.apply_async(_get_services,
1676
- kwds=dict(
1677
- service_names=None,
1678
- show_all=False,
1679
- show_endpoint=False,
1680
- is_called_by_user=False))
1681
- if ip or show_endpoints:
1682
- if refresh:
1683
- raise click.UsageError(
1684
- 'Using --ip or --endpoint(s) with --refresh is not'
1685
- 'supported for now. To fix, refresh first, '
1686
- 'then query the IP or endpoint.')
1775
+ # Do not show job queue if user specifies clusters, and if user
1776
+ # specifies --ip or --endpoint(s).
1777
+ show_managed_jobs = show_managed_jobs and not any([clusters, ip, endpoints])
1778
+ if show_managed_jobs:
1779
+ managed_jobs_queue_request_id = managed_jobs.queue(refresh=False,
1780
+ skip_finished=True)
1781
+ show_endpoints = endpoints or endpoint is not None
1782
+ show_single_endpoint = endpoint is not None
1783
+ show_services = show_services and not any([clusters, ip, endpoints])
1784
+ if show_services:
1785
+ # Run the sky serve service query in parallel to speed up the
1786
+ # status query.
1787
+ service_status_request_id = serve_lib.status(service_names=None)
1788
+
1789
+ if ip or show_endpoints:
1790
+ if refresh:
1791
+ raise click.UsageError(
1792
+ 'Using --ip or --endpoint(s) with --refresh is not'
1793
+ 'supported for now. To fix, refresh first, '
1794
+ 'then query the IP or endpoint.')
1687
1795
 
1688
- if ip and show_endpoints:
1689
- with ux_utils.print_exception_no_traceback():
1690
- raise ValueError(
1691
- 'Cannot specify both --ip and --endpoint(s) '
1692
- 'at the same time.')
1796
+ if ip and show_endpoints:
1797
+ with ux_utils.print_exception_no_traceback():
1798
+ raise ValueError('Cannot specify both --ip and --endpoint(s) '
1799
+ 'at the same time.')
1693
1800
 
1694
- if endpoint is not None and endpoints:
1695
- with ux_utils.print_exception_no_traceback():
1696
- raise ValueError(
1697
- 'Cannot specify both --endpoint and --endpoints '
1698
- 'at the same time.')
1801
+ if endpoint is not None and endpoints:
1802
+ with ux_utils.print_exception_no_traceback():
1803
+ raise ValueError(
1804
+ 'Cannot specify both --endpoint and --endpoints '
1805
+ 'at the same time.')
1699
1806
 
1700
- if len(clusters) != 1:
1701
- with ux_utils.print_exception_no_traceback():
1702
- plural = 's' if len(clusters) > 1 else ''
1703
- cluster_num = (str(len(clusters)) if clusters else 'No')
1704
- cause = 'a single' if len(clusters) > 1 else 'an existing'
1705
- raise ValueError(
1706
- _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format(
1707
- cluster_num=cluster_num,
1708
- plural=plural,
1709
- verb='specified',
1710
- cause=cause,
1711
- property='IP address' if ip else 'endpoint(s)',
1712
- flag='ip' if ip else
1713
- ('endpoint port'
1714
- if show_single_endpoint else 'endpoints')))
1807
+ if len(clusters) != 1:
1808
+ with ux_utils.print_exception_no_traceback():
1809
+ plural = 's' if len(clusters) > 1 else ''
1810
+ cluster_num = (str(len(clusters)) if clusters else 'No')
1811
+ cause = 'a single' if len(clusters) > 1 else 'an existing'
1812
+ raise ValueError(
1813
+ _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format(
1814
+ cluster_num=cluster_num,
1815
+ plural=plural,
1816
+ verb='specified',
1817
+ cause=cause,
1818
+ property='IP address' if ip else 'endpoint(s)',
1819
+ flag='ip' if ip else
1820
+ ('endpoint port'
1821
+ if show_single_endpoint else 'endpoints')))
1822
+ else:
1823
+ click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters'
1824
+ f'{colorama.Style.RESET_ALL}')
1825
+ query_clusters: Optional[List[str]] = None if not clusters else clusters
1826
+ refresh_mode = common.StatusRefreshMode.NONE
1827
+ if refresh:
1828
+ refresh_mode = common.StatusRefreshMode.FORCE
1829
+ cluster_records = _get_cluster_records_and_set_ssh_config(
1830
+ query_clusters, refresh_mode, all_users)
1831
+
1832
+ # TOOD(zhwu): setup the ssh config for status
1833
+ if ip or show_endpoints:
1834
+ _show_endpoint(query_clusters, cluster_records, ip, endpoints, endpoint)
1835
+ return
1836
+ hints = []
1837
+ normal_clusters = []
1838
+ controllers = []
1839
+ for cluster_record in cluster_records:
1840
+ cluster_name = cluster_record['name']
1841
+ controller = controller_utils.Controllers.from_name(cluster_name)
1842
+ if controller is not None:
1843
+ controllers.append(cluster_record)
1715
1844
  else:
1716
- click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters'
1717
- f'{colorama.Style.RESET_ALL}')
1718
- query_clusters: Optional[List[str]] = None
1719
- if clusters:
1720
- query_clusters = _get_glob_clusters(clusters, silent=ip)
1721
- cluster_records = core.status(cluster_names=query_clusters,
1722
- refresh=refresh)
1723
- if ip or show_endpoints:
1724
- if len(cluster_records) != 1:
1725
- with ux_utils.print_exception_no_traceback():
1726
- plural = 's' if len(cluster_records) > 1 else ''
1727
- cluster_num = (str(len(cluster_records))
1728
- if cluster_records else f'{clusters[0]!r}')
1729
- verb = 'found' if cluster_records else 'not found'
1730
- cause = 'a single' if len(clusters) > 1 else 'an existing'
1731
- raise ValueError(
1732
- _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format(
1733
- cluster_num=cluster_num,
1734
- plural=plural,
1735
- verb=verb,
1736
- cause=cause,
1737
- property='IP address' if ip else 'endpoint(s)',
1738
- flag='ip' if ip else
1739
- ('endpoint port'
1740
- if show_single_endpoint else 'endpoints')))
1741
-
1742
- cluster_record = cluster_records[0]
1743
- if cluster_record['status'] != status_lib.ClusterStatus.UP:
1744
- with ux_utils.print_exception_no_traceback():
1745
- raise RuntimeError(f'Cluster {cluster_record["name"]!r} '
1746
- 'is not in UP status.')
1747
- handle = cluster_record['handle']
1748
- if not isinstance(handle, backends.CloudVmRayResourceHandle):
1749
- with ux_utils.print_exception_no_traceback():
1750
- raise ValueError('Querying IP address is not supported '
1751
- 'for local clusters.')
1752
-
1753
- head_ip = handle.external_ips()[0]
1754
- if show_endpoints:
1755
- if endpoint:
1756
- cluster_endpoint = core.endpoints(cluster_record['name'],
1757
- endpoint).get(
1758
- endpoint, None)
1759
- if not cluster_endpoint:
1760
- raise click.Abort(
1761
- f'Endpoint {endpoint} not found for cluster '
1762
- f'{cluster_record["name"]!r}.')
1763
- click.echo(cluster_endpoint)
1764
- else:
1765
- cluster_endpoints = core.endpoints(cluster_record['name'])
1766
- assert isinstance(cluster_endpoints, dict)
1767
- if not cluster_endpoints:
1768
- raise click.Abort(f'No endpoint found for cluster '
1769
- f'{cluster_record["name"]!r}.')
1770
- for port, port_endpoint in cluster_endpoints.items():
1771
- click.echo(
1772
- f'{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}'
1773
- f'{colorama.Style.RESET_ALL}: '
1774
- f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1775
- f'{port_endpoint}{colorama.Style.RESET_ALL}')
1776
- return
1777
- click.echo(head_ip)
1778
- return
1779
- hints = []
1780
- normal_clusters = []
1781
- controllers = []
1782
- for cluster_record in cluster_records:
1783
- cluster_name = cluster_record['name']
1784
- controller = controller_utils.Controllers.from_name(cluster_name)
1785
- if controller is not None:
1786
- controllers.append(cluster_record)
1787
- else:
1788
- normal_clusters.append(cluster_record)
1845
+ normal_clusters.append(cluster_record)
1789
1846
 
1790
- num_pending_autostop = 0
1791
- num_pending_autostop += status_utils.show_status_table(
1792
- normal_clusters + controllers, all)
1847
+ num_pending_autostop = 0
1848
+ num_pending_autostop += status_utils.show_status_table(
1849
+ normal_clusters + controllers, verbose, all_users, query_clusters)
1793
1850
 
1794
- def _try_get_future_result(future) -> Tuple[bool, Any]:
1795
- result = None
1796
- interrupted = False
1851
+ managed_jobs_query_interrupted = False
1852
+ if show_managed_jobs:
1853
+ click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1854
+ f'Managed jobs{colorama.Style.RESET_ALL}')
1855
+ with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
1797
1856
  try:
1798
- result = future.get()
1857
+ num_in_progress_jobs, msg = _handle_jobs_queue_request(
1858
+ managed_jobs_queue_request_id,
1859
+ show_all=False,
1860
+ limit_num_jobs_to_show=not all,
1861
+ is_called_by_user=False)
1799
1862
  except KeyboardInterrupt:
1800
- pool.terminate()
1801
- interrupted = True
1802
- return interrupted, result
1803
-
1804
- managed_jobs_query_interrupted = False
1805
- if show_managed_jobs:
1806
- click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1807
- f'Managed jobs{colorama.Style.RESET_ALL}')
1808
- with rich_utils.safe_status(
1809
- ux_utils.spinner_message('Checking managed jobs')):
1810
- managed_jobs_query_interrupted, result = _try_get_future_result(
1811
- managed_jobs_future)
1812
- if managed_jobs_query_interrupted:
1813
- # Set to -1, so that the controller is not considered
1814
- # down, and the hint for showing sky jobs queue
1815
- # will still be shown.
1816
- num_in_progress_jobs = -1
1817
- msg = 'KeyboardInterrupt'
1818
- else:
1819
- num_in_progress_jobs, msg = result
1820
-
1821
- click.echo(msg)
1822
- if num_in_progress_jobs is not None:
1823
- # jobs controller is UP.
1824
- job_info = ''
1825
- if num_in_progress_jobs > 0:
1826
- plural_and_verb = ' is'
1827
- if num_in_progress_jobs > 1:
1828
- plural_and_verb = 's are'
1829
- job_info = (
1830
- f'{num_in_progress_jobs} managed job{plural_and_verb} '
1831
- 'in progress')
1832
- if (num_in_progress_jobs >
1833
- _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS):
1834
- job_info += (
1835
- f' ({_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS} latest '
1836
- 'ones shown)')
1837
- job_info += '. '
1838
- hints.append(
1839
- controller_utils.Controllers.JOBS_CONTROLLER.value.
1840
- in_progress_hint.format(job_info=job_info))
1841
-
1842
- if show_services:
1843
- click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1844
- f'Services{colorama.Style.RESET_ALL}')
1845
- num_services = None
1846
- if managed_jobs_query_interrupted:
1847
- # The pool is terminated, so we cannot run the service query.
1863
+ sdk.api_cancel(managed_jobs_queue_request_id, silent=True)
1864
+ managed_jobs_query_interrupted = True
1865
+ # Set to -1, so that the controller is not considered
1866
+ # down, and the hint for showing sky jobs queue
1867
+ # will still be shown.
1868
+ num_in_progress_jobs = -1
1848
1869
  msg = 'KeyboardInterrupt'
1849
- else:
1850
- with rich_utils.safe_status(
1851
- ux_utils.spinner_message('Checking services')):
1852
- interrupted, result = _try_get_future_result(
1853
- services_future)
1854
- if interrupted:
1855
- num_services = -1
1856
- msg = 'KeyboardInterrupt'
1857
- else:
1858
- num_services, msg = result
1859
- click.echo(msg)
1860
- if num_services is not None:
1861
- hints.append(controller_utils.Controllers.SKY_SERVE_CONTROLLER.
1862
- value.in_progress_hint)
1863
1870
 
1864
- if show_managed_jobs or show_services:
1865
- try:
1866
- pool.close()
1867
- pool.join()
1868
- except SystemExit as e:
1869
- # This is to avoid a "Exception ignored" problem caused by
1870
- # ray worker setting the sigterm handler to sys.exit(15)
1871
- # (see ray/_private/worker.py).
1872
- # TODO (zhwu): Remove any importing of ray in SkyPilot.
1873
- if e.code != 15:
1874
- raise
1875
-
1876
- if num_pending_autostop > 0 and not refresh:
1877
- # Don't print this hint if there's no pending autostop or user has
1878
- # already passed --refresh.
1879
- plural_and_verb = ' has'
1880
- if num_pending_autostop > 1:
1881
- plural_and_verb = 's have'
1882
- hints.append(f'* {num_pending_autostop} cluster{plural_and_verb} '
1883
- 'auto{stop,down} scheduled. Refresh statuses with: '
1884
- f'{colorama.Style.BRIGHT}sky status --refresh'
1885
- f'{colorama.Style.RESET_ALL}')
1886
- if hints:
1887
- click.echo('\n' + '\n'.join(hints))
1871
+ click.echo(msg)
1872
+ if num_in_progress_jobs is not None:
1873
+ # jobs controller is UP.
1874
+ job_info = ''
1875
+ if num_in_progress_jobs > 0:
1876
+ plural_and_verb = ' is'
1877
+ if num_in_progress_jobs > 1:
1878
+ plural_and_verb = 's are'
1879
+ job_info = (
1880
+ f'{num_in_progress_jobs} managed job{plural_and_verb} '
1881
+ 'in progress')
1882
+ if num_in_progress_jobs > _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS:
1883
+ job_info += (
1884
+ f' ({_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS} latest '
1885
+ 'ones shown)')
1886
+ job_info += '. '
1887
+ hints.append(
1888
+ controller_utils.Controllers.JOBS_CONTROLLER.value.
1889
+ in_progress_hint.format(job_info=job_info))
1890
+
1891
+ if show_services:
1892
+ click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1893
+ f'Services{colorama.Style.RESET_ALL}')
1894
+ num_services = None
1895
+ if managed_jobs_query_interrupted:
1896
+ msg = 'KeyboardInterrupt'
1897
+ else:
1898
+ with rich_utils.client_status('[cyan]Checking services[/]'):
1899
+ try:
1900
+ num_services, msg = _handle_services_request(
1901
+ service_status_request_id,
1902
+ service_names=None,
1903
+ show_all=False,
1904
+ show_endpoint=False,
1905
+ is_called_by_user=False)
1906
+ except KeyboardInterrupt:
1907
+ sdk.api_cancel(service_status_request_id, silent=True)
1908
+ num_services = -1
1909
+ msg = 'KeyboardInterrupt'
1910
+ click.echo(msg)
1911
+ if num_services is not None:
1912
+ hints.append(controller_utils.Controllers.SKY_SERVE_CONTROLLER.
1913
+ value.in_progress_hint)
1914
+
1915
+ if num_pending_autostop > 0 and not refresh:
1916
+ # Don't print this hint if there's no pending autostop or user has
1917
+ # already passed --refresh.
1918
+ plural_and_verb = ' has'
1919
+ if num_pending_autostop > 1:
1920
+ plural_and_verb = 's have'
1921
+ hints.append(f'* {num_pending_autostop} cluster{plural_and_verb} '
1922
+ 'auto{stop,down} scheduled. Refresh statuses with: '
1923
+ f'{colorama.Style.BRIGHT}sky status --refresh'
1924
+ f'{colorama.Style.RESET_ALL}')
1925
+ if hints:
1926
+ click.echo('\n' + '\n'.join(hints))
1888
1927
 
1889
1928
 
1890
1929
  @cli.command()
@@ -1893,7 +1932,7 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
1893
1932
  default=False,
1894
1933
  is_flag=True,
1895
1934
  required=False,
1896
- help='Show all information in full.')
1935
+ help='Show all cluster information.')
1897
1936
  @usage_lib.entrypoint
1898
1937
  def cost_report(all: bool): # pylint: disable=redefined-builtin
1899
1938
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
@@ -1914,7 +1953,7 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin
1914
1953
 
1915
1954
  - Clusters that were terminated/stopped on the cloud console.
1916
1955
  """
1917
- cluster_records = core.cost_report()
1956
+ cluster_records = sdk.get(sdk.cost_report())
1918
1957
 
1919
1958
  normal_cluster_records = []
1920
1959
  controllers = dict()
@@ -1959,7 +1998,7 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin
1959
1998
 
1960
1999
  @cli.command()
1961
2000
  @click.option('--all-users',
1962
- '-a',
2001
+ '-u',
1963
2002
  default=False,
1964
2003
  is_flag=True,
1965
2004
  required=False,
@@ -1980,17 +2019,20 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
1980
2019
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1981
2020
  """Show the job queue for cluster(s)."""
1982
2021
  click.secho('Fetching and parsing job queue...', fg='cyan')
1983
- if clusters:
1984
- clusters = _get_glob_clusters(clusters)
1985
- else:
1986
- cluster_infos = global_user_state.get_clusters()
1987
- clusters = [c['name'] for c in cluster_infos]
2022
+ if not clusters:
2023
+ cluster_records = _get_cluster_records_and_set_ssh_config(
2024
+ None, all_users=all_users)
2025
+ clusters = [cluster['name'] for cluster in cluster_records]
1988
2026
 
1989
2027
  unsupported_clusters = []
1990
- for cluster in clusters:
2028
+ logger.info(f'Fetching job queue for {clusters}')
2029
+ job_tables = {}
2030
+
2031
+ def _get_job_queue(cluster):
1991
2032
  try:
1992
- job_table = core.queue(cluster, skip_finished, all_users)
1993
- except (exceptions.CommandError, ValueError,
2033
+ job_table = sdk.stream_and_get(
2034
+ sdk.queue(cluster, skip_finished, all_users))
2035
+ except (RuntimeError, exceptions.CommandError, ValueError,
1994
2036
  exceptions.NotSupportedError, exceptions.ClusterNotUpError,
1995
2037
  exceptions.CloudUserIdentityError,
1996
2038
  exceptions.ClusterOwnerIdentityMismatchError) as e:
@@ -1999,9 +2041,14 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
1999
2041
  click.echo(f'{colorama.Fore.YELLOW}Failed to get the job queue for '
2000
2042
  f'cluster {cluster!r}.{colorama.Style.RESET_ALL}\n'
2001
2043
  f' {common_utils.format_exception(e)}')
2002
- continue
2003
- job_table = job_lib.format_job_queue(job_table)
2004
- click.echo(f'\nJob queue of cluster {cluster}\n{job_table}')
2044
+ return
2045
+ job_tables[cluster] = job_lib.format_job_queue(job_table)
2046
+
2047
+ subprocess_utils.run_in_parallel(_get_job_queue, clusters)
2048
+ user_str = 'all users' if all_users else 'current user'
2049
+ for cluster, job_table in job_tables.items():
2050
+ click.echo(f'\nJob queue of {user_str} on cluster {cluster}\n'
2051
+ f'{job_table}')
2005
2052
 
2006
2053
  if unsupported_clusters:
2007
2054
  click.secho(
@@ -2081,25 +2128,34 @@ def logs(
2081
2128
  job_ids = None if not job_ids else job_ids
2082
2129
 
2083
2130
  if sync_down:
2084
- core.download_logs(cluster, job_ids)
2131
+ with rich_utils.client_status(
2132
+ ux_utils.spinner_message('Downloading logs')):
2133
+ log_local_path_dict = sdk.download_logs(cluster, job_ids)
2134
+ style = colorama.Style
2135
+ fore = colorama.Fore
2136
+ for job, log_local_path in log_local_path_dict.items():
2137
+ logger.info(f'{fore.CYAN}Job {job} logs: {log_local_path}'
2138
+ f'{style.RESET_ALL}')
2085
2139
  return
2086
2140
 
2087
2141
  assert job_ids is None or len(job_ids) <= 1, job_ids
2088
- job_id = None
2142
+ job_id: Optional[int] = None
2089
2143
  job_ids_to_query: Optional[List[int]] = None
2090
2144
  if job_ids:
2091
2145
  # Already check that len(job_ids) <= 1. This variable is used later
2092
- # in core.tail_logs.
2093
- job_id = job_ids[0]
2094
- if not job_id.isdigit():
2095
- raise click.UsageError(f'Invalid job ID {job_id}. '
2146
+ # in sdk.tail_logs.
2147
+ cur_job_id = job_ids[0]
2148
+ if not cur_job_id.isdigit():
2149
+ raise click.UsageError(f'Invalid job ID {cur_job_id}. '
2096
2150
  'Job ID must be integers.')
2097
- job_ids_to_query = [int(job_id)]
2151
+ job_id = int(cur_job_id)
2152
+ job_ids_to_query = [int(job_ids[0])]
2098
2153
  else:
2099
2154
  # job_ids is either None or empty list, so it is safe to cast it here.
2100
2155
  job_ids_to_query = typing.cast(Optional[List[int]], job_ids)
2101
2156
  if status:
2102
- job_statuses = core.job_status(cluster, job_ids_to_query)
2157
+ job_statuses = sdk.stream_and_get(
2158
+ sdk.job_status(cluster, job_ids_to_query))
2103
2159
  job_id = list(job_statuses.keys())[0]
2104
2160
  # If job_ids is None and no job has been submitted to the cluster,
2105
2161
  # it will return {None: None}.
@@ -2117,7 +2173,15 @@ def logs(
2117
2173
  click.secho(f'Job {id_str}not found', fg='red')
2118
2174
  sys.exit(1)
2119
2175
 
2120
- core.tail_logs(cluster, job_id, follow, tail)
2176
+ job_str = f'job {job_id}'
2177
+ if job_id is None:
2178
+ job_str = 'the last job'
2179
+ logger.info(f'{colorama.Fore.YELLOW}'
2180
+ f'Tailing logs of {job_str} on cluster {cluster!r}...'
2181
+ f'{colorama.Style.RESET_ALL}')
2182
+
2183
+ # Stream logs from the server.
2184
+ sdk.tail_logs(cluster, job_id, follow, tail=tail)
2121
2185
 
2122
2186
 
2123
2187
  @cli.command()
@@ -2130,16 +2194,31 @@ def logs(
2130
2194
  default=False,
2131
2195
  is_flag=True,
2132
2196
  required=False,
2133
- help='Cancel all jobs on the specified cluster.')
2197
+ help='Cancel all jobs from current user on the specified cluster.'
2198
+ )
2199
+ @click.option('--all-users',
2200
+ '-u',
2201
+ default=False,
2202
+ is_flag=True,
2203
+ required=False,
2204
+ help='Cancel all jobs on the specified cluster for all users.')
2134
2205
  @click.option('--yes',
2135
2206
  '-y',
2136
2207
  is_flag=True,
2137
2208
  default=False,
2138
2209
  required=False,
2139
2210
  help='Skip confirmation prompt.')
2211
+ @_add_click_options(_COMMON_OPTIONS)
2140
2212
  @click.argument('jobs', required=False, type=int, nargs=-1)
2141
2213
  @usage_lib.entrypoint
2142
- def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disable=redefined-builtin, redefined-outer-name
2214
+ def cancel(
2215
+ cluster: str,
2216
+ all: bool, # pylint: disable=redefined-builtin
2217
+ all_users: bool,
2218
+ jobs: List[int], # pylint: disable=redefined-outer-name
2219
+ yes: bool,
2220
+ async_call: bool,
2221
+ ): # pylint: disable=redefined-builtin
2143
2222
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
2144
2223
  """Cancel job(s).
2145
2224
 
@@ -2152,30 +2231,36 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa
2152
2231
  sky cancel cluster_name 1
2153
2232
  sky cancel cluster_name 1 2 3
2154
2233
  \b
2155
- # Cancel all jobs on a cluster.
2234
+ # Cancel all your jobs on a cluster.
2156
2235
  sky cancel cluster_name -a
2157
2236
  \b
2237
+ # Cancel all users' jobs on a cluster.
2238
+ sky cancel cluster_name -u
2239
+ \b
2158
2240
  # Cancel the latest running job on a cluster.
2159
2241
  sky cancel cluster_name
2160
2242
 
2161
2243
  Job IDs can be looked up by ``sky queue cluster_name``.
2162
2244
  """
2163
- job_identity_str = None
2245
+ job_identity_str = ''
2164
2246
  job_ids_to_cancel = None
2165
- if not jobs and not all:
2166
- click.echo(f'{colorama.Fore.YELLOW}No job IDs or --all provided; '
2167
- 'cancelling the latest running job.'
2168
- f'{colorama.Style.RESET_ALL}')
2247
+ if not jobs and not all and not all_users:
2248
+ click.echo(
2249
+ f'{colorama.Fore.YELLOW}No job IDs or --all/--all-users provided; '
2250
+ 'cancelling the latest running job.'
2251
+ f'{colorama.Style.RESET_ALL}')
2169
2252
  job_identity_str = 'the latest running job'
2253
+ elif all_users:
2254
+ job_identity_str = 'all users\' jobs'
2170
2255
  else:
2171
- # Cancelling specific jobs or --all.
2172
- job_ids = ' '.join(map(str, jobs))
2173
- plural = 's' if len(job_ids) > 1 else ''
2174
- job_identity_str = f'job{plural} {job_ids}'
2175
- job_ids_to_cancel = jobs
2176
2256
  if all:
2177
- job_identity_str = 'all jobs'
2178
- job_ids_to_cancel = None
2257
+ job_identity_str = 'all your jobs'
2258
+ if jobs:
2259
+ jobs_str = ' '.join(map(str, jobs))
2260
+ plural = 's' if len(jobs) > 1 else ''
2261
+ connector = ' and ' if job_identity_str else ''
2262
+ job_identity_str += f'{connector}job{plural} {jobs_str}'
2263
+ job_ids_to_cancel = jobs
2179
2264
  job_identity_str += f' on cluster {cluster!r}'
2180
2265
 
2181
2266
  if not yes:
@@ -2185,7 +2270,11 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa
2185
2270
  show_default=True)
2186
2271
 
2187
2272
  try:
2188
- core.cancel(cluster, all=all, job_ids=job_ids_to_cancel)
2273
+ request_id = sdk.cancel(cluster,
2274
+ all=all,
2275
+ all_users=all_users,
2276
+ job_ids=job_ids_to_cancel)
2277
+ _async_call_or_wait(request_id, async_call, 'sky.cancel')
2189
2278
  except exceptions.NotSupportedError as e:
2190
2279
  controller = controller_utils.Controllers.from_name(cluster)
2191
2280
  assert controller is not None, cluster
@@ -2205,20 +2294,28 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa
2205
2294
  **_get_shell_complete_args(_complete_cluster_name))
2206
2295
  @click.option('--all',
2207
2296
  '-a',
2208
- default=None,
2297
+ default=False,
2209
2298
  is_flag=True,
2210
2299
  help='Stop all existing clusters.')
2300
+ @click.option('--all-users',
2301
+ '-u',
2302
+ default=False,
2303
+ is_flag=True,
2304
+ help='Stop all existing clusters for all users.')
2211
2305
  @click.option('--yes',
2212
2306
  '-y',
2213
2307
  is_flag=True,
2214
2308
  default=False,
2215
2309
  required=False,
2216
2310
  help='Skip confirmation prompt.')
2311
+ @_add_click_options(_COMMON_OPTIONS)
2217
2312
  @usage_lib.entrypoint
2218
2313
  def stop(
2219
2314
  clusters: List[str],
2220
- all: Optional[bool], # pylint: disable=redefined-builtin
2315
+ all: bool, # pylint: disable=redefined-builtin
2316
+ all_users: bool,
2221
2317
  yes: bool,
2318
+ async_call: bool,
2222
2319
  ):
2223
2320
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
2224
2321
  """Stop cluster(s).
@@ -2251,8 +2348,10 @@ def stop(
2251
2348
  """
2252
2349
  _down_or_stop_clusters(clusters,
2253
2350
  apply_to_all=all,
2351
+ all_users=all_users,
2254
2352
  down=False,
2255
- no_confirm=yes)
2353
+ no_confirm=yes,
2354
+ async_call=async_call)
2256
2355
 
2257
2356
 
2258
2357
  @cli.command(cls=_DocumentedCodeCommand)
@@ -2262,9 +2361,14 @@ def stop(
2262
2361
  **_get_shell_complete_args(_complete_cluster_name))
2263
2362
  @click.option('--all',
2264
2363
  '-a',
2265
- default=None,
2364
+ default=False,
2266
2365
  is_flag=True,
2267
- help='Apply this command to all existing clusters.')
2366
+ help='Autostop all existing clusters.')
2367
+ @click.option('--all-users',
2368
+ '-u',
2369
+ default=False,
2370
+ is_flag=True,
2371
+ help='Autostop all existing clusters for all users.')
2268
2372
  @click.option('--idle-minutes',
2269
2373
  '-i',
2270
2374
  type=int,
@@ -2292,14 +2396,17 @@ def stop(
2292
2396
  default=False,
2293
2397
  required=False,
2294
2398
  help='Skip confirmation prompt.')
2399
+ @_add_click_options(_COMMON_OPTIONS)
2295
2400
  @usage_lib.entrypoint
2296
2401
  def autostop(
2297
2402
  clusters: List[str],
2298
- all: Optional[bool], # pylint: disable=redefined-builtin
2403
+ all: bool, # pylint: disable=redefined-builtin
2404
+ all_users: bool,
2299
2405
  idle_minutes: Optional[int],
2300
2406
  cancel: bool, # pylint: disable=redefined-outer-name
2301
2407
  down: bool, # pylint: disable=redefined-outer-name
2302
2408
  yes: bool,
2409
+ async_call: bool,
2303
2410
  ):
2304
2411
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
2305
2412
  """Schedule an autostop or autodown for cluster(s).
@@ -2352,9 +2459,11 @@ def autostop(
2352
2459
  idle_minutes = 5
2353
2460
  _down_or_stop_clusters(clusters,
2354
2461
  apply_to_all=all,
2462
+ all_users=all_users,
2355
2463
  down=down,
2356
2464
  no_confirm=yes,
2357
- idle_minutes_to_autostop=idle_minutes)
2465
+ idle_minutes_to_autostop=idle_minutes,
2466
+ async_call=async_call)
2358
2467
 
2359
2468
 
2360
2469
  @cli.command(cls=_DocumentedCodeCommand)
@@ -2417,16 +2526,19 @@ def autostop(
2417
2526
  required=False,
2418
2527
  help=('Force start the cluster even if it is already UP. Useful for '
2419
2528
  'upgrading the SkyPilot runtime on the cluster.'))
2529
+ @_add_click_options(_COMMON_OPTIONS)
2420
2530
  @usage_lib.entrypoint
2421
2531
  # pylint: disable=redefined-builtin
2422
2532
  def start(
2423
- clusters: List[str],
2424
- all: bool,
2425
- yes: bool,
2426
- idle_minutes_to_autostop: Optional[int],
2427
- down: bool, # pylint: disable=redefined-outer-name
2428
- retry_until_up: bool,
2429
- force: bool):
2533
+ clusters: List[str],
2534
+ all: bool,
2535
+ yes: bool,
2536
+ idle_minutes_to_autostop: Optional[int],
2537
+ down: bool, # pylint: disable=redefined-outer-name
2538
+ retry_until_up: bool,
2539
+ force: bool,
2540
+ async_call: bool,
2541
+ ):
2430
2542
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
2431
2543
  """Restart cluster(s).
2432
2544
 
@@ -2460,12 +2572,14 @@ def start(
2460
2572
  '--idle-minutes-to-autostop must be set if --down is set.')
2461
2573
  to_start = []
2462
2574
 
2575
+ cluster_records = None
2463
2576
  if not clusters and not all:
2464
2577
  # UX: frequently users may have only 1 cluster. In this case, be smart
2465
2578
  # and default to that unique choice.
2466
- all_cluster_names = global_user_state.get_cluster_names_start_with('')
2467
- if len(all_cluster_names) <= 1:
2468
- clusters = all_cluster_names
2579
+ all_clusters = _get_cluster_records_and_set_ssh_config(
2580
+ clusters=None, refresh=common.StatusRefreshMode.AUTO)
2581
+ if len(all_clusters) <= 1:
2582
+ cluster_records = all_clusters
2469
2583
  else:
2470
2584
  raise click.UsageError(
2471
2585
  '`sky start` requires either a cluster name or glob '
@@ -2476,24 +2590,27 @@ def start(
2476
2590
  click.echo('Both --all and cluster(s) specified for sky start. '
2477
2591
  'Letting --all take effect.')
2478
2592
 
2593
+ all_clusters = _get_cluster_records_and_set_ssh_config(
2594
+ clusters=None, refresh=common.StatusRefreshMode.AUTO)
2595
+
2479
2596
  # Get all clusters that are not controllers.
2480
- clusters = [
2481
- cluster['name']
2482
- for cluster in global_user_state.get_clusters()
2597
+ cluster_records = [
2598
+ cluster for cluster in all_clusters
2483
2599
  if controller_utils.Controllers.from_name(cluster['name']) is None
2484
2600
  ]
2601
+ if cluster_records is None:
2602
+ # Get GLOB cluster names
2603
+ cluster_records = _get_cluster_records_and_set_ssh_config(
2604
+ clusters, refresh=common.StatusRefreshMode.AUTO)
2485
2605
 
2486
- if not clusters:
2606
+ if not cluster_records:
2487
2607
  click.echo('Cluster(s) not found (tip: see `sky status`). Do you '
2488
2608
  'mean to use `sky launch` to provision a new cluster?')
2489
2609
  return
2490
2610
  else:
2491
- # Get GLOB cluster names
2492
- clusters = _get_glob_clusters(clusters)
2493
-
2494
- for name in clusters:
2495
- cluster_status, _ = backend_utils.refresh_cluster_status_handle(
2496
- name)
2611
+ for cluster in cluster_records:
2612
+ name = cluster['name']
2613
+ cluster_status = cluster['status']
2497
2614
  # A cluster may have one of the following states:
2498
2615
  #
2499
2616
  # STOPPED - ok to restart
@@ -2573,18 +2690,25 @@ def start(
2573
2690
  abort=True,
2574
2691
  show_default=True)
2575
2692
 
2576
- for name in to_start:
2693
+ request_ids = subprocess_utils.run_in_parallel(
2694
+ lambda name: sdk.start(name,
2695
+ idle_minutes_to_autostop,
2696
+ retry_until_up,
2697
+ down=down,
2698
+ force=force), to_start)
2699
+
2700
+ for name, request_id in zip(to_start, request_ids):
2577
2701
  try:
2578
- core.start(name,
2579
- idle_minutes_to_autostop,
2580
- retry_until_up,
2581
- down=down,
2582
- force=force)
2702
+ _async_call_or_wait(request_id, async_call, 'sky.start')
2703
+ if not async_call:
2704
+ # Add ssh config for the cluster
2705
+ _get_cluster_records_and_set_ssh_config(clusters=[name])
2583
2706
  except (exceptions.NotSupportedError,
2584
2707
  exceptions.ClusterOwnerIdentityMismatchError) as e:
2585
2708
  click.echo(str(e))
2586
2709
  else:
2587
- click.secho(f'Cluster {name} started.', fg='green')
2710
+ if not async_call:
2711
+ click.secho(f'Cluster {name} started.', fg='green')
2588
2712
 
2589
2713
 
2590
2714
  @cli.command(cls=_DocumentedCodeCommand)
@@ -2594,9 +2718,14 @@ def start(
2594
2718
  **_get_shell_complete_args(_complete_cluster_name))
2595
2719
  @click.option('--all',
2596
2720
  '-a',
2597
- default=None,
2721
+ default=False,
2598
2722
  is_flag=True,
2599
2723
  help='Tear down all existing clusters.')
2724
+ @click.option('--all-users',
2725
+ '-u',
2726
+ default=False,
2727
+ is_flag=True,
2728
+ help='Tear down all existing clusters for all users.')
2600
2729
  @click.option('--yes',
2601
2730
  '-y',
2602
2731
  is_flag=True,
@@ -2615,12 +2744,15 @@ def start(
2615
2744
  ' in certain manual troubleshooting scenarios; with it set, it is the'
2616
2745
  ' user\'s responsibility to ensure there are no leaked instances and '
2617
2746
  'related resources.'))
2747
+ @_add_click_options(_COMMON_OPTIONS)
2618
2748
  @usage_lib.entrypoint
2619
2749
  def down(
2620
2750
  clusters: List[str],
2621
- all: Optional[bool], # pylint: disable=redefined-builtin
2751
+ all: bool, # pylint: disable=redefined-builtin
2752
+ all_users: bool, # pylint: disable=redefined-builtin
2622
2753
  yes: bool,
2623
2754
  purge: bool,
2755
+ async_call: bool,
2624
2756
  ):
2625
2757
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
2626
2758
  """Tear down cluster(s).
@@ -2652,12 +2784,15 @@ def down(
2652
2784
  """
2653
2785
  _down_or_stop_clusters(clusters,
2654
2786
  apply_to_all=all,
2787
+ all_users=all_users,
2655
2788
  down=True,
2656
2789
  no_confirm=yes,
2657
- purge=purge)
2790
+ purge=purge,
2791
+ async_call=async_call)
2658
2792
 
2659
2793
 
2660
- def _hint_or_raise_for_down_jobs_controller(controller_name: str):
2794
+ def _hint_or_raise_for_down_jobs_controller(controller_name: str,
2795
+ purge: bool) -> None:
2661
2796
  """Helper function to check job controller status before tearing it down.
2662
2797
 
2663
2798
  Raises helpful exceptions and errors if the controller is not in a safe
@@ -2669,14 +2804,19 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str):
2669
2804
  to be torn down (e.g., because it has jobs running or
2670
2805
  it is in init state)
2671
2806
  """
2807
+ if not common.is_current_user_controller(controller_name):
2808
+ with ux_utils.print_exception_no_traceback():
2809
+ raise exceptions.NotSupportedError(
2810
+ f'Tearing down other user\'s managed job controller '
2811
+ f'{controller_name!r} is not allowed.')
2672
2812
  controller = controller_utils.Controllers.from_name(controller_name)
2673
2813
  assert controller is not None, controller_name
2674
2814
 
2675
- with rich_utils.safe_status(
2676
- ux_utils.spinner_message('Checking for in-progress managed jobs')):
2815
+ with rich_utils.client_status(
2816
+ '[bold cyan]Checking for in-progress managed jobs[/]'):
2677
2817
  try:
2678
- managed_jobs_ = managed_jobs.queue(refresh=False,
2679
- skip_finished=True)
2818
+ request_id = managed_jobs.queue(refresh=False, skip_finished=True)
2819
+ managed_jobs_ = sdk.stream_and_get(request_id)
2680
2820
  except exceptions.ClusterNotUpError as e:
2681
2821
  if controller.value.connection_error_hint in str(e):
2682
2822
  with ux_utils.print_exception_no_traceback():
@@ -2704,14 +2844,19 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str):
2704
2844
  # Add prefix to each line to align with the bullet point.
2705
2845
  msg += '\n'.join(
2706
2846
  [' ' + line for line in job_table.split('\n') if line != ''])
2707
- with ux_utils.print_exception_no_traceback():
2708
- raise exceptions.NotSupportedError(msg)
2847
+ if purge:
2848
+ logger.warning('--purge is set, ignoring the in-progress managed '
2849
+ 'jobs. This could cause leaked clusters!')
2850
+ else:
2851
+ with ux_utils.print_exception_no_traceback():
2852
+ raise exceptions.NotSupportedError(msg)
2709
2853
  else:
2710
2854
  click.echo(' * No in-progress managed jobs found. It should be safe to '
2711
2855
  'terminate (see caveats above).')
2712
2856
 
2713
2857
 
2714
- def _hint_or_raise_for_down_sky_serve_controller(controller_name: str):
2858
+ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
2859
+ purge: bool) -> None:
2715
2860
  """Helper function to check serve controller status before tearing it down.
2716
2861
 
2717
2862
  Raises helpful exceptions and errors if the controller is not in a safe
@@ -2723,12 +2868,18 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str):
2723
2868
  to be torn down (e.g., because it has services running or
2724
2869
  it is in init state)
2725
2870
  """
2871
+ # TODO(zhwu): Move this check to the sdk or even API server side.
2872
+ if not common.is_current_user_controller(controller_name):
2873
+ with ux_utils.print_exception_no_traceback():
2874
+ raise exceptions.NotSupportedError(
2875
+ f'Tearing down other user\'s sky serve controller '
2876
+ f'{controller_name!r} is not allowed.')
2726
2877
  controller = controller_utils.Controllers.from_name(controller_name)
2727
2878
  assert controller is not None, controller_name
2728
- with rich_utils.safe_status(
2729
- ux_utils.spinner_message('Checking for live services')):
2879
+ with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
2730
2880
  try:
2731
- services = serve_lib.status()
2881
+ request_id = serve_lib.status(service_names=None)
2882
+ services = sdk.stream_and_get(request_id)
2732
2883
  except exceptions.ClusterNotUpError as e:
2733
2884
  if controller.value.connection_error_hint in str(e):
2734
2885
  with ux_utils.print_exception_no_traceback():
@@ -2745,35 +2896,52 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str):
2745
2896
 
2746
2897
  if services:
2747
2898
  service_names = [service['name'] for service in services]
2748
- with ux_utils.print_exception_no_traceback():
2749
- msg = (
2750
- controller.value.decline_down_for_dirty_controller_hint.format(
2751
- service_names=', '.join(service_names)))
2752
- raise exceptions.NotSupportedError(msg)
2899
+ if purge:
2900
+ logger.warning('--purge is set, ignoring the in-progress services. '
2901
+ 'This could cause leaked clusters!')
2902
+ else:
2903
+ with ux_utils.print_exception_no_traceback():
2904
+ msg = (controller.value.decline_down_for_dirty_controller_hint.
2905
+ format(service_names=', '.join(service_names)))
2906
+ raise exceptions.NotSupportedError(msg)
2753
2907
  # Do nothing for STOPPED state, as it is safe to terminate the cluster.
2754
2908
  click.echo(f'Terminate sky serve controller: {controller_name}.')
2755
2909
 
2756
2910
 
2757
- _CONTROLLER_TO_HINT_OR_RAISE = {
2758
- controller_utils.Controllers.JOBS_CONTROLLER:
2759
- (_hint_or_raise_for_down_jobs_controller),
2760
- controller_utils.Controllers.SKY_SERVE_CONTROLLER:
2761
- (_hint_or_raise_for_down_sky_serve_controller),
2762
- }
2911
+ def _controller_to_hint_or_raise(
2912
+ controller: controller_utils.Controllers
2913
+ ) -> Callable[[str, bool], None]:
2914
+ if controller == controller_utils.Controllers.JOBS_CONTROLLER:
2915
+ return _hint_or_raise_for_down_jobs_controller
2916
+ return _hint_or_raise_for_down_sky_serve_controller
2763
2917
 
2764
2918
 
2765
2919
  def _down_or_stop_clusters(
2766
2920
  names: List[str],
2767
- apply_to_all: Optional[bool],
2768
- down: bool, # pylint: disable=redefined-outer-name
2769
- no_confirm: bool,
2921
+ apply_to_all: bool = False,
2922
+ all_users: bool = False,
2923
+ down: bool = False, # pylint: disable=redefined-outer-name
2924
+ no_confirm: bool = True,
2770
2925
  purge: bool = False,
2771
- idle_minutes_to_autostop: Optional[int] = None) -> None:
2926
+ idle_minutes_to_autostop: Optional[int] = None,
2927
+ async_call: bool = False) -> None:
2772
2928
  """Tears down or (auto-)stops a cluster (or all clusters).
2773
2929
 
2774
2930
  Controllers (jobs controller and sky serve controller) can only be
2775
2931
  terminated if the cluster name is explicitly and uniquely specified (not
2776
2932
  via glob).
2933
+
2934
+ Args:
2935
+ names: The names of the clusters to tear down or stop. If empty,
2936
+ apply_to_all or all_users must be set.
2937
+ apply_to_all: If True, apply the operation to all clusters.
2938
+ all_users: If True, apply the operation to all clusters for all users.
2939
+ down: If True, tear down the clusters.
2940
+ no_confirm: If True, skip the confirmation prompt.
2941
+ purge: If True, forcefully remove the clusters from the cluster table.
2942
+ idle_minutes_to_autostop: The number of minutes to wait before
2943
+ automatically stopping the cluster.
2944
+ async_call: If True, send the request asynchronously.
2777
2945
  """
2778
2946
  if down:
2779
2947
  command = 'down'
@@ -2781,17 +2949,12 @@ def _down_or_stop_clusters(
2781
2949
  command = 'autostop'
2782
2950
  else:
2783
2951
  command = 'stop'
2784
- if not names and apply_to_all is None:
2785
- # UX: frequently users may have only 1 cluster. In this case, 'sky
2786
- # stop/down' without args should be smart and default to that unique
2787
- # choice.
2788
- all_cluster_names = global_user_state.get_cluster_names_start_with('')
2789
- if len(all_cluster_names) <= 1:
2790
- names = all_cluster_names
2791
- else:
2792
- raise click.UsageError(
2793
- f'`sky {command}` requires either a cluster name or glob '
2794
- '(see `sky status`), or the -a/--all flag.')
2952
+ if not names and not apply_to_all and not all_users:
2953
+ raise click.UsageError(
2954
+ f'`sky {command}` requires either a cluster name or glob '
2955
+ '(see `sky status`), or the -a/--all flag for all your '
2956
+ 'clusters, or the -u/--all-users flag for all clusters in '
2957
+ 'your team.')
2795
2958
 
2796
2959
  operation = 'Terminating' if down else 'Stopping'
2797
2960
  if idle_minutes_to_autostop is not None:
@@ -2802,6 +2965,7 @@ def _down_or_stop_clusters(
2802
2965
  option_str = '{stop,down}'
2803
2966
  operation = f'{verb} auto{option_str} on'
2804
2967
 
2968
+ names = list(names)
2805
2969
  if names:
2806
2970
  controllers = [
2807
2971
  name for name in names
@@ -2809,8 +2973,9 @@ def _down_or_stop_clusters(
2809
2973
  ]
2810
2974
  controllers_str = ', '.join(map(repr, controllers))
2811
2975
  names = [
2812
- name for name in _get_glob_clusters(names)
2813
- if controller_utils.Controllers.from_name(name) is None
2976
+ cluster['name']
2977
+ for cluster in _get_cluster_records_and_set_ssh_config(names)
2978
+ if controller_utils.Controllers.from_name(cluster['name']) is None
2814
2979
  ]
2815
2980
 
2816
2981
  # Make sure the controllers are explicitly specified without other
@@ -2837,7 +3002,7 @@ def _down_or_stop_clusters(
2837
3002
  controller = controller_utils.Controllers.from_name(
2838
3003
  controller_name)
2839
3004
  assert controller is not None
2840
- hint_or_raise = _CONTROLLER_TO_HINT_OR_RAISE[controller]
3005
+ hint_or_raise = _controller_to_hint_or_raise(controller)
2841
3006
  try:
2842
3007
  # TODO(zhwu): This hint or raise is not transactional, which
2843
3008
  # means even if it passed the check with no in-progress spot
@@ -2846,7 +3011,7 @@ def _down_or_stop_clusters(
2846
3011
  # `sky serve up` before typing the delete, causing a leaked
2847
3012
  # managed job or service. We should make this check atomic
2848
3013
  # with the termination.
2849
- hint_or_raise(controller_name)
3014
+ hint_or_raise(controller_name, purge)
2850
3015
  except (exceptions.ClusterOwnerIdentityMismatchError,
2851
3016
  RuntimeError) as e:
2852
3017
  if purge:
@@ -2867,8 +3032,9 @@ def _down_or_stop_clusters(
2867
3032
  no_confirm = True
2868
3033
  names += controllers
2869
3034
 
2870
- if apply_to_all:
2871
- all_clusters = global_user_state.get_clusters()
3035
+ if apply_to_all or all_users:
3036
+ all_clusters = _get_cluster_records_and_set_ssh_config(
3037
+ clusters=None, all_users=all_users)
2872
3038
  if names:
2873
3039
  click.echo(
2874
3040
  f'Both --all and cluster(s) specified for `sky {command}`. '
@@ -2881,15 +3047,7 @@ def _down_or_stop_clusters(
2881
3047
  if controller_utils.Controllers.from_name(record['name']) is None
2882
3048
  ]
2883
3049
 
2884
- clusters = []
2885
- for name in names:
2886
- handle = global_user_state.get_handle_from_cluster_name(name)
2887
- if handle is None:
2888
- # This codepath is used for 'sky down -p <controller>' when the
2889
- # controller is not in 'sky status'. Cluster-not-found message
2890
- # should've been printed by _get_glob_clusters() above.
2891
- continue
2892
- clusters.append(name)
3050
+ clusters = names
2893
3051
  usage_lib.record_cluster_name_for_current_operation(clusters)
2894
3052
 
2895
3053
  if not clusters:
@@ -2910,15 +3068,21 @@ def _down_or_stop_clusters(
2910
3068
  progress = rich_progress.Progress(transient=True,
2911
3069
  redirect_stdout=False,
2912
3070
  redirect_stderr=False)
2913
- task = progress.add_task(ux_utils.spinner_message(
2914
- f'{operation} {len(clusters)} cluster{plural}'),
2915
- total=len(clusters))
3071
+ task = progress.add_task(
3072
+ f'[bold cyan]{operation} {len(clusters)} cluster{plural}[/]',
3073
+ total=len(clusters))
3074
+
3075
+ request_ids = []
2916
3076
 
2917
3077
  def _down_or_stop(name: str):
2918
3078
  success_progress = False
2919
3079
  if idle_minutes_to_autostop is not None:
2920
3080
  try:
2921
- core.autostop(name, idle_minutes_to_autostop, down)
3081
+ request_id = sdk.autostop(name, idle_minutes_to_autostop, down)
3082
+ request_ids.append(request_id)
3083
+ _async_call_or_wait(
3084
+ request_id, async_call,
3085
+ server_constants.REQUEST_NAME_PREFIX + operation)
2922
3086
  except (exceptions.NotSupportedError,
2923
3087
  exceptions.ClusterNotUpError) as e:
2924
3088
  message = str(e)
@@ -2941,9 +3105,17 @@ def _down_or_stop_clusters(
2941
3105
  else:
2942
3106
  try:
2943
3107
  if down:
2944
- core.down(name, purge=purge)
3108
+ request_id = sdk.down(name, purge=purge)
2945
3109
  else:
2946
- core.stop(name, purge=purge)
3110
+ request_id = sdk.stop(name, purge=purge)
3111
+ request_ids.append(request_id)
3112
+ _async_call_or_wait(
3113
+ request_id, async_call,
3114
+ server_constants.REQUEST_NAME_PREFIX + operation)
3115
+ if not async_call:
3116
+ # Remove the cluster from the SSH config file as soon as it
3117
+ # is stopped or downed.
3118
+ cluster_utils.SSHConfigHelper.remove_cluster(name)
2947
3119
  except RuntimeError as e:
2948
3120
  message = (
2949
3121
  f'{colorama.Fore.RED}{operation} cluster {name}...failed. '
@@ -2974,6 +3146,10 @@ def _down_or_stop_clusters(
2974
3146
  # Make sure the progress bar not mess up the terminal.
2975
3147
  progress.refresh()
2976
3148
 
3149
+ if async_call:
3150
+ click.secho(f'{operation} requests are sent. Check the requests\' '
3151
+ 'status with `sky request get <request_id>`.')
3152
+
2977
3153
 
2978
3154
  @cli.command(cls=_DocumentedCodeCommand)
2979
3155
  @click.argument('clouds', required=False, type=str, nargs=-1)
@@ -2983,6 +3159,7 @@ def _down_or_stop_clusters(
2983
3159
  default=False,
2984
3160
  help='Show the activated account for each cloud.')
2985
3161
  @usage_lib.entrypoint
3162
+ # pylint: disable=redefined-outer-name
2986
3163
  def check(clouds: Tuple[str], verbose: bool):
2987
3164
  """Check which clouds are available to use.
2988
3165
 
@@ -3005,8 +3182,13 @@ def check(clouds: Tuple[str], verbose: bool):
3005
3182
  # Check only specific clouds - AWS and GCP.
3006
3183
  sky check aws gcp
3007
3184
  """
3008
- clouds_arg = clouds if clouds else None
3009
- sky_check.check(verbose=verbose, clouds=clouds_arg)
3185
+ clouds_arg = clouds if len(clouds) > 0 else None
3186
+ request_id = sdk.check(clouds=clouds_arg, verbose=verbose)
3187
+ sdk.stream_and_get(request_id)
3188
+ api_server_url = server_common.get_server_url()
3189
+ click.echo()
3190
+ click.echo(
3191
+ click.style(f'Using SkyPilot API server: {api_server_url}', fg='green'))
3010
3192
 
3011
3193
 
3012
3194
  @cli.command()
@@ -3099,23 +3281,27 @@ def show_gpus(
3099
3281
  '--all-regions and --region flags cannot be used simultaneously.')
3100
3282
 
3101
3283
  # This will validate 'cloud' and raise if not found.
3102
- cloud_obj = sky_clouds.CLOUD_REGISTRY.from_str(cloud)
3103
- cloud_name = cloud_obj.canonical_name() if cloud_obj is not None else None
3104
- service_catalog.validate_region_zone(region, None, clouds=cloud_name)
3284
+ cloud_obj = registry.CLOUD_REGISTRY.from_str(cloud)
3285
+ cloud_name = str(cloud_obj).lower() if cloud is not None else None
3105
3286
  show_all = all
3106
3287
  if show_all and accelerator_str is not None:
3107
3288
  raise click.UsageError('--all is only allowed without a GPU name.')
3108
3289
 
3109
3290
  # Kubernetes specific bools
3110
- cloud_is_kubernetes = isinstance(cloud_obj, sky_clouds.Kubernetes)
3291
+ enabled_clouds = sdk.get(sdk.enabled_clouds())
3292
+ cloud_is_kubernetes = isinstance(cloud_obj, clouds.Kubernetes)
3293
+ # TODO(romilb): We should move this to the backend.
3111
3294
  kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None
3112
- kubernetes_is_enabled = sky_clouds.cloud_in_iterable(
3113
- sky_clouds.Kubernetes(), global_user_state.get_cached_enabled_clouds())
3114
- no_permissions_str = '<no permissions>'
3295
+ kubernetes_is_enabled = clouds.cloud_in_iterable(
3296
+ clouds.Kubernetes(),
3297
+ enabled_clouds,
3298
+ )
3115
3299
 
3116
3300
  def _list_to_str(lst):
3117
3301
  return ', '.join([str(e) for e in lst])
3118
3302
 
3303
+ # TODO(zhwu,romilb): We should move most of these kubernetes related
3304
+ # queries into the backend, especially behind the server.
3119
3305
  def _get_kubernetes_realtime_gpu_table(
3120
3306
  context: Optional[str] = None,
3121
3307
  name_filter: Optional[str] = None,
@@ -3128,19 +3314,12 @@ def show_gpus(
3128
3314
  free_header = 'TOTAL_FREE_GPUS'
3129
3315
  realtime_gpu_table = log_utils.create_table(
3130
3316
  ['GPU', qty_header, 'TOTAL_GPUS', free_header])
3131
- counts, capacity, available = service_catalog.list_accelerator_realtime(
3132
- gpus_only=True,
3133
- clouds='kubernetes',
3134
- name_filter=name_filter,
3135
- region_filter=context,
3136
- quantity_filter=quantity_filter,
3137
- case_sensitive=False)
3138
- assert (set(counts.keys()) == set(capacity.keys()) == set(
3139
- available.keys())), (f'Keys of counts ({list(counts.keys())}), '
3140
- f'capacity ({list(capacity.keys())}), '
3141
- f'and available ({list(available.keys())}) '
3142
- 'must be same.')
3143
- if not counts:
3317
+ realtime_gpu_availability_list = sdk.stream_and_get(
3318
+ sdk.realtime_kubernetes_gpu_availability(
3319
+ context=context,
3320
+ name_filter=name_filter,
3321
+ quantity_filter=quantity_filter))
3322
+ if not realtime_gpu_availability_list:
3144
3323
  err_msg = 'No GPUs found in Kubernetes cluster. '
3145
3324
  debug_msg = 'To further debug, run: sky check '
3146
3325
  if name_filter is not None:
@@ -3152,24 +3331,32 @@ def show_gpus(
3152
3331
  'in Kubernetes cluster. ')
3153
3332
  debug_msg = ('To show available accelerators on kubernetes,'
3154
3333
  ' run: sky show-gpus --cloud kubernetes ')
3155
- full_err_msg = (err_msg +
3156
- kubernetes_utils.NO_ACCELERATOR_HELP_MESSAGE +
3334
+ full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
3157
3335
  debug_msg)
3158
3336
  raise ValueError(full_err_msg)
3159
- for gpu, _ in sorted(counts.items()):
3160
- available_qty = available[gpu] if available[gpu] != -1 else (
3161
- no_permissions_str)
3337
+ no_permissions_str = '<no permissions>'
3338
+ for realtime_gpu_availability in sorted(realtime_gpu_availability_list):
3339
+ gpu_availability = models.RealtimeGpuAvailability(
3340
+ *realtime_gpu_availability)
3341
+ available_qty = (gpu_availability.available
3342
+ if gpu_availability.available != -1 else
3343
+ no_permissions_str)
3162
3344
  realtime_gpu_table.add_row([
3163
- gpu,
3164
- _list_to_str(counts.pop(gpu)), capacity[gpu], available_qty
3345
+ gpu_availability.gpu,
3346
+ _list_to_str(gpu_availability.counts),
3347
+ gpu_availability.capacity,
3348
+ available_qty,
3165
3349
  ])
3166
3350
  return realtime_gpu_table
3167
3351
 
3352
+ # TODO(zhwu): this needs to run on remote server.
3168
3353
  def _get_kubernetes_node_info_table(context: Optional[str]):
3169
3354
  node_table = log_utils.create_table(
3170
3355
  ['NODE_NAME', 'GPU_NAME', 'TOTAL_GPUS', 'FREE_GPUS'])
3171
3356
 
3172
- node_info_dict = kubernetes_utils.get_kubernetes_node_info(context)
3357
+ no_permissions_str = '<no permissions>'
3358
+ node_info_dict = sdk.stream_and_get(
3359
+ sdk.kubernetes_node_info(context=context))
3173
3360
  for node_name, node_info in node_info_dict.items():
3174
3361
  available = node_info.free[
3175
3362
  'accelerators_available'] if node_info.free[
@@ -3180,7 +3367,7 @@ def show_gpus(
3180
3367
  ])
3181
3368
  return node_table
3182
3369
 
3183
- def _output():
3370
+ def _output() -> Generator[str, None, None]:
3184
3371
  gpu_table = log_utils.create_table(
3185
3372
  ['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
3186
3373
  tpu_table = log_utils.create_table(
@@ -3193,7 +3380,7 @@ def show_gpus(
3193
3380
  # Optimization - do not poll for Kubernetes API for fetching
3194
3381
  # common GPUs because that will be fetched later for the table after
3195
3382
  # common GPUs.
3196
- clouds_to_list = cloud_name
3383
+ clouds_to_list: Union[Optional[str], List[str]] = cloud_name
3197
3384
  if cloud_name is None:
3198
3385
  clouds_to_list = [
3199
3386
  c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes'
@@ -3206,12 +3393,8 @@ def show_gpus(
3206
3393
  # If cloud is kubernetes, we want to show real-time capacity
3207
3394
  if kubernetes_is_enabled and (cloud_name is None or
3208
3395
  cloud_is_kubernetes):
3209
- if region:
3210
- context = region
3211
- else:
3212
- # If region is not specified, we use the current context
3213
- context = (
3214
- kubernetes_utils.get_current_kube_config_context_name())
3396
+ context = region
3397
+
3215
3398
  try:
3216
3399
  # If --cloud kubernetes is not specified, we want to catch
3217
3400
  # the case where no GPUs are available on the cluster and
@@ -3225,8 +3408,9 @@ def show_gpus(
3225
3408
  k8s_messages += str(e)
3226
3409
  else:
3227
3410
  print_section_titles = True
3411
+ context_str = f'(Context: {context})' if context else ''
3228
3412
  yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3229
- f'Kubernetes GPUs (context: {context})'
3413
+ f'Kubernetes GPUs {context_str}'
3230
3414
  f'{colorama.Style.RESET_ALL}\n')
3231
3415
  yield from k8s_realtime_table.get_string()
3232
3416
  k8s_node_table = _get_kubernetes_node_info_table(context)
@@ -3262,11 +3446,14 @@ def show_gpus(
3262
3446
  yield k8s_messages
3263
3447
  yield '\n\n'
3264
3448
 
3265
- result = service_catalog.list_accelerator_counts(
3266
- gpus_only=True,
3267
- clouds=clouds_to_list,
3268
- region_filter=region,
3269
- )
3449
+ result = sdk.stream_and_get(
3450
+ sdk.list_accelerator_counts(
3451
+ gpus_only=True,
3452
+ clouds=clouds_to_list,
3453
+ region_filter=region,
3454
+ ))
3455
+ # TODO(zhwu): handle the case where no accelerators are found,
3456
+ # especially when --region specified a non-existent region.
3270
3457
 
3271
3458
  if print_section_titles:
3272
3459
  # If section titles were printed above, print again here
@@ -3354,16 +3541,17 @@ def show_gpus(
3354
3541
 
3355
3542
  # For clouds other than Kubernetes, get the accelerator details
3356
3543
  # Case-sensitive
3357
- result = service_catalog.list_accelerators(gpus_only=True,
3358
- name_filter=name,
3359
- quantity_filter=quantity,
3360
- region_filter=region,
3361
- clouds=clouds_to_list,
3362
- case_sensitive=False,
3363
- all_regions=all_regions)
3544
+ result = sdk.stream_and_get(
3545
+ sdk.list_accelerators(gpus_only=True,
3546
+ name_filter=name,
3547
+ quantity_filter=quantity,
3548
+ region_filter=region,
3549
+ clouds=clouds_to_list,
3550
+ case_sensitive=False,
3551
+ all_regions=all_regions))
3364
3552
  # Import here to save module load speed.
3365
3553
  # pylint: disable=import-outside-toplevel,line-too-long
3366
- from sky.clouds.service_catalog import common
3554
+ from sky.clouds.service_catalog import common as catalog_common
3367
3555
 
3368
3556
  # For each gpu name (count not included):
3369
3557
  # - Group by cloud
@@ -3384,7 +3572,7 @@ def show_gpus(
3384
3572
  df = df.sort_values(by=['min_price', 'min_spot_price'])
3385
3573
  df = df.drop(columns=['min_price', 'min_spot_price'])
3386
3574
  sorted_dataclasses = [
3387
- common.InstanceTypeInfo(*row)
3575
+ catalog_common.InstanceTypeInfo(*row)
3388
3576
  for row in df.to_records(index=False)
3389
3577
  ]
3390
3578
  new_result[gpu] = sorted_dataclasses
@@ -3459,10 +3647,11 @@ def show_gpus(
3459
3647
  yield '\n\n'
3460
3648
  yield from accelerator_table.get_string()
3461
3649
 
3650
+ outputs = _output()
3462
3651
  if show_all:
3463
- click.echo_via_pager(_output())
3652
+ click.echo_via_pager(outputs)
3464
3653
  else:
3465
- for out in _output():
3654
+ for out in outputs:
3466
3655
  click.echo(out, nl=False)
3467
3656
  click.echo()
3468
3657
 
@@ -3474,18 +3663,20 @@ def storage():
3474
3663
 
3475
3664
 
3476
3665
  @storage.command('ls', cls=_DocumentedCodeCommand)
3477
- @click.option('--all',
3478
- '-a',
3666
+ @click.option('--verbose',
3667
+ '-v',
3479
3668
  default=False,
3480
3669
  is_flag=True,
3481
3670
  required=False,
3482
3671
  help='Show all information in full.')
3483
3672
  @usage_lib.entrypoint
3484
3673
  # pylint: disable=redefined-builtin
3485
- def storage_ls(all: bool):
3674
+ def storage_ls(verbose: bool):
3486
3675
  """List storage objects managed by SkyPilot."""
3487
- storages = sky.storage_ls()
3488
- storage_table = storage_utils.format_storage_table(storages, show_all=all)
3676
+ request_id = sdk.storage_ls()
3677
+ storages = sdk.stream_and_get(request_id)
3678
+ storage_table = storage_utils.format_storage_table(storages,
3679
+ show_all=verbose)
3489
3680
  click.echo(storage_table)
3490
3681
 
3491
3682
 
@@ -3507,8 +3698,9 @@ def storage_ls(all: bool):
3507
3698
  is_flag=True,
3508
3699
  required=False,
3509
3700
  help='Skip confirmation prompt.')
3701
+ @_add_click_options(_COMMON_OPTIONS)
3510
3702
  @usage_lib.entrypoint
3511
- def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=redefined-builtin
3703
+ def storage_delete(names: List[str], all: bool, yes: bool, async_call: bool): # pylint: disable=redefined-builtin
3512
3704
  """Delete storage objects.
3513
3705
 
3514
3706
  Examples:
@@ -3527,9 +3719,8 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
3527
3719
  if sum([bool(names), all]) != 1:
3528
3720
  raise click.UsageError('Either --all or a name must be specified.')
3529
3721
  if all:
3530
- # Use '*' to get all storages.
3531
- names = global_user_state.get_glob_storage_name(storage_name='*')
3532
- if not names:
3722
+ storages = sdk.get(sdk.storage_ls())
3723
+ if not storages:
3533
3724
  click.echo('No storage(s) to delete.')
3534
3725
  return
3535
3726
  else:
@@ -3545,19 +3736,25 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
3545
3736
  abort=True,
3546
3737
  show_default=True)
3547
3738
 
3548
- def delete_storage(name: str) -> None:
3739
+ request_ids = {}
3740
+ # TODO(zhwu): Support all flag for the underlying SDK and API server to
3741
+ # avoid multiple requests.
3742
+ for name in names:
3743
+ request_ids[name] = sdk.storage_delete(name)
3744
+
3745
+ for name, request_id in request_ids.items():
3549
3746
  try:
3550
- sky.storage_delete(name)
3747
+ _async_call_or_wait(request_id, async_call, 'sky.storage')
3551
3748
  except Exception as e: # pylint: disable=broad-except
3552
- click.secho(f'Error deleting storage {name}: {e}', fg='red')
3553
-
3554
- subprocess_utils.run_in_parallel(delete_storage, names)
3749
+ logger.error(f'{colorama.Fore.RED}Error deleting storage {name}: '
3750
+ f'{common_utils.format_exception(e, use_bracket=True)}'
3751
+ f'{colorama.Style.RESET_ALL}')
3555
3752
 
3556
3753
 
3557
- @cli.group(cls=_NaturalOrderGroup)
3754
+ @cli.group(cls=_NaturalOrderGroup, hidden=True)
3558
3755
  def bench():
3559
3756
  """SkyPilot Benchmark CLI."""
3560
- pass
3757
+ raise click.UsageError('The benchmark CLI is currently disabled.')
3561
3758
 
3562
3759
 
3563
3760
  @cli.group(cls=_NaturalOrderGroup)
@@ -3573,7 +3770,8 @@ def jobs():
3573
3770
  nargs=-1,
3574
3771
  **_get_shell_complete_args(_complete_file_name))
3575
3772
  # TODO(zhwu): Add --dryrun option to test the launch command.
3576
- @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS)
3773
+ @_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS +
3774
+ _COMMON_OPTIONS)
3577
3775
  @click.option('--cluster',
3578
3776
  '-c',
3579
3777
  default=None,
@@ -3622,6 +3820,7 @@ def jobs_launch(
3622
3820
  ports: Tuple[str],
3623
3821
  detach_run: bool,
3624
3822
  yes: bool,
3823
+ async_call: bool,
3625
3824
  ):
3626
3825
  """Launch a managed job from a YAML or a command.
3627
3826
 
@@ -3678,36 +3877,25 @@ def jobs_launch(
3678
3877
  dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
3679
3878
  dag_utils.fill_default_config_in_dag_for_job_launch(dag)
3680
3879
 
3681
- dag, _ = admin_policy_utils.apply(
3682
- dag, use_mutated_config_in_current_request=False)
3683
-
3684
- if yes:
3685
- # Skip resource preview if -y is set, since we are probably running in
3686
- # a script and the user won't have a chance to review it anyway.
3687
- # This can save a couple of seconds.
3688
- click.secho(
3689
- f'Resources for managed job {dag.name!r} will be computed on the '
3690
- 'managed jobs controller, since --yes is set.',
3691
- fg='cyan')
3692
-
3693
- else:
3694
- click.secho(
3695
- f'Managed job {dag.name!r} will be launched on (estimated):',
3696
- fg='cyan')
3697
- dag = sky.optimize(dag)
3698
-
3699
- prompt = f'Launching a managed job {dag.name!r}. Proceed?'
3700
- if prompt is not None:
3701
- click.confirm(prompt, default=True, abort=True, show_default=True)
3702
-
3703
3880
  common_utils.check_cluster_name_is_valid(name)
3704
3881
 
3705
- managed_jobs.launch(dag, name, detach_run=detach_run)
3882
+ click.secho(f'Managed job {dag.name!r} will be launched on (estimated):',
3883
+ fg='yellow')
3884
+
3885
+ request_id = managed_jobs.launch(dag, name, _need_confirmation=not yes)
3886
+ job_id_handle = _async_call_or_wait(request_id, async_call,
3887
+ 'sky.jobs.launch')
3888
+ if not async_call and not detach_run:
3889
+ job_id = job_id_handle[0]
3890
+ managed_jobs.tail_logs(name=None,
3891
+ job_id=job_id,
3892
+ follow=True,
3893
+ controller=False)
3706
3894
 
3707
3895
 
3708
3896
  @jobs.command('queue', cls=_DocumentedCodeCommand)
3709
- @click.option('--all',
3710
- '-a',
3897
+ @click.option('--verbose',
3898
+ '-v',
3711
3899
  default=False,
3712
3900
  is_flag=True,
3713
3901
  required=False,
@@ -3728,7 +3916,7 @@ def jobs_launch(
3728
3916
  help='Show only pending/running jobs\' information.')
3729
3917
  @usage_lib.entrypoint
3730
3918
  # pylint: disable=redefined-builtin
3731
- def jobs_queue(all: bool, refresh: bool, skip_finished: bool):
3919
+ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool):
3732
3920
  """Show statuses of managed jobs.
3733
3921
 
3734
3922
  Each managed jobs can have one of the following statuses:
@@ -3782,13 +3970,13 @@ def jobs_queue(all: bool, refresh: bool, skip_finished: bool):
3782
3970
  watch -n60 sky jobs queue
3783
3971
 
3784
3972
  """
3785
- click.secho('Fetching managed jobs...', fg='cyan')
3786
- with rich_utils.safe_status(
3787
- ux_utils.spinner_message('Checking managed jobs')):
3788
- _, msg = _get_managed_jobs(refresh=refresh,
3789
- skip_finished=skip_finished,
3790
- show_all=all,
3791
- is_called_by_user=True)
3973
+ click.secho('Fetching managed job statuses...', fg='cyan')
3974
+ with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
3975
+ managed_jobs_request_id = managed_jobs.queue(
3976
+ refresh=refresh, skip_finished=skip_finished)
3977
+ _, msg = _handle_jobs_queue_request(managed_jobs_request_id,
3978
+ show_all=verbose,
3979
+ is_called_by_user=True)
3792
3980
  if not skip_finished:
3793
3981
  in_progress_only_hint = ''
3794
3982
  else:
@@ -3835,13 +4023,6 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
3835
4023
  # Cancel managed jobs with IDs 1, 2, 3
3836
4024
  $ sky jobs cancel 1 2 3
3837
4025
  """
3838
- with rich_utils.safe_status(
3839
- ux_utils.spinner_message('Checking managed jobs')):
3840
- backend_utils.is_controller_accessible(
3841
- controller=controller_utils.Controllers.JOBS_CONTROLLER,
3842
- stopped_message='All managed jobs should have finished.',
3843
- exit_if_not_accessible=True)
3844
-
3845
4026
  job_id_str = ','.join(map(str, job_ids))
3846
4027
  if sum([bool(job_ids), name is not None, all]) != 1:
3847
4028
  argument_str = f'--job-ids {job_id_str}' if job_ids else ''
@@ -3861,7 +4042,7 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
3861
4042
  abort=True,
3862
4043
  show_default=True)
3863
4044
 
3864
- managed_jobs.cancel(job_ids=job_ids, name=name, all=all)
4045
+ sdk.stream_and_get(managed_jobs.cancel(job_ids=job_ids, name=name, all=all))
3865
4046
 
3866
4047
 
3867
4048
  @jobs.command('logs', cls=_DocumentedCodeCommand)
@@ -3903,10 +4084,19 @@ def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
3903
4084
  """Tail or sync down the log of a managed job."""
3904
4085
  try:
3905
4086
  if sync_down:
3906
- managed_jobs.sync_down_logs(name=name,
3907
- job_id=job_id,
3908
- controller=controller,
3909
- refresh=refresh)
4087
+ with rich_utils.client_status(
4088
+ ux_utils.spinner_message('Downloading jobs logs')):
4089
+ log_local_path_dict = managed_jobs.download_logs(
4090
+ name=name,
4091
+ job_id=job_id,
4092
+ controller=controller,
4093
+ refresh=refresh)
4094
+ style = colorama.Style
4095
+ fore = colorama.Fore
4096
+ controller_str = ' (controller)' if controller else ''
4097
+ for job, log_local_path in log_local_path_dict.items():
4098
+ logger.info(f'{fore.CYAN}Job {job} logs{controller_str}: '
4099
+ f'{log_local_path}{style.RESET_ALL}')
3910
4100
  else:
3911
4101
  managed_jobs.tail_logs(name=name,
3912
4102
  job_id=job_id,
@@ -3919,62 +4109,10 @@ def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
3919
4109
 
3920
4110
 
3921
4111
  @jobs.command('dashboard', cls=_DocumentedCodeCommand)
3922
- @click.option(
3923
- '--port',
3924
- '-p',
3925
- default=None,
3926
- type=int,
3927
- required=False,
3928
- help=('Local port to use for the dashboard. If None, a free port is '
3929
- 'automatically chosen.'))
3930
4112
  @usage_lib.entrypoint
3931
- def jobs_dashboard(port: Optional[int]):
3932
- """Opens a dashboard for managed jobs (needs controller to be UP)."""
3933
- # TODO(zongheng): ideally, the controller/dashboard server should expose the
3934
- # API perhaps via REST. Then here we would (1) not have to use SSH to try to
3935
- # see if the controller is UP first, which is slow; (2) not have to run SSH
3936
- # port forwarding first (we'd just launch a local dashboard which would make
3937
- # REST API calls to the controller dashboard server).
3938
- click.secho('Checking if jobs controller is up...', fg='cyan')
3939
- hint = ('Dashboard is not available if jobs controller is not up. Run a '
3940
- 'managed job first.')
3941
- backend_utils.is_controller_accessible(
3942
- controller=controller_utils.Controllers.JOBS_CONTROLLER,
3943
- stopped_message=hint,
3944
- non_existent_message=hint,
3945
- exit_if_not_accessible=True)
3946
-
3947
- # SSH forward a free local port to remote's dashboard port.
3948
- remote_port = constants.SPOT_DASHBOARD_REMOTE_PORT
3949
- if port is None:
3950
- free_port = common_utils.find_free_port(remote_port)
3951
- else:
3952
- free_port = port
3953
- ssh_command = (
3954
- f'ssh -qNL {free_port}:localhost:{remote_port} '
3955
- f'{controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name}')
3956
- click.echo('Forwarding port: ', nl=False)
3957
- click.secho(f'{ssh_command}', dim=True)
3958
-
3959
- with subprocess.Popen(ssh_command, shell=True,
3960
- start_new_session=True) as ssh_process:
3961
- time.sleep(3) # Added delay for ssh_command to initialize.
3962
- webbrowser.open(f'http://localhost:{free_port}')
3963
- click.secho(
3964
- f'Dashboard is now available at: http://127.0.0.1:{free_port}',
3965
- fg='green')
3966
- try:
3967
- ssh_process.wait()
3968
- except KeyboardInterrupt:
3969
- # When user presses Ctrl-C in terminal, exits the previous ssh
3970
- # command so that <free local port> is freed up.
3971
- try:
3972
- os.killpg(os.getpgid(ssh_process.pid), signal.SIGTERM)
3973
- except ProcessLookupError:
3974
- # This happens if jobs controller is auto-stopped.
3975
- pass
3976
- finally:
3977
- click.echo('Exiting.')
4113
+ def jobs_dashboard():
4114
+ """Opens a dashboard for managed jobs."""
4115
+ managed_jobs.dashboard()
3978
4116
 
3979
4117
 
3980
4118
  @cli.group(cls=_NaturalOrderGroup)
@@ -4111,7 +4249,7 @@ def _generate_task_with_service(
4111
4249
  type=str,
4112
4250
  help='A service name. Unique for each service. If not provided, '
4113
4251
  'a unique name is autogenerated.')
4114
- @_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS)
4252
+ @_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS + _COMMON_OPTIONS)
4115
4253
  @click.option('--yes',
4116
4254
  '-y',
4117
4255
  is_flag=True,
@@ -4140,6 +4278,7 @@ def serve_up(
4140
4278
  disk_size: Optional[int],
4141
4279
  disk_tier: Optional[str],
4142
4280
  yes: bool,
4281
+ async_call: bool,
4143
4282
  ):
4144
4283
  """Launch a SkyServe service.
4145
4284
 
@@ -4200,16 +4339,9 @@ def serve_up(
4200
4339
  fg='cyan')
4201
4340
  with sky.Dag() as dag:
4202
4341
  dag.add(task)
4203
- dag, _ = admin_policy_utils.apply(
4204
- dag, use_mutated_config_in_current_request=False)
4205
- sky.optimize(dag)
4206
-
4207
- if not yes:
4208
- prompt = f'Launching a new service {service_name!r}. Proceed?'
4209
- if prompt is not None:
4210
- click.confirm(prompt, default=True, abort=True, show_default=True)
4211
4342
 
4212
- serve_lib.up(task, service_name)
4343
+ request_id = serve_lib.up(task, service_name, _need_confirmation=not yes)
4344
+ _async_call_or_wait(request_id, async_call, 'sky.serve.up')
4213
4345
 
4214
4346
 
4215
4347
  # TODO(MaoZiming): Update Doc.
@@ -4222,7 +4354,7 @@ def serve_up(
4222
4354
  type=str,
4223
4355
  nargs=-1,
4224
4356
  **_get_shell_complete_args(_complete_file_name))
4225
- @_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS)
4357
+ @_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS + _COMMON_OPTIONS)
4226
4358
  @click.option('--mode',
4227
4359
  default=serve_lib.DEFAULT_UPDATE_MODE.value,
4228
4360
  type=click.Choice([m.value for m in serve_lib.UpdateMode],
@@ -4239,28 +4371,16 @@ def serve_up(
4239
4371
  help='Skip confirmation prompt.')
4240
4372
  @timeline.event
4241
4373
  @usage_lib.entrypoint
4242
- def serve_update(
4243
- service_name: str,
4244
- service_yaml: Tuple[str, ...],
4245
- workdir: Optional[str],
4246
- cloud: Optional[str],
4247
- region: Optional[str],
4248
- zone: Optional[str],
4249
- num_nodes: Optional[int],
4250
- use_spot: Optional[bool],
4251
- image_id: Optional[str],
4252
- env_file: Optional[Dict[str, str]],
4253
- env: List[Tuple[str, str]],
4254
- gpus: Optional[str],
4255
- instance_type: Optional[str],
4256
- ports: Tuple[str],
4257
- cpus: Optional[str],
4258
- memory: Optional[str],
4259
- disk_size: Optional[int],
4260
- disk_tier: Optional[str],
4261
- mode: str,
4262
- yes: bool,
4263
- ):
4374
+ def serve_update(service_name: str, service_yaml: Tuple[str, ...],
4375
+ workdir: Optional[str], cloud: Optional[str],
4376
+ region: Optional[str], zone: Optional[str],
4377
+ num_nodes: Optional[int], use_spot: Optional[bool],
4378
+ image_id: Optional[str], env_file: Optional[Dict[str, str]],
4379
+ env: List[Tuple[str, str]], gpus: Optional[str],
4380
+ instance_type: Optional[str], ports: Tuple[str],
4381
+ cpus: Optional[str], memory: Optional[str],
4382
+ disk_size: Optional[int], disk_tier: Optional[str], mode: str,
4383
+ yes: bool, async_call: bool):
4264
4384
  """Update a SkyServe service.
4265
4385
 
4266
4386
  service_yaml must point to a valid YAML file.
@@ -4318,22 +4438,17 @@ def serve_update(
4318
4438
  fg='cyan')
4319
4439
  with sky.Dag() as dag:
4320
4440
  dag.add(task)
4321
- dag, _ = admin_policy_utils.apply(
4322
- dag, use_mutated_config_in_current_request=False)
4323
- sky.optimize(dag)
4324
4441
 
4325
- if not yes:
4326
- click.confirm(f'Updating service {service_name!r}. Proceed?',
4327
- default=True,
4328
- abort=True,
4329
- show_default=True)
4330
-
4331
- serve_lib.update(task, service_name, mode=serve_lib.UpdateMode(mode))
4442
+ request_id = serve_lib.update(task,
4443
+ service_name,
4444
+ mode=serve_lib.UpdateMode(mode),
4445
+ _need_confirmation=not yes)
4446
+ _async_call_or_wait(request_id, async_call, 'sky.serve.update')
4332
4447
 
4333
4448
 
4334
4449
  @serve.command('status', cls=_DocumentedCodeCommand)
4335
- @click.option('--all',
4336
- '-a',
4450
+ @click.option('--verbose',
4451
+ '-v',
4337
4452
  default=False,
4338
4453
  is_flag=True,
4339
4454
  required=False,
@@ -4346,7 +4461,7 @@ def serve_update(
4346
4461
  @click.argument('service_names', required=False, type=str, nargs=-1)
4347
4462
  @usage_lib.entrypoint
4348
4463
  # pylint: disable=redefined-builtin
4349
- def serve_status(all: bool, endpoint: bool, service_names: List[str]):
4464
+ def serve_status(verbose: bool, endpoint: bool, service_names: List[str]):
4350
4465
  """Show statuses of SkyServe services.
4351
4466
 
4352
4467
  Show detailed statuses of one or more services. If SERVICE_NAME is not
@@ -4433,17 +4548,22 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]):
4433
4548
  sky serve status
4434
4549
  \b
4435
4550
  # Show detailed status for all services
4436
- sky serve status -a
4551
+ sky serve status -v
4437
4552
  \b
4438
4553
  # Only show status of my-service
4439
4554
  sky serve status my-service
4440
4555
  """
4556
+ service_names_to_query: Optional[List[str]] = service_names
4557
+ if not service_names:
4558
+ service_names_to_query = None
4441
4559
  # This won't pollute the output of --endpoint.
4442
- with rich_utils.safe_status(ux_utils.spinner_message('Checking services')):
4443
- _, msg = _get_services(service_names,
4444
- show_all=all,
4445
- show_endpoint=endpoint,
4446
- is_called_by_user=True)
4560
+ with rich_utils.client_status('[cyan]Checking services[/]'):
4561
+ service_status_request_id = serve_lib.status(service_names_to_query)
4562
+ _, msg = _handle_services_request(service_status_request_id,
4563
+ service_names=service_names_to_query,
4564
+ show_all=verbose,
4565
+ show_endpoint=endpoint,
4566
+ is_called_by_user=True)
4447
4567
 
4448
4568
  if not endpoint:
4449
4569
  click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
@@ -4473,10 +4593,17 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]):
4473
4593
  default=None,
4474
4594
  type=int,
4475
4595
  help='Tear down a given replica')
4596
+ @_add_click_options(_COMMON_OPTIONS)
4476
4597
  # pylint: disable=redefined-builtin
4477
- def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool,
4478
- replica_id: Optional[int]):
4479
- """Teardown service(s) or a replica.
4598
+ def serve_down(
4599
+ service_names: List[str],
4600
+ all: bool,
4601
+ purge: bool,
4602
+ yes: bool,
4603
+ replica_id: Optional[int],
4604
+ async_call: bool,
4605
+ ) -> None:
4606
+ """Teardown service(s).
4480
4607
 
4481
4608
  SERVICE_NAMES is the name of the service (or glob pattern) to tear down. If
4482
4609
  both SERVICE_NAMES and ``--all`` are supplied, the latter takes precedence.
@@ -4527,12 +4654,6 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool,
4527
4654
  if all:
4528
4655
  raise click.UsageError('The --replica-id option cannot be used '
4529
4656
  'with the --all option.')
4530
-
4531
- backend_utils.is_controller_accessible(
4532
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
4533
- stopped_message='All services should have been terminated.',
4534
- exit_if_not_accessible=True)
4535
-
4536
4657
  if not yes:
4537
4658
  if replica_id_is_defined:
4538
4659
  click.confirm(
@@ -4543,8 +4664,8 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool,
4543
4664
  show_default=True)
4544
4665
  else:
4545
4666
  quoted_service_names = [f'{name!r}' for name in service_names]
4546
- service_identity_str = (f'service(s) '
4547
- f'{", ".join(quoted_service_names)}')
4667
+ list_service_str = ', '.join(quoted_service_names)
4668
+ service_identity_str = f'service(s) {list_service_str}'
4548
4669
  if all:
4549
4670
  service_identity_str = 'all services'
4550
4671
  click.confirm(f'Terminating {service_identity_str}. Proceed?',
@@ -4553,9 +4674,13 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool,
4553
4674
  show_default=True)
4554
4675
 
4555
4676
  if replica_id_is_defined:
4556
- serve_lib.terminate_replica(service_names[0], replica_id, purge)
4677
+ request_id = serve_lib.terminate_replica(service_names[0], replica_id,
4678
+ purge)
4557
4679
  else:
4558
- serve_lib.down(service_names=service_names, all=all, purge=purge)
4680
+ request_id = serve_lib.down(service_names=service_names,
4681
+ all=all,
4682
+ purge=purge)
4683
+ _async_call_or_wait(request_id, async_call, 'sky.serve.down')
4559
4684
 
4560
4685
 
4561
4686
  @serve.command('logs', cls=_DocumentedCodeCommand)
@@ -4682,7 +4807,7 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]:
4682
4807
  required=True,
4683
4808
  type=str,
4684
4809
  help='Benchmark name.')
4685
- @_add_click_options(_TASK_OPTIONS_WITH_NAME)
4810
+ @_add_click_options(_TASK_OPTIONS_WITH_NAME + _COMMON_OPTIONS)
4686
4811
  @click.option('--gpus',
4687
4812
  required=False,
4688
4813
  type=str,
@@ -4717,26 +4842,27 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]:
4717
4842
  help='Skip confirmation prompt.')
4718
4843
  @usage_lib.entrypoint
4719
4844
  def benchmark_launch(
4720
- entrypoint: str,
4721
- benchmark: str,
4722
- name: Optional[str],
4723
- workdir: Optional[str],
4724
- cloud: Optional[str],
4725
- region: Optional[str],
4726
- zone: Optional[str],
4727
- gpus: Optional[str],
4728
- num_nodes: Optional[int],
4729
- use_spot: Optional[bool],
4730
- image_id: Optional[str],
4731
- env_file: Optional[Dict[str, str]],
4732
- env: List[Tuple[str, str]],
4733
- cpus: Optional[str],
4734
- memory: Optional[str],
4735
- disk_size: Optional[int],
4736
- disk_tier: Optional[str],
4737
- ports: Tuple[str],
4738
- idle_minutes_to_autostop: Optional[int],
4739
- yes: bool,
4845
+ entrypoint: str,
4846
+ benchmark: str,
4847
+ name: Optional[str],
4848
+ workdir: Optional[str],
4849
+ cloud: Optional[str],
4850
+ region: Optional[str],
4851
+ zone: Optional[str],
4852
+ gpus: Optional[str],
4853
+ num_nodes: Optional[int],
4854
+ use_spot: Optional[bool],
4855
+ image_id: Optional[str],
4856
+ env_file: Optional[Dict[str, str]],
4857
+ env: List[Tuple[str, str]],
4858
+ cpus: Optional[str],
4859
+ memory: Optional[str],
4860
+ disk_size: Optional[int],
4861
+ disk_tier: Optional[str],
4862
+ ports: Tuple[str],
4863
+ idle_minutes_to_autostop: Optional[int],
4864
+ yes: bool,
4865
+ async_call: bool, # pylint: disable=unused-argument
4740
4866
  ) -> None:
4741
4867
  """Benchmark a task on different resources.
4742
4868
 
@@ -4745,6 +4871,7 @@ def benchmark_launch(
4745
4871
  Alternatively, specify the benchmarking resources in your YAML (see doc),
4746
4872
  which allows benchmarking on many more resource fields.
4747
4873
  """
4874
+ # TODO(zhwu): move benchmark to SkyPilot API server
4748
4875
  env = _merge_env_vars(env_file, env)
4749
4876
  record = benchmark_state.get_benchmark_from_name(benchmark)
4750
4877
  if record is not None:
@@ -5135,10 +5262,7 @@ def benchmark_down(
5135
5262
  continue
5136
5263
  to_stop.append(cluster)
5137
5264
 
5138
- _down_or_stop_clusters(to_stop,
5139
- apply_to_all=False,
5140
- down=True,
5141
- no_confirm=yes)
5265
+ _down_or_stop_clusters(to_stop, down=True, no_confirm=yes)
5142
5266
 
5143
5267
 
5144
5268
  @bench.command('delete', cls=_DocumentedCodeCommand)
@@ -5192,9 +5316,9 @@ def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool],
5192
5316
  progress = rich_progress.Progress(transient=True,
5193
5317
  redirect_stdout=False,
5194
5318
  redirect_stderr=False)
5195
- task = progress.add_task(ux_utils.spinner_message(
5196
- f'Deleting {len(to_delete)} benchmark{plural}'),
5197
- total=len(to_delete))
5319
+ task = progress.add_task(
5320
+ f'[bold cyan]Deleting {len(to_delete)} benchmark{plural}: ',
5321
+ total=len(to_delete))
5198
5322
 
5199
5323
  def _delete_benchmark(benchmark: str) -> None:
5200
5324
  clusters = benchmark_state.get_benchmark_clusters(benchmark)
@@ -5244,196 +5368,6 @@ def local():
5244
5368
  pass
5245
5369
 
5246
5370
 
5247
- def _deploy_local_cluster(gpus: bool):
5248
- cluster_created = False
5249
-
5250
- # Check if GPUs are available on the host
5251
- local_gpus_available = backend_utils.check_local_gpus()
5252
- gpus = gpus and local_gpus_available
5253
-
5254
- # Check if ~/.kube/config exists:
5255
- if os.path.exists(os.path.expanduser('~/.kube/config')):
5256
- curr_context = kubernetes_utils.get_current_kube_config_context_name()
5257
- skypilot_context = 'kind-skypilot'
5258
- if curr_context is not None and curr_context != skypilot_context:
5259
- click.echo(
5260
- f'Current context in kube config: {curr_context}'
5261
- '\nWill automatically switch to kind-skypilot after the local '
5262
- 'cluster is created.')
5263
- message_str = 'Creating local cluster{}...'
5264
- message_str = message_str.format((' with GPU support (this may take up '
5265
- 'to 15 minutes)') if gpus else '')
5266
- path_to_package = os.path.dirname(os.path.dirname(__file__))
5267
- up_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes',
5268
- 'create_cluster.sh')
5269
-
5270
- # Get directory of script and run it from there
5271
- cwd = os.path.dirname(os.path.abspath(up_script_path))
5272
- run_command = up_script_path + f' {common_utils.get_user_hash()}'
5273
- run_command = run_command + ' --gpus' if gpus else run_command
5274
- run_command = shlex.split(run_command)
5275
-
5276
- # Setup logging paths
5277
- run_timestamp = sky_logging.get_run_timestamp()
5278
- log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
5279
- 'local_up.log')
5280
- tail_cmd = 'tail -n100 -f ' + log_path
5281
-
5282
- click.echo(message_str)
5283
- style = colorama.Style
5284
- click.echo('To view detailed progress: '
5285
- f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
5286
-
5287
- returncode, _, stderr = log_lib.run_with_log(
5288
- cmd=run_command,
5289
- log_path=log_path,
5290
- require_outputs=True,
5291
- stream_logs=False,
5292
- line_processor=log_utils.SkyLocalUpLineProcessor(),
5293
- cwd=cwd)
5294
-
5295
- # Kind always writes to stderr even if it succeeds.
5296
- # If the failure happens after the cluster is created, we need
5297
- # to strip all stderr of "No kind clusters found.", which is
5298
- # printed when querying with kind get clusters.
5299
- stderr = stderr.replace('No kind clusters found.\n', '')
5300
-
5301
- if returncode == 0:
5302
- cluster_created = True
5303
- elif returncode == 100:
5304
- click.echo(f'{colorama.Fore.GREEN}Local cluster already '
5305
- f'exists.{style.RESET_ALL}\n'
5306
- 'If you want to delete it instead, run: sky local down')
5307
- else:
5308
- with ux_utils.print_exception_no_traceback():
5309
- raise RuntimeError(
5310
- 'Failed to create local cluster. '
5311
- f'Full log: {log_path}'
5312
- f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}')
5313
- # Run sky check
5314
- with rich_utils.safe_status(ux_utils.spinner_message('Running sky check')):
5315
- sky_check.check(clouds=['kubernetes'], quiet=True)
5316
- if cluster_created:
5317
- # Prepare completion message which shows CPU and GPU count
5318
- # Get number of CPUs
5319
- p = subprocess_utils.run(
5320
- 'kubectl get nodes -o jsonpath=\'{.items[0].status.capacity.cpu}\'',
5321
- capture_output=True)
5322
- num_cpus = int(p.stdout.decode('utf-8'))
5323
-
5324
- # GPU count/type parsing
5325
- gpu_message = ''
5326
- gpu_hint = ''
5327
- if gpus:
5328
- # Get GPU model by querying the node labels
5329
- label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
5330
- gpu_type_cmd = f'kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
5331
- try:
5332
- # Run the command and capture the output
5333
- gpu_count_output = subprocess.check_output(gpu_type_cmd,
5334
- shell=True,
5335
- text=True)
5336
- gpu_type_str = gpu_count_output.strip() + ' '
5337
- except subprocess.CalledProcessError as e:
5338
- output = str(e.output.decode('utf-8'))
5339
- logger.warning(f'Failed to get GPU type: {output}')
5340
- gpu_type_str = ''
5341
-
5342
- # Get number of GPUs (sum of nvidia.com/gpu resources)
5343
- gpu_count_command = 'kubectl get nodes -o=jsonpath=\'{range .items[*]}{.status.allocatable.nvidia\\.com/gpu}{\"\\n\"}{end}\' | awk \'{sum += $1} END {print sum}\'' # pylint: disable=line-too-long
5344
- try:
5345
- # Run the command and capture the output
5346
- gpu_count_output = subprocess.check_output(gpu_count_command,
5347
- shell=True,
5348
- text=True)
5349
- gpu_count = gpu_count_output.strip(
5350
- ) # Remove any extra whitespace
5351
- gpu_message = f' and {gpu_count} {gpu_type_str}GPUs'
5352
- except subprocess.CalledProcessError as e:
5353
- output = str(e.output.decode('utf-8'))
5354
- logger.warning(f'Failed to get GPU count: {output}')
5355
- gpu_message = f' with {gpu_type_str}GPU support'
5356
-
5357
- gpu_hint = (
5358
- '\nHint: To see the list of GPUs in the cluster, '
5359
- 'run \'sky show-gpus --cloud kubernetes\'') if gpus else ''
5360
-
5361
- if num_cpus < 2:
5362
- click.echo('Warning: Local cluster has less than 2 CPUs. '
5363
- 'This may cause issues with running tasks.')
5364
- click.echo(
5365
- f'\n{colorama.Fore.GREEN}Local Kubernetes cluster created '
5366
- 'successfully with '
5367
- f'{num_cpus} CPUs{gpu_message}.{style.RESET_ALL}\n`sky launch` can '
5368
- 'now run tasks locally.'
5369
- '\nHint: To change the number of CPUs, change your docker '
5370
- 'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
5371
- f'{gpu_hint}')
5372
-
5373
-
5374
- def _deploy_remote_cluster(ip_file: str, ssh_user: str, ssh_key_path: str,
5375
- cleanup: bool):
5376
- success = False
5377
- path_to_package = os.path.dirname(os.path.dirname(__file__))
5378
- up_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes',
5379
- 'deploy_remote_cluster.sh')
5380
- # Get directory of script and run it from there
5381
- cwd = os.path.dirname(os.path.abspath(up_script_path))
5382
-
5383
- deploy_command = f'{up_script_path} {ip_file} {ssh_user} {ssh_key_path}'
5384
- if cleanup:
5385
- deploy_command += ' --cleanup'
5386
-
5387
- # Convert the command to a format suitable for subprocess
5388
- deploy_command = shlex.split(deploy_command)
5389
-
5390
- # Setup logging paths
5391
- run_timestamp = sky_logging.get_run_timestamp()
5392
- log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
5393
- 'local_up.log')
5394
- tail_cmd = 'tail -n100 -f ' + log_path
5395
-
5396
- # Check if ~/.kube/config exists:
5397
- if os.path.exists(os.path.expanduser('~/.kube/config')):
5398
- click.echo('Found existing kube config. '
5399
- 'It will be backed up to ~/.kube/config.bak.')
5400
- style = colorama.Style
5401
- click.echo('To view detailed progress: '
5402
- f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
5403
- if cleanup:
5404
- msg_str = 'Cleaning up remote cluster...'
5405
- else:
5406
- msg_str = 'Deploying remote cluster...'
5407
- with rich_utils.safe_status(f'[bold cyan]{msg_str}'):
5408
- returncode, _, stderr = log_lib.run_with_log(
5409
- cmd=deploy_command,
5410
- log_path=log_path,
5411
- require_outputs=True,
5412
- stream_logs=False,
5413
- line_processor=log_utils.SkyRemoteUpLineProcessor(),
5414
- cwd=cwd)
5415
- if returncode == 0:
5416
- success = True
5417
- else:
5418
- with ux_utils.print_exception_no_traceback():
5419
- raise RuntimeError(
5420
- 'Failed to deploy remote cluster. '
5421
- f'Full log: {log_path}'
5422
- f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}')
5423
-
5424
- if success:
5425
- if cleanup:
5426
- click.echo(f'{colorama.Fore.GREEN}'
5427
- '🎉 Remote cluster cleaned up successfully.'
5428
- f'{style.RESET_ALL}')
5429
- else:
5430
- click.echo('Cluster deployment done. You can now run tasks on '
5431
- 'this cluster.\nE.g., run a task with: '
5432
- 'sky launch --cloud kubernetes -- echo hello world.'
5433
- f'\n{colorama.Fore.GREEN}🎉 Remote cluster deployed '
5434
- f'successfully. {style.RESET_ALL}')
5435
-
5436
-
5437
5371
  @click.option('--gpus/--no-gpus',
5438
5372
  default=True,
5439
5373
  is_flag=True,
@@ -5456,9 +5390,10 @@ def _deploy_remote_cluster(ip_file: str, ssh_user: str, ssh_key_path: str,
5456
5390
  is_flag=True,
5457
5391
  help='Clean up the remote cluster instead of deploying it.')
5458
5392
  @local.command('up', cls=_DocumentedCodeCommand)
5393
+ @_add_click_options(_COMMON_OPTIONS)
5459
5394
  @usage_lib.entrypoint
5460
5395
  def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
5461
- cleanup: bool):
5396
+ cleanup: bool, async_call: bool):
5462
5397
  """Creates a local or remote cluster."""
5463
5398
 
5464
5399
  def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
@@ -5479,64 +5414,226 @@ def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
5479
5414
  _validate_args(ips, ssh_user, ssh_key_path, cleanup)
5480
5415
 
5481
5416
  # If remote deployment arguments are specified, run remote up script
5417
+ ip_list = None
5418
+ ssh_key = None
5482
5419
  if ips and ssh_user and ssh_key_path:
5483
- # Convert ips and ssh_key_path to absolute paths
5484
- ips = os.path.abspath(ips)
5485
- ssh_key_path = os.path.abspath(ssh_key_path)
5486
- _deploy_remote_cluster(ips, ssh_user, ssh_key_path, cleanup)
5487
- else:
5488
- # Run local deployment (kind) if no remote args are specified
5489
- _deploy_local_cluster(gpus)
5420
+ # Read and validate IP file
5421
+ try:
5422
+ with open(os.path.expanduser(ips), 'r', encoding='utf-8') as f:
5423
+ ip_list = f.read().strip().splitlines()
5424
+ if not ip_list:
5425
+ raise click.BadParameter(f'IP file is empty: {ips}')
5426
+ except (IOError, OSError) as e:
5427
+ raise click.BadParameter(f'Failed to read IP file {ips}: {str(e)}')
5428
+
5429
+ # Read and validate SSH key file
5430
+ try:
5431
+ with open(os.path.expanduser(ssh_key_path), 'r',
5432
+ encoding='utf-8') as f:
5433
+ ssh_key = f.read()
5434
+ if not ssh_key:
5435
+ raise click.BadParameter(
5436
+ f'SSH key file is empty: {ssh_key_path}')
5437
+ except (IOError, OSError) as e:
5438
+ raise click.BadParameter(
5439
+ f'Failed to read SSH key file {ssh_key_path}: {str(e)}')
5440
+
5441
+ request_id = sdk.local_up(gpus, ip_list, ssh_user, ssh_key, cleanup)
5442
+ _async_call_or_wait(request_id, async_call, request_name='local up')
5490
5443
 
5491
5444
 
5492
5445
  @local.command('down', cls=_DocumentedCodeCommand)
5446
+ @_add_click_options(_COMMON_OPTIONS)
5493
5447
  @usage_lib.entrypoint
5494
- def local_down():
5448
+ def local_down(async_call: bool):
5495
5449
  """Deletes a local cluster."""
5496
- cluster_removed = False
5450
+ request_id = sdk.local_down()
5451
+ _async_call_or_wait(request_id, async_call, request_name='sky.local.down')
5497
5452
 
5498
- path_to_package = os.path.dirname(os.path.dirname(__file__))
5499
- down_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes',
5500
- 'delete_cluster.sh')
5501
5453
 
5502
- cwd = os.path.dirname(os.path.abspath(down_script_path))
5503
- run_command = shlex.split(down_script_path)
5454
+ @cli.group(cls=_NaturalOrderGroup)
5455
+ def api():
5456
+ """SkyPilot API server commands."""
5457
+ pass
5504
5458
 
5505
- # Setup logging paths
5506
- run_timestamp = sky_logging.get_run_timestamp()
5507
- log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
5508
- 'local_down.log')
5509
- tail_cmd = 'tail -n100 -f ' + log_path
5510
5459
 
5511
- with rich_utils.safe_status(
5512
- ux_utils.spinner_message('Removing local cluster')):
5513
- style = colorama.Style
5514
- click.echo('To view detailed progress: '
5515
- f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
5516
- returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
5517
- log_path=log_path,
5518
- require_outputs=True,
5519
- stream_logs=False,
5520
- cwd=cwd)
5521
- stderr = stderr.replace('No kind clusters found.\n', '')
5522
-
5523
- if returncode == 0:
5524
- cluster_removed = True
5525
- elif returncode == 100:
5526
- click.echo('\nLocal cluster does not exist.')
5527
- else:
5528
- with ux_utils.print_exception_no_traceback():
5529
- raise RuntimeError(
5530
- 'Failed to create local cluster. '
5531
- f'Stdout: {stdout}'
5532
- f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}')
5533
- if cluster_removed:
5534
- # Run sky check
5535
- with rich_utils.safe_status(
5536
- ux_utils.spinner_message('Running sky check')):
5537
- sky_check.check(clouds=['kubernetes'], quiet=True)
5538
- click.echo(
5539
- f'{colorama.Fore.GREEN}Local cluster removed.{style.RESET_ALL}')
5460
+ @api.command('start', cls=_DocumentedCodeCommand)
5461
+ @click.option('--deploy',
5462
+ type=bool,
5463
+ is_flag=True,
5464
+ default=False,
5465
+ required=False,
5466
+ help=('Deploy the SkyPilot API server. When set to True, '
5467
+ 'SkyPilot API server will use all resources on the host '
5468
+ 'machine assuming the machine is dedicated to SkyPilot API '
5469
+ 'server; host will also be set to 0.0.0.0 to allow remote '
5470
+ 'access.'))
5471
+ @click.option('--host',
5472
+ default='127.0.0.1',
5473
+ type=click.Choice(server_common.AVAILBLE_LOCAL_API_SERVER_HOSTS),
5474
+ required=False,
5475
+ help=('The host to deploy the SkyPilot API server. To allow '
5476
+ 'remote access, set this to 0.0.0.0'))
5477
+ @usage_lib.entrypoint
5478
+ def api_start(deploy: bool, host: Optional[str]):
5479
+ """Starts the SkyPilot API server locally."""
5480
+ sdk.api_start(deploy=deploy, host=host)
5481
+
5482
+
5483
+ @api.command('stop', cls=_DocumentedCodeCommand)
5484
+ @usage_lib.entrypoint
5485
+ def api_stop():
5486
+ """Stops the SkyPilot API server locally."""
5487
+ sdk.api_stop()
5488
+
5489
+
5490
+ @api.command('logs', cls=_DocumentedCodeCommand)
5491
+ @click.argument('request_id', required=False, type=str)
5492
+ @click.option('--server-logs',
5493
+ is_flag=True,
5494
+ default=False,
5495
+ required=False,
5496
+ help='Stream the server logs.')
5497
+ @click.option('--log-path',
5498
+ '-l',
5499
+ required=False,
5500
+ type=str,
5501
+ help='The path to the log file to stream.')
5502
+ @click.option('--tail',
5503
+ required=False,
5504
+ type=int,
5505
+ help=('Number of lines to show from the end of the logs. '
5506
+ '(default: None)'))
5507
+ @click.option('--follow/--no-follow',
5508
+ is_flag=True,
5509
+ default=True,
5510
+ required=False,
5511
+ help='Follow the logs.')
5512
+ @usage_lib.entrypoint
5513
+ def api_logs(request_id: Optional[str], server_logs: bool,
5514
+ log_path: Optional[str], tail: Optional[int], follow: bool):
5515
+ """Stream the logs of a request running on SkyPilot API server."""
5516
+ if not server_logs and request_id is None and log_path is None:
5517
+ # TODO(zhwu): get the latest request ID.
5518
+ raise click.BadParameter('Please provide the request ID or log path.')
5519
+ if server_logs:
5520
+ sdk.api_server_logs(follow=follow, tail=tail)
5521
+ return
5522
+
5523
+ if request_id is not None and log_path is not None:
5524
+ raise click.BadParameter(
5525
+ 'Only one of request ID and log path can be provided.')
5526
+ sdk.stream_and_get(request_id, log_path, tail)
5527
+
5528
+
5529
+ @api.command('cancel', cls=_DocumentedCodeCommand)
5530
+ @click.argument('request_ids', required=False, type=str, nargs=-1)
5531
+ @click.option('--all',
5532
+ '-a',
5533
+ is_flag=True,
5534
+ default=False,
5535
+ required=False,
5536
+ help='Cancel all your requests.')
5537
+ @click.option('--all-users',
5538
+ '-u',
5539
+ is_flag=True,
5540
+ default=False,
5541
+ required=False,
5542
+ help='Cancel all requests from all users.')
5543
+ @usage_lib.entrypoint
5544
+ # pylint: disable=redefined-builtin
5545
+ def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
5546
+ """Cancel a request running on SkyPilot API server."""
5547
+ if all or all_users:
5548
+ keyword = 'ALL USERS\'' if all_users else 'YOUR'
5549
+ user_input = click.prompt(
5550
+ f'This will cancel all {keyword} requests.\n'
5551
+ f'To proceed, please type {colorama.Style.BRIGHT}'
5552
+ f'\'cancel all requests\'{colorama.Style.RESET_ALL}',
5553
+ type=str)
5554
+ if user_input != 'cancel all requests':
5555
+ raise click.Abort()
5556
+ if all:
5557
+ request_ids = None
5558
+ cancelled_request_ids = sdk.get(
5559
+ sdk.api_cancel(request_ids=request_ids, all_users=all_users))
5560
+ if not cancelled_request_ids:
5561
+ click.secho('No requests need to be cancelled.', fg='green')
5562
+ elif len(cancelled_request_ids) == 1:
5563
+ click.secho(f'Cancelled 1 request: {cancelled_request_ids[0]}',
5564
+ fg='green')
5565
+ else:
5566
+ click.secho(f'Cancelled {len(cancelled_request_ids)} requests.',
5567
+ fg='green')
5568
+
5569
+
5570
+ @api.command('status', cls=_DocumentedCodeCommand)
5571
+ @click.argument('request_ids', required=False, type=str, nargs=-1)
5572
+ @click.option('--all-status',
5573
+ '-a',
5574
+ is_flag=True,
5575
+ default=False,
5576
+ required=False,
5577
+ help='Show requests of all statuses.')
5578
+ @click.option('--verbose',
5579
+ '-v',
5580
+ is_flag=True,
5581
+ default=False,
5582
+ required=False,
5583
+ help='Show more details.')
5584
+ @usage_lib.entrypoint
5585
+ # pylint: disable=redefined-builtin
5586
+ def api_status(request_ids: Optional[List[str]], all_status: bool,
5587
+ verbose: bool):
5588
+ """List requests on SkyPilot API server."""
5589
+ if not request_ids:
5590
+ request_ids = None
5591
+ request_list = sdk.api_status(request_ids, all_status)
5592
+ columns = ['ID', 'User', 'Name']
5593
+ if verbose:
5594
+ columns.append('Cluster')
5595
+ columns.extend(['Created', 'Status'])
5596
+ table = log_utils.create_table(columns)
5597
+ for request in request_list:
5598
+ r_id = request.request_id
5599
+ if not verbose:
5600
+ r_id = common_utils.truncate_long_string(r_id, 36)
5601
+ req_status = requests.RequestStatus(request.status)
5602
+ row = [r_id, request.user_name, request.name]
5603
+ if verbose:
5604
+ row.append(request.cluster_name)
5605
+ row.extend([
5606
+ log_utils.readable_time_duration(request.created_at),
5607
+ req_status.colored_str()
5608
+ ])
5609
+ table.add_row(row)
5610
+ click.echo(table)
5611
+
5612
+
5613
+ @api.command('login', cls=_DocumentedCodeCommand)
5614
+ @click.option('--endpoint',
5615
+ '-e',
5616
+ required=False,
5617
+ help='The SkyPilot API server endpoint.')
5618
+ @usage_lib.entrypoint
5619
+ def api_login(endpoint: Optional[str]):
5620
+ """Logs into a SkyPilot API server."""
5621
+ sdk.api_login(endpoint)
5622
+
5623
+
5624
+ @api.command('info', cls=_DocumentedCodeCommand)
5625
+ @usage_lib.entrypoint
5626
+ def api_info():
5627
+ """Shows the SkyPilot API server URL."""
5628
+ url = server_common.get_server_url()
5629
+ api_server_info = sdk.api_info()
5630
+ user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
5631
+ user_hash = common_utils.get_user_hash()
5632
+ click.echo(f'Using SkyPilot API server: {url}\n'
5633
+ f'{ux_utils.INDENT_SYMBOL}Status: {api_server_info["status"]}, '
5634
+ f'commit: {api_server_info["commit"]}, '
5635
+ f'version: {api_server_info["version"]}\n'
5636
+ f'{ux_utils.INDENT_LAST_SYMBOL}User: {user_name} ({user_hash})')
5540
5637
 
5541
5638
 
5542
5639
  def main():