skypilot-nightly 1.0.0.dev20250215__py3-none-any.whl → 1.0.0.dev20250217__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +48 -22
- sky/adaptors/aws.py +2 -1
- sky/adaptors/azure.py +4 -4
- sky/adaptors/cloudflare.py +4 -4
- sky/adaptors/kubernetes.py +8 -8
- sky/authentication.py +42 -45
- sky/backends/backend.py +2 -2
- sky/backends/backend_utils.py +108 -221
- sky/backends/cloud_vm_ray_backend.py +283 -282
- sky/benchmark/benchmark_utils.py +6 -2
- sky/check.py +40 -28
- sky/cli.py +1213 -1116
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5644 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1757 -0
- sky/cloud_stores.py +12 -6
- sky/clouds/__init__.py +0 -2
- sky/clouds/aws.py +20 -13
- sky/clouds/azure.py +5 -3
- sky/clouds/cloud.py +1 -1
- sky/clouds/cudo.py +2 -1
- sky/clouds/do.py +2 -1
- sky/clouds/fluidstack.py +3 -2
- sky/clouds/gcp.py +10 -8
- sky/clouds/ibm.py +8 -7
- sky/clouds/kubernetes.py +7 -6
- sky/clouds/lambda_cloud.py +8 -7
- sky/clouds/oci.py +4 -3
- sky/clouds/paperspace.py +2 -1
- sky/clouds/runpod.py +2 -1
- sky/clouds/scp.py +8 -7
- sky/clouds/service_catalog/__init__.py +3 -3
- sky/clouds/service_catalog/aws_catalog.py +7 -1
- sky/clouds/service_catalog/common.py +4 -2
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +2 -2
- sky/clouds/utils/oci_utils.py +1 -1
- sky/clouds/vast.py +2 -1
- sky/clouds/vsphere.py +2 -1
- sky/core.py +263 -99
- sky/dag.py +4 -0
- sky/data/mounting_utils.py +2 -1
- sky/data/storage.py +97 -35
- sky/data/storage_utils.py +69 -9
- sky/exceptions.py +138 -5
- sky/execution.py +47 -50
- sky/global_user_state.py +105 -22
- sky/jobs/__init__.py +12 -14
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +296 -0
- sky/jobs/constants.py +30 -1
- sky/jobs/controller.py +12 -6
- sky/jobs/dashboard/dashboard.py +2 -6
- sky/jobs/recovery_strategy.py +22 -29
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/{core.py → server/core.py} +101 -34
- sky/jobs/server/dashboard_utils.py +64 -0
- sky/jobs/server/server.py +182 -0
- sky/jobs/utils.py +32 -23
- sky/models.py +27 -0
- sky/optimizer.py +9 -11
- sky/provision/__init__.py +6 -3
- sky/provision/aws/config.py +2 -2
- sky/provision/aws/instance.py +1 -1
- sky/provision/azure/instance.py +1 -1
- sky/provision/cudo/instance.py +1 -1
- sky/provision/do/instance.py +1 -1
- sky/provision/do/utils.py +0 -5
- sky/provision/fluidstack/fluidstack_utils.py +4 -3
- sky/provision/fluidstack/instance.py +4 -2
- sky/provision/gcp/instance.py +1 -1
- sky/provision/instance_setup.py +2 -2
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +1 -1
- sky/provision/kubernetes/utils.py +67 -76
- sky/provision/lambda_cloud/instance.py +3 -15
- sky/provision/logging.py +1 -1
- sky/provision/oci/instance.py +7 -4
- sky/provision/paperspace/instance.py +1 -1
- sky/provision/provisioner.py +3 -2
- sky/provision/runpod/instance.py +1 -1
- sky/provision/vast/instance.py +1 -1
- sky/provision/vast/utils.py +2 -1
- sky/provision/vsphere/instance.py +2 -11
- sky/resources.py +55 -40
- sky/serve/__init__.py +6 -10
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +3 -0
- sky/serve/replica_managers.py +10 -10
- sky/serve/serve_utils.py +56 -36
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +37 -17
- sky/serve/server/server.py +117 -0
- sky/serve/service.py +8 -1
- sky/server/__init__.py +1 -0
- sky/server/common.py +441 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +462 -0
- sky/server/requests/payloads.py +481 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1095 -0
- sky/server/stream_utils.py +144 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/setup_files/dependencies.py +12 -4
- sky/setup_files/setup.py +1 -1
- sky/sky_logging.py +9 -13
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +46 -12
- sky/skylet/events.py +5 -6
- sky/skylet/job_lib.py +78 -66
- sky/skylet/log_lib.py +17 -11
- sky/skypilot_config.py +79 -94
- sky/task.py +119 -73
- sky/templates/aws-ray.yml.j2 +4 -4
- sky/templates/azure-ray.yml.j2 +3 -2
- sky/templates/cudo-ray.yml.j2 +3 -2
- sky/templates/fluidstack-ray.yml.j2 +3 -2
- sky/templates/gcp-ray.yml.j2 +3 -2
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +1 -12
- sky/templates/kubernetes-ray.yml.j2 +3 -2
- sky/templates/lambda-ray.yml.j2 +3 -2
- sky/templates/oci-ray.yml.j2 +3 -2
- sky/templates/paperspace-ray.yml.j2 +3 -2
- sky/templates/runpod-ray.yml.j2 +3 -2
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vsphere-ray.yml.j2 +4 -2
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +8 -0
- sky/usage/usage_lib.py +45 -11
- sky/utils/accelerator_registry.py +33 -53
- sky/utils/admin_policy_utils.py +2 -1
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +33 -3
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +69 -14
- sky/utils/common.py +74 -0
- sky/utils/common_utils.py +133 -93
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +2 -3
- sky/utils/controller_utils.py +133 -147
- sky/utils/dag_utils.py +72 -24
- sky/utils/kubernetes/deploy_remote_cluster.sh +2 -2
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/log_utils.py +83 -23
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +2 -2
- sky/utils/rich_utils.py +213 -34
- sky/utils/schemas.py +19 -2
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +51 -35
- sky/utils/timeline.py +7 -2
- sky/utils/ux_utils.py +95 -25
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/METADATA +8 -3
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/RECORD +170 -132
- sky/clouds/cloud_registry.py +0 -76
- sky/utils/cluster_yaml_utils.py +0 -24
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250215.dist-info → skypilot_nightly-1.0.0.dev20250217.dist-info}/top_level.txt +0 -0
sky/cli.py
CHANGED
@@ -26,62 +26,66 @@ each other.
|
|
26
26
|
import copy
|
27
27
|
import datetime
|
28
28
|
import functools
|
29
|
-
import
|
29
|
+
import getpass
|
30
30
|
import os
|
31
31
|
import shlex
|
32
32
|
import shutil
|
33
|
-
import signal
|
34
33
|
import subprocess
|
35
34
|
import sys
|
36
35
|
import textwrap
|
37
|
-
import
|
36
|
+
import traceback
|
38
37
|
import typing
|
39
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
40
|
-
import webbrowser
|
38
|
+
from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
|
41
39
|
|
42
40
|
import click
|
43
41
|
import colorama
|
44
42
|
import dotenv
|
43
|
+
import requests as requests_lib
|
45
44
|
from rich import progress as rich_progress
|
46
45
|
import yaml
|
47
46
|
|
48
47
|
import sky
|
49
|
-
from sky import admin_policy
|
50
48
|
from sky import backends
|
51
|
-
from sky import
|
52
|
-
from sky import clouds as sky_clouds
|
53
|
-
from sky import core
|
49
|
+
from sky import clouds
|
54
50
|
from sky import exceptions
|
55
51
|
from sky import global_user_state
|
56
52
|
from sky import jobs as managed_jobs
|
53
|
+
from sky import models
|
57
54
|
from sky import serve as serve_lib
|
58
55
|
from sky import sky_logging
|
59
|
-
from sky import status_lib
|
60
56
|
from sky.adaptors import common as adaptors_common
|
61
|
-
from sky.backends import backend_utils
|
62
57
|
from sky.benchmark import benchmark_state
|
63
58
|
from sky.benchmark import benchmark_utils
|
59
|
+
from sky.client import sdk
|
64
60
|
from sky.clouds import service_catalog
|
65
61
|
from sky.data import storage_utils
|
62
|
+
from sky.provision.kubernetes import constants as kubernetes_constants
|
66
63
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
64
|
+
from sky.server import common as server_common
|
65
|
+
from sky.server import constants as server_constants
|
66
|
+
from sky.server.requests import requests
|
67
67
|
from sky.skylet import constants
|
68
68
|
from sky.skylet import job_lib
|
69
|
-
from sky.skylet import log_lib
|
70
69
|
from sky.usage import usage_lib
|
71
|
-
from sky.utils import
|
70
|
+
from sky.utils import annotations
|
71
|
+
from sky.utils import cluster_utils
|
72
|
+
from sky.utils import common
|
72
73
|
from sky.utils import common_utils
|
73
74
|
from sky.utils import controller_utils
|
74
75
|
from sky.utils import dag_utils
|
76
|
+
from sky.utils import env_options
|
75
77
|
from sky.utils import log_utils
|
78
|
+
from sky.utils import registry
|
76
79
|
from sky.utils import resources_utils
|
77
80
|
from sky.utils import rich_utils
|
81
|
+
from sky.utils import status_lib
|
78
82
|
from sky.utils import subprocess_utils
|
79
83
|
from sky.utils import timeline
|
80
84
|
from sky.utils import ux_utils
|
81
85
|
from sky.utils.cli_utils import status_utils
|
82
86
|
|
83
87
|
if typing.TYPE_CHECKING:
|
84
|
-
|
88
|
+
import types
|
85
89
|
|
86
90
|
pd = adaptors_common.LazyImport('pandas')
|
87
91
|
logger = sky_logging.init_logger(__name__)
|
@@ -101,23 +105,92 @@ _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = (
|
|
101
105
|
'{cluster_num} cluster{plural} {verb}. Please specify {cause} '
|
102
106
|
'cluster to show its {property}.\nUsage: `sky status --{flag} <cluster>`')
|
103
107
|
|
104
|
-
_ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
|
105
|
-
'please retry after a while.')
|
106
|
-
|
107
108
|
_DAG_NOT_SUPPORTED_MESSAGE = ('YAML specifies a DAG which is only supported by '
|
108
109
|
'`sky jobs launch`. `{command}` supports a '
|
109
110
|
'single task only.')
|
110
111
|
|
111
112
|
|
112
|
-
def
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
|
113
|
+
def _get_cluster_records_and_set_ssh_config(
|
114
|
+
clusters: Optional[List[str]],
|
115
|
+
refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
|
116
|
+
all_users: bool = False,
|
117
|
+
) -> List[dict]:
|
118
|
+
"""Returns a list of clusters that match the glob pattern.
|
119
|
+
|
120
|
+
Args:
|
121
|
+
clusters: A list of cluster names to query. If None, query all clusters.
|
122
|
+
refresh: The refresh mode for the status command.
|
123
|
+
all_users: Whether to query clusters from all users.
|
124
|
+
If clusters is not None, this field is ignored because cluster list
|
125
|
+
can include other users' clusters.
|
126
|
+
"""
|
127
|
+
# TODO(zhwu): we should move this function into SDK.
|
128
|
+
# TODO(zhwu): this additional RTT makes CLIs slow. We should optimize this.
|
129
|
+
if clusters is not None:
|
130
|
+
all_users = True
|
131
|
+
request_id = sdk.status(clusters, refresh=refresh, all_users=all_users)
|
132
|
+
cluster_records = sdk.stream_and_get(request_id)
|
133
|
+
# Update the SSH config for all clusters
|
134
|
+
for record in cluster_records:
|
135
|
+
handle = record['handle']
|
136
|
+
if handle is not None and handle.cached_external_ips is not None:
|
137
|
+
credentials = record['credentials']
|
138
|
+
if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
|
139
|
+
# Replace the proxy command to proxy through the SkyPilot API
|
140
|
+
# server with websocket.
|
141
|
+
key_path = (
|
142
|
+
cluster_utils.SSHConfigHelper.generate_local_key_file(
|
143
|
+
handle.cluster_name, credentials))
|
144
|
+
# Instead of directly use websocket_proxy.py, we add an
|
145
|
+
# additional proxy, so that ssh can use the head pod in the
|
146
|
+
# cluster to jump to worker pods.
|
147
|
+
proxy_command = (
|
148
|
+
f'ssh -tt -i {key_path} '
|
149
|
+
'-o StrictHostKeyChecking=no '
|
150
|
+
'-o UserKnownHostsFile=/dev/null '
|
151
|
+
'-o IdentitiesOnly=yes '
|
152
|
+
'-W %h:%p '
|
153
|
+
f'{handle.ssh_user}@127.0.0.1 '
|
154
|
+
'-o ProxyCommand='
|
155
|
+
# TODO(zhwu): write the template to a temp file, don't use
|
156
|
+
# the one in skypilot repo, to avoid changing the file when
|
157
|
+
# updating skypilot.
|
158
|
+
f'\'{sys.executable} {sky.__root_dir__}/templates/'
|
159
|
+
f'websocket_proxy.py '
|
160
|
+
f'{server_common.get_server_url().split("://")[1]} '
|
161
|
+
f'{handle.cluster_name}\'')
|
162
|
+
credentials['ssh_proxy_command'] = proxy_command
|
163
|
+
cluster_utils.SSHConfigHelper.add_cluster(
|
164
|
+
handle.cluster_name,
|
165
|
+
handle.cached_external_ips,
|
166
|
+
credentials,
|
167
|
+
handle.cached_external_ssh_ports,
|
168
|
+
handle.docker_user,
|
169
|
+
handle.ssh_user,
|
170
|
+
)
|
171
|
+
else:
|
172
|
+
# If the cluster is not UP or does not have IPs, we need to remove
|
173
|
+
# the cluster from the SSH config.
|
174
|
+
cluster_utils.SSHConfigHelper.remove_cluster(handle.cluster_name)
|
175
|
+
|
176
|
+
# Clean up SSH configs for clusters that do not exist.
|
177
|
+
#
|
178
|
+
# We do this in a conservative way: only when a query is made for all users
|
179
|
+
# or specific clusters. Without those, the table returned only contains the
|
180
|
+
# current user's clusters, and the information is not enough for
|
181
|
+
# removing clusters, because SkyPilot has no idea whether to remove
|
182
|
+
# ssh config of a cluster from another user.
|
183
|
+
clusters_exists = set(record['name'] for record in cluster_records)
|
184
|
+
if clusters is not None:
|
185
|
+
for cluster in clusters:
|
186
|
+
if cluster not in clusters_exists:
|
187
|
+
cluster_utils.SSHConfigHelper.remove_cluster(cluster)
|
188
|
+
elif all_users:
|
189
|
+
for cluster_name in cluster_utils.SSHConfigHelper.list_cluster_names():
|
190
|
+
if cluster_name not in clusters_exists:
|
191
|
+
cluster_utils.SSHConfigHelper.remove_cluster(cluster_name)
|
192
|
+
|
193
|
+
return cluster_records
|
121
194
|
|
122
195
|
|
123
196
|
def _get_glob_storages(storages: List[str]) -> List[str]:
|
@@ -147,6 +220,44 @@ def _parse_env_var(env_var: str) -> Tuple[str, str]:
|
|
147
220
|
return ret[0], ret[1]
|
148
221
|
|
149
222
|
|
223
|
+
def _async_call_or_wait(request_id: str, async_call: bool,
|
224
|
+
request_name: str) -> Any:
|
225
|
+
short_request_id = request_id[:8]
|
226
|
+
if not async_call:
|
227
|
+
try:
|
228
|
+
return sdk.stream_and_get(request_id)
|
229
|
+
except KeyboardInterrupt:
|
230
|
+
logger.info(
|
231
|
+
ux_utils.starting_message('Request will continue running '
|
232
|
+
'asynchronously.') +
|
233
|
+
f'\n{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}View logs: '
|
234
|
+
f'{ux_utils.BOLD}sky api logs {short_request_id}'
|
235
|
+
f'{colorama.Style.RESET_ALL}'
|
236
|
+
f'\n{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}Or, '
|
237
|
+
'visit: '
|
238
|
+
f'{server_common.get_server_url()}/api/stream?'
|
239
|
+
f'request_id={short_request_id}'
|
240
|
+
f'\n{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}To cancel '
|
241
|
+
'the request, run: '
|
242
|
+
f'{ux_utils.BOLD}sky api cancel {short_request_id}'
|
243
|
+
f'{colorama.Style.RESET_ALL}'
|
244
|
+
f'\n{colorama.Style.RESET_ALL}')
|
245
|
+
raise
|
246
|
+
else:
|
247
|
+
click.secho(f'Submitted {request_name} request: {request_id}',
|
248
|
+
fg='green')
|
249
|
+
click.echo(
|
250
|
+
f'{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}Check logs with: '
|
251
|
+
f'sky api logs {short_request_id}{colorama.Style.RESET_ALL}\n'
|
252
|
+
f'{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}Or, visit: '
|
253
|
+
f'{server_common.get_server_url()}/api/stream?'
|
254
|
+
f'request_id={short_request_id}'
|
255
|
+
f'\n{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}To cancel '
|
256
|
+
'the request, run: '
|
257
|
+
f'{ux_utils.BOLD}sky api cancel {short_request_id}'
|
258
|
+
f'{colorama.Style.RESET_ALL}\n')
|
259
|
+
|
260
|
+
|
150
261
|
def _merge_env_vars(env_dict: Optional[Dict[str, str]],
|
151
262
|
env_list: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
152
263
|
"""Merges all values from env_list into env_dict."""
|
@@ -157,6 +268,15 @@ def _merge_env_vars(env_dict: Optional[Dict[str, str]],
|
|
157
268
|
return list(env_dict.items())
|
158
269
|
|
159
270
|
|
271
|
+
_COMMON_OPTIONS = [
|
272
|
+
click.option('--async/--no-async',
|
273
|
+
'async_call',
|
274
|
+
required=False,
|
275
|
+
is_flag=True,
|
276
|
+
default=False,
|
277
|
+
help=('Run the command asynchronously.'))
|
278
|
+
]
|
279
|
+
|
160
280
|
_TASK_OPTIONS = [
|
161
281
|
click.option(
|
162
282
|
'--workdir',
|
@@ -308,14 +428,28 @@ def _complete_cluster_name(ctx: click.Context, param: click.Parameter,
|
|
308
428
|
incomplete: str) -> List[str]:
|
309
429
|
"""Handle shell completion for cluster names."""
|
310
430
|
del ctx, param # Unused.
|
311
|
-
|
431
|
+
# TODO(zhwu): we send requests to API server for completion, which can cause
|
432
|
+
# large latency. We should investigate caching mechanism if needed.
|
433
|
+
response = requests_lib.get(
|
434
|
+
f'{server_common.get_server_url()}'
|
435
|
+
f'/api/completion/cluster_name?incomplete={incomplete}',
|
436
|
+
timeout=2.0,
|
437
|
+
)
|
438
|
+
response.raise_for_status()
|
439
|
+
return response.json()
|
312
440
|
|
313
441
|
|
314
442
|
def _complete_storage_name(ctx: click.Context, param: click.Parameter,
|
315
443
|
incomplete: str) -> List[str]:
|
316
444
|
"""Handle shell completion for storage names."""
|
317
445
|
del ctx, param # Unused.
|
318
|
-
|
446
|
+
response = requests_lib.get(
|
447
|
+
f'{server_common.get_server_url()}'
|
448
|
+
f'/api/completion/storage_name?incomplete={incomplete}',
|
449
|
+
timeout=2.0,
|
450
|
+
)
|
451
|
+
response.raise_for_status()
|
452
|
+
return response.json()
|
319
453
|
|
320
454
|
|
321
455
|
def _complete_file_name(ctx: click.Context, param: click.Parameter,
|
@@ -493,7 +627,7 @@ def _parse_override_params(
|
|
493
627
|
if cloud.lower() == 'none':
|
494
628
|
override_params['cloud'] = None
|
495
629
|
else:
|
496
|
-
override_params['cloud'] =
|
630
|
+
override_params['cloud'] = registry.CLOUD_REGISTRY.from_str(cloud)
|
497
631
|
if region is not None:
|
498
632
|
if region.lower() == 'none':
|
499
633
|
override_params['region'] = None
|
@@ -550,99 +684,6 @@ def _parse_override_params(
|
|
550
684
|
return override_params
|
551
685
|
|
552
686
|
|
553
|
-
def _launch_with_confirm(
|
554
|
-
task: sky.Task,
|
555
|
-
backend: backends.Backend,
|
556
|
-
cluster: Optional[str],
|
557
|
-
*,
|
558
|
-
dryrun: bool,
|
559
|
-
detach_run: bool,
|
560
|
-
detach_setup: bool = False,
|
561
|
-
no_confirm: bool = False,
|
562
|
-
idle_minutes_to_autostop: Optional[int] = None,
|
563
|
-
down: bool = False, # pylint: disable=redefined-outer-name
|
564
|
-
retry_until_up: bool = False,
|
565
|
-
no_setup: bool = False,
|
566
|
-
clone_disk_from: Optional[str] = None,
|
567
|
-
fast: bool = False,
|
568
|
-
):
|
569
|
-
"""Launch a cluster with a Task."""
|
570
|
-
if cluster is None:
|
571
|
-
cluster = backend_utils.generate_cluster_name()
|
572
|
-
|
573
|
-
clone_source_str = ''
|
574
|
-
if clone_disk_from is not None:
|
575
|
-
clone_source_str = f' from the disk of {clone_disk_from!r}'
|
576
|
-
task, _ = backend_utils.check_can_clone_disk_and_override_task(
|
577
|
-
clone_disk_from, cluster, task)
|
578
|
-
|
579
|
-
with sky.Dag() as dag:
|
580
|
-
dag.add(task)
|
581
|
-
|
582
|
-
maybe_status, handle = backend_utils.refresh_cluster_status_handle(cluster)
|
583
|
-
if maybe_status is None:
|
584
|
-
# Show the optimize log before the prompt if the cluster does not exist.
|
585
|
-
try:
|
586
|
-
sky_check.get_cached_enabled_clouds_or_refresh(
|
587
|
-
raise_if_no_cloud_access=True)
|
588
|
-
except exceptions.NoCloudAccessError as e:
|
589
|
-
# Catch the exception where the public cloud is not enabled, and
|
590
|
-
# make it yellow for better visibility.
|
591
|
-
with ux_utils.print_exception_no_traceback():
|
592
|
-
raise RuntimeError(f'{colorama.Fore.YELLOW}{e}'
|
593
|
-
f'{colorama.Style.RESET_ALL}') from e
|
594
|
-
dag, _ = admin_policy_utils.apply(
|
595
|
-
dag,
|
596
|
-
request_options=admin_policy.RequestOptions(
|
597
|
-
cluster_name=cluster,
|
598
|
-
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
599
|
-
down=down,
|
600
|
-
dryrun=dryrun,
|
601
|
-
),
|
602
|
-
)
|
603
|
-
dag = sky.optimize(dag)
|
604
|
-
task = dag.tasks[0]
|
605
|
-
|
606
|
-
if handle is not None:
|
607
|
-
backend.check_resources_fit_cluster(handle, task)
|
608
|
-
|
609
|
-
confirm_shown = False
|
610
|
-
if not no_confirm:
|
611
|
-
# Prompt if (1) --cluster is None, or (2) cluster doesn't exist, or (3)
|
612
|
-
# it exists but is STOPPED.
|
613
|
-
prompt = None
|
614
|
-
if maybe_status is None:
|
615
|
-
cluster_str = '' if cluster is None else f' {cluster!r}'
|
616
|
-
prompt = (
|
617
|
-
f'Launching a new cluster{cluster_str}{clone_source_str}. '
|
618
|
-
'Proceed?')
|
619
|
-
elif maybe_status == status_lib.ClusterStatus.STOPPED:
|
620
|
-
prompt = f'Restarting the stopped cluster {cluster!r}. Proceed?'
|
621
|
-
if prompt is not None:
|
622
|
-
confirm_shown = True
|
623
|
-
click.confirm(prompt, default=True, abort=True, show_default=True)
|
624
|
-
|
625
|
-
if not confirm_shown:
|
626
|
-
click.secho('Running on cluster: ', fg='cyan', nl=False)
|
627
|
-
click.secho(cluster)
|
628
|
-
|
629
|
-
sky.launch(
|
630
|
-
dag,
|
631
|
-
dryrun=dryrun,
|
632
|
-
stream_logs=True,
|
633
|
-
cluster_name=cluster,
|
634
|
-
detach_setup=detach_setup,
|
635
|
-
detach_run=detach_run,
|
636
|
-
backend=backend,
|
637
|
-
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
638
|
-
down=down,
|
639
|
-
retry_until_up=retry_until_up,
|
640
|
-
no_setup=no_setup,
|
641
|
-
clone_disk_from=clone_disk_from,
|
642
|
-
fast=fast,
|
643
|
-
)
|
644
|
-
|
645
|
-
|
646
687
|
def _check_yaml(entrypoint: str) -> Tuple[bool, Optional[Dict[str, Any]]]:
|
647
688
|
"""Checks if entrypoint is a readable YAML file.
|
648
689
|
|
@@ -953,6 +994,7 @@ def _deprecate_and_hide_command(group, command_to_deprecate,
|
|
953
994
|
prog_name='skypilot',
|
954
995
|
message='%(prog)s, commit %(version)s',
|
955
996
|
help='Show the commit hash and exit')
|
997
|
+
@annotations.client_api
|
956
998
|
def cli():
|
957
999
|
pass
|
958
1000
|
|
@@ -973,20 +1015,9 @@ def cli():
|
|
973
1015
|
default=False,
|
974
1016
|
is_flag=True,
|
975
1017
|
help='If True, do not actually run the job.')
|
976
|
-
@click.option(
|
977
|
-
'--detach-setup',
|
978
|
-
'-s',
|
979
|
-
default=False,
|
980
|
-
is_flag=True,
|
981
|
-
help=
|
982
|
-
('If True, run setup in non-interactive mode as part of the job itself. '
|
983
|
-
'You can safely ctrl-c to detach from logging, and it will not interrupt '
|
984
|
-
'the setup process. To see the logs again after detaching, use `sky logs`.'
|
985
|
-
' To cancel setup, cancel the job via `sky cancel`. Useful for long-'
|
986
|
-
'running setup commands.'))
|
987
1018
|
@click.option(
|
988
1019
|
'--detach-run',
|
989
|
-
'-d',
|
1020
|
+
'-d/-no-d',
|
990
1021
|
default=False,
|
991
1022
|
is_flag=True,
|
992
1023
|
help=('If True, as soon as a job is submitted, return from this call '
|
@@ -994,11 +1025,13 @@ def cli():
|
|
994
1025
|
@click.option('--docker',
|
995
1026
|
'backend_name',
|
996
1027
|
flag_value=backends.LocalDockerBackend.NAME,
|
1028
|
+
default=False,
|
997
1029
|
hidden=True,
|
998
1030
|
help=('(Deprecated) Local docker support is deprecated. '
|
999
1031
|
'To run locally, create a local Kubernetes cluster with '
|
1000
1032
|
'``sky local up``.'))
|
1001
|
-
@_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS
|
1033
|
+
@_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS +
|
1034
|
+
_COMMON_OPTIONS)
|
1002
1035
|
@click.option(
|
1003
1036
|
'--idle-minutes-to-autostop',
|
1004
1037
|
'-i',
|
@@ -1067,37 +1100,36 @@ def cli():
|
|
1067
1100
|
'provisioning and setup steps.'))
|
1068
1101
|
@usage_lib.entrypoint
|
1069
1102
|
def launch(
|
1070
|
-
|
1071
|
-
|
1072
|
-
|
1073
|
-
|
1074
|
-
|
1075
|
-
|
1076
|
-
|
1077
|
-
|
1078
|
-
|
1079
|
-
|
1080
|
-
|
1081
|
-
|
1082
|
-
|
1083
|
-
|
1084
|
-
|
1085
|
-
|
1086
|
-
|
1087
|
-
|
1088
|
-
|
1089
|
-
|
1090
|
-
|
1091
|
-
|
1092
|
-
|
1093
|
-
|
1094
|
-
|
1095
|
-
|
1096
|
-
|
1097
|
-
|
1098
|
-
|
1099
|
-
|
1100
|
-
):
|
1103
|
+
entrypoint: Tuple[str, ...],
|
1104
|
+
cluster: Optional[str],
|
1105
|
+
dryrun: bool,
|
1106
|
+
detach_run: bool,
|
1107
|
+
backend_name: Optional[str],
|
1108
|
+
name: Optional[str],
|
1109
|
+
workdir: Optional[str],
|
1110
|
+
cloud: Optional[str],
|
1111
|
+
region: Optional[str],
|
1112
|
+
zone: Optional[str],
|
1113
|
+
gpus: Optional[str],
|
1114
|
+
cpus: Optional[str],
|
1115
|
+
memory: Optional[str],
|
1116
|
+
instance_type: Optional[str],
|
1117
|
+
num_nodes: Optional[int],
|
1118
|
+
use_spot: Optional[bool],
|
1119
|
+
image_id: Optional[str],
|
1120
|
+
env_file: Optional[Dict[str, str]],
|
1121
|
+
env: List[Tuple[str, str]],
|
1122
|
+
disk_size: Optional[int],
|
1123
|
+
disk_tier: Optional[str],
|
1124
|
+
ports: Tuple[str, ...],
|
1125
|
+
idle_minutes_to_autostop: Optional[int],
|
1126
|
+
down: bool, # pylint: disable=redefined-outer-name
|
1127
|
+
retry_until_up: bool,
|
1128
|
+
yes: bool,
|
1129
|
+
no_setup: bool,
|
1130
|
+
clone_disk_from: Optional[str],
|
1131
|
+
fast: bool,
|
1132
|
+
async_call: bool):
|
1101
1133
|
"""Launch a cluster or task.
|
1102
1134
|
|
1103
1135
|
If ENTRYPOINT points to a valid YAML file, it is read in as the task
|
@@ -1107,6 +1139,14 @@ def launch(
|
|
1107
1139
|
and they undergo job queue scheduling.
|
1108
1140
|
"""
|
1109
1141
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
1142
|
+
# TODO(zhwu): the current --async is a bit inconsistent with the direct
|
1143
|
+
# sky launch, as `sky api logs` does not contain the logs for the actual job
|
1144
|
+
# submitted, while the synchronous way of `sky launch` does. We should
|
1145
|
+
# consider having the job logs available in `sky api logs` as well.
|
1146
|
+
# Reason for not doing it right now: immediately tailing the logs for the
|
1147
|
+
# job can take up resources on the API server. When there are a lot of
|
1148
|
+
# `launch` submitted asynchronously, the log tailing may overwhelm the API
|
1149
|
+
# server, if the jobs are long running.
|
1110
1150
|
env = _merge_env_vars(env_file, env)
|
1111
1151
|
controller_utils.check_cluster_name_not_controller(
|
1112
1152
|
cluster, operation_str='Launching tasks on it')
|
@@ -1159,19 +1199,35 @@ def launch(
|
|
1159
1199
|
f'{colorama.Style.RESET_ALL}{colorama.Style.BRIGHT}sky serve up'
|
1160
1200
|
f'{colorama.Style.RESET_ALL}')
|
1161
1201
|
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1168
|
-
|
1169
|
-
|
1170
|
-
|
1171
|
-
|
1172
|
-
|
1173
|
-
|
1174
|
-
|
1202
|
+
request_id = sdk.launch(
|
1203
|
+
task,
|
1204
|
+
dryrun=dryrun,
|
1205
|
+
cluster_name=cluster,
|
1206
|
+
backend=backend,
|
1207
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
1208
|
+
down=down,
|
1209
|
+
retry_until_up=retry_until_up,
|
1210
|
+
no_setup=no_setup,
|
1211
|
+
clone_disk_from=clone_disk_from,
|
1212
|
+
fast=fast,
|
1213
|
+
_need_confirmation=not yes,
|
1214
|
+
)
|
1215
|
+
job_id_handle = _async_call_or_wait(request_id, async_call, 'sky.launch')
|
1216
|
+
if not async_call:
|
1217
|
+
job_id, handle = job_id_handle
|
1218
|
+
if not handle:
|
1219
|
+
assert dryrun, 'handle should only be None when dryrun is true'
|
1220
|
+
return
|
1221
|
+
# Add ssh config for the cluster
|
1222
|
+
_get_cluster_records_and_set_ssh_config(
|
1223
|
+
clusters=[handle.get_cluster_name()])
|
1224
|
+
# job_id will be None if no job was submitted (e.g. no entrypoint
|
1225
|
+
# provided)
|
1226
|
+
if not detach_run and job_id is not None:
|
1227
|
+
sdk.tail_logs(handle.get_cluster_name(), job_id, follow=True)
|
1228
|
+
click.secho(
|
1229
|
+
ux_utils.command_hint_messages(ux_utils.CommandHintType.CLUSTER_JOB,
|
1230
|
+
job_id, handle.get_cluster_name()))
|
1175
1231
|
|
1176
1232
|
|
1177
1233
|
@cli.command(cls=_DocumentedCodeCommand)
|
@@ -1199,32 +1255,19 @@ def launch(
|
|
1199
1255
|
is_flag=True,
|
1200
1256
|
help=('If True, as soon as a job is submitted, return from this call '
|
1201
1257
|
'and do not stream execution logs.'))
|
1202
|
-
@_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS
|
1258
|
+
@_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS +
|
1259
|
+
_COMMON_OPTIONS)
|
1203
1260
|
@usage_lib.entrypoint
|
1204
1261
|
# pylint: disable=redefined-builtin
|
1205
|
-
def exec(
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1209
|
-
|
1210
|
-
|
1211
|
-
|
1212
|
-
|
1213
|
-
|
1214
|
-
workdir: Optional[str],
|
1215
|
-
gpus: Optional[str],
|
1216
|
-
ports: Tuple[str],
|
1217
|
-
instance_type: Optional[str],
|
1218
|
-
num_nodes: Optional[int],
|
1219
|
-
use_spot: Optional[bool],
|
1220
|
-
image_id: Optional[str],
|
1221
|
-
env_file: Optional[Dict[str, str]],
|
1222
|
-
env: List[Tuple[str, str]],
|
1223
|
-
cpus: Optional[str],
|
1224
|
-
memory: Optional[str],
|
1225
|
-
disk_size: Optional[int],
|
1226
|
-
disk_tier: Optional[str],
|
1227
|
-
):
|
1262
|
+
def exec(cluster: Optional[str], cluster_option: Optional[str],
|
1263
|
+
entrypoint: Tuple[str, ...], detach_run: bool, name: Optional[str],
|
1264
|
+
cloud: Optional[str], region: Optional[str], zone: Optional[str],
|
1265
|
+
workdir: Optional[str], gpus: Optional[str], ports: Tuple[str],
|
1266
|
+
instance_type: Optional[str], num_nodes: Optional[int],
|
1267
|
+
use_spot: Optional[bool], image_id: Optional[str],
|
1268
|
+
env_file: Optional[Dict[str, str]], env: List[Tuple[str, str]],
|
1269
|
+
cpus: Optional[str], memory: Optional[str], disk_size: Optional[int],
|
1270
|
+
disk_tier: Optional[str], async_call: bool):
|
1228
1271
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
1229
1272
|
"""Execute a task or command on an existing cluster.
|
1230
1273
|
|
@@ -1297,11 +1340,6 @@ def exec(
|
|
1297
1340
|
env = _merge_env_vars(env_file, env)
|
1298
1341
|
controller_utils.check_cluster_name_not_controller(
|
1299
1342
|
cluster, operation_str='Executing task on it')
|
1300
|
-
handle = global_user_state.get_handle_from_cluster_name(cluster)
|
1301
|
-
if handle is None:
|
1302
|
-
raise click.BadParameter(f'Cluster {cluster!r} not found. '
|
1303
|
-
'Use `sky launch` to provision first.')
|
1304
|
-
backend = backend_utils.get_backend_from_handle(handle)
|
1305
1343
|
|
1306
1344
|
task_or_dag = _make_task_or_dag_from_entrypoint_with_overrides(
|
1307
1345
|
entrypoint=entrypoint,
|
@@ -1331,21 +1369,21 @@ def exec(
|
|
1331
1369
|
|
1332
1370
|
click.secho('Submitting job to cluster: ', fg='cyan', nl=False)
|
1333
1371
|
click.secho(cluster)
|
1334
|
-
|
1372
|
+
request_id = sdk.exec(task, cluster_name=cluster)
|
1373
|
+
job_id_handle = _async_call_or_wait(request_id, async_call, 'sky.exec')
|
1374
|
+
if not async_call and not detach_run:
|
1375
|
+
job_id, _ = job_id_handle
|
1376
|
+
sdk.tail_logs(cluster, job_id, follow=True)
|
1335
1377
|
|
1336
1378
|
|
1337
|
-
def
|
1338
|
-
|
1339
|
-
skip_finished: bool,
|
1379
|
+
def _handle_jobs_queue_request(
|
1380
|
+
request_id: str,
|
1340
1381
|
show_all: bool,
|
1341
1382
|
limit_num_jobs_to_show: bool = False,
|
1342
1383
|
is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
|
1343
1384
|
"""Get the in-progress managed jobs.
|
1344
1385
|
|
1345
1386
|
Args:
|
1346
|
-
refresh: Query the latest statuses, restarting the jobs controller if
|
1347
|
-
stopped.
|
1348
|
-
skip_finished: Show only in-progress jobs.
|
1349
1387
|
show_all: Show all information of each job (e.g., region, price).
|
1350
1388
|
limit_num_jobs_to_show: If True, limit the number of jobs to show to
|
1351
1389
|
_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS, which is mainly used by
|
@@ -1359,14 +1397,13 @@ def _get_managed_jobs(
|
|
1359
1397
|
msg contains the error message. Otherwise, msg contains the formatted
|
1360
1398
|
managed job table.
|
1361
1399
|
"""
|
1400
|
+
# TODO(SKY-980): remove unnecessary fallbacks on the client side.
|
1362
1401
|
num_in_progress_jobs = None
|
1402
|
+
msg = ''
|
1363
1403
|
try:
|
1364
1404
|
if not is_called_by_user:
|
1365
1405
|
usage_lib.messages.usage.set_internal()
|
1366
|
-
|
1367
|
-
# Make the call silent
|
1368
|
-
managed_jobs_ = managed_jobs.queue(refresh=refresh,
|
1369
|
-
skip_finished=skip_finished)
|
1406
|
+
managed_jobs_ = sdk.get(request_id)
|
1370
1407
|
num_in_progress_jobs = len(set(job['job_id'] for job in managed_jobs_))
|
1371
1408
|
except exceptions.ClusterNotUpError as e:
|
1372
1409
|
controller_status = e.cluster_status
|
@@ -1379,16 +1416,18 @@ def _get_managed_jobs(
|
|
1379
1416
|
msg += (f' (See finished managed jobs: {colorama.Style.BRIGHT}'
|
1380
1417
|
f'sky jobs queue --refresh{colorama.Style.RESET_ALL})')
|
1381
1418
|
except RuntimeError as e:
|
1382
|
-
msg = ''
|
1383
1419
|
try:
|
1384
1420
|
# Check the controller status again, as the RuntimeError is likely
|
1385
1421
|
# due to the controller being autostopped when querying the jobs.
|
1386
1422
|
controller_type = controller_utils.Controllers.JOBS_CONTROLLER
|
1387
|
-
|
1388
|
-
|
1389
|
-
|
1390
|
-
|
1391
|
-
|
1423
|
+
# Query status of the controller cluster. We add a wildcard because
|
1424
|
+
# the controller cluster name can have a suffix like
|
1425
|
+
# '-remote-<hash>' when using remote API server.
|
1426
|
+
records = sdk.get(
|
1427
|
+
sdk.status(
|
1428
|
+
cluster_names=[controller_type.value.cluster_name + '*']))
|
1429
|
+
if (not records or
|
1430
|
+
records[0]['status'] == status_lib.ClusterStatus.STOPPED):
|
1392
1431
|
msg = controller_type.value.default_hint_if_non_existent
|
1393
1432
|
except Exception: # pylint: disable=broad-except
|
1394
1433
|
# This is to an best effort to find the latest controller status to
|
@@ -1402,8 +1441,12 @@ def _get_managed_jobs(
|
|
1402
1441
|
f'Details: {common_utils.format_exception(e, use_bracket=True)}'
|
1403
1442
|
)
|
1404
1443
|
except Exception as e: # pylint: disable=broad-except
|
1405
|
-
msg =
|
1406
|
-
|
1444
|
+
msg = ''
|
1445
|
+
if env_options.Options.SHOW_DEBUG_INFO.get():
|
1446
|
+
msg += traceback.format_exc()
|
1447
|
+
msg += '\n'
|
1448
|
+
msg += ('Failed to query managed jobs: '
|
1449
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
1407
1450
|
else:
|
1408
1451
|
max_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS
|
1409
1452
|
if limit_num_jobs_to_show else None)
|
@@ -1413,10 +1456,12 @@ def _get_managed_jobs(
|
|
1413
1456
|
return num_in_progress_jobs, msg
|
1414
1457
|
|
1415
1458
|
|
1416
|
-
def
|
1417
|
-
|
1418
|
-
|
1419
|
-
|
1459
|
+
def _handle_services_request(
|
1460
|
+
request_id: str,
|
1461
|
+
service_names: Optional[List[str]],
|
1462
|
+
show_all: bool,
|
1463
|
+
show_endpoint: bool,
|
1464
|
+
is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
|
1420
1465
|
"""Get service statuses.
|
1421
1466
|
|
1422
1467
|
Args:
|
@@ -1435,12 +1480,8 @@ def _get_services(service_names: Optional[List[str]],
|
|
1435
1480
|
try:
|
1436
1481
|
if not is_called_by_user:
|
1437
1482
|
usage_lib.messages.usage.set_internal()
|
1438
|
-
|
1439
|
-
|
1440
|
-
# Change empty list to None
|
1441
|
-
service_names = None
|
1442
|
-
service_records = serve_lib.status(service_names)
|
1443
|
-
num_services = len(service_records)
|
1483
|
+
service_records = sdk.get(request_id)
|
1484
|
+
num_services = len(service_records)
|
1444
1485
|
except exceptions.ClusterNotUpError as e:
|
1445
1486
|
controller_status = e.cluster_status
|
1446
1487
|
msg = str(e)
|
@@ -1454,11 +1495,14 @@ def _get_services(service_names: Optional[List[str]],
|
|
1454
1495
|
# due to the controller being autostopped when querying the
|
1455
1496
|
# services.
|
1456
1497
|
controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
1460
|
-
|
1461
|
-
|
1498
|
+
# Query status of the controller cluster. We add a wildcard because
|
1499
|
+
# the controller cluster name can have a suffix like
|
1500
|
+
# '-remote-<hash>' when using remote API server.
|
1501
|
+
records = sdk.get(
|
1502
|
+
sdk.status(
|
1503
|
+
cluster_names=[controller_type.value.cluster_name + '*']))
|
1504
|
+
if (not records or
|
1505
|
+
records[0]['status'] == status_lib.ClusterStatus.STOPPED):
|
1462
1506
|
msg = controller_type.value.default_hint_if_non_existent
|
1463
1507
|
except Exception: # pylint: disable=broad-except
|
1464
1508
|
# This is to an best effort to find the latest controller status to
|
@@ -1482,7 +1526,8 @@ def _get_services(service_names: Optional[List[str]],
|
|
1482
1526
|
f'{service_num} service{plural} found. Please specify '
|
1483
1527
|
'an existing service to show its endpoint. Usage: '
|
1484
1528
|
'sky serve status --endpoint <service-name>')
|
1485
|
-
|
1529
|
+
endpoint = service_records[0]['endpoint']
|
1530
|
+
msg = '-' if endpoint is None else endpoint
|
1486
1531
|
else:
|
1487
1532
|
msg = serve_lib.format_service_table(service_records, show_all)
|
1488
1533
|
service_not_found_msg = ''
|
@@ -1503,8 +1548,8 @@ def _status_kubernetes(show_all: bool):
|
|
1503
1548
|
Args:
|
1504
1549
|
show_all (bool): Show all job information (e.g., start time, failures).
|
1505
1550
|
"""
|
1506
|
-
all_clusters, unmanaged_clusters, all_jobs, context = (
|
1507
|
-
|
1551
|
+
all_clusters, unmanaged_clusters, all_jobs, context = (sdk.stream_and_get(
|
1552
|
+
sdk.status_kubernetes()))
|
1508
1553
|
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1509
1554
|
f'Kubernetes cluster state (context: {context})'
|
1510
1555
|
f'{colorama.Style.RESET_ALL}')
|
@@ -1524,9 +1569,76 @@ def _status_kubernetes(show_all: bool):
|
|
1524
1569
|
f'{colorama.Style.RESET_ALL}')
|
1525
1570
|
|
1526
1571
|
|
1572
|
+
def _show_endpoint(query_clusters: Optional[List[str]],
|
1573
|
+
cluster_records: List[Dict[str, Any]], ip: bool,
|
1574
|
+
endpoints: bool, endpoint: Optional[int]) -> None:
|
1575
|
+
show_endpoints = endpoints or endpoint is not None
|
1576
|
+
show_single_endpoint = endpoint is not None
|
1577
|
+
if len(cluster_records) != 1:
|
1578
|
+
with ux_utils.print_exception_no_traceback():
|
1579
|
+
plural = 's' if len(cluster_records) > 1 else ''
|
1580
|
+
if cluster_records:
|
1581
|
+
cluster_num = str(len(cluster_records))
|
1582
|
+
else:
|
1583
|
+
cluster_num = (f'{query_clusters[0]!r}'
|
1584
|
+
if query_clusters else 'No')
|
1585
|
+
verb = 'found' if cluster_records else 'not found'
|
1586
|
+
cause = 'a single'
|
1587
|
+
if query_clusters and len(query_clusters) > 1:
|
1588
|
+
cause = 'an existing'
|
1589
|
+
raise ValueError(
|
1590
|
+
_STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format(
|
1591
|
+
cluster_num=cluster_num,
|
1592
|
+
plural=plural,
|
1593
|
+
verb=verb,
|
1594
|
+
cause=cause,
|
1595
|
+
property='IP address' if ip else 'endpoint(s)',
|
1596
|
+
flag='ip' if ip else
|
1597
|
+
('endpoint port' if show_single_endpoint else 'endpoints')))
|
1598
|
+
|
1599
|
+
cluster_record = cluster_records[0]
|
1600
|
+
if cluster_record['status'] != status_lib.ClusterStatus.UP:
|
1601
|
+
with ux_utils.print_exception_no_traceback():
|
1602
|
+
raise RuntimeError(f'Cluster {cluster_record["name"]!r} '
|
1603
|
+
'is not in UP status.')
|
1604
|
+
handle = cluster_record['handle']
|
1605
|
+
if not isinstance(handle, backends.CloudVmRayResourceHandle):
|
1606
|
+
with ux_utils.print_exception_no_traceback():
|
1607
|
+
raise ValueError('Querying IP address is not supported '
|
1608
|
+
'for local clusters.')
|
1609
|
+
|
1610
|
+
head_ip = handle.external_ips()[0]
|
1611
|
+
# The endpoint request is relatively fast, so we don't add special handling
|
1612
|
+
# for keyboard interrupt and abort the request to avoid additional latency.
|
1613
|
+
if show_endpoints:
|
1614
|
+
if endpoint:
|
1615
|
+
request_id = sdk.endpoints(cluster_record['name'], endpoint)
|
1616
|
+
cluster_endpoints = sdk.stream_and_get(request_id)
|
1617
|
+
cluster_endpoint = cluster_endpoints.get(str(endpoint), None)
|
1618
|
+
if not cluster_endpoint:
|
1619
|
+
raise click.Abort(f'Endpoint {endpoint} not found for cluster '
|
1620
|
+
f'{cluster_record["name"]!r}.')
|
1621
|
+
click.echo(cluster_endpoint)
|
1622
|
+
else:
|
1623
|
+
request_id = sdk.endpoints(cluster_record['name'])
|
1624
|
+
cluster_endpoints = sdk.stream_and_get(request_id)
|
1625
|
+
assert isinstance(cluster_endpoints, dict)
|
1626
|
+
if not cluster_endpoints:
|
1627
|
+
raise click.Abort(f'No endpoint found for cluster '
|
1628
|
+
f'{cluster_record["name"]!r}.')
|
1629
|
+
for port, port_endpoint in cluster_endpoints.items():
|
1630
|
+
click.echo(f'{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}'
|
1631
|
+
f'{colorama.Style.RESET_ALL}: '
|
1632
|
+
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1633
|
+
f'{port_endpoint}{colorama.Style.RESET_ALL}')
|
1634
|
+
return
|
1635
|
+
click.echo(head_ip)
|
1636
|
+
return
|
1637
|
+
|
1638
|
+
|
1527
1639
|
@cli.command()
|
1528
|
-
@click.option('--
|
1529
|
-
'-
|
1640
|
+
@click.option('--verbose',
|
1641
|
+
'-v',
|
1530
1642
|
default=False,
|
1531
1643
|
is_flag=True,
|
1532
1644
|
required=False,
|
@@ -1582,11 +1694,19 @@ def _status_kubernetes(show_all: bool):
|
|
1582
1694
|
type=str,
|
1583
1695
|
nargs=-1,
|
1584
1696
|
**_get_shell_complete_args(_complete_cluster_name))
|
1697
|
+
@click.option('--all-users',
|
1698
|
+
'-u',
|
1699
|
+
default=False,
|
1700
|
+
is_flag=True,
|
1701
|
+
required=False,
|
1702
|
+
help='Show all clusters, including those not owned by the '
|
1703
|
+
'current user.')
|
1585
1704
|
@usage_lib.entrypoint
|
1586
1705
|
# pylint: disable=redefined-builtin
|
1587
|
-
def status(
|
1706
|
+
def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
1588
1707
|
endpoint: Optional[int], show_managed_jobs: bool,
|
1589
|
-
show_services: bool, kubernetes: bool, clusters: List[str]
|
1708
|
+
show_services: bool, kubernetes: bool, clusters: List[str],
|
1709
|
+
all_users: bool):
|
1590
1710
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
1591
1711
|
"""Show clusters.
|
1592
1712
|
|
@@ -1601,11 +1721,15 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1601
1721
|
``sky status --endpoints mycluster``. To query a single endpoint, you
|
1602
1722
|
can use ``sky status mycluster --endpoint 8888``.
|
1603
1723
|
|
1724
|
+
Running `sky status` will update the ssh config for the clusters locally, so
|
1725
|
+
that you can directly ssh into the clusters or connect to the clusters with
|
1726
|
+
vscode.
|
1727
|
+
|
1604
1728
|
The following fields for each cluster are recorded: cluster name, time
|
1605
1729
|
since last launch, resources, region, zone, hourly price, status, autostop,
|
1606
1730
|
command.
|
1607
1731
|
|
1608
|
-
Display all fields using ``sky status -
|
1732
|
+
Display all fields using ``sky status -v``.
|
1609
1733
|
|
1610
1734
|
Each cluster can have one of the following statuses:
|
1611
1735
|
|
@@ -1646,245 +1770,160 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1646
1770
|
cluster statuses from the cloud providers.
|
1647
1771
|
"""
|
1648
1772
|
if kubernetes:
|
1649
|
-
_status_kubernetes(
|
1773
|
+
_status_kubernetes(verbose)
|
1650
1774
|
return
|
1651
|
-
#
|
1652
|
-
#
|
1653
|
-
|
1654
|
-
|
1655
|
-
|
1656
|
-
|
1657
|
-
|
1658
|
-
|
1659
|
-
|
1660
|
-
|
1661
|
-
|
1662
|
-
|
1663
|
-
|
1664
|
-
|
1665
|
-
|
1666
|
-
|
1667
|
-
|
1668
|
-
|
1669
|
-
|
1670
|
-
|
1671
|
-
show_services = show_services and not clusters and not ip
|
1672
|
-
if show_services:
|
1673
|
-
# Run the sky serve service query in parallel to speed up the
|
1674
|
-
# status query.
|
1675
|
-
services_future = pool.apply_async(_get_services,
|
1676
|
-
kwds=dict(
|
1677
|
-
service_names=None,
|
1678
|
-
show_all=False,
|
1679
|
-
show_endpoint=False,
|
1680
|
-
is_called_by_user=False))
|
1681
|
-
if ip or show_endpoints:
|
1682
|
-
if refresh:
|
1683
|
-
raise click.UsageError(
|
1684
|
-
'Using --ip or --endpoint(s) with --refresh is not'
|
1685
|
-
'supported for now. To fix, refresh first, '
|
1686
|
-
'then query the IP or endpoint.')
|
1775
|
+
# Do not show job queue if user specifies clusters, and if user
|
1776
|
+
# specifies --ip or --endpoint(s).
|
1777
|
+
show_managed_jobs = show_managed_jobs and not any([clusters, ip, endpoints])
|
1778
|
+
if show_managed_jobs:
|
1779
|
+
managed_jobs_queue_request_id = managed_jobs.queue(refresh=False,
|
1780
|
+
skip_finished=True)
|
1781
|
+
show_endpoints = endpoints or endpoint is not None
|
1782
|
+
show_single_endpoint = endpoint is not None
|
1783
|
+
show_services = show_services and not any([clusters, ip, endpoints])
|
1784
|
+
if show_services:
|
1785
|
+
# Run the sky serve service query in parallel to speed up the
|
1786
|
+
# status query.
|
1787
|
+
service_status_request_id = serve_lib.status(service_names=None)
|
1788
|
+
|
1789
|
+
if ip or show_endpoints:
|
1790
|
+
if refresh:
|
1791
|
+
raise click.UsageError(
|
1792
|
+
'Using --ip or --endpoint(s) with --refresh is not'
|
1793
|
+
'supported for now. To fix, refresh first, '
|
1794
|
+
'then query the IP or endpoint.')
|
1687
1795
|
|
1688
|
-
|
1689
|
-
|
1690
|
-
|
1691
|
-
|
1692
|
-
'at the same time.')
|
1796
|
+
if ip and show_endpoints:
|
1797
|
+
with ux_utils.print_exception_no_traceback():
|
1798
|
+
raise ValueError('Cannot specify both --ip and --endpoint(s) '
|
1799
|
+
'at the same time.')
|
1693
1800
|
|
1694
|
-
|
1695
|
-
|
1696
|
-
|
1697
|
-
|
1698
|
-
|
1801
|
+
if endpoint is not None and endpoints:
|
1802
|
+
with ux_utils.print_exception_no_traceback():
|
1803
|
+
raise ValueError(
|
1804
|
+
'Cannot specify both --endpoint and --endpoints '
|
1805
|
+
'at the same time.')
|
1699
1806
|
|
1700
|
-
|
1701
|
-
|
1702
|
-
|
1703
|
-
|
1704
|
-
|
1705
|
-
|
1706
|
-
|
1707
|
-
|
1708
|
-
|
1709
|
-
|
1710
|
-
|
1711
|
-
|
1712
|
-
|
1713
|
-
|
1714
|
-
|
1807
|
+
if len(clusters) != 1:
|
1808
|
+
with ux_utils.print_exception_no_traceback():
|
1809
|
+
plural = 's' if len(clusters) > 1 else ''
|
1810
|
+
cluster_num = (str(len(clusters)) if clusters else 'No')
|
1811
|
+
cause = 'a single' if len(clusters) > 1 else 'an existing'
|
1812
|
+
raise ValueError(
|
1813
|
+
_STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format(
|
1814
|
+
cluster_num=cluster_num,
|
1815
|
+
plural=plural,
|
1816
|
+
verb='specified',
|
1817
|
+
cause=cause,
|
1818
|
+
property='IP address' if ip else 'endpoint(s)',
|
1819
|
+
flag='ip' if ip else
|
1820
|
+
('endpoint port'
|
1821
|
+
if show_single_endpoint else 'endpoints')))
|
1822
|
+
else:
|
1823
|
+
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters'
|
1824
|
+
f'{colorama.Style.RESET_ALL}')
|
1825
|
+
query_clusters: Optional[List[str]] = None if not clusters else clusters
|
1826
|
+
refresh_mode = common.StatusRefreshMode.NONE
|
1827
|
+
if refresh:
|
1828
|
+
refresh_mode = common.StatusRefreshMode.FORCE
|
1829
|
+
cluster_records = _get_cluster_records_and_set_ssh_config(
|
1830
|
+
query_clusters, refresh_mode, all_users)
|
1831
|
+
|
1832
|
+
# TOOD(zhwu): setup the ssh config for status
|
1833
|
+
if ip or show_endpoints:
|
1834
|
+
_show_endpoint(query_clusters, cluster_records, ip, endpoints, endpoint)
|
1835
|
+
return
|
1836
|
+
hints = []
|
1837
|
+
normal_clusters = []
|
1838
|
+
controllers = []
|
1839
|
+
for cluster_record in cluster_records:
|
1840
|
+
cluster_name = cluster_record['name']
|
1841
|
+
controller = controller_utils.Controllers.from_name(cluster_name)
|
1842
|
+
if controller is not None:
|
1843
|
+
controllers.append(cluster_record)
|
1715
1844
|
else:
|
1716
|
-
|
1717
|
-
f'{colorama.Style.RESET_ALL}')
|
1718
|
-
query_clusters: Optional[List[str]] = None
|
1719
|
-
if clusters:
|
1720
|
-
query_clusters = _get_glob_clusters(clusters, silent=ip)
|
1721
|
-
cluster_records = core.status(cluster_names=query_clusters,
|
1722
|
-
refresh=refresh)
|
1723
|
-
if ip or show_endpoints:
|
1724
|
-
if len(cluster_records) != 1:
|
1725
|
-
with ux_utils.print_exception_no_traceback():
|
1726
|
-
plural = 's' if len(cluster_records) > 1 else ''
|
1727
|
-
cluster_num = (str(len(cluster_records))
|
1728
|
-
if cluster_records else f'{clusters[0]!r}')
|
1729
|
-
verb = 'found' if cluster_records else 'not found'
|
1730
|
-
cause = 'a single' if len(clusters) > 1 else 'an existing'
|
1731
|
-
raise ValueError(
|
1732
|
-
_STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format(
|
1733
|
-
cluster_num=cluster_num,
|
1734
|
-
plural=plural,
|
1735
|
-
verb=verb,
|
1736
|
-
cause=cause,
|
1737
|
-
property='IP address' if ip else 'endpoint(s)',
|
1738
|
-
flag='ip' if ip else
|
1739
|
-
('endpoint port'
|
1740
|
-
if show_single_endpoint else 'endpoints')))
|
1741
|
-
|
1742
|
-
cluster_record = cluster_records[0]
|
1743
|
-
if cluster_record['status'] != status_lib.ClusterStatus.UP:
|
1744
|
-
with ux_utils.print_exception_no_traceback():
|
1745
|
-
raise RuntimeError(f'Cluster {cluster_record["name"]!r} '
|
1746
|
-
'is not in UP status.')
|
1747
|
-
handle = cluster_record['handle']
|
1748
|
-
if not isinstance(handle, backends.CloudVmRayResourceHandle):
|
1749
|
-
with ux_utils.print_exception_no_traceback():
|
1750
|
-
raise ValueError('Querying IP address is not supported '
|
1751
|
-
'for local clusters.')
|
1752
|
-
|
1753
|
-
head_ip = handle.external_ips()[0]
|
1754
|
-
if show_endpoints:
|
1755
|
-
if endpoint:
|
1756
|
-
cluster_endpoint = core.endpoints(cluster_record['name'],
|
1757
|
-
endpoint).get(
|
1758
|
-
endpoint, None)
|
1759
|
-
if not cluster_endpoint:
|
1760
|
-
raise click.Abort(
|
1761
|
-
f'Endpoint {endpoint} not found for cluster '
|
1762
|
-
f'{cluster_record["name"]!r}.')
|
1763
|
-
click.echo(cluster_endpoint)
|
1764
|
-
else:
|
1765
|
-
cluster_endpoints = core.endpoints(cluster_record['name'])
|
1766
|
-
assert isinstance(cluster_endpoints, dict)
|
1767
|
-
if not cluster_endpoints:
|
1768
|
-
raise click.Abort(f'No endpoint found for cluster '
|
1769
|
-
f'{cluster_record["name"]!r}.')
|
1770
|
-
for port, port_endpoint in cluster_endpoints.items():
|
1771
|
-
click.echo(
|
1772
|
-
f'{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}'
|
1773
|
-
f'{colorama.Style.RESET_ALL}: '
|
1774
|
-
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1775
|
-
f'{port_endpoint}{colorama.Style.RESET_ALL}')
|
1776
|
-
return
|
1777
|
-
click.echo(head_ip)
|
1778
|
-
return
|
1779
|
-
hints = []
|
1780
|
-
normal_clusters = []
|
1781
|
-
controllers = []
|
1782
|
-
for cluster_record in cluster_records:
|
1783
|
-
cluster_name = cluster_record['name']
|
1784
|
-
controller = controller_utils.Controllers.from_name(cluster_name)
|
1785
|
-
if controller is not None:
|
1786
|
-
controllers.append(cluster_record)
|
1787
|
-
else:
|
1788
|
-
normal_clusters.append(cluster_record)
|
1845
|
+
normal_clusters.append(cluster_record)
|
1789
1846
|
|
1790
|
-
|
1791
|
-
|
1792
|
-
|
1847
|
+
num_pending_autostop = 0
|
1848
|
+
num_pending_autostop += status_utils.show_status_table(
|
1849
|
+
normal_clusters + controllers, verbose, all_users, query_clusters)
|
1793
1850
|
|
1794
|
-
|
1795
|
-
|
1796
|
-
|
1851
|
+
managed_jobs_query_interrupted = False
|
1852
|
+
if show_managed_jobs:
|
1853
|
+
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1854
|
+
f'Managed jobs{colorama.Style.RESET_ALL}')
|
1855
|
+
with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
|
1797
1856
|
try:
|
1798
|
-
|
1857
|
+
num_in_progress_jobs, msg = _handle_jobs_queue_request(
|
1858
|
+
managed_jobs_queue_request_id,
|
1859
|
+
show_all=False,
|
1860
|
+
limit_num_jobs_to_show=not all,
|
1861
|
+
is_called_by_user=False)
|
1799
1862
|
except KeyboardInterrupt:
|
1800
|
-
|
1801
|
-
|
1802
|
-
|
1803
|
-
|
1804
|
-
|
1805
|
-
|
1806
|
-
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1807
|
-
f'Managed jobs{colorama.Style.RESET_ALL}')
|
1808
|
-
with rich_utils.safe_status(
|
1809
|
-
ux_utils.spinner_message('Checking managed jobs')):
|
1810
|
-
managed_jobs_query_interrupted, result = _try_get_future_result(
|
1811
|
-
managed_jobs_future)
|
1812
|
-
if managed_jobs_query_interrupted:
|
1813
|
-
# Set to -1, so that the controller is not considered
|
1814
|
-
# down, and the hint for showing sky jobs queue
|
1815
|
-
# will still be shown.
|
1816
|
-
num_in_progress_jobs = -1
|
1817
|
-
msg = 'KeyboardInterrupt'
|
1818
|
-
else:
|
1819
|
-
num_in_progress_jobs, msg = result
|
1820
|
-
|
1821
|
-
click.echo(msg)
|
1822
|
-
if num_in_progress_jobs is not None:
|
1823
|
-
# jobs controller is UP.
|
1824
|
-
job_info = ''
|
1825
|
-
if num_in_progress_jobs > 0:
|
1826
|
-
plural_and_verb = ' is'
|
1827
|
-
if num_in_progress_jobs > 1:
|
1828
|
-
plural_and_verb = 's are'
|
1829
|
-
job_info = (
|
1830
|
-
f'{num_in_progress_jobs} managed job{plural_and_verb} '
|
1831
|
-
'in progress')
|
1832
|
-
if (num_in_progress_jobs >
|
1833
|
-
_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS):
|
1834
|
-
job_info += (
|
1835
|
-
f' ({_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS} latest '
|
1836
|
-
'ones shown)')
|
1837
|
-
job_info += '. '
|
1838
|
-
hints.append(
|
1839
|
-
controller_utils.Controllers.JOBS_CONTROLLER.value.
|
1840
|
-
in_progress_hint.format(job_info=job_info))
|
1841
|
-
|
1842
|
-
if show_services:
|
1843
|
-
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1844
|
-
f'Services{colorama.Style.RESET_ALL}')
|
1845
|
-
num_services = None
|
1846
|
-
if managed_jobs_query_interrupted:
|
1847
|
-
# The pool is terminated, so we cannot run the service query.
|
1863
|
+
sdk.api_cancel(managed_jobs_queue_request_id, silent=True)
|
1864
|
+
managed_jobs_query_interrupted = True
|
1865
|
+
# Set to -1, so that the controller is not considered
|
1866
|
+
# down, and the hint for showing sky jobs queue
|
1867
|
+
# will still be shown.
|
1868
|
+
num_in_progress_jobs = -1
|
1848
1869
|
msg = 'KeyboardInterrupt'
|
1849
|
-
else:
|
1850
|
-
with rich_utils.safe_status(
|
1851
|
-
ux_utils.spinner_message('Checking services')):
|
1852
|
-
interrupted, result = _try_get_future_result(
|
1853
|
-
services_future)
|
1854
|
-
if interrupted:
|
1855
|
-
num_services = -1
|
1856
|
-
msg = 'KeyboardInterrupt'
|
1857
|
-
else:
|
1858
|
-
num_services, msg = result
|
1859
|
-
click.echo(msg)
|
1860
|
-
if num_services is not None:
|
1861
|
-
hints.append(controller_utils.Controllers.SKY_SERVE_CONTROLLER.
|
1862
|
-
value.in_progress_hint)
|
1863
1870
|
|
1864
|
-
|
1865
|
-
|
1866
|
-
|
1867
|
-
|
1868
|
-
|
1869
|
-
|
1870
|
-
|
1871
|
-
|
1872
|
-
|
1873
|
-
|
1874
|
-
|
1875
|
-
|
1876
|
-
|
1877
|
-
|
1878
|
-
|
1879
|
-
|
1880
|
-
|
1881
|
-
|
1882
|
-
|
1883
|
-
|
1884
|
-
|
1885
|
-
|
1886
|
-
|
1887
|
-
|
1871
|
+
click.echo(msg)
|
1872
|
+
if num_in_progress_jobs is not None:
|
1873
|
+
# jobs controller is UP.
|
1874
|
+
job_info = ''
|
1875
|
+
if num_in_progress_jobs > 0:
|
1876
|
+
plural_and_verb = ' is'
|
1877
|
+
if num_in_progress_jobs > 1:
|
1878
|
+
plural_and_verb = 's are'
|
1879
|
+
job_info = (
|
1880
|
+
f'{num_in_progress_jobs} managed job{plural_and_verb} '
|
1881
|
+
'in progress')
|
1882
|
+
if num_in_progress_jobs > _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS:
|
1883
|
+
job_info += (
|
1884
|
+
f' ({_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS} latest '
|
1885
|
+
'ones shown)')
|
1886
|
+
job_info += '. '
|
1887
|
+
hints.append(
|
1888
|
+
controller_utils.Controllers.JOBS_CONTROLLER.value.
|
1889
|
+
in_progress_hint.format(job_info=job_info))
|
1890
|
+
|
1891
|
+
if show_services:
|
1892
|
+
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1893
|
+
f'Services{colorama.Style.RESET_ALL}')
|
1894
|
+
num_services = None
|
1895
|
+
if managed_jobs_query_interrupted:
|
1896
|
+
msg = 'KeyboardInterrupt'
|
1897
|
+
else:
|
1898
|
+
with rich_utils.client_status('[cyan]Checking services[/]'):
|
1899
|
+
try:
|
1900
|
+
num_services, msg = _handle_services_request(
|
1901
|
+
service_status_request_id,
|
1902
|
+
service_names=None,
|
1903
|
+
show_all=False,
|
1904
|
+
show_endpoint=False,
|
1905
|
+
is_called_by_user=False)
|
1906
|
+
except KeyboardInterrupt:
|
1907
|
+
sdk.api_cancel(service_status_request_id, silent=True)
|
1908
|
+
num_services = -1
|
1909
|
+
msg = 'KeyboardInterrupt'
|
1910
|
+
click.echo(msg)
|
1911
|
+
if num_services is not None:
|
1912
|
+
hints.append(controller_utils.Controllers.SKY_SERVE_CONTROLLER.
|
1913
|
+
value.in_progress_hint)
|
1914
|
+
|
1915
|
+
if num_pending_autostop > 0 and not refresh:
|
1916
|
+
# Don't print this hint if there's no pending autostop or user has
|
1917
|
+
# already passed --refresh.
|
1918
|
+
plural_and_verb = ' has'
|
1919
|
+
if num_pending_autostop > 1:
|
1920
|
+
plural_and_verb = 's have'
|
1921
|
+
hints.append(f'* {num_pending_autostop} cluster{plural_and_verb} '
|
1922
|
+
'auto{stop,down} scheduled. Refresh statuses with: '
|
1923
|
+
f'{colorama.Style.BRIGHT}sky status --refresh'
|
1924
|
+
f'{colorama.Style.RESET_ALL}')
|
1925
|
+
if hints:
|
1926
|
+
click.echo('\n' + '\n'.join(hints))
|
1888
1927
|
|
1889
1928
|
|
1890
1929
|
@cli.command()
|
@@ -1893,7 +1932,7 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1893
1932
|
default=False,
|
1894
1933
|
is_flag=True,
|
1895
1934
|
required=False,
|
1896
|
-
help='Show all information
|
1935
|
+
help='Show all cluster information.')
|
1897
1936
|
@usage_lib.entrypoint
|
1898
1937
|
def cost_report(all: bool): # pylint: disable=redefined-builtin
|
1899
1938
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
@@ -1914,7 +1953,7 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin
|
|
1914
1953
|
|
1915
1954
|
- Clusters that were terminated/stopped on the cloud console.
|
1916
1955
|
"""
|
1917
|
-
cluster_records =
|
1956
|
+
cluster_records = sdk.get(sdk.cost_report())
|
1918
1957
|
|
1919
1958
|
normal_cluster_records = []
|
1920
1959
|
controllers = dict()
|
@@ -1959,7 +1998,7 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin
|
|
1959
1998
|
|
1960
1999
|
@cli.command()
|
1961
2000
|
@click.option('--all-users',
|
1962
|
-
'-
|
2001
|
+
'-u',
|
1963
2002
|
default=False,
|
1964
2003
|
is_flag=True,
|
1965
2004
|
required=False,
|
@@ -1980,17 +2019,20 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
1980
2019
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
1981
2020
|
"""Show the job queue for cluster(s)."""
|
1982
2021
|
click.secho('Fetching and parsing job queue...', fg='cyan')
|
1983
|
-
if clusters:
|
1984
|
-
|
1985
|
-
|
1986
|
-
|
1987
|
-
clusters = [c['name'] for c in cluster_infos]
|
2022
|
+
if not clusters:
|
2023
|
+
cluster_records = _get_cluster_records_and_set_ssh_config(
|
2024
|
+
None, all_users=all_users)
|
2025
|
+
clusters = [cluster['name'] for cluster in cluster_records]
|
1988
2026
|
|
1989
2027
|
unsupported_clusters = []
|
1990
|
-
|
2028
|
+
logger.info(f'Fetching job queue for {clusters}')
|
2029
|
+
job_tables = {}
|
2030
|
+
|
2031
|
+
def _get_job_queue(cluster):
|
1991
2032
|
try:
|
1992
|
-
job_table =
|
1993
|
-
|
2033
|
+
job_table = sdk.stream_and_get(
|
2034
|
+
sdk.queue(cluster, skip_finished, all_users))
|
2035
|
+
except (RuntimeError, exceptions.CommandError, ValueError,
|
1994
2036
|
exceptions.NotSupportedError, exceptions.ClusterNotUpError,
|
1995
2037
|
exceptions.CloudUserIdentityError,
|
1996
2038
|
exceptions.ClusterOwnerIdentityMismatchError) as e:
|
@@ -1999,9 +2041,14 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
1999
2041
|
click.echo(f'{colorama.Fore.YELLOW}Failed to get the job queue for '
|
2000
2042
|
f'cluster {cluster!r}.{colorama.Style.RESET_ALL}\n'
|
2001
2043
|
f' {common_utils.format_exception(e)}')
|
2002
|
-
|
2003
|
-
|
2004
|
-
|
2044
|
+
return
|
2045
|
+
job_tables[cluster] = job_lib.format_job_queue(job_table)
|
2046
|
+
|
2047
|
+
subprocess_utils.run_in_parallel(_get_job_queue, clusters)
|
2048
|
+
user_str = 'all users' if all_users else 'current user'
|
2049
|
+
for cluster, job_table in job_tables.items():
|
2050
|
+
click.echo(f'\nJob queue of {user_str} on cluster {cluster}\n'
|
2051
|
+
f'{job_table}')
|
2005
2052
|
|
2006
2053
|
if unsupported_clusters:
|
2007
2054
|
click.secho(
|
@@ -2081,25 +2128,34 @@ def logs(
|
|
2081
2128
|
job_ids = None if not job_ids else job_ids
|
2082
2129
|
|
2083
2130
|
if sync_down:
|
2084
|
-
|
2131
|
+
with rich_utils.client_status(
|
2132
|
+
ux_utils.spinner_message('Downloading logs')):
|
2133
|
+
log_local_path_dict = sdk.download_logs(cluster, job_ids)
|
2134
|
+
style = colorama.Style
|
2135
|
+
fore = colorama.Fore
|
2136
|
+
for job, log_local_path in log_local_path_dict.items():
|
2137
|
+
logger.info(f'{fore.CYAN}Job {job} logs: {log_local_path}'
|
2138
|
+
f'{style.RESET_ALL}')
|
2085
2139
|
return
|
2086
2140
|
|
2087
2141
|
assert job_ids is None or len(job_ids) <= 1, job_ids
|
2088
|
-
job_id = None
|
2142
|
+
job_id: Optional[int] = None
|
2089
2143
|
job_ids_to_query: Optional[List[int]] = None
|
2090
2144
|
if job_ids:
|
2091
2145
|
# Already check that len(job_ids) <= 1. This variable is used later
|
2092
|
-
# in
|
2093
|
-
|
2094
|
-
if not
|
2095
|
-
raise click.UsageError(f'Invalid job ID {
|
2146
|
+
# in sdk.tail_logs.
|
2147
|
+
cur_job_id = job_ids[0]
|
2148
|
+
if not cur_job_id.isdigit():
|
2149
|
+
raise click.UsageError(f'Invalid job ID {cur_job_id}. '
|
2096
2150
|
'Job ID must be integers.')
|
2097
|
-
|
2151
|
+
job_id = int(cur_job_id)
|
2152
|
+
job_ids_to_query = [int(job_ids[0])]
|
2098
2153
|
else:
|
2099
2154
|
# job_ids is either None or empty list, so it is safe to cast it here.
|
2100
2155
|
job_ids_to_query = typing.cast(Optional[List[int]], job_ids)
|
2101
2156
|
if status:
|
2102
|
-
job_statuses =
|
2157
|
+
job_statuses = sdk.stream_and_get(
|
2158
|
+
sdk.job_status(cluster, job_ids_to_query))
|
2103
2159
|
job_id = list(job_statuses.keys())[0]
|
2104
2160
|
# If job_ids is None and no job has been submitted to the cluster,
|
2105
2161
|
# it will return {None: None}.
|
@@ -2117,7 +2173,15 @@ def logs(
|
|
2117
2173
|
click.secho(f'Job {id_str}not found', fg='red')
|
2118
2174
|
sys.exit(1)
|
2119
2175
|
|
2120
|
-
|
2176
|
+
job_str = f'job {job_id}'
|
2177
|
+
if job_id is None:
|
2178
|
+
job_str = 'the last job'
|
2179
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
2180
|
+
f'Tailing logs of {job_str} on cluster {cluster!r}...'
|
2181
|
+
f'{colorama.Style.RESET_ALL}')
|
2182
|
+
|
2183
|
+
# Stream logs from the server.
|
2184
|
+
sdk.tail_logs(cluster, job_id, follow, tail=tail)
|
2121
2185
|
|
2122
2186
|
|
2123
2187
|
@cli.command()
|
@@ -2130,16 +2194,31 @@ def logs(
|
|
2130
2194
|
default=False,
|
2131
2195
|
is_flag=True,
|
2132
2196
|
required=False,
|
2133
|
-
help='Cancel all jobs on the specified cluster.'
|
2197
|
+
help='Cancel all jobs from current user on the specified cluster.'
|
2198
|
+
)
|
2199
|
+
@click.option('--all-users',
|
2200
|
+
'-u',
|
2201
|
+
default=False,
|
2202
|
+
is_flag=True,
|
2203
|
+
required=False,
|
2204
|
+
help='Cancel all jobs on the specified cluster for all users.')
|
2134
2205
|
@click.option('--yes',
|
2135
2206
|
'-y',
|
2136
2207
|
is_flag=True,
|
2137
2208
|
default=False,
|
2138
2209
|
required=False,
|
2139
2210
|
help='Skip confirmation prompt.')
|
2211
|
+
@_add_click_options(_COMMON_OPTIONS)
|
2140
2212
|
@click.argument('jobs', required=False, type=int, nargs=-1)
|
2141
2213
|
@usage_lib.entrypoint
|
2142
|
-
def cancel(
|
2214
|
+
def cancel(
|
2215
|
+
cluster: str,
|
2216
|
+
all: bool, # pylint: disable=redefined-builtin
|
2217
|
+
all_users: bool,
|
2218
|
+
jobs: List[int], # pylint: disable=redefined-outer-name
|
2219
|
+
yes: bool,
|
2220
|
+
async_call: bool,
|
2221
|
+
): # pylint: disable=redefined-builtin
|
2143
2222
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
2144
2223
|
"""Cancel job(s).
|
2145
2224
|
|
@@ -2152,30 +2231,36 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa
|
|
2152
2231
|
sky cancel cluster_name 1
|
2153
2232
|
sky cancel cluster_name 1 2 3
|
2154
2233
|
\b
|
2155
|
-
# Cancel all jobs on a cluster.
|
2234
|
+
# Cancel all your jobs on a cluster.
|
2156
2235
|
sky cancel cluster_name -a
|
2157
2236
|
\b
|
2237
|
+
# Cancel all users' jobs on a cluster.
|
2238
|
+
sky cancel cluster_name -u
|
2239
|
+
\b
|
2158
2240
|
# Cancel the latest running job on a cluster.
|
2159
2241
|
sky cancel cluster_name
|
2160
2242
|
|
2161
2243
|
Job IDs can be looked up by ``sky queue cluster_name``.
|
2162
2244
|
"""
|
2163
|
-
job_identity_str =
|
2245
|
+
job_identity_str = ''
|
2164
2246
|
job_ids_to_cancel = None
|
2165
|
-
if not jobs and not all:
|
2166
|
-
click.echo(
|
2167
|
-
|
2168
|
-
|
2247
|
+
if not jobs and not all and not all_users:
|
2248
|
+
click.echo(
|
2249
|
+
f'{colorama.Fore.YELLOW}No job IDs or --all/--all-users provided; '
|
2250
|
+
'cancelling the latest running job.'
|
2251
|
+
f'{colorama.Style.RESET_ALL}')
|
2169
2252
|
job_identity_str = 'the latest running job'
|
2253
|
+
elif all_users:
|
2254
|
+
job_identity_str = 'all users\' jobs'
|
2170
2255
|
else:
|
2171
|
-
# Cancelling specific jobs or --all.
|
2172
|
-
job_ids = ' '.join(map(str, jobs))
|
2173
|
-
plural = 's' if len(job_ids) > 1 else ''
|
2174
|
-
job_identity_str = f'job{plural} {job_ids}'
|
2175
|
-
job_ids_to_cancel = jobs
|
2176
2256
|
if all:
|
2177
|
-
job_identity_str = 'all jobs'
|
2178
|
-
|
2257
|
+
job_identity_str = 'all your jobs'
|
2258
|
+
if jobs:
|
2259
|
+
jobs_str = ' '.join(map(str, jobs))
|
2260
|
+
plural = 's' if len(jobs) > 1 else ''
|
2261
|
+
connector = ' and ' if job_identity_str else ''
|
2262
|
+
job_identity_str += f'{connector}job{plural} {jobs_str}'
|
2263
|
+
job_ids_to_cancel = jobs
|
2179
2264
|
job_identity_str += f' on cluster {cluster!r}'
|
2180
2265
|
|
2181
2266
|
if not yes:
|
@@ -2185,7 +2270,11 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa
|
|
2185
2270
|
show_default=True)
|
2186
2271
|
|
2187
2272
|
try:
|
2188
|
-
|
2273
|
+
request_id = sdk.cancel(cluster,
|
2274
|
+
all=all,
|
2275
|
+
all_users=all_users,
|
2276
|
+
job_ids=job_ids_to_cancel)
|
2277
|
+
_async_call_or_wait(request_id, async_call, 'sky.cancel')
|
2189
2278
|
except exceptions.NotSupportedError as e:
|
2190
2279
|
controller = controller_utils.Controllers.from_name(cluster)
|
2191
2280
|
assert controller is not None, cluster
|
@@ -2205,20 +2294,28 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa
|
|
2205
2294
|
**_get_shell_complete_args(_complete_cluster_name))
|
2206
2295
|
@click.option('--all',
|
2207
2296
|
'-a',
|
2208
|
-
default=
|
2297
|
+
default=False,
|
2209
2298
|
is_flag=True,
|
2210
2299
|
help='Stop all existing clusters.')
|
2300
|
+
@click.option('--all-users',
|
2301
|
+
'-u',
|
2302
|
+
default=False,
|
2303
|
+
is_flag=True,
|
2304
|
+
help='Stop all existing clusters for all users.')
|
2211
2305
|
@click.option('--yes',
|
2212
2306
|
'-y',
|
2213
2307
|
is_flag=True,
|
2214
2308
|
default=False,
|
2215
2309
|
required=False,
|
2216
2310
|
help='Skip confirmation prompt.')
|
2311
|
+
@_add_click_options(_COMMON_OPTIONS)
|
2217
2312
|
@usage_lib.entrypoint
|
2218
2313
|
def stop(
|
2219
2314
|
clusters: List[str],
|
2220
|
-
all:
|
2315
|
+
all: bool, # pylint: disable=redefined-builtin
|
2316
|
+
all_users: bool,
|
2221
2317
|
yes: bool,
|
2318
|
+
async_call: bool,
|
2222
2319
|
):
|
2223
2320
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
2224
2321
|
"""Stop cluster(s).
|
@@ -2251,8 +2348,10 @@ def stop(
|
|
2251
2348
|
"""
|
2252
2349
|
_down_or_stop_clusters(clusters,
|
2253
2350
|
apply_to_all=all,
|
2351
|
+
all_users=all_users,
|
2254
2352
|
down=False,
|
2255
|
-
no_confirm=yes
|
2353
|
+
no_confirm=yes,
|
2354
|
+
async_call=async_call)
|
2256
2355
|
|
2257
2356
|
|
2258
2357
|
@cli.command(cls=_DocumentedCodeCommand)
|
@@ -2262,9 +2361,14 @@ def stop(
|
|
2262
2361
|
**_get_shell_complete_args(_complete_cluster_name))
|
2263
2362
|
@click.option('--all',
|
2264
2363
|
'-a',
|
2265
|
-
default=
|
2364
|
+
default=False,
|
2266
2365
|
is_flag=True,
|
2267
|
-
help='
|
2366
|
+
help='Autostop all existing clusters.')
|
2367
|
+
@click.option('--all-users',
|
2368
|
+
'-u',
|
2369
|
+
default=False,
|
2370
|
+
is_flag=True,
|
2371
|
+
help='Autostop all existing clusters for all users.')
|
2268
2372
|
@click.option('--idle-minutes',
|
2269
2373
|
'-i',
|
2270
2374
|
type=int,
|
@@ -2292,14 +2396,17 @@ def stop(
|
|
2292
2396
|
default=False,
|
2293
2397
|
required=False,
|
2294
2398
|
help='Skip confirmation prompt.')
|
2399
|
+
@_add_click_options(_COMMON_OPTIONS)
|
2295
2400
|
@usage_lib.entrypoint
|
2296
2401
|
def autostop(
|
2297
2402
|
clusters: List[str],
|
2298
|
-
all:
|
2403
|
+
all: bool, # pylint: disable=redefined-builtin
|
2404
|
+
all_users: bool,
|
2299
2405
|
idle_minutes: Optional[int],
|
2300
2406
|
cancel: bool, # pylint: disable=redefined-outer-name
|
2301
2407
|
down: bool, # pylint: disable=redefined-outer-name
|
2302
2408
|
yes: bool,
|
2409
|
+
async_call: bool,
|
2303
2410
|
):
|
2304
2411
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
2305
2412
|
"""Schedule an autostop or autodown for cluster(s).
|
@@ -2352,9 +2459,11 @@ def autostop(
|
|
2352
2459
|
idle_minutes = 5
|
2353
2460
|
_down_or_stop_clusters(clusters,
|
2354
2461
|
apply_to_all=all,
|
2462
|
+
all_users=all_users,
|
2355
2463
|
down=down,
|
2356
2464
|
no_confirm=yes,
|
2357
|
-
idle_minutes_to_autostop=idle_minutes
|
2465
|
+
idle_minutes_to_autostop=idle_minutes,
|
2466
|
+
async_call=async_call)
|
2358
2467
|
|
2359
2468
|
|
2360
2469
|
@cli.command(cls=_DocumentedCodeCommand)
|
@@ -2417,16 +2526,19 @@ def autostop(
|
|
2417
2526
|
required=False,
|
2418
2527
|
help=('Force start the cluster even if it is already UP. Useful for '
|
2419
2528
|
'upgrading the SkyPilot runtime on the cluster.'))
|
2529
|
+
@_add_click_options(_COMMON_OPTIONS)
|
2420
2530
|
@usage_lib.entrypoint
|
2421
2531
|
# pylint: disable=redefined-builtin
|
2422
2532
|
def start(
|
2423
|
-
|
2424
|
-
|
2425
|
-
|
2426
|
-
|
2427
|
-
|
2428
|
-
|
2429
|
-
|
2533
|
+
clusters: List[str],
|
2534
|
+
all: bool,
|
2535
|
+
yes: bool,
|
2536
|
+
idle_minutes_to_autostop: Optional[int],
|
2537
|
+
down: bool, # pylint: disable=redefined-outer-name
|
2538
|
+
retry_until_up: bool,
|
2539
|
+
force: bool,
|
2540
|
+
async_call: bool,
|
2541
|
+
):
|
2430
2542
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
2431
2543
|
"""Restart cluster(s).
|
2432
2544
|
|
@@ -2460,12 +2572,14 @@ def start(
|
|
2460
2572
|
'--idle-minutes-to-autostop must be set if --down is set.')
|
2461
2573
|
to_start = []
|
2462
2574
|
|
2575
|
+
cluster_records = None
|
2463
2576
|
if not clusters and not all:
|
2464
2577
|
# UX: frequently users may have only 1 cluster. In this case, be smart
|
2465
2578
|
# and default to that unique choice.
|
2466
|
-
|
2467
|
-
|
2468
|
-
|
2579
|
+
all_clusters = _get_cluster_records_and_set_ssh_config(
|
2580
|
+
clusters=None, refresh=common.StatusRefreshMode.AUTO)
|
2581
|
+
if len(all_clusters) <= 1:
|
2582
|
+
cluster_records = all_clusters
|
2469
2583
|
else:
|
2470
2584
|
raise click.UsageError(
|
2471
2585
|
'`sky start` requires either a cluster name or glob '
|
@@ -2476,24 +2590,27 @@ def start(
|
|
2476
2590
|
click.echo('Both --all and cluster(s) specified for sky start. '
|
2477
2591
|
'Letting --all take effect.')
|
2478
2592
|
|
2593
|
+
all_clusters = _get_cluster_records_and_set_ssh_config(
|
2594
|
+
clusters=None, refresh=common.StatusRefreshMode.AUTO)
|
2595
|
+
|
2479
2596
|
# Get all clusters that are not controllers.
|
2480
|
-
|
2481
|
-
cluster
|
2482
|
-
for cluster in global_user_state.get_clusters()
|
2597
|
+
cluster_records = [
|
2598
|
+
cluster for cluster in all_clusters
|
2483
2599
|
if controller_utils.Controllers.from_name(cluster['name']) is None
|
2484
2600
|
]
|
2601
|
+
if cluster_records is None:
|
2602
|
+
# Get GLOB cluster names
|
2603
|
+
cluster_records = _get_cluster_records_and_set_ssh_config(
|
2604
|
+
clusters, refresh=common.StatusRefreshMode.AUTO)
|
2485
2605
|
|
2486
|
-
if not
|
2606
|
+
if not cluster_records:
|
2487
2607
|
click.echo('Cluster(s) not found (tip: see `sky status`). Do you '
|
2488
2608
|
'mean to use `sky launch` to provision a new cluster?')
|
2489
2609
|
return
|
2490
2610
|
else:
|
2491
|
-
|
2492
|
-
|
2493
|
-
|
2494
|
-
for name in clusters:
|
2495
|
-
cluster_status, _ = backend_utils.refresh_cluster_status_handle(
|
2496
|
-
name)
|
2611
|
+
for cluster in cluster_records:
|
2612
|
+
name = cluster['name']
|
2613
|
+
cluster_status = cluster['status']
|
2497
2614
|
# A cluster may have one of the following states:
|
2498
2615
|
#
|
2499
2616
|
# STOPPED - ok to restart
|
@@ -2573,18 +2690,25 @@ def start(
|
|
2573
2690
|
abort=True,
|
2574
2691
|
show_default=True)
|
2575
2692
|
|
2576
|
-
|
2693
|
+
request_ids = subprocess_utils.run_in_parallel(
|
2694
|
+
lambda name: sdk.start(name,
|
2695
|
+
idle_minutes_to_autostop,
|
2696
|
+
retry_until_up,
|
2697
|
+
down=down,
|
2698
|
+
force=force), to_start)
|
2699
|
+
|
2700
|
+
for name, request_id in zip(to_start, request_ids):
|
2577
2701
|
try:
|
2578
|
-
|
2579
|
-
|
2580
|
-
|
2581
|
-
|
2582
|
-
force=force)
|
2702
|
+
_async_call_or_wait(request_id, async_call, 'sky.start')
|
2703
|
+
if not async_call:
|
2704
|
+
# Add ssh config for the cluster
|
2705
|
+
_get_cluster_records_and_set_ssh_config(clusters=[name])
|
2583
2706
|
except (exceptions.NotSupportedError,
|
2584
2707
|
exceptions.ClusterOwnerIdentityMismatchError) as e:
|
2585
2708
|
click.echo(str(e))
|
2586
2709
|
else:
|
2587
|
-
|
2710
|
+
if not async_call:
|
2711
|
+
click.secho(f'Cluster {name} started.', fg='green')
|
2588
2712
|
|
2589
2713
|
|
2590
2714
|
@cli.command(cls=_DocumentedCodeCommand)
|
@@ -2594,9 +2718,14 @@ def start(
|
|
2594
2718
|
**_get_shell_complete_args(_complete_cluster_name))
|
2595
2719
|
@click.option('--all',
|
2596
2720
|
'-a',
|
2597
|
-
default=
|
2721
|
+
default=False,
|
2598
2722
|
is_flag=True,
|
2599
2723
|
help='Tear down all existing clusters.')
|
2724
|
+
@click.option('--all-users',
|
2725
|
+
'-u',
|
2726
|
+
default=False,
|
2727
|
+
is_flag=True,
|
2728
|
+
help='Tear down all existing clusters for all users.')
|
2600
2729
|
@click.option('--yes',
|
2601
2730
|
'-y',
|
2602
2731
|
is_flag=True,
|
@@ -2615,12 +2744,15 @@ def start(
|
|
2615
2744
|
' in certain manual troubleshooting scenarios; with it set, it is the'
|
2616
2745
|
' user\'s responsibility to ensure there are no leaked instances and '
|
2617
2746
|
'related resources.'))
|
2747
|
+
@_add_click_options(_COMMON_OPTIONS)
|
2618
2748
|
@usage_lib.entrypoint
|
2619
2749
|
def down(
|
2620
2750
|
clusters: List[str],
|
2621
|
-
all:
|
2751
|
+
all: bool, # pylint: disable=redefined-builtin
|
2752
|
+
all_users: bool, # pylint: disable=redefined-builtin
|
2622
2753
|
yes: bool,
|
2623
2754
|
purge: bool,
|
2755
|
+
async_call: bool,
|
2624
2756
|
):
|
2625
2757
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
2626
2758
|
"""Tear down cluster(s).
|
@@ -2652,12 +2784,15 @@ def down(
|
|
2652
2784
|
"""
|
2653
2785
|
_down_or_stop_clusters(clusters,
|
2654
2786
|
apply_to_all=all,
|
2787
|
+
all_users=all_users,
|
2655
2788
|
down=True,
|
2656
2789
|
no_confirm=yes,
|
2657
|
-
purge=purge
|
2790
|
+
purge=purge,
|
2791
|
+
async_call=async_call)
|
2658
2792
|
|
2659
2793
|
|
2660
|
-
def _hint_or_raise_for_down_jobs_controller(controller_name: str
|
2794
|
+
def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
2795
|
+
purge: bool) -> None:
|
2661
2796
|
"""Helper function to check job controller status before tearing it down.
|
2662
2797
|
|
2663
2798
|
Raises helpful exceptions and errors if the controller is not in a safe
|
@@ -2669,14 +2804,19 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str):
|
|
2669
2804
|
to be torn down (e.g., because it has jobs running or
|
2670
2805
|
it is in init state)
|
2671
2806
|
"""
|
2807
|
+
if not common.is_current_user_controller(controller_name):
|
2808
|
+
with ux_utils.print_exception_no_traceback():
|
2809
|
+
raise exceptions.NotSupportedError(
|
2810
|
+
f'Tearing down other user\'s managed job controller '
|
2811
|
+
f'{controller_name!r} is not allowed.')
|
2672
2812
|
controller = controller_utils.Controllers.from_name(controller_name)
|
2673
2813
|
assert controller is not None, controller_name
|
2674
2814
|
|
2675
|
-
with rich_utils.
|
2676
|
-
|
2815
|
+
with rich_utils.client_status(
|
2816
|
+
'[bold cyan]Checking for in-progress managed jobs[/]'):
|
2677
2817
|
try:
|
2678
|
-
|
2679
|
-
|
2818
|
+
request_id = managed_jobs.queue(refresh=False, skip_finished=True)
|
2819
|
+
managed_jobs_ = sdk.stream_and_get(request_id)
|
2680
2820
|
except exceptions.ClusterNotUpError as e:
|
2681
2821
|
if controller.value.connection_error_hint in str(e):
|
2682
2822
|
with ux_utils.print_exception_no_traceback():
|
@@ -2704,14 +2844,19 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str):
|
|
2704
2844
|
# Add prefix to each line to align with the bullet point.
|
2705
2845
|
msg += '\n'.join(
|
2706
2846
|
[' ' + line for line in job_table.split('\n') if line != ''])
|
2707
|
-
|
2708
|
-
|
2847
|
+
if purge:
|
2848
|
+
logger.warning('--purge is set, ignoring the in-progress managed '
|
2849
|
+
'jobs. This could cause leaked clusters!')
|
2850
|
+
else:
|
2851
|
+
with ux_utils.print_exception_no_traceback():
|
2852
|
+
raise exceptions.NotSupportedError(msg)
|
2709
2853
|
else:
|
2710
2854
|
click.echo(' * No in-progress managed jobs found. It should be safe to '
|
2711
2855
|
'terminate (see caveats above).')
|
2712
2856
|
|
2713
2857
|
|
2714
|
-
def _hint_or_raise_for_down_sky_serve_controller(controller_name: str
|
2858
|
+
def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
|
2859
|
+
purge: bool) -> None:
|
2715
2860
|
"""Helper function to check serve controller status before tearing it down.
|
2716
2861
|
|
2717
2862
|
Raises helpful exceptions and errors if the controller is not in a safe
|
@@ -2723,12 +2868,18 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str):
|
|
2723
2868
|
to be torn down (e.g., because it has services running or
|
2724
2869
|
it is in init state)
|
2725
2870
|
"""
|
2871
|
+
# TODO(zhwu): Move this check to the sdk or even API server side.
|
2872
|
+
if not common.is_current_user_controller(controller_name):
|
2873
|
+
with ux_utils.print_exception_no_traceback():
|
2874
|
+
raise exceptions.NotSupportedError(
|
2875
|
+
f'Tearing down other user\'s sky serve controller '
|
2876
|
+
f'{controller_name!r} is not allowed.')
|
2726
2877
|
controller = controller_utils.Controllers.from_name(controller_name)
|
2727
2878
|
assert controller is not None, controller_name
|
2728
|
-
with rich_utils.
|
2729
|
-
ux_utils.spinner_message('Checking for live services')):
|
2879
|
+
with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
|
2730
2880
|
try:
|
2731
|
-
|
2881
|
+
request_id = serve_lib.status(service_names=None)
|
2882
|
+
services = sdk.stream_and_get(request_id)
|
2732
2883
|
except exceptions.ClusterNotUpError as e:
|
2733
2884
|
if controller.value.connection_error_hint in str(e):
|
2734
2885
|
with ux_utils.print_exception_no_traceback():
|
@@ -2745,35 +2896,52 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str):
|
|
2745
2896
|
|
2746
2897
|
if services:
|
2747
2898
|
service_names = [service['name'] for service in services]
|
2748
|
-
|
2749
|
-
|
2750
|
-
|
2751
|
-
|
2752
|
-
|
2899
|
+
if purge:
|
2900
|
+
logger.warning('--purge is set, ignoring the in-progress services. '
|
2901
|
+
'This could cause leaked clusters!')
|
2902
|
+
else:
|
2903
|
+
with ux_utils.print_exception_no_traceback():
|
2904
|
+
msg = (controller.value.decline_down_for_dirty_controller_hint.
|
2905
|
+
format(service_names=', '.join(service_names)))
|
2906
|
+
raise exceptions.NotSupportedError(msg)
|
2753
2907
|
# Do nothing for STOPPED state, as it is safe to terminate the cluster.
|
2754
2908
|
click.echo(f'Terminate sky serve controller: {controller_name}.')
|
2755
2909
|
|
2756
2910
|
|
2757
|
-
|
2758
|
-
|
2759
|
-
|
2760
|
-
controller_utils.Controllers.
|
2761
|
-
|
2762
|
-
|
2911
|
+
def _controller_to_hint_or_raise(
|
2912
|
+
controller: controller_utils.Controllers
|
2913
|
+
) -> Callable[[str, bool], None]:
|
2914
|
+
if controller == controller_utils.Controllers.JOBS_CONTROLLER:
|
2915
|
+
return _hint_or_raise_for_down_jobs_controller
|
2916
|
+
return _hint_or_raise_for_down_sky_serve_controller
|
2763
2917
|
|
2764
2918
|
|
2765
2919
|
def _down_or_stop_clusters(
|
2766
2920
|
names: List[str],
|
2767
|
-
apply_to_all:
|
2768
|
-
|
2769
|
-
|
2921
|
+
apply_to_all: bool = False,
|
2922
|
+
all_users: bool = False,
|
2923
|
+
down: bool = False, # pylint: disable=redefined-outer-name
|
2924
|
+
no_confirm: bool = True,
|
2770
2925
|
purge: bool = False,
|
2771
|
-
idle_minutes_to_autostop: Optional[int] = None
|
2926
|
+
idle_minutes_to_autostop: Optional[int] = None,
|
2927
|
+
async_call: bool = False) -> None:
|
2772
2928
|
"""Tears down or (auto-)stops a cluster (or all clusters).
|
2773
2929
|
|
2774
2930
|
Controllers (jobs controller and sky serve controller) can only be
|
2775
2931
|
terminated if the cluster name is explicitly and uniquely specified (not
|
2776
2932
|
via glob).
|
2933
|
+
|
2934
|
+
Args:
|
2935
|
+
names: The names of the clusters to tear down or stop. If empty,
|
2936
|
+
apply_to_all or all_users must be set.
|
2937
|
+
apply_to_all: If True, apply the operation to all clusters.
|
2938
|
+
all_users: If True, apply the operation to all clusters for all users.
|
2939
|
+
down: If True, tear down the clusters.
|
2940
|
+
no_confirm: If True, skip the confirmation prompt.
|
2941
|
+
purge: If True, forcefully remove the clusters from the cluster table.
|
2942
|
+
idle_minutes_to_autostop: The number of minutes to wait before
|
2943
|
+
automatically stopping the cluster.
|
2944
|
+
async_call: If True, send the request asynchronously.
|
2777
2945
|
"""
|
2778
2946
|
if down:
|
2779
2947
|
command = 'down'
|
@@ -2781,17 +2949,12 @@ def _down_or_stop_clusters(
|
|
2781
2949
|
command = 'autostop'
|
2782
2950
|
else:
|
2783
2951
|
command = 'stop'
|
2784
|
-
if not names and apply_to_all
|
2785
|
-
|
2786
|
-
|
2787
|
-
|
2788
|
-
|
2789
|
-
|
2790
|
-
names = all_cluster_names
|
2791
|
-
else:
|
2792
|
-
raise click.UsageError(
|
2793
|
-
f'`sky {command}` requires either a cluster name or glob '
|
2794
|
-
'(see `sky status`), or the -a/--all flag.')
|
2952
|
+
if not names and not apply_to_all and not all_users:
|
2953
|
+
raise click.UsageError(
|
2954
|
+
f'`sky {command}` requires either a cluster name or glob '
|
2955
|
+
'(see `sky status`), or the -a/--all flag for all your '
|
2956
|
+
'clusters, or the -u/--all-users flag for all clusters in '
|
2957
|
+
'your team.')
|
2795
2958
|
|
2796
2959
|
operation = 'Terminating' if down else 'Stopping'
|
2797
2960
|
if idle_minutes_to_autostop is not None:
|
@@ -2802,6 +2965,7 @@ def _down_or_stop_clusters(
|
|
2802
2965
|
option_str = '{stop,down}'
|
2803
2966
|
operation = f'{verb} auto{option_str} on'
|
2804
2967
|
|
2968
|
+
names = list(names)
|
2805
2969
|
if names:
|
2806
2970
|
controllers = [
|
2807
2971
|
name for name in names
|
@@ -2809,8 +2973,9 @@ def _down_or_stop_clusters(
|
|
2809
2973
|
]
|
2810
2974
|
controllers_str = ', '.join(map(repr, controllers))
|
2811
2975
|
names = [
|
2812
|
-
name
|
2813
|
-
|
2976
|
+
cluster['name']
|
2977
|
+
for cluster in _get_cluster_records_and_set_ssh_config(names)
|
2978
|
+
if controller_utils.Controllers.from_name(cluster['name']) is None
|
2814
2979
|
]
|
2815
2980
|
|
2816
2981
|
# Make sure the controllers are explicitly specified without other
|
@@ -2837,7 +3002,7 @@ def _down_or_stop_clusters(
|
|
2837
3002
|
controller = controller_utils.Controllers.from_name(
|
2838
3003
|
controller_name)
|
2839
3004
|
assert controller is not None
|
2840
|
-
hint_or_raise =
|
3005
|
+
hint_or_raise = _controller_to_hint_or_raise(controller)
|
2841
3006
|
try:
|
2842
3007
|
# TODO(zhwu): This hint or raise is not transactional, which
|
2843
3008
|
# means even if it passed the check with no in-progress spot
|
@@ -2846,7 +3011,7 @@ def _down_or_stop_clusters(
|
|
2846
3011
|
# `sky serve up` before typing the delete, causing a leaked
|
2847
3012
|
# managed job or service. We should make this check atomic
|
2848
3013
|
# with the termination.
|
2849
|
-
hint_or_raise(controller_name)
|
3014
|
+
hint_or_raise(controller_name, purge)
|
2850
3015
|
except (exceptions.ClusterOwnerIdentityMismatchError,
|
2851
3016
|
RuntimeError) as e:
|
2852
3017
|
if purge:
|
@@ -2867,8 +3032,9 @@ def _down_or_stop_clusters(
|
|
2867
3032
|
no_confirm = True
|
2868
3033
|
names += controllers
|
2869
3034
|
|
2870
|
-
if apply_to_all:
|
2871
|
-
all_clusters =
|
3035
|
+
if apply_to_all or all_users:
|
3036
|
+
all_clusters = _get_cluster_records_and_set_ssh_config(
|
3037
|
+
clusters=None, all_users=all_users)
|
2872
3038
|
if names:
|
2873
3039
|
click.echo(
|
2874
3040
|
f'Both --all and cluster(s) specified for `sky {command}`. '
|
@@ -2881,15 +3047,7 @@ def _down_or_stop_clusters(
|
|
2881
3047
|
if controller_utils.Controllers.from_name(record['name']) is None
|
2882
3048
|
]
|
2883
3049
|
|
2884
|
-
clusters =
|
2885
|
-
for name in names:
|
2886
|
-
handle = global_user_state.get_handle_from_cluster_name(name)
|
2887
|
-
if handle is None:
|
2888
|
-
# This codepath is used for 'sky down -p <controller>' when the
|
2889
|
-
# controller is not in 'sky status'. Cluster-not-found message
|
2890
|
-
# should've been printed by _get_glob_clusters() above.
|
2891
|
-
continue
|
2892
|
-
clusters.append(name)
|
3050
|
+
clusters = names
|
2893
3051
|
usage_lib.record_cluster_name_for_current_operation(clusters)
|
2894
3052
|
|
2895
3053
|
if not clusters:
|
@@ -2910,15 +3068,21 @@ def _down_or_stop_clusters(
|
|
2910
3068
|
progress = rich_progress.Progress(transient=True,
|
2911
3069
|
redirect_stdout=False,
|
2912
3070
|
redirect_stderr=False)
|
2913
|
-
task = progress.add_task(
|
2914
|
-
f'{operation} {len(clusters)} cluster{plural}'
|
2915
|
-
|
3071
|
+
task = progress.add_task(
|
3072
|
+
f'[bold cyan]{operation} {len(clusters)} cluster{plural}[/]',
|
3073
|
+
total=len(clusters))
|
3074
|
+
|
3075
|
+
request_ids = []
|
2916
3076
|
|
2917
3077
|
def _down_or_stop(name: str):
|
2918
3078
|
success_progress = False
|
2919
3079
|
if idle_minutes_to_autostop is not None:
|
2920
3080
|
try:
|
2921
|
-
|
3081
|
+
request_id = sdk.autostop(name, idle_minutes_to_autostop, down)
|
3082
|
+
request_ids.append(request_id)
|
3083
|
+
_async_call_or_wait(
|
3084
|
+
request_id, async_call,
|
3085
|
+
server_constants.REQUEST_NAME_PREFIX + operation)
|
2922
3086
|
except (exceptions.NotSupportedError,
|
2923
3087
|
exceptions.ClusterNotUpError) as e:
|
2924
3088
|
message = str(e)
|
@@ -2941,9 +3105,17 @@ def _down_or_stop_clusters(
|
|
2941
3105
|
else:
|
2942
3106
|
try:
|
2943
3107
|
if down:
|
2944
|
-
|
3108
|
+
request_id = sdk.down(name, purge=purge)
|
2945
3109
|
else:
|
2946
|
-
|
3110
|
+
request_id = sdk.stop(name, purge=purge)
|
3111
|
+
request_ids.append(request_id)
|
3112
|
+
_async_call_or_wait(
|
3113
|
+
request_id, async_call,
|
3114
|
+
server_constants.REQUEST_NAME_PREFIX + operation)
|
3115
|
+
if not async_call:
|
3116
|
+
# Remove the cluster from the SSH config file as soon as it
|
3117
|
+
# is stopped or downed.
|
3118
|
+
cluster_utils.SSHConfigHelper.remove_cluster(name)
|
2947
3119
|
except RuntimeError as e:
|
2948
3120
|
message = (
|
2949
3121
|
f'{colorama.Fore.RED}{operation} cluster {name}...failed. '
|
@@ -2974,6 +3146,10 @@ def _down_or_stop_clusters(
|
|
2974
3146
|
# Make sure the progress bar not mess up the terminal.
|
2975
3147
|
progress.refresh()
|
2976
3148
|
|
3149
|
+
if async_call:
|
3150
|
+
click.secho(f'{operation} requests are sent. Check the requests\' '
|
3151
|
+
'status with `sky request get <request_id>`.')
|
3152
|
+
|
2977
3153
|
|
2978
3154
|
@cli.command(cls=_DocumentedCodeCommand)
|
2979
3155
|
@click.argument('clouds', required=False, type=str, nargs=-1)
|
@@ -2983,6 +3159,7 @@ def _down_or_stop_clusters(
|
|
2983
3159
|
default=False,
|
2984
3160
|
help='Show the activated account for each cloud.')
|
2985
3161
|
@usage_lib.entrypoint
|
3162
|
+
# pylint: disable=redefined-outer-name
|
2986
3163
|
def check(clouds: Tuple[str], verbose: bool):
|
2987
3164
|
"""Check which clouds are available to use.
|
2988
3165
|
|
@@ -3005,8 +3182,13 @@ def check(clouds: Tuple[str], verbose: bool):
|
|
3005
3182
|
# Check only specific clouds - AWS and GCP.
|
3006
3183
|
sky check aws gcp
|
3007
3184
|
"""
|
3008
|
-
clouds_arg = clouds if clouds else None
|
3009
|
-
|
3185
|
+
clouds_arg = clouds if len(clouds) > 0 else None
|
3186
|
+
request_id = sdk.check(clouds=clouds_arg, verbose=verbose)
|
3187
|
+
sdk.stream_and_get(request_id)
|
3188
|
+
api_server_url = server_common.get_server_url()
|
3189
|
+
click.echo()
|
3190
|
+
click.echo(
|
3191
|
+
click.style(f'Using SkyPilot API server: {api_server_url}', fg='green'))
|
3010
3192
|
|
3011
3193
|
|
3012
3194
|
@cli.command()
|
@@ -3099,23 +3281,27 @@ def show_gpus(
|
|
3099
3281
|
'--all-regions and --region flags cannot be used simultaneously.')
|
3100
3282
|
|
3101
3283
|
# This will validate 'cloud' and raise if not found.
|
3102
|
-
cloud_obj =
|
3103
|
-
cloud_name = cloud_obj.
|
3104
|
-
service_catalog.validate_region_zone(region, None, clouds=cloud_name)
|
3284
|
+
cloud_obj = registry.CLOUD_REGISTRY.from_str(cloud)
|
3285
|
+
cloud_name = str(cloud_obj).lower() if cloud is not None else None
|
3105
3286
|
show_all = all
|
3106
3287
|
if show_all and accelerator_str is not None:
|
3107
3288
|
raise click.UsageError('--all is only allowed without a GPU name.')
|
3108
3289
|
|
3109
3290
|
# Kubernetes specific bools
|
3110
|
-
|
3291
|
+
enabled_clouds = sdk.get(sdk.enabled_clouds())
|
3292
|
+
cloud_is_kubernetes = isinstance(cloud_obj, clouds.Kubernetes)
|
3293
|
+
# TODO(romilb): We should move this to the backend.
|
3111
3294
|
kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None
|
3112
|
-
kubernetes_is_enabled =
|
3113
|
-
|
3114
|
-
|
3295
|
+
kubernetes_is_enabled = clouds.cloud_in_iterable(
|
3296
|
+
clouds.Kubernetes(),
|
3297
|
+
enabled_clouds,
|
3298
|
+
)
|
3115
3299
|
|
3116
3300
|
def _list_to_str(lst):
|
3117
3301
|
return ', '.join([str(e) for e in lst])
|
3118
3302
|
|
3303
|
+
# TODO(zhwu,romilb): We should move most of these kubernetes related
|
3304
|
+
# queries into the backend, especially behind the server.
|
3119
3305
|
def _get_kubernetes_realtime_gpu_table(
|
3120
3306
|
context: Optional[str] = None,
|
3121
3307
|
name_filter: Optional[str] = None,
|
@@ -3128,19 +3314,12 @@ def show_gpus(
|
|
3128
3314
|
free_header = 'TOTAL_FREE_GPUS'
|
3129
3315
|
realtime_gpu_table = log_utils.create_table(
|
3130
3316
|
['GPU', qty_header, 'TOTAL_GPUS', free_header])
|
3131
|
-
|
3132
|
-
|
3133
|
-
|
3134
|
-
|
3135
|
-
|
3136
|
-
|
3137
|
-
case_sensitive=False)
|
3138
|
-
assert (set(counts.keys()) == set(capacity.keys()) == set(
|
3139
|
-
available.keys())), (f'Keys of counts ({list(counts.keys())}), '
|
3140
|
-
f'capacity ({list(capacity.keys())}), '
|
3141
|
-
f'and available ({list(available.keys())}) '
|
3142
|
-
'must be same.')
|
3143
|
-
if not counts:
|
3317
|
+
realtime_gpu_availability_list = sdk.stream_and_get(
|
3318
|
+
sdk.realtime_kubernetes_gpu_availability(
|
3319
|
+
context=context,
|
3320
|
+
name_filter=name_filter,
|
3321
|
+
quantity_filter=quantity_filter))
|
3322
|
+
if not realtime_gpu_availability_list:
|
3144
3323
|
err_msg = 'No GPUs found in Kubernetes cluster. '
|
3145
3324
|
debug_msg = 'To further debug, run: sky check '
|
3146
3325
|
if name_filter is not None:
|
@@ -3152,24 +3331,32 @@ def show_gpus(
|
|
3152
3331
|
'in Kubernetes cluster. ')
|
3153
3332
|
debug_msg = ('To show available accelerators on kubernetes,'
|
3154
3333
|
' run: sky show-gpus --cloud kubernetes ')
|
3155
|
-
full_err_msg = (err_msg +
|
3156
|
-
kubernetes_utils.NO_ACCELERATOR_HELP_MESSAGE +
|
3334
|
+
full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
|
3157
3335
|
debug_msg)
|
3158
3336
|
raise ValueError(full_err_msg)
|
3159
|
-
|
3160
|
-
|
3161
|
-
|
3337
|
+
no_permissions_str = '<no permissions>'
|
3338
|
+
for realtime_gpu_availability in sorted(realtime_gpu_availability_list):
|
3339
|
+
gpu_availability = models.RealtimeGpuAvailability(
|
3340
|
+
*realtime_gpu_availability)
|
3341
|
+
available_qty = (gpu_availability.available
|
3342
|
+
if gpu_availability.available != -1 else
|
3343
|
+
no_permissions_str)
|
3162
3344
|
realtime_gpu_table.add_row([
|
3163
|
-
gpu,
|
3164
|
-
_list_to_str(counts
|
3345
|
+
gpu_availability.gpu,
|
3346
|
+
_list_to_str(gpu_availability.counts),
|
3347
|
+
gpu_availability.capacity,
|
3348
|
+
available_qty,
|
3165
3349
|
])
|
3166
3350
|
return realtime_gpu_table
|
3167
3351
|
|
3352
|
+
# TODO(zhwu): this needs to run on remote server.
|
3168
3353
|
def _get_kubernetes_node_info_table(context: Optional[str]):
|
3169
3354
|
node_table = log_utils.create_table(
|
3170
3355
|
['NODE_NAME', 'GPU_NAME', 'TOTAL_GPUS', 'FREE_GPUS'])
|
3171
3356
|
|
3172
|
-
|
3357
|
+
no_permissions_str = '<no permissions>'
|
3358
|
+
node_info_dict = sdk.stream_and_get(
|
3359
|
+
sdk.kubernetes_node_info(context=context))
|
3173
3360
|
for node_name, node_info in node_info_dict.items():
|
3174
3361
|
available = node_info.free[
|
3175
3362
|
'accelerators_available'] if node_info.free[
|
@@ -3180,7 +3367,7 @@ def show_gpus(
|
|
3180
3367
|
])
|
3181
3368
|
return node_table
|
3182
3369
|
|
3183
|
-
def _output():
|
3370
|
+
def _output() -> Generator[str, None, None]:
|
3184
3371
|
gpu_table = log_utils.create_table(
|
3185
3372
|
['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
|
3186
3373
|
tpu_table = log_utils.create_table(
|
@@ -3193,7 +3380,7 @@ def show_gpus(
|
|
3193
3380
|
# Optimization - do not poll for Kubernetes API for fetching
|
3194
3381
|
# common GPUs because that will be fetched later for the table after
|
3195
3382
|
# common GPUs.
|
3196
|
-
clouds_to_list = cloud_name
|
3383
|
+
clouds_to_list: Union[Optional[str], List[str]] = cloud_name
|
3197
3384
|
if cloud_name is None:
|
3198
3385
|
clouds_to_list = [
|
3199
3386
|
c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes'
|
@@ -3206,12 +3393,8 @@ def show_gpus(
|
|
3206
3393
|
# If cloud is kubernetes, we want to show real-time capacity
|
3207
3394
|
if kubernetes_is_enabled and (cloud_name is None or
|
3208
3395
|
cloud_is_kubernetes):
|
3209
|
-
|
3210
|
-
|
3211
|
-
else:
|
3212
|
-
# If region is not specified, we use the current context
|
3213
|
-
context = (
|
3214
|
-
kubernetes_utils.get_current_kube_config_context_name())
|
3396
|
+
context = region
|
3397
|
+
|
3215
3398
|
try:
|
3216
3399
|
# If --cloud kubernetes is not specified, we want to catch
|
3217
3400
|
# the case where no GPUs are available on the cluster and
|
@@ -3225,8 +3408,9 @@ def show_gpus(
|
|
3225
3408
|
k8s_messages += str(e)
|
3226
3409
|
else:
|
3227
3410
|
print_section_titles = True
|
3411
|
+
context_str = f'(Context: {context})' if context else ''
|
3228
3412
|
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3229
|
-
f'Kubernetes GPUs
|
3413
|
+
f'Kubernetes GPUs {context_str}'
|
3230
3414
|
f'{colorama.Style.RESET_ALL}\n')
|
3231
3415
|
yield from k8s_realtime_table.get_string()
|
3232
3416
|
k8s_node_table = _get_kubernetes_node_info_table(context)
|
@@ -3262,11 +3446,14 @@ def show_gpus(
|
|
3262
3446
|
yield k8s_messages
|
3263
3447
|
yield '\n\n'
|
3264
3448
|
|
3265
|
-
result =
|
3266
|
-
|
3267
|
-
|
3268
|
-
|
3269
|
-
|
3449
|
+
result = sdk.stream_and_get(
|
3450
|
+
sdk.list_accelerator_counts(
|
3451
|
+
gpus_only=True,
|
3452
|
+
clouds=clouds_to_list,
|
3453
|
+
region_filter=region,
|
3454
|
+
))
|
3455
|
+
# TODO(zhwu): handle the case where no accelerators are found,
|
3456
|
+
# especially when --region specified a non-existent region.
|
3270
3457
|
|
3271
3458
|
if print_section_titles:
|
3272
3459
|
# If section titles were printed above, print again here
|
@@ -3354,16 +3541,17 @@ def show_gpus(
|
|
3354
3541
|
|
3355
3542
|
# For clouds other than Kubernetes, get the accelerator details
|
3356
3543
|
# Case-sensitive
|
3357
|
-
result =
|
3358
|
-
|
3359
|
-
|
3360
|
-
|
3361
|
-
|
3362
|
-
|
3363
|
-
|
3544
|
+
result = sdk.stream_and_get(
|
3545
|
+
sdk.list_accelerators(gpus_only=True,
|
3546
|
+
name_filter=name,
|
3547
|
+
quantity_filter=quantity,
|
3548
|
+
region_filter=region,
|
3549
|
+
clouds=clouds_to_list,
|
3550
|
+
case_sensitive=False,
|
3551
|
+
all_regions=all_regions))
|
3364
3552
|
# Import here to save module load speed.
|
3365
3553
|
# pylint: disable=import-outside-toplevel,line-too-long
|
3366
|
-
from sky.clouds.service_catalog import common
|
3554
|
+
from sky.clouds.service_catalog import common as catalog_common
|
3367
3555
|
|
3368
3556
|
# For each gpu name (count not included):
|
3369
3557
|
# - Group by cloud
|
@@ -3384,7 +3572,7 @@ def show_gpus(
|
|
3384
3572
|
df = df.sort_values(by=['min_price', 'min_spot_price'])
|
3385
3573
|
df = df.drop(columns=['min_price', 'min_spot_price'])
|
3386
3574
|
sorted_dataclasses = [
|
3387
|
-
|
3575
|
+
catalog_common.InstanceTypeInfo(*row)
|
3388
3576
|
for row in df.to_records(index=False)
|
3389
3577
|
]
|
3390
3578
|
new_result[gpu] = sorted_dataclasses
|
@@ -3459,10 +3647,11 @@ def show_gpus(
|
|
3459
3647
|
yield '\n\n'
|
3460
3648
|
yield from accelerator_table.get_string()
|
3461
3649
|
|
3650
|
+
outputs = _output()
|
3462
3651
|
if show_all:
|
3463
|
-
click.echo_via_pager(
|
3652
|
+
click.echo_via_pager(outputs)
|
3464
3653
|
else:
|
3465
|
-
for out in
|
3654
|
+
for out in outputs:
|
3466
3655
|
click.echo(out, nl=False)
|
3467
3656
|
click.echo()
|
3468
3657
|
|
@@ -3474,18 +3663,20 @@ def storage():
|
|
3474
3663
|
|
3475
3664
|
|
3476
3665
|
@storage.command('ls', cls=_DocumentedCodeCommand)
|
3477
|
-
@click.option('--
|
3478
|
-
'-
|
3666
|
+
@click.option('--verbose',
|
3667
|
+
'-v',
|
3479
3668
|
default=False,
|
3480
3669
|
is_flag=True,
|
3481
3670
|
required=False,
|
3482
3671
|
help='Show all information in full.')
|
3483
3672
|
@usage_lib.entrypoint
|
3484
3673
|
# pylint: disable=redefined-builtin
|
3485
|
-
def storage_ls(
|
3674
|
+
def storage_ls(verbose: bool):
|
3486
3675
|
"""List storage objects managed by SkyPilot."""
|
3487
|
-
|
3488
|
-
|
3676
|
+
request_id = sdk.storage_ls()
|
3677
|
+
storages = sdk.stream_and_get(request_id)
|
3678
|
+
storage_table = storage_utils.format_storage_table(storages,
|
3679
|
+
show_all=verbose)
|
3489
3680
|
click.echo(storage_table)
|
3490
3681
|
|
3491
3682
|
|
@@ -3507,8 +3698,9 @@ def storage_ls(all: bool):
|
|
3507
3698
|
is_flag=True,
|
3508
3699
|
required=False,
|
3509
3700
|
help='Skip confirmation prompt.')
|
3701
|
+
@_add_click_options(_COMMON_OPTIONS)
|
3510
3702
|
@usage_lib.entrypoint
|
3511
|
-
def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=redefined-builtin
|
3703
|
+
def storage_delete(names: List[str], all: bool, yes: bool, async_call: bool): # pylint: disable=redefined-builtin
|
3512
3704
|
"""Delete storage objects.
|
3513
3705
|
|
3514
3706
|
Examples:
|
@@ -3527,9 +3719,8 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
|
|
3527
3719
|
if sum([bool(names), all]) != 1:
|
3528
3720
|
raise click.UsageError('Either --all or a name must be specified.')
|
3529
3721
|
if all:
|
3530
|
-
|
3531
|
-
|
3532
|
-
if not names:
|
3722
|
+
storages = sdk.get(sdk.storage_ls())
|
3723
|
+
if not storages:
|
3533
3724
|
click.echo('No storage(s) to delete.')
|
3534
3725
|
return
|
3535
3726
|
else:
|
@@ -3545,19 +3736,25 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
|
|
3545
3736
|
abort=True,
|
3546
3737
|
show_default=True)
|
3547
3738
|
|
3548
|
-
|
3739
|
+
request_ids = {}
|
3740
|
+
# TODO(zhwu): Support all flag for the underlying SDK and API server to
|
3741
|
+
# avoid multiple requests.
|
3742
|
+
for name in names:
|
3743
|
+
request_ids[name] = sdk.storage_delete(name)
|
3744
|
+
|
3745
|
+
for name, request_id in request_ids.items():
|
3549
3746
|
try:
|
3550
|
-
sky.
|
3747
|
+
_async_call_or_wait(request_id, async_call, 'sky.storage')
|
3551
3748
|
except Exception as e: # pylint: disable=broad-except
|
3552
|
-
|
3553
|
-
|
3554
|
-
|
3749
|
+
logger.error(f'{colorama.Fore.RED}Error deleting storage {name}: '
|
3750
|
+
f'{common_utils.format_exception(e, use_bracket=True)}'
|
3751
|
+
f'{colorama.Style.RESET_ALL}')
|
3555
3752
|
|
3556
3753
|
|
3557
|
-
@cli.group(cls=_NaturalOrderGroup)
|
3754
|
+
@cli.group(cls=_NaturalOrderGroup, hidden=True)
|
3558
3755
|
def bench():
|
3559
3756
|
"""SkyPilot Benchmark CLI."""
|
3560
|
-
|
3757
|
+
raise click.UsageError('The benchmark CLI is currently disabled.')
|
3561
3758
|
|
3562
3759
|
|
3563
3760
|
@cli.group(cls=_NaturalOrderGroup)
|
@@ -3573,7 +3770,8 @@ def jobs():
|
|
3573
3770
|
nargs=-1,
|
3574
3771
|
**_get_shell_complete_args(_complete_file_name))
|
3575
3772
|
# TODO(zhwu): Add --dryrun option to test the launch command.
|
3576
|
-
@_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS
|
3773
|
+
@_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS +
|
3774
|
+
_COMMON_OPTIONS)
|
3577
3775
|
@click.option('--cluster',
|
3578
3776
|
'-c',
|
3579
3777
|
default=None,
|
@@ -3622,6 +3820,7 @@ def jobs_launch(
|
|
3622
3820
|
ports: Tuple[str],
|
3623
3821
|
detach_run: bool,
|
3624
3822
|
yes: bool,
|
3823
|
+
async_call: bool,
|
3625
3824
|
):
|
3626
3825
|
"""Launch a managed job from a YAML or a command.
|
3627
3826
|
|
@@ -3678,36 +3877,25 @@ def jobs_launch(
|
|
3678
3877
|
dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
|
3679
3878
|
dag_utils.fill_default_config_in_dag_for_job_launch(dag)
|
3680
3879
|
|
3681
|
-
dag, _ = admin_policy_utils.apply(
|
3682
|
-
dag, use_mutated_config_in_current_request=False)
|
3683
|
-
|
3684
|
-
if yes:
|
3685
|
-
# Skip resource preview if -y is set, since we are probably running in
|
3686
|
-
# a script and the user won't have a chance to review it anyway.
|
3687
|
-
# This can save a couple of seconds.
|
3688
|
-
click.secho(
|
3689
|
-
f'Resources for managed job {dag.name!r} will be computed on the '
|
3690
|
-
'managed jobs controller, since --yes is set.',
|
3691
|
-
fg='cyan')
|
3692
|
-
|
3693
|
-
else:
|
3694
|
-
click.secho(
|
3695
|
-
f'Managed job {dag.name!r} will be launched on (estimated):',
|
3696
|
-
fg='cyan')
|
3697
|
-
dag = sky.optimize(dag)
|
3698
|
-
|
3699
|
-
prompt = f'Launching a managed job {dag.name!r}. Proceed?'
|
3700
|
-
if prompt is not None:
|
3701
|
-
click.confirm(prompt, default=True, abort=True, show_default=True)
|
3702
|
-
|
3703
3880
|
common_utils.check_cluster_name_is_valid(name)
|
3704
3881
|
|
3705
|
-
|
3882
|
+
click.secho(f'Managed job {dag.name!r} will be launched on (estimated):',
|
3883
|
+
fg='yellow')
|
3884
|
+
|
3885
|
+
request_id = managed_jobs.launch(dag, name, _need_confirmation=not yes)
|
3886
|
+
job_id_handle = _async_call_or_wait(request_id, async_call,
|
3887
|
+
'sky.jobs.launch')
|
3888
|
+
if not async_call and not detach_run:
|
3889
|
+
job_id = job_id_handle[0]
|
3890
|
+
managed_jobs.tail_logs(name=None,
|
3891
|
+
job_id=job_id,
|
3892
|
+
follow=True,
|
3893
|
+
controller=False)
|
3706
3894
|
|
3707
3895
|
|
3708
3896
|
@jobs.command('queue', cls=_DocumentedCodeCommand)
|
3709
|
-
@click.option('--
|
3710
|
-
'-
|
3897
|
+
@click.option('--verbose',
|
3898
|
+
'-v',
|
3711
3899
|
default=False,
|
3712
3900
|
is_flag=True,
|
3713
3901
|
required=False,
|
@@ -3728,7 +3916,7 @@ def jobs_launch(
|
|
3728
3916
|
help='Show only pending/running jobs\' information.')
|
3729
3917
|
@usage_lib.entrypoint
|
3730
3918
|
# pylint: disable=redefined-builtin
|
3731
|
-
def jobs_queue(
|
3919
|
+
def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool):
|
3732
3920
|
"""Show statuses of managed jobs.
|
3733
3921
|
|
3734
3922
|
Each managed jobs can have one of the following statuses:
|
@@ -3782,13 +3970,13 @@ def jobs_queue(all: bool, refresh: bool, skip_finished: bool):
|
|
3782
3970
|
watch -n60 sky jobs queue
|
3783
3971
|
|
3784
3972
|
"""
|
3785
|
-
click.secho('Fetching managed
|
3786
|
-
with rich_utils.
|
3787
|
-
|
3788
|
-
|
3789
|
-
|
3790
|
-
|
3791
|
-
|
3973
|
+
click.secho('Fetching managed job statuses...', fg='cyan')
|
3974
|
+
with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
|
3975
|
+
managed_jobs_request_id = managed_jobs.queue(
|
3976
|
+
refresh=refresh, skip_finished=skip_finished)
|
3977
|
+
_, msg = _handle_jobs_queue_request(managed_jobs_request_id,
|
3978
|
+
show_all=verbose,
|
3979
|
+
is_called_by_user=True)
|
3792
3980
|
if not skip_finished:
|
3793
3981
|
in_progress_only_hint = ''
|
3794
3982
|
else:
|
@@ -3835,13 +4023,6 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
|
|
3835
4023
|
# Cancel managed jobs with IDs 1, 2, 3
|
3836
4024
|
$ sky jobs cancel 1 2 3
|
3837
4025
|
"""
|
3838
|
-
with rich_utils.safe_status(
|
3839
|
-
ux_utils.spinner_message('Checking managed jobs')):
|
3840
|
-
backend_utils.is_controller_accessible(
|
3841
|
-
controller=controller_utils.Controllers.JOBS_CONTROLLER,
|
3842
|
-
stopped_message='All managed jobs should have finished.',
|
3843
|
-
exit_if_not_accessible=True)
|
3844
|
-
|
3845
4026
|
job_id_str = ','.join(map(str, job_ids))
|
3846
4027
|
if sum([bool(job_ids), name is not None, all]) != 1:
|
3847
4028
|
argument_str = f'--job-ids {job_id_str}' if job_ids else ''
|
@@ -3861,7 +4042,7 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
|
|
3861
4042
|
abort=True,
|
3862
4043
|
show_default=True)
|
3863
4044
|
|
3864
|
-
managed_jobs.cancel(job_ids=job_ids, name=name, all=all)
|
4045
|
+
sdk.stream_and_get(managed_jobs.cancel(job_ids=job_ids, name=name, all=all))
|
3865
4046
|
|
3866
4047
|
|
3867
4048
|
@jobs.command('logs', cls=_DocumentedCodeCommand)
|
@@ -3903,10 +4084,19 @@ def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
|
3903
4084
|
"""Tail or sync down the log of a managed job."""
|
3904
4085
|
try:
|
3905
4086
|
if sync_down:
|
3906
|
-
|
3907
|
-
|
3908
|
-
|
3909
|
-
|
4087
|
+
with rich_utils.client_status(
|
4088
|
+
ux_utils.spinner_message('Downloading jobs logs')):
|
4089
|
+
log_local_path_dict = managed_jobs.download_logs(
|
4090
|
+
name=name,
|
4091
|
+
job_id=job_id,
|
4092
|
+
controller=controller,
|
4093
|
+
refresh=refresh)
|
4094
|
+
style = colorama.Style
|
4095
|
+
fore = colorama.Fore
|
4096
|
+
controller_str = ' (controller)' if controller else ''
|
4097
|
+
for job, log_local_path in log_local_path_dict.items():
|
4098
|
+
logger.info(f'{fore.CYAN}Job {job} logs{controller_str}: '
|
4099
|
+
f'{log_local_path}{style.RESET_ALL}')
|
3910
4100
|
else:
|
3911
4101
|
managed_jobs.tail_logs(name=name,
|
3912
4102
|
job_id=job_id,
|
@@ -3919,62 +4109,10 @@ def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
|
3919
4109
|
|
3920
4110
|
|
3921
4111
|
@jobs.command('dashboard', cls=_DocumentedCodeCommand)
|
3922
|
-
@click.option(
|
3923
|
-
'--port',
|
3924
|
-
'-p',
|
3925
|
-
default=None,
|
3926
|
-
type=int,
|
3927
|
-
required=False,
|
3928
|
-
help=('Local port to use for the dashboard. If None, a free port is '
|
3929
|
-
'automatically chosen.'))
|
3930
4112
|
@usage_lib.entrypoint
|
3931
|
-
def jobs_dashboard(
|
3932
|
-
"""Opens a dashboard for managed jobs
|
3933
|
-
|
3934
|
-
# API perhaps via REST. Then here we would (1) not have to use SSH to try to
|
3935
|
-
# see if the controller is UP first, which is slow; (2) not have to run SSH
|
3936
|
-
# port forwarding first (we'd just launch a local dashboard which would make
|
3937
|
-
# REST API calls to the controller dashboard server).
|
3938
|
-
click.secho('Checking if jobs controller is up...', fg='cyan')
|
3939
|
-
hint = ('Dashboard is not available if jobs controller is not up. Run a '
|
3940
|
-
'managed job first.')
|
3941
|
-
backend_utils.is_controller_accessible(
|
3942
|
-
controller=controller_utils.Controllers.JOBS_CONTROLLER,
|
3943
|
-
stopped_message=hint,
|
3944
|
-
non_existent_message=hint,
|
3945
|
-
exit_if_not_accessible=True)
|
3946
|
-
|
3947
|
-
# SSH forward a free local port to remote's dashboard port.
|
3948
|
-
remote_port = constants.SPOT_DASHBOARD_REMOTE_PORT
|
3949
|
-
if port is None:
|
3950
|
-
free_port = common_utils.find_free_port(remote_port)
|
3951
|
-
else:
|
3952
|
-
free_port = port
|
3953
|
-
ssh_command = (
|
3954
|
-
f'ssh -qNL {free_port}:localhost:{remote_port} '
|
3955
|
-
f'{controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name}')
|
3956
|
-
click.echo('Forwarding port: ', nl=False)
|
3957
|
-
click.secho(f'{ssh_command}', dim=True)
|
3958
|
-
|
3959
|
-
with subprocess.Popen(ssh_command, shell=True,
|
3960
|
-
start_new_session=True) as ssh_process:
|
3961
|
-
time.sleep(3) # Added delay for ssh_command to initialize.
|
3962
|
-
webbrowser.open(f'http://localhost:{free_port}')
|
3963
|
-
click.secho(
|
3964
|
-
f'Dashboard is now available at: http://127.0.0.1:{free_port}',
|
3965
|
-
fg='green')
|
3966
|
-
try:
|
3967
|
-
ssh_process.wait()
|
3968
|
-
except KeyboardInterrupt:
|
3969
|
-
# When user presses Ctrl-C in terminal, exits the previous ssh
|
3970
|
-
# command so that <free local port> is freed up.
|
3971
|
-
try:
|
3972
|
-
os.killpg(os.getpgid(ssh_process.pid), signal.SIGTERM)
|
3973
|
-
except ProcessLookupError:
|
3974
|
-
# This happens if jobs controller is auto-stopped.
|
3975
|
-
pass
|
3976
|
-
finally:
|
3977
|
-
click.echo('Exiting.')
|
4113
|
+
def jobs_dashboard():
|
4114
|
+
"""Opens a dashboard for managed jobs."""
|
4115
|
+
managed_jobs.dashboard()
|
3978
4116
|
|
3979
4117
|
|
3980
4118
|
@cli.group(cls=_NaturalOrderGroup)
|
@@ -4111,7 +4249,7 @@ def _generate_task_with_service(
|
|
4111
4249
|
type=str,
|
4112
4250
|
help='A service name. Unique for each service. If not provided, '
|
4113
4251
|
'a unique name is autogenerated.')
|
4114
|
-
@_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS)
|
4252
|
+
@_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS + _COMMON_OPTIONS)
|
4115
4253
|
@click.option('--yes',
|
4116
4254
|
'-y',
|
4117
4255
|
is_flag=True,
|
@@ -4140,6 +4278,7 @@ def serve_up(
|
|
4140
4278
|
disk_size: Optional[int],
|
4141
4279
|
disk_tier: Optional[str],
|
4142
4280
|
yes: bool,
|
4281
|
+
async_call: bool,
|
4143
4282
|
):
|
4144
4283
|
"""Launch a SkyServe service.
|
4145
4284
|
|
@@ -4200,16 +4339,9 @@ def serve_up(
|
|
4200
4339
|
fg='cyan')
|
4201
4340
|
with sky.Dag() as dag:
|
4202
4341
|
dag.add(task)
|
4203
|
-
dag, _ = admin_policy_utils.apply(
|
4204
|
-
dag, use_mutated_config_in_current_request=False)
|
4205
|
-
sky.optimize(dag)
|
4206
|
-
|
4207
|
-
if not yes:
|
4208
|
-
prompt = f'Launching a new service {service_name!r}. Proceed?'
|
4209
|
-
if prompt is not None:
|
4210
|
-
click.confirm(prompt, default=True, abort=True, show_default=True)
|
4211
4342
|
|
4212
|
-
serve_lib.up(task, service_name)
|
4343
|
+
request_id = serve_lib.up(task, service_name, _need_confirmation=not yes)
|
4344
|
+
_async_call_or_wait(request_id, async_call, 'sky.serve.up')
|
4213
4345
|
|
4214
4346
|
|
4215
4347
|
# TODO(MaoZiming): Update Doc.
|
@@ -4222,7 +4354,7 @@ def serve_up(
|
|
4222
4354
|
type=str,
|
4223
4355
|
nargs=-1,
|
4224
4356
|
**_get_shell_complete_args(_complete_file_name))
|
4225
|
-
@_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS)
|
4357
|
+
@_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS + _COMMON_OPTIONS)
|
4226
4358
|
@click.option('--mode',
|
4227
4359
|
default=serve_lib.DEFAULT_UPDATE_MODE.value,
|
4228
4360
|
type=click.Choice([m.value for m in serve_lib.UpdateMode],
|
@@ -4239,28 +4371,16 @@ def serve_up(
|
|
4239
4371
|
help='Skip confirmation prompt.')
|
4240
4372
|
@timeline.event
|
4241
4373
|
@usage_lib.entrypoint
|
4242
|
-
def serve_update(
|
4243
|
-
|
4244
|
-
|
4245
|
-
|
4246
|
-
|
4247
|
-
|
4248
|
-
|
4249
|
-
|
4250
|
-
|
4251
|
-
|
4252
|
-
env_file: Optional[Dict[str, str]],
|
4253
|
-
env: List[Tuple[str, str]],
|
4254
|
-
gpus: Optional[str],
|
4255
|
-
instance_type: Optional[str],
|
4256
|
-
ports: Tuple[str],
|
4257
|
-
cpus: Optional[str],
|
4258
|
-
memory: Optional[str],
|
4259
|
-
disk_size: Optional[int],
|
4260
|
-
disk_tier: Optional[str],
|
4261
|
-
mode: str,
|
4262
|
-
yes: bool,
|
4263
|
-
):
|
4374
|
+
def serve_update(service_name: str, service_yaml: Tuple[str, ...],
|
4375
|
+
workdir: Optional[str], cloud: Optional[str],
|
4376
|
+
region: Optional[str], zone: Optional[str],
|
4377
|
+
num_nodes: Optional[int], use_spot: Optional[bool],
|
4378
|
+
image_id: Optional[str], env_file: Optional[Dict[str, str]],
|
4379
|
+
env: List[Tuple[str, str]], gpus: Optional[str],
|
4380
|
+
instance_type: Optional[str], ports: Tuple[str],
|
4381
|
+
cpus: Optional[str], memory: Optional[str],
|
4382
|
+
disk_size: Optional[int], disk_tier: Optional[str], mode: str,
|
4383
|
+
yes: bool, async_call: bool):
|
4264
4384
|
"""Update a SkyServe service.
|
4265
4385
|
|
4266
4386
|
service_yaml must point to a valid YAML file.
|
@@ -4318,22 +4438,17 @@ def serve_update(
|
|
4318
4438
|
fg='cyan')
|
4319
4439
|
with sky.Dag() as dag:
|
4320
4440
|
dag.add(task)
|
4321
|
-
dag, _ = admin_policy_utils.apply(
|
4322
|
-
dag, use_mutated_config_in_current_request=False)
|
4323
|
-
sky.optimize(dag)
|
4324
4441
|
|
4325
|
-
|
4326
|
-
|
4327
|
-
|
4328
|
-
|
4329
|
-
|
4330
|
-
|
4331
|
-
serve_lib.update(task, service_name, mode=serve_lib.UpdateMode(mode))
|
4442
|
+
request_id = serve_lib.update(task,
|
4443
|
+
service_name,
|
4444
|
+
mode=serve_lib.UpdateMode(mode),
|
4445
|
+
_need_confirmation=not yes)
|
4446
|
+
_async_call_or_wait(request_id, async_call, 'sky.serve.update')
|
4332
4447
|
|
4333
4448
|
|
4334
4449
|
@serve.command('status', cls=_DocumentedCodeCommand)
|
4335
|
-
@click.option('--
|
4336
|
-
'-
|
4450
|
+
@click.option('--verbose',
|
4451
|
+
'-v',
|
4337
4452
|
default=False,
|
4338
4453
|
is_flag=True,
|
4339
4454
|
required=False,
|
@@ -4346,7 +4461,7 @@ def serve_update(
|
|
4346
4461
|
@click.argument('service_names', required=False, type=str, nargs=-1)
|
4347
4462
|
@usage_lib.entrypoint
|
4348
4463
|
# pylint: disable=redefined-builtin
|
4349
|
-
def serve_status(
|
4464
|
+
def serve_status(verbose: bool, endpoint: bool, service_names: List[str]):
|
4350
4465
|
"""Show statuses of SkyServe services.
|
4351
4466
|
|
4352
4467
|
Show detailed statuses of one or more services. If SERVICE_NAME is not
|
@@ -4433,17 +4548,22 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]):
|
|
4433
4548
|
sky serve status
|
4434
4549
|
\b
|
4435
4550
|
# Show detailed status for all services
|
4436
|
-
sky serve status -
|
4551
|
+
sky serve status -v
|
4437
4552
|
\b
|
4438
4553
|
# Only show status of my-service
|
4439
4554
|
sky serve status my-service
|
4440
4555
|
"""
|
4556
|
+
service_names_to_query: Optional[List[str]] = service_names
|
4557
|
+
if not service_names:
|
4558
|
+
service_names_to_query = None
|
4441
4559
|
# This won't pollute the output of --endpoint.
|
4442
|
-
with rich_utils.
|
4443
|
-
|
4444
|
-
|
4445
|
-
|
4446
|
-
|
4560
|
+
with rich_utils.client_status('[cyan]Checking services[/]'):
|
4561
|
+
service_status_request_id = serve_lib.status(service_names_to_query)
|
4562
|
+
_, msg = _handle_services_request(service_status_request_id,
|
4563
|
+
service_names=service_names_to_query,
|
4564
|
+
show_all=verbose,
|
4565
|
+
show_endpoint=endpoint,
|
4566
|
+
is_called_by_user=True)
|
4447
4567
|
|
4448
4568
|
if not endpoint:
|
4449
4569
|
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
@@ -4473,10 +4593,17 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]):
|
|
4473
4593
|
default=None,
|
4474
4594
|
type=int,
|
4475
4595
|
help='Tear down a given replica')
|
4596
|
+
@_add_click_options(_COMMON_OPTIONS)
|
4476
4597
|
# pylint: disable=redefined-builtin
|
4477
|
-
def serve_down(
|
4478
|
-
|
4479
|
-
|
4598
|
+
def serve_down(
|
4599
|
+
service_names: List[str],
|
4600
|
+
all: bool,
|
4601
|
+
purge: bool,
|
4602
|
+
yes: bool,
|
4603
|
+
replica_id: Optional[int],
|
4604
|
+
async_call: bool,
|
4605
|
+
) -> None:
|
4606
|
+
"""Teardown service(s).
|
4480
4607
|
|
4481
4608
|
SERVICE_NAMES is the name of the service (or glob pattern) to tear down. If
|
4482
4609
|
both SERVICE_NAMES and ``--all`` are supplied, the latter takes precedence.
|
@@ -4527,12 +4654,6 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool,
|
|
4527
4654
|
if all:
|
4528
4655
|
raise click.UsageError('The --replica-id option cannot be used '
|
4529
4656
|
'with the --all option.')
|
4530
|
-
|
4531
|
-
backend_utils.is_controller_accessible(
|
4532
|
-
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
|
4533
|
-
stopped_message='All services should have been terminated.',
|
4534
|
-
exit_if_not_accessible=True)
|
4535
|
-
|
4536
4657
|
if not yes:
|
4537
4658
|
if replica_id_is_defined:
|
4538
4659
|
click.confirm(
|
@@ -4543,8 +4664,8 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool,
|
|
4543
4664
|
show_default=True)
|
4544
4665
|
else:
|
4545
4666
|
quoted_service_names = [f'{name!r}' for name in service_names]
|
4546
|
-
|
4547
|
-
|
4667
|
+
list_service_str = ', '.join(quoted_service_names)
|
4668
|
+
service_identity_str = f'service(s) {list_service_str}'
|
4548
4669
|
if all:
|
4549
4670
|
service_identity_str = 'all services'
|
4550
4671
|
click.confirm(f'Terminating {service_identity_str}. Proceed?',
|
@@ -4553,9 +4674,13 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool,
|
|
4553
4674
|
show_default=True)
|
4554
4675
|
|
4555
4676
|
if replica_id_is_defined:
|
4556
|
-
serve_lib.terminate_replica(service_names[0], replica_id,
|
4677
|
+
request_id = serve_lib.terminate_replica(service_names[0], replica_id,
|
4678
|
+
purge)
|
4557
4679
|
else:
|
4558
|
-
serve_lib.down(service_names=service_names,
|
4680
|
+
request_id = serve_lib.down(service_names=service_names,
|
4681
|
+
all=all,
|
4682
|
+
purge=purge)
|
4683
|
+
_async_call_or_wait(request_id, async_call, 'sky.serve.down')
|
4559
4684
|
|
4560
4685
|
|
4561
4686
|
@serve.command('logs', cls=_DocumentedCodeCommand)
|
@@ -4682,7 +4807,7 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]:
|
|
4682
4807
|
required=True,
|
4683
4808
|
type=str,
|
4684
4809
|
help='Benchmark name.')
|
4685
|
-
@_add_click_options(_TASK_OPTIONS_WITH_NAME)
|
4810
|
+
@_add_click_options(_TASK_OPTIONS_WITH_NAME + _COMMON_OPTIONS)
|
4686
4811
|
@click.option('--gpus',
|
4687
4812
|
required=False,
|
4688
4813
|
type=str,
|
@@ -4717,26 +4842,27 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]:
|
|
4717
4842
|
help='Skip confirmation prompt.')
|
4718
4843
|
@usage_lib.entrypoint
|
4719
4844
|
def benchmark_launch(
|
4720
|
-
|
4721
|
-
|
4722
|
-
|
4723
|
-
|
4724
|
-
|
4725
|
-
|
4726
|
-
|
4727
|
-
|
4728
|
-
|
4729
|
-
|
4730
|
-
|
4731
|
-
|
4732
|
-
|
4733
|
-
|
4734
|
-
|
4735
|
-
|
4736
|
-
|
4737
|
-
|
4738
|
-
|
4739
|
-
|
4845
|
+
entrypoint: str,
|
4846
|
+
benchmark: str,
|
4847
|
+
name: Optional[str],
|
4848
|
+
workdir: Optional[str],
|
4849
|
+
cloud: Optional[str],
|
4850
|
+
region: Optional[str],
|
4851
|
+
zone: Optional[str],
|
4852
|
+
gpus: Optional[str],
|
4853
|
+
num_nodes: Optional[int],
|
4854
|
+
use_spot: Optional[bool],
|
4855
|
+
image_id: Optional[str],
|
4856
|
+
env_file: Optional[Dict[str, str]],
|
4857
|
+
env: List[Tuple[str, str]],
|
4858
|
+
cpus: Optional[str],
|
4859
|
+
memory: Optional[str],
|
4860
|
+
disk_size: Optional[int],
|
4861
|
+
disk_tier: Optional[str],
|
4862
|
+
ports: Tuple[str],
|
4863
|
+
idle_minutes_to_autostop: Optional[int],
|
4864
|
+
yes: bool,
|
4865
|
+
async_call: bool, # pylint: disable=unused-argument
|
4740
4866
|
) -> None:
|
4741
4867
|
"""Benchmark a task on different resources.
|
4742
4868
|
|
@@ -4745,6 +4871,7 @@ def benchmark_launch(
|
|
4745
4871
|
Alternatively, specify the benchmarking resources in your YAML (see doc),
|
4746
4872
|
which allows benchmarking on many more resource fields.
|
4747
4873
|
"""
|
4874
|
+
# TODO(zhwu): move benchmark to SkyPilot API server
|
4748
4875
|
env = _merge_env_vars(env_file, env)
|
4749
4876
|
record = benchmark_state.get_benchmark_from_name(benchmark)
|
4750
4877
|
if record is not None:
|
@@ -5135,10 +5262,7 @@ def benchmark_down(
|
|
5135
5262
|
continue
|
5136
5263
|
to_stop.append(cluster)
|
5137
5264
|
|
5138
|
-
_down_or_stop_clusters(to_stop,
|
5139
|
-
apply_to_all=False,
|
5140
|
-
down=True,
|
5141
|
-
no_confirm=yes)
|
5265
|
+
_down_or_stop_clusters(to_stop, down=True, no_confirm=yes)
|
5142
5266
|
|
5143
5267
|
|
5144
5268
|
@bench.command('delete', cls=_DocumentedCodeCommand)
|
@@ -5192,9 +5316,9 @@ def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool],
|
|
5192
5316
|
progress = rich_progress.Progress(transient=True,
|
5193
5317
|
redirect_stdout=False,
|
5194
5318
|
redirect_stderr=False)
|
5195
|
-
task = progress.add_task(
|
5196
|
-
f'Deleting {len(to_delete)} benchmark{plural}'
|
5197
|
-
|
5319
|
+
task = progress.add_task(
|
5320
|
+
f'[bold cyan]Deleting {len(to_delete)} benchmark{plural}: ',
|
5321
|
+
total=len(to_delete))
|
5198
5322
|
|
5199
5323
|
def _delete_benchmark(benchmark: str) -> None:
|
5200
5324
|
clusters = benchmark_state.get_benchmark_clusters(benchmark)
|
@@ -5244,196 +5368,6 @@ def local():
|
|
5244
5368
|
pass
|
5245
5369
|
|
5246
5370
|
|
5247
|
-
def _deploy_local_cluster(gpus: bool):
|
5248
|
-
cluster_created = False
|
5249
|
-
|
5250
|
-
# Check if GPUs are available on the host
|
5251
|
-
local_gpus_available = backend_utils.check_local_gpus()
|
5252
|
-
gpus = gpus and local_gpus_available
|
5253
|
-
|
5254
|
-
# Check if ~/.kube/config exists:
|
5255
|
-
if os.path.exists(os.path.expanduser('~/.kube/config')):
|
5256
|
-
curr_context = kubernetes_utils.get_current_kube_config_context_name()
|
5257
|
-
skypilot_context = 'kind-skypilot'
|
5258
|
-
if curr_context is not None and curr_context != skypilot_context:
|
5259
|
-
click.echo(
|
5260
|
-
f'Current context in kube config: {curr_context}'
|
5261
|
-
'\nWill automatically switch to kind-skypilot after the local '
|
5262
|
-
'cluster is created.')
|
5263
|
-
message_str = 'Creating local cluster{}...'
|
5264
|
-
message_str = message_str.format((' with GPU support (this may take up '
|
5265
|
-
'to 15 minutes)') if gpus else '')
|
5266
|
-
path_to_package = os.path.dirname(os.path.dirname(__file__))
|
5267
|
-
up_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes',
|
5268
|
-
'create_cluster.sh')
|
5269
|
-
|
5270
|
-
# Get directory of script and run it from there
|
5271
|
-
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
5272
|
-
run_command = up_script_path + f' {common_utils.get_user_hash()}'
|
5273
|
-
run_command = run_command + ' --gpus' if gpus else run_command
|
5274
|
-
run_command = shlex.split(run_command)
|
5275
|
-
|
5276
|
-
# Setup logging paths
|
5277
|
-
run_timestamp = sky_logging.get_run_timestamp()
|
5278
|
-
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
5279
|
-
'local_up.log')
|
5280
|
-
tail_cmd = 'tail -n100 -f ' + log_path
|
5281
|
-
|
5282
|
-
click.echo(message_str)
|
5283
|
-
style = colorama.Style
|
5284
|
-
click.echo('To view detailed progress: '
|
5285
|
-
f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
|
5286
|
-
|
5287
|
-
returncode, _, stderr = log_lib.run_with_log(
|
5288
|
-
cmd=run_command,
|
5289
|
-
log_path=log_path,
|
5290
|
-
require_outputs=True,
|
5291
|
-
stream_logs=False,
|
5292
|
-
line_processor=log_utils.SkyLocalUpLineProcessor(),
|
5293
|
-
cwd=cwd)
|
5294
|
-
|
5295
|
-
# Kind always writes to stderr even if it succeeds.
|
5296
|
-
# If the failure happens after the cluster is created, we need
|
5297
|
-
# to strip all stderr of "No kind clusters found.", which is
|
5298
|
-
# printed when querying with kind get clusters.
|
5299
|
-
stderr = stderr.replace('No kind clusters found.\n', '')
|
5300
|
-
|
5301
|
-
if returncode == 0:
|
5302
|
-
cluster_created = True
|
5303
|
-
elif returncode == 100:
|
5304
|
-
click.echo(f'{colorama.Fore.GREEN}Local cluster already '
|
5305
|
-
f'exists.{style.RESET_ALL}\n'
|
5306
|
-
'If you want to delete it instead, run: sky local down')
|
5307
|
-
else:
|
5308
|
-
with ux_utils.print_exception_no_traceback():
|
5309
|
-
raise RuntimeError(
|
5310
|
-
'Failed to create local cluster. '
|
5311
|
-
f'Full log: {log_path}'
|
5312
|
-
f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}')
|
5313
|
-
# Run sky check
|
5314
|
-
with rich_utils.safe_status(ux_utils.spinner_message('Running sky check')):
|
5315
|
-
sky_check.check(clouds=['kubernetes'], quiet=True)
|
5316
|
-
if cluster_created:
|
5317
|
-
# Prepare completion message which shows CPU and GPU count
|
5318
|
-
# Get number of CPUs
|
5319
|
-
p = subprocess_utils.run(
|
5320
|
-
'kubectl get nodes -o jsonpath=\'{.items[0].status.capacity.cpu}\'',
|
5321
|
-
capture_output=True)
|
5322
|
-
num_cpus = int(p.stdout.decode('utf-8'))
|
5323
|
-
|
5324
|
-
# GPU count/type parsing
|
5325
|
-
gpu_message = ''
|
5326
|
-
gpu_hint = ''
|
5327
|
-
if gpus:
|
5328
|
-
# Get GPU model by querying the node labels
|
5329
|
-
label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
|
5330
|
-
gpu_type_cmd = f'kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
|
5331
|
-
try:
|
5332
|
-
# Run the command and capture the output
|
5333
|
-
gpu_count_output = subprocess.check_output(gpu_type_cmd,
|
5334
|
-
shell=True,
|
5335
|
-
text=True)
|
5336
|
-
gpu_type_str = gpu_count_output.strip() + ' '
|
5337
|
-
except subprocess.CalledProcessError as e:
|
5338
|
-
output = str(e.output.decode('utf-8'))
|
5339
|
-
logger.warning(f'Failed to get GPU type: {output}')
|
5340
|
-
gpu_type_str = ''
|
5341
|
-
|
5342
|
-
# Get number of GPUs (sum of nvidia.com/gpu resources)
|
5343
|
-
gpu_count_command = 'kubectl get nodes -o=jsonpath=\'{range .items[*]}{.status.allocatable.nvidia\\.com/gpu}{\"\\n\"}{end}\' | awk \'{sum += $1} END {print sum}\'' # pylint: disable=line-too-long
|
5344
|
-
try:
|
5345
|
-
# Run the command and capture the output
|
5346
|
-
gpu_count_output = subprocess.check_output(gpu_count_command,
|
5347
|
-
shell=True,
|
5348
|
-
text=True)
|
5349
|
-
gpu_count = gpu_count_output.strip(
|
5350
|
-
) # Remove any extra whitespace
|
5351
|
-
gpu_message = f' and {gpu_count} {gpu_type_str}GPUs'
|
5352
|
-
except subprocess.CalledProcessError as e:
|
5353
|
-
output = str(e.output.decode('utf-8'))
|
5354
|
-
logger.warning(f'Failed to get GPU count: {output}')
|
5355
|
-
gpu_message = f' with {gpu_type_str}GPU support'
|
5356
|
-
|
5357
|
-
gpu_hint = (
|
5358
|
-
'\nHint: To see the list of GPUs in the cluster, '
|
5359
|
-
'run \'sky show-gpus --cloud kubernetes\'') if gpus else ''
|
5360
|
-
|
5361
|
-
if num_cpus < 2:
|
5362
|
-
click.echo('Warning: Local cluster has less than 2 CPUs. '
|
5363
|
-
'This may cause issues with running tasks.')
|
5364
|
-
click.echo(
|
5365
|
-
f'\n{colorama.Fore.GREEN}Local Kubernetes cluster created '
|
5366
|
-
'successfully with '
|
5367
|
-
f'{num_cpus} CPUs{gpu_message}.{style.RESET_ALL}\n`sky launch` can '
|
5368
|
-
'now run tasks locally.'
|
5369
|
-
'\nHint: To change the number of CPUs, change your docker '
|
5370
|
-
'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
|
5371
|
-
f'{gpu_hint}')
|
5372
|
-
|
5373
|
-
|
5374
|
-
def _deploy_remote_cluster(ip_file: str, ssh_user: str, ssh_key_path: str,
|
5375
|
-
cleanup: bool):
|
5376
|
-
success = False
|
5377
|
-
path_to_package = os.path.dirname(os.path.dirname(__file__))
|
5378
|
-
up_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes',
|
5379
|
-
'deploy_remote_cluster.sh')
|
5380
|
-
# Get directory of script and run it from there
|
5381
|
-
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
5382
|
-
|
5383
|
-
deploy_command = f'{up_script_path} {ip_file} {ssh_user} {ssh_key_path}'
|
5384
|
-
if cleanup:
|
5385
|
-
deploy_command += ' --cleanup'
|
5386
|
-
|
5387
|
-
# Convert the command to a format suitable for subprocess
|
5388
|
-
deploy_command = shlex.split(deploy_command)
|
5389
|
-
|
5390
|
-
# Setup logging paths
|
5391
|
-
run_timestamp = sky_logging.get_run_timestamp()
|
5392
|
-
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
5393
|
-
'local_up.log')
|
5394
|
-
tail_cmd = 'tail -n100 -f ' + log_path
|
5395
|
-
|
5396
|
-
# Check if ~/.kube/config exists:
|
5397
|
-
if os.path.exists(os.path.expanduser('~/.kube/config')):
|
5398
|
-
click.echo('Found existing kube config. '
|
5399
|
-
'It will be backed up to ~/.kube/config.bak.')
|
5400
|
-
style = colorama.Style
|
5401
|
-
click.echo('To view detailed progress: '
|
5402
|
-
f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
|
5403
|
-
if cleanup:
|
5404
|
-
msg_str = 'Cleaning up remote cluster...'
|
5405
|
-
else:
|
5406
|
-
msg_str = 'Deploying remote cluster...'
|
5407
|
-
with rich_utils.safe_status(f'[bold cyan]{msg_str}'):
|
5408
|
-
returncode, _, stderr = log_lib.run_with_log(
|
5409
|
-
cmd=deploy_command,
|
5410
|
-
log_path=log_path,
|
5411
|
-
require_outputs=True,
|
5412
|
-
stream_logs=False,
|
5413
|
-
line_processor=log_utils.SkyRemoteUpLineProcessor(),
|
5414
|
-
cwd=cwd)
|
5415
|
-
if returncode == 0:
|
5416
|
-
success = True
|
5417
|
-
else:
|
5418
|
-
with ux_utils.print_exception_no_traceback():
|
5419
|
-
raise RuntimeError(
|
5420
|
-
'Failed to deploy remote cluster. '
|
5421
|
-
f'Full log: {log_path}'
|
5422
|
-
f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}')
|
5423
|
-
|
5424
|
-
if success:
|
5425
|
-
if cleanup:
|
5426
|
-
click.echo(f'{colorama.Fore.GREEN}'
|
5427
|
-
'🎉 Remote cluster cleaned up successfully.'
|
5428
|
-
f'{style.RESET_ALL}')
|
5429
|
-
else:
|
5430
|
-
click.echo('Cluster deployment done. You can now run tasks on '
|
5431
|
-
'this cluster.\nE.g., run a task with: '
|
5432
|
-
'sky launch --cloud kubernetes -- echo hello world.'
|
5433
|
-
f'\n{colorama.Fore.GREEN}🎉 Remote cluster deployed '
|
5434
|
-
f'successfully. {style.RESET_ALL}')
|
5435
|
-
|
5436
|
-
|
5437
5371
|
@click.option('--gpus/--no-gpus',
|
5438
5372
|
default=True,
|
5439
5373
|
is_flag=True,
|
@@ -5456,9 +5390,10 @@ def _deploy_remote_cluster(ip_file: str, ssh_user: str, ssh_key_path: str,
|
|
5456
5390
|
is_flag=True,
|
5457
5391
|
help='Clean up the remote cluster instead of deploying it.')
|
5458
5392
|
@local.command('up', cls=_DocumentedCodeCommand)
|
5393
|
+
@_add_click_options(_COMMON_OPTIONS)
|
5459
5394
|
@usage_lib.entrypoint
|
5460
5395
|
def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
|
5461
|
-
cleanup: bool):
|
5396
|
+
cleanup: bool, async_call: bool):
|
5462
5397
|
"""Creates a local or remote cluster."""
|
5463
5398
|
|
5464
5399
|
def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
|
@@ -5479,64 +5414,226 @@ def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
|
|
5479
5414
|
_validate_args(ips, ssh_user, ssh_key_path, cleanup)
|
5480
5415
|
|
5481
5416
|
# If remote deployment arguments are specified, run remote up script
|
5417
|
+
ip_list = None
|
5418
|
+
ssh_key = None
|
5482
5419
|
if ips and ssh_user and ssh_key_path:
|
5483
|
-
#
|
5484
|
-
|
5485
|
-
|
5486
|
-
|
5487
|
-
|
5488
|
-
|
5489
|
-
|
5420
|
+
# Read and validate IP file
|
5421
|
+
try:
|
5422
|
+
with open(os.path.expanduser(ips), 'r', encoding='utf-8') as f:
|
5423
|
+
ip_list = f.read().strip().splitlines()
|
5424
|
+
if not ip_list:
|
5425
|
+
raise click.BadParameter(f'IP file is empty: {ips}')
|
5426
|
+
except (IOError, OSError) as e:
|
5427
|
+
raise click.BadParameter(f'Failed to read IP file {ips}: {str(e)}')
|
5428
|
+
|
5429
|
+
# Read and validate SSH key file
|
5430
|
+
try:
|
5431
|
+
with open(os.path.expanduser(ssh_key_path), 'r',
|
5432
|
+
encoding='utf-8') as f:
|
5433
|
+
ssh_key = f.read()
|
5434
|
+
if not ssh_key:
|
5435
|
+
raise click.BadParameter(
|
5436
|
+
f'SSH key file is empty: {ssh_key_path}')
|
5437
|
+
except (IOError, OSError) as e:
|
5438
|
+
raise click.BadParameter(
|
5439
|
+
f'Failed to read SSH key file {ssh_key_path}: {str(e)}')
|
5440
|
+
|
5441
|
+
request_id = sdk.local_up(gpus, ip_list, ssh_user, ssh_key, cleanup)
|
5442
|
+
_async_call_or_wait(request_id, async_call, request_name='local up')
|
5490
5443
|
|
5491
5444
|
|
5492
5445
|
@local.command('down', cls=_DocumentedCodeCommand)
|
5446
|
+
@_add_click_options(_COMMON_OPTIONS)
|
5493
5447
|
@usage_lib.entrypoint
|
5494
|
-
def local_down():
|
5448
|
+
def local_down(async_call: bool):
|
5495
5449
|
"""Deletes a local cluster."""
|
5496
|
-
|
5450
|
+
request_id = sdk.local_down()
|
5451
|
+
_async_call_or_wait(request_id, async_call, request_name='sky.local.down')
|
5497
5452
|
|
5498
|
-
path_to_package = os.path.dirname(os.path.dirname(__file__))
|
5499
|
-
down_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes',
|
5500
|
-
'delete_cluster.sh')
|
5501
5453
|
|
5502
|
-
|
5503
|
-
|
5454
|
+
@cli.group(cls=_NaturalOrderGroup)
|
5455
|
+
def api():
|
5456
|
+
"""SkyPilot API server commands."""
|
5457
|
+
pass
|
5504
5458
|
|
5505
|
-
# Setup logging paths
|
5506
|
-
run_timestamp = sky_logging.get_run_timestamp()
|
5507
|
-
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
5508
|
-
'local_down.log')
|
5509
|
-
tail_cmd = 'tail -n100 -f ' + log_path
|
5510
5459
|
|
5511
|
-
|
5512
|
-
|
5513
|
-
|
5514
|
-
|
5515
|
-
|
5516
|
-
|
5517
|
-
|
5518
|
-
|
5519
|
-
|
5520
|
-
|
5521
|
-
|
5522
|
-
|
5523
|
-
|
5524
|
-
|
5525
|
-
|
5526
|
-
|
5527
|
-
|
5528
|
-
|
5529
|
-
|
5530
|
-
|
5531
|
-
|
5532
|
-
|
5533
|
-
|
5534
|
-
|
5535
|
-
|
5536
|
-
|
5537
|
-
|
5538
|
-
|
5539
|
-
|
5460
|
+
@api.command('start', cls=_DocumentedCodeCommand)
|
5461
|
+
@click.option('--deploy',
|
5462
|
+
type=bool,
|
5463
|
+
is_flag=True,
|
5464
|
+
default=False,
|
5465
|
+
required=False,
|
5466
|
+
help=('Deploy the SkyPilot API server. When set to True, '
|
5467
|
+
'SkyPilot API server will use all resources on the host '
|
5468
|
+
'machine assuming the machine is dedicated to SkyPilot API '
|
5469
|
+
'server; host will also be set to 0.0.0.0 to allow remote '
|
5470
|
+
'access.'))
|
5471
|
+
@click.option('--host',
|
5472
|
+
default='127.0.0.1',
|
5473
|
+
type=click.Choice(server_common.AVAILBLE_LOCAL_API_SERVER_HOSTS),
|
5474
|
+
required=False,
|
5475
|
+
help=('The host to deploy the SkyPilot API server. To allow '
|
5476
|
+
'remote access, set this to 0.0.0.0'))
|
5477
|
+
@usage_lib.entrypoint
|
5478
|
+
def api_start(deploy: bool, host: Optional[str]):
|
5479
|
+
"""Starts the SkyPilot API server locally."""
|
5480
|
+
sdk.api_start(deploy=deploy, host=host)
|
5481
|
+
|
5482
|
+
|
5483
|
+
@api.command('stop', cls=_DocumentedCodeCommand)
|
5484
|
+
@usage_lib.entrypoint
|
5485
|
+
def api_stop():
|
5486
|
+
"""Stops the SkyPilot API server locally."""
|
5487
|
+
sdk.api_stop()
|
5488
|
+
|
5489
|
+
|
5490
|
+
@api.command('logs', cls=_DocumentedCodeCommand)
|
5491
|
+
@click.argument('request_id', required=False, type=str)
|
5492
|
+
@click.option('--server-logs',
|
5493
|
+
is_flag=True,
|
5494
|
+
default=False,
|
5495
|
+
required=False,
|
5496
|
+
help='Stream the server logs.')
|
5497
|
+
@click.option('--log-path',
|
5498
|
+
'-l',
|
5499
|
+
required=False,
|
5500
|
+
type=str,
|
5501
|
+
help='The path to the log file to stream.')
|
5502
|
+
@click.option('--tail',
|
5503
|
+
required=False,
|
5504
|
+
type=int,
|
5505
|
+
help=('Number of lines to show from the end of the logs. '
|
5506
|
+
'(default: None)'))
|
5507
|
+
@click.option('--follow/--no-follow',
|
5508
|
+
is_flag=True,
|
5509
|
+
default=True,
|
5510
|
+
required=False,
|
5511
|
+
help='Follow the logs.')
|
5512
|
+
@usage_lib.entrypoint
|
5513
|
+
def api_logs(request_id: Optional[str], server_logs: bool,
|
5514
|
+
log_path: Optional[str], tail: Optional[int], follow: bool):
|
5515
|
+
"""Stream the logs of a request running on SkyPilot API server."""
|
5516
|
+
if not server_logs and request_id is None and log_path is None:
|
5517
|
+
# TODO(zhwu): get the latest request ID.
|
5518
|
+
raise click.BadParameter('Please provide the request ID or log path.')
|
5519
|
+
if server_logs:
|
5520
|
+
sdk.api_server_logs(follow=follow, tail=tail)
|
5521
|
+
return
|
5522
|
+
|
5523
|
+
if request_id is not None and log_path is not None:
|
5524
|
+
raise click.BadParameter(
|
5525
|
+
'Only one of request ID and log path can be provided.')
|
5526
|
+
sdk.stream_and_get(request_id, log_path, tail)
|
5527
|
+
|
5528
|
+
|
5529
|
+
@api.command('cancel', cls=_DocumentedCodeCommand)
|
5530
|
+
@click.argument('request_ids', required=False, type=str, nargs=-1)
|
5531
|
+
@click.option('--all',
|
5532
|
+
'-a',
|
5533
|
+
is_flag=True,
|
5534
|
+
default=False,
|
5535
|
+
required=False,
|
5536
|
+
help='Cancel all your requests.')
|
5537
|
+
@click.option('--all-users',
|
5538
|
+
'-u',
|
5539
|
+
is_flag=True,
|
5540
|
+
default=False,
|
5541
|
+
required=False,
|
5542
|
+
help='Cancel all requests from all users.')
|
5543
|
+
@usage_lib.entrypoint
|
5544
|
+
# pylint: disable=redefined-builtin
|
5545
|
+
def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
|
5546
|
+
"""Cancel a request running on SkyPilot API server."""
|
5547
|
+
if all or all_users:
|
5548
|
+
keyword = 'ALL USERS\'' if all_users else 'YOUR'
|
5549
|
+
user_input = click.prompt(
|
5550
|
+
f'This will cancel all {keyword} requests.\n'
|
5551
|
+
f'To proceed, please type {colorama.Style.BRIGHT}'
|
5552
|
+
f'\'cancel all requests\'{colorama.Style.RESET_ALL}',
|
5553
|
+
type=str)
|
5554
|
+
if user_input != 'cancel all requests':
|
5555
|
+
raise click.Abort()
|
5556
|
+
if all:
|
5557
|
+
request_ids = None
|
5558
|
+
cancelled_request_ids = sdk.get(
|
5559
|
+
sdk.api_cancel(request_ids=request_ids, all_users=all_users))
|
5560
|
+
if not cancelled_request_ids:
|
5561
|
+
click.secho('No requests need to be cancelled.', fg='green')
|
5562
|
+
elif len(cancelled_request_ids) == 1:
|
5563
|
+
click.secho(f'Cancelled 1 request: {cancelled_request_ids[0]}',
|
5564
|
+
fg='green')
|
5565
|
+
else:
|
5566
|
+
click.secho(f'Cancelled {len(cancelled_request_ids)} requests.',
|
5567
|
+
fg='green')
|
5568
|
+
|
5569
|
+
|
5570
|
+
@api.command('status', cls=_DocumentedCodeCommand)
|
5571
|
+
@click.argument('request_ids', required=False, type=str, nargs=-1)
|
5572
|
+
@click.option('--all-status',
|
5573
|
+
'-a',
|
5574
|
+
is_flag=True,
|
5575
|
+
default=False,
|
5576
|
+
required=False,
|
5577
|
+
help='Show requests of all statuses.')
|
5578
|
+
@click.option('--verbose',
|
5579
|
+
'-v',
|
5580
|
+
is_flag=True,
|
5581
|
+
default=False,
|
5582
|
+
required=False,
|
5583
|
+
help='Show more details.')
|
5584
|
+
@usage_lib.entrypoint
|
5585
|
+
# pylint: disable=redefined-builtin
|
5586
|
+
def api_status(request_ids: Optional[List[str]], all_status: bool,
|
5587
|
+
verbose: bool):
|
5588
|
+
"""List requests on SkyPilot API server."""
|
5589
|
+
if not request_ids:
|
5590
|
+
request_ids = None
|
5591
|
+
request_list = sdk.api_status(request_ids, all_status)
|
5592
|
+
columns = ['ID', 'User', 'Name']
|
5593
|
+
if verbose:
|
5594
|
+
columns.append('Cluster')
|
5595
|
+
columns.extend(['Created', 'Status'])
|
5596
|
+
table = log_utils.create_table(columns)
|
5597
|
+
for request in request_list:
|
5598
|
+
r_id = request.request_id
|
5599
|
+
if not verbose:
|
5600
|
+
r_id = common_utils.truncate_long_string(r_id, 36)
|
5601
|
+
req_status = requests.RequestStatus(request.status)
|
5602
|
+
row = [r_id, request.user_name, request.name]
|
5603
|
+
if verbose:
|
5604
|
+
row.append(request.cluster_name)
|
5605
|
+
row.extend([
|
5606
|
+
log_utils.readable_time_duration(request.created_at),
|
5607
|
+
req_status.colored_str()
|
5608
|
+
])
|
5609
|
+
table.add_row(row)
|
5610
|
+
click.echo(table)
|
5611
|
+
|
5612
|
+
|
5613
|
+
@api.command('login', cls=_DocumentedCodeCommand)
|
5614
|
+
@click.option('--endpoint',
|
5615
|
+
'-e',
|
5616
|
+
required=False,
|
5617
|
+
help='The SkyPilot API server endpoint.')
|
5618
|
+
@usage_lib.entrypoint
|
5619
|
+
def api_login(endpoint: Optional[str]):
|
5620
|
+
"""Logs into a SkyPilot API server."""
|
5621
|
+
sdk.api_login(endpoint)
|
5622
|
+
|
5623
|
+
|
5624
|
+
@api.command('info', cls=_DocumentedCodeCommand)
|
5625
|
+
@usage_lib.entrypoint
|
5626
|
+
def api_info():
|
5627
|
+
"""Shows the SkyPilot API server URL."""
|
5628
|
+
url = server_common.get_server_url()
|
5629
|
+
api_server_info = sdk.api_info()
|
5630
|
+
user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
|
5631
|
+
user_hash = common_utils.get_user_hash()
|
5632
|
+
click.echo(f'Using SkyPilot API server: {url}\n'
|
5633
|
+
f'{ux_utils.INDENT_SYMBOL}Status: {api_server_info["status"]}, '
|
5634
|
+
f'commit: {api_server_info["commit"]}, '
|
5635
|
+
f'version: {api_server_info["version"]}\n'
|
5636
|
+
f'{ux_utils.INDENT_LAST_SYMBOL}User: {user_name} ({user_hash})')
|
5540
5637
|
|
5541
5638
|
|
5542
5639
|
def main():
|