skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/cli.py
CHANGED
@@ -26,59 +26,66 @@ each other.
|
|
26
26
|
import copy
|
27
27
|
import datetime
|
28
28
|
import functools
|
29
|
-
import
|
29
|
+
import getpass
|
30
30
|
import os
|
31
31
|
import shlex
|
32
|
-
import
|
32
|
+
import shutil
|
33
33
|
import subprocess
|
34
34
|
import sys
|
35
35
|
import textwrap
|
36
|
-
import
|
36
|
+
import traceback
|
37
37
|
import typing
|
38
|
-
from typing import Any, Dict, List, Optional, Tuple, Union
|
39
|
-
import webbrowser
|
38
|
+
from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
|
40
39
|
|
41
40
|
import click
|
42
41
|
import colorama
|
43
42
|
import dotenv
|
43
|
+
import requests as requests_lib
|
44
44
|
from rich import progress as rich_progress
|
45
45
|
import yaml
|
46
46
|
|
47
47
|
import sky
|
48
48
|
from sky import backends
|
49
|
-
from sky import
|
50
|
-
from sky import clouds as sky_clouds
|
51
|
-
from sky import core
|
49
|
+
from sky import clouds
|
52
50
|
from sky import exceptions
|
53
51
|
from sky import global_user_state
|
54
52
|
from sky import jobs as managed_jobs
|
53
|
+
from sky import models
|
55
54
|
from sky import serve as serve_lib
|
56
55
|
from sky import sky_logging
|
57
|
-
from sky import status_lib
|
58
56
|
from sky.adaptors import common as adaptors_common
|
59
|
-
from sky.backends import backend_utils
|
60
57
|
from sky.benchmark import benchmark_state
|
61
58
|
from sky.benchmark import benchmark_utils
|
59
|
+
from sky.client import sdk
|
62
60
|
from sky.clouds import service_catalog
|
63
61
|
from sky.data import storage_utils
|
62
|
+
from sky.provision.kubernetes import constants as kubernetes_constants
|
64
63
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
64
|
+
from sky.server import common as server_common
|
65
|
+
from sky.server import constants as server_constants
|
66
|
+
from sky.server.requests import requests
|
65
67
|
from sky.skylet import constants
|
66
68
|
from sky.skylet import job_lib
|
67
|
-
from sky.skylet import log_lib
|
68
69
|
from sky.usage import usage_lib
|
70
|
+
from sky.utils import annotations
|
71
|
+
from sky.utils import cluster_utils
|
72
|
+
from sky.utils import common
|
69
73
|
from sky.utils import common_utils
|
70
74
|
from sky.utils import controller_utils
|
71
75
|
from sky.utils import dag_utils
|
76
|
+
from sky.utils import env_options
|
72
77
|
from sky.utils import log_utils
|
78
|
+
from sky.utils import registry
|
73
79
|
from sky.utils import resources_utils
|
74
80
|
from sky.utils import rich_utils
|
81
|
+
from sky.utils import status_lib
|
75
82
|
from sky.utils import subprocess_utils
|
76
83
|
from sky.utils import timeline
|
77
84
|
from sky.utils import ux_utils
|
78
85
|
from sky.utils.cli_utils import status_utils
|
79
86
|
|
80
87
|
if typing.TYPE_CHECKING:
|
81
|
-
|
88
|
+
import types
|
82
89
|
|
83
90
|
pd = adaptors_common.LazyImport('pandas')
|
84
91
|
logger = sky_logging.init_logger(__name__)
|
@@ -98,23 +105,96 @@ _STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE = (
|
|
98
105
|
'{cluster_num} cluster{plural} {verb}. Please specify {cause} '
|
99
106
|
'cluster to show its {property}.\nUsage: `sky status --{flag} <cluster>`')
|
100
107
|
|
101
|
-
_ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
|
102
|
-
'please retry after a while.')
|
103
|
-
|
104
108
|
_DAG_NOT_SUPPORTED_MESSAGE = ('YAML specifies a DAG which is only supported by '
|
105
109
|
'`sky jobs launch`. `{command}` supports a '
|
106
110
|
'single task only.')
|
107
111
|
|
108
112
|
|
109
|
-
def
|
110
|
-
|
111
|
-
|
112
|
-
|
113
|
-
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
113
|
+
def _get_cluster_records_and_set_ssh_config(
|
114
|
+
clusters: Optional[List[str]],
|
115
|
+
refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
|
116
|
+
all_users: bool = False,
|
117
|
+
) -> List[dict]:
|
118
|
+
"""Returns a list of clusters that match the glob pattern.
|
119
|
+
|
120
|
+
Args:
|
121
|
+
clusters: A list of cluster names to query. If None, query all clusters.
|
122
|
+
refresh: The refresh mode for the status command.
|
123
|
+
all_users: Whether to query clusters from all users.
|
124
|
+
If clusters is not None, this field is ignored because cluster list
|
125
|
+
can include other users' clusters.
|
126
|
+
"""
|
127
|
+
# TODO(zhwu): we should move this function into SDK.
|
128
|
+
# TODO(zhwu): this additional RTT makes CLIs slow. We should optimize this.
|
129
|
+
if clusters is not None:
|
130
|
+
all_users = True
|
131
|
+
request_id = sdk.status(clusters, refresh=refresh, all_users=all_users)
|
132
|
+
cluster_records = sdk.stream_and_get(request_id)
|
133
|
+
# Update the SSH config for all clusters
|
134
|
+
for record in cluster_records:
|
135
|
+
handle = record['handle']
|
136
|
+
# During the failover, even though a cluster does not exist, the handle
|
137
|
+
# can still exist in the record, and we check for credentials to avoid
|
138
|
+
# updating the SSH config for non-existent clusters.
|
139
|
+
if (handle is not None and handle.cached_external_ips is not None and
|
140
|
+
'credentials' in record):
|
141
|
+
credentials = record['credentials']
|
142
|
+
if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
|
143
|
+
# Replace the proxy command to proxy through the SkyPilot API
|
144
|
+
# server with websocket.
|
145
|
+
key_path = (
|
146
|
+
cluster_utils.SSHConfigHelper.generate_local_key_file(
|
147
|
+
handle.cluster_name, credentials))
|
148
|
+
# Instead of directly use websocket_proxy.py, we add an
|
149
|
+
# additional proxy, so that ssh can use the head pod in the
|
150
|
+
# cluster to jump to worker pods.
|
151
|
+
proxy_command = (
|
152
|
+
f'ssh -tt -i {key_path} '
|
153
|
+
'-o StrictHostKeyChecking=no '
|
154
|
+
'-o UserKnownHostsFile=/dev/null '
|
155
|
+
'-o IdentitiesOnly=yes '
|
156
|
+
'-W %h:%p '
|
157
|
+
f'{handle.ssh_user}@127.0.0.1 '
|
158
|
+
'-o ProxyCommand='
|
159
|
+
# TODO(zhwu): write the template to a temp file, don't use
|
160
|
+
# the one in skypilot repo, to avoid changing the file when
|
161
|
+
# updating skypilot.
|
162
|
+
f'\'{sys.executable} {sky.__root_dir__}/templates/'
|
163
|
+
f'websocket_proxy.py '
|
164
|
+
f'{server_common.get_server_url().split("://")[1]} '
|
165
|
+
f'{handle.cluster_name}\'')
|
166
|
+
credentials['ssh_proxy_command'] = proxy_command
|
167
|
+
cluster_utils.SSHConfigHelper.add_cluster(
|
168
|
+
handle.cluster_name,
|
169
|
+
handle.cached_external_ips,
|
170
|
+
credentials,
|
171
|
+
handle.cached_external_ssh_ports,
|
172
|
+
handle.docker_user,
|
173
|
+
handle.ssh_user,
|
174
|
+
)
|
175
|
+
else:
|
176
|
+
# If the cluster is not UP or does not have credentials available,
|
177
|
+
# we need to remove the cluster from the SSH config.
|
178
|
+
cluster_utils.SSHConfigHelper.remove_cluster(record['name'])
|
179
|
+
|
180
|
+
# Clean up SSH configs for clusters that do not exist.
|
181
|
+
#
|
182
|
+
# We do this in a conservative way: only when a query is made for all users
|
183
|
+
# or specific clusters. Without those, the table returned only contains the
|
184
|
+
# current user's clusters, and the information is not enough for
|
185
|
+
# removing clusters, because SkyPilot has no idea whether to remove
|
186
|
+
# ssh config of a cluster from another user.
|
187
|
+
clusters_exists = set(record['name'] for record in cluster_records)
|
188
|
+
if clusters is not None:
|
189
|
+
for cluster in clusters:
|
190
|
+
if cluster not in clusters_exists:
|
191
|
+
cluster_utils.SSHConfigHelper.remove_cluster(cluster)
|
192
|
+
elif all_users:
|
193
|
+
for cluster_name in cluster_utils.SSHConfigHelper.list_cluster_names():
|
194
|
+
if cluster_name not in clusters_exists:
|
195
|
+
cluster_utils.SSHConfigHelper.remove_cluster(cluster_name)
|
196
|
+
|
197
|
+
return cluster_records
|
118
198
|
|
119
199
|
|
120
200
|
def _get_glob_storages(storages: List[str]) -> List[str]:
|
@@ -122,7 +202,7 @@ def _get_glob_storages(storages: List[str]) -> List[str]:
|
|
122
202
|
glob_storages = []
|
123
203
|
for storage_object in storages:
|
124
204
|
glob_storage = global_user_state.get_glob_storage_name(storage_object)
|
125
|
-
if
|
205
|
+
if not glob_storage:
|
126
206
|
click.echo(f'Storage {storage_object} not found.')
|
127
207
|
glob_storages.extend(glob_storage)
|
128
208
|
return list(set(glob_storages))
|
@@ -144,6 +224,44 @@ def _parse_env_var(env_var: str) -> Tuple[str, str]:
|
|
144
224
|
return ret[0], ret[1]
|
145
225
|
|
146
226
|
|
227
|
+
def _async_call_or_wait(request_id: str, async_call: bool,
|
228
|
+
request_name: str) -> Any:
|
229
|
+
short_request_id = request_id[:8]
|
230
|
+
if not async_call:
|
231
|
+
try:
|
232
|
+
return sdk.stream_and_get(request_id)
|
233
|
+
except KeyboardInterrupt:
|
234
|
+
logger.info(
|
235
|
+
ux_utils.starting_message('Request will continue running '
|
236
|
+
'asynchronously.') +
|
237
|
+
f'\n{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}View logs: '
|
238
|
+
f'{ux_utils.BOLD}sky api logs {short_request_id}'
|
239
|
+
f'{colorama.Style.RESET_ALL}'
|
240
|
+
f'\n{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}Or, '
|
241
|
+
'visit: '
|
242
|
+
f'{server_common.get_server_url()}/api/stream?'
|
243
|
+
f'request_id={short_request_id}'
|
244
|
+
f'\n{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}To cancel '
|
245
|
+
'the request, run: '
|
246
|
+
f'{ux_utils.BOLD}sky api cancel {short_request_id}'
|
247
|
+
f'{colorama.Style.RESET_ALL}'
|
248
|
+
f'\n{colorama.Style.RESET_ALL}')
|
249
|
+
raise
|
250
|
+
else:
|
251
|
+
click.secho(f'Submitted {request_name} request: {request_id}',
|
252
|
+
fg='green')
|
253
|
+
click.echo(
|
254
|
+
f'{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}Check logs with: '
|
255
|
+
f'sky api logs {short_request_id}{colorama.Style.RESET_ALL}\n'
|
256
|
+
f'{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}Or, visit: '
|
257
|
+
f'{server_common.get_server_url()}/api/stream?'
|
258
|
+
f'request_id={short_request_id}'
|
259
|
+
f'\n{ux_utils.INDENT_LAST_SYMBOL}{colorama.Style.DIM}To cancel '
|
260
|
+
'the request, run: '
|
261
|
+
f'{ux_utils.BOLD}sky api cancel {short_request_id}'
|
262
|
+
f'{colorama.Style.RESET_ALL}\n')
|
263
|
+
|
264
|
+
|
147
265
|
def _merge_env_vars(env_dict: Optional[Dict[str, str]],
|
148
266
|
env_list: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
|
149
267
|
"""Merges all values from env_list into env_dict."""
|
@@ -154,6 +272,15 @@ def _merge_env_vars(env_dict: Optional[Dict[str, str]],
|
|
154
272
|
return list(env_dict.items())
|
155
273
|
|
156
274
|
|
275
|
+
_COMMON_OPTIONS = [
|
276
|
+
click.option('--async/--no-async',
|
277
|
+
'async_call',
|
278
|
+
required=False,
|
279
|
+
is_flag=True,
|
280
|
+
default=False,
|
281
|
+
help=('Run the command asynchronously.'))
|
282
|
+
]
|
283
|
+
|
157
284
|
_TASK_OPTIONS = [
|
158
285
|
click.option(
|
159
286
|
'--workdir',
|
@@ -305,14 +432,28 @@ def _complete_cluster_name(ctx: click.Context, param: click.Parameter,
|
|
305
432
|
incomplete: str) -> List[str]:
|
306
433
|
"""Handle shell completion for cluster names."""
|
307
434
|
del ctx, param # Unused.
|
308
|
-
|
435
|
+
# TODO(zhwu): we send requests to API server for completion, which can cause
|
436
|
+
# large latency. We should investigate caching mechanism if needed.
|
437
|
+
response = requests_lib.get(
|
438
|
+
f'{server_common.get_server_url()}'
|
439
|
+
f'/api/completion/cluster_name?incomplete={incomplete}',
|
440
|
+
timeout=2.0,
|
441
|
+
)
|
442
|
+
response.raise_for_status()
|
443
|
+
return response.json()
|
309
444
|
|
310
445
|
|
311
446
|
def _complete_storage_name(ctx: click.Context, param: click.Parameter,
|
312
447
|
incomplete: str) -> List[str]:
|
313
448
|
"""Handle shell completion for storage names."""
|
314
449
|
del ctx, param # Unused.
|
315
|
-
|
450
|
+
response = requests_lib.get(
|
451
|
+
f'{server_common.get_server_url()}'
|
452
|
+
f'/api/completion/storage_name?incomplete={incomplete}',
|
453
|
+
timeout=2.0,
|
454
|
+
)
|
455
|
+
response.raise_for_status()
|
456
|
+
return response.json()
|
316
457
|
|
317
458
|
|
318
459
|
def _complete_file_name(ctx: click.Context, param: click.Parameter,
|
@@ -338,7 +479,6 @@ def _get_shell_complete_args(complete_fn):
|
|
338
479
|
|
339
480
|
|
340
481
|
_RELOAD_ZSH_CMD = 'source ~/.zshrc'
|
341
|
-
_RELOAD_FISH_CMD = 'source ~/.config/fish/config.fish'
|
342
482
|
_RELOAD_BASH_CMD = 'source ~/.bashrc'
|
343
483
|
|
344
484
|
|
@@ -368,14 +508,18 @@ def _install_shell_completion(ctx: click.Context, param: click.Parameter,
|
|
368
508
|
echo "{bashrc_diff}" >> ~/.bashrc'
|
369
509
|
|
370
510
|
cmd = (f'(grep -q "SkyPilot" ~/.bashrc) || '
|
371
|
-
f'[[ ${{BASH_VERSINFO[0]}} -ge 4 ]] && ({install_cmd})'
|
511
|
+
f'([[ ${{BASH_VERSINFO[0]}} -ge 4 ]] && ({install_cmd}) || '
|
512
|
+
f'(echo "Bash must be version 4 or above." && exit 1))')
|
513
|
+
|
372
514
|
reload_cmd = _RELOAD_BASH_CMD
|
373
515
|
|
374
516
|
elif value == 'fish':
|
375
517
|
cmd = '_SKY_COMPLETE=fish_source sky > \
|
376
518
|
~/.config/fish/completions/sky.fish'
|
377
519
|
|
378
|
-
|
520
|
+
# Fish does not need to be reloaded and will automatically pick up
|
521
|
+
# completions.
|
522
|
+
reload_cmd = None
|
379
523
|
|
380
524
|
elif value == 'zsh':
|
381
525
|
install_cmd = f'_SKY_COMPLETE=zsh_source sky > \
|
@@ -390,11 +534,15 @@ def _install_shell_completion(ctx: click.Context, param: click.Parameter,
|
|
390
534
|
ctx.exit()
|
391
535
|
|
392
536
|
try:
|
393
|
-
subprocess.run(cmd,
|
537
|
+
subprocess.run(cmd,
|
538
|
+
shell=True,
|
539
|
+
check=True,
|
540
|
+
executable=shutil.which('bash'))
|
394
541
|
click.secho(f'Shell completion installed for {value}', fg='green')
|
395
|
-
|
396
|
-
|
397
|
-
|
542
|
+
if reload_cmd is not None:
|
543
|
+
click.echo(
|
544
|
+
'Completion will take effect once you restart the terminal: ' +
|
545
|
+
click.style(f'{reload_cmd}', bold=True))
|
398
546
|
except subprocess.CalledProcessError as e:
|
399
547
|
click.secho(f'> Installation failed with code {e.returncode}', fg='red')
|
400
548
|
ctx.exit()
|
@@ -425,7 +573,9 @@ def _uninstall_shell_completion(ctx: click.Context, param: click.Parameter,
|
|
425
573
|
|
426
574
|
elif value == 'fish':
|
427
575
|
cmd = 'rm -f ~/.config/fish/completions/sky.fish'
|
428
|
-
|
576
|
+
# Fish does not need to be reloaded and will automatically pick up
|
577
|
+
# completions.
|
578
|
+
reload_cmd = None
|
429
579
|
|
430
580
|
elif value == 'zsh':
|
431
581
|
cmd = 'sed -i"" -e "/# For SkyPilot shell completion/d" ~/.zshrc && \
|
@@ -441,8 +591,10 @@ def _uninstall_shell_completion(ctx: click.Context, param: click.Parameter,
|
|
441
591
|
try:
|
442
592
|
subprocess.run(cmd, shell=True, check=True)
|
443
593
|
click.secho(f'Shell completion uninstalled for {value}', fg='green')
|
444
|
-
|
445
|
-
|
594
|
+
if reload_cmd is not None:
|
595
|
+
click.echo(
|
596
|
+
'Changes will take effect once you restart the terminal: ' +
|
597
|
+
click.style(f'{reload_cmd}', bold=True))
|
446
598
|
except subprocess.CalledProcessError as e:
|
447
599
|
click.secho(f'> Uninstallation failed with code {e.returncode}',
|
448
600
|
fg='red')
|
@@ -472,14 +624,14 @@ def _parse_override_params(
|
|
472
624
|
image_id: Optional[str] = None,
|
473
625
|
disk_size: Optional[int] = None,
|
474
626
|
disk_tier: Optional[str] = None,
|
475
|
-
ports: Optional[Tuple[str]] = None) -> Dict[str, Any]:
|
627
|
+
ports: Optional[Tuple[str, ...]] = None) -> Dict[str, Any]:
|
476
628
|
"""Parses the override parameters into a dictionary."""
|
477
629
|
override_params: Dict[str, Any] = {}
|
478
630
|
if cloud is not None:
|
479
631
|
if cloud.lower() == 'none':
|
480
632
|
override_params['cloud'] = None
|
481
633
|
else:
|
482
|
-
override_params['cloud'] =
|
634
|
+
override_params['cloud'] = registry.CLOUD_REGISTRY.from_str(cloud)
|
483
635
|
if region is not None:
|
484
636
|
if region.lower() == 'none':
|
485
637
|
override_params['region'] = None
|
@@ -525,91 +677,17 @@ def _parse_override_params(
|
|
525
677
|
else:
|
526
678
|
override_params['disk_tier'] = disk_tier
|
527
679
|
if ports:
|
528
|
-
|
680
|
+
if any(p.lower() == 'none' for p in ports):
|
681
|
+
if len(ports) > 1:
|
682
|
+
with ux_utils.print_exception_no_traceback():
|
683
|
+
raise ValueError('Cannot specify both "none" and other '
|
684
|
+
'ports.')
|
685
|
+
override_params['ports'] = None
|
686
|
+
else:
|
687
|
+
override_params['ports'] = ports
|
529
688
|
return override_params
|
530
689
|
|
531
690
|
|
532
|
-
def _launch_with_confirm(
|
533
|
-
task: sky.Task,
|
534
|
-
backend: backends.Backend,
|
535
|
-
cluster: Optional[str],
|
536
|
-
*,
|
537
|
-
dryrun: bool,
|
538
|
-
detach_run: bool,
|
539
|
-
detach_setup: bool = False,
|
540
|
-
no_confirm: bool = False,
|
541
|
-
idle_minutes_to_autostop: Optional[int] = None,
|
542
|
-
down: bool = False, # pylint: disable=redefined-outer-name
|
543
|
-
retry_until_up: bool = False,
|
544
|
-
no_setup: bool = False,
|
545
|
-
clone_disk_from: Optional[str] = None,
|
546
|
-
):
|
547
|
-
"""Launch a cluster with a Task."""
|
548
|
-
if cluster is None:
|
549
|
-
cluster = backend_utils.generate_cluster_name()
|
550
|
-
|
551
|
-
clone_source_str = ''
|
552
|
-
if clone_disk_from is not None:
|
553
|
-
clone_source_str = f' from the disk of {clone_disk_from!r}'
|
554
|
-
task, _ = backend_utils.check_can_clone_disk_and_override_task(
|
555
|
-
clone_disk_from, cluster, task)
|
556
|
-
|
557
|
-
with sky.Dag() as dag:
|
558
|
-
dag.add(task)
|
559
|
-
|
560
|
-
maybe_status, handle = backend_utils.refresh_cluster_status_handle(cluster)
|
561
|
-
if maybe_status is None:
|
562
|
-
# Show the optimize log before the prompt if the cluster does not exist.
|
563
|
-
try:
|
564
|
-
sky_check.get_cached_enabled_clouds_or_refresh(
|
565
|
-
raise_if_no_cloud_access=True)
|
566
|
-
except exceptions.NoCloudAccessError as e:
|
567
|
-
# Catch the exception where the public cloud is not enabled, and
|
568
|
-
# make it yellow for better visibility.
|
569
|
-
with ux_utils.print_exception_no_traceback():
|
570
|
-
raise RuntimeError(f'{colorama.Fore.YELLOW}{e}'
|
571
|
-
f'{colorama.Style.RESET_ALL}') from e
|
572
|
-
dag = sky.optimize(dag)
|
573
|
-
task = dag.tasks[0]
|
574
|
-
|
575
|
-
if handle is not None:
|
576
|
-
backend.check_resources_fit_cluster(handle, task)
|
577
|
-
|
578
|
-
confirm_shown = False
|
579
|
-
if not no_confirm:
|
580
|
-
# Prompt if (1) --cluster is None, or (2) cluster doesn't exist, or (3)
|
581
|
-
# it exists but is STOPPED.
|
582
|
-
prompt = None
|
583
|
-
if maybe_status is None:
|
584
|
-
cluster_str = '' if cluster is None else f' {cluster!r}'
|
585
|
-
prompt = (
|
586
|
-
f'Launching a new cluster{cluster_str}{clone_source_str}. '
|
587
|
-
'Proceed?')
|
588
|
-
elif maybe_status == status_lib.ClusterStatus.STOPPED:
|
589
|
-
prompt = f'Restarting the stopped cluster {cluster!r}. Proceed?'
|
590
|
-
if prompt is not None:
|
591
|
-
confirm_shown = True
|
592
|
-
click.confirm(prompt, default=True, abort=True, show_default=True)
|
593
|
-
|
594
|
-
if not confirm_shown:
|
595
|
-
click.secho(f'Running task on cluster {cluster}...', fg='yellow')
|
596
|
-
|
597
|
-
sky.launch(
|
598
|
-
dag,
|
599
|
-
dryrun=dryrun,
|
600
|
-
stream_logs=True,
|
601
|
-
cluster_name=cluster,
|
602
|
-
detach_setup=detach_setup,
|
603
|
-
detach_run=detach_run,
|
604
|
-
backend=backend,
|
605
|
-
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
606
|
-
down=down,
|
607
|
-
retry_until_up=retry_until_up,
|
608
|
-
no_setup=no_setup,
|
609
|
-
clone_disk_from=clone_disk_from,
|
610
|
-
)
|
611
|
-
|
612
|
-
|
613
691
|
def _check_yaml(entrypoint: str) -> Tuple[bool, Optional[Dict[str, Any]]]:
|
614
692
|
"""Checks if entrypoint is a readable YAML file.
|
615
693
|
|
@@ -690,7 +768,6 @@ def _pop_and_ignore_fields_in_override_params(
|
|
690
768
|
def _make_task_or_dag_from_entrypoint_with_overrides(
|
691
769
|
entrypoint: Tuple[str, ...],
|
692
770
|
*,
|
693
|
-
entrypoint_name: str = 'Task',
|
694
771
|
name: Optional[str] = None,
|
695
772
|
workdir: Optional[str] = None,
|
696
773
|
cloud: Optional[str] = None,
|
@@ -705,7 +782,7 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
705
782
|
image_id: Optional[str] = None,
|
706
783
|
disk_size: Optional[int] = None,
|
707
784
|
disk_tier: Optional[str] = None,
|
708
|
-
ports: Optional[Tuple[str]] = None,
|
785
|
+
ports: Optional[Tuple[str, ...]] = None,
|
709
786
|
env: Optional[List[Tuple[str, str]]] = None,
|
710
787
|
field_to_ignore: Optional[List[str]] = None,
|
711
788
|
# job launch specific
|
@@ -722,19 +799,15 @@ def _make_task_or_dag_from_entrypoint_with_overrides(
|
|
722
799
|
entrypoint: Optional[str]
|
723
800
|
if is_yaml:
|
724
801
|
# Treat entrypoint as a yaml.
|
725
|
-
click.secho(
|
726
|
-
|
727
|
-
nl=False)
|
728
|
-
click.secho(entrypoint, bold=True)
|
802
|
+
click.secho('YAML to run: ', fg='cyan', nl=False)
|
803
|
+
click.secho(entrypoint)
|
729
804
|
else:
|
730
805
|
if not entrypoint:
|
731
806
|
entrypoint = None
|
732
807
|
else:
|
733
808
|
# Treat entrypoint as a bash command.
|
734
|
-
click.secho(
|
735
|
-
|
736
|
-
nl=False)
|
737
|
-
click.secho(entrypoint, bold=True)
|
809
|
+
click.secho('Command to run: ', fg='cyan', nl=False)
|
810
|
+
click.secho(entrypoint)
|
738
811
|
|
739
812
|
override_params = _parse_override_params(cloud=cloud,
|
740
813
|
region=region,
|
@@ -798,7 +871,7 @@ class _NaturalOrderGroup(click.Group):
|
|
798
871
|
Reference: https://github.com/pallets/click/issues/513
|
799
872
|
"""
|
800
873
|
|
801
|
-
def list_commands(self, ctx):
|
874
|
+
def list_commands(self, ctx): # pylint: disable=unused-argument
|
802
875
|
return self.commands.keys()
|
803
876
|
|
804
877
|
@usage_lib.entrypoint('sky.cli', fallback=True)
|
@@ -925,6 +998,7 @@ def _deprecate_and_hide_command(group, command_to_deprecate,
|
|
925
998
|
prog_name='skypilot',
|
926
999
|
message='%(prog)s, commit %(version)s',
|
927
1000
|
help='Show the commit hash and exit')
|
1001
|
+
@annotations.client_api
|
928
1002
|
def cli():
|
929
1003
|
pass
|
930
1004
|
|
@@ -945,20 +1019,9 @@ def cli():
|
|
945
1019
|
default=False,
|
946
1020
|
is_flag=True,
|
947
1021
|
help='If True, do not actually run the job.')
|
948
|
-
@click.option(
|
949
|
-
'--detach-setup',
|
950
|
-
'-s',
|
951
|
-
default=False,
|
952
|
-
is_flag=True,
|
953
|
-
help=
|
954
|
-
('If True, run setup in non-interactive mode as part of the job itself. '
|
955
|
-
'You can safely ctrl-c to detach from logging, and it will not interrupt '
|
956
|
-
'the setup process. To see the logs again after detaching, use `sky logs`.'
|
957
|
-
' To cancel setup, cancel the job via `sky cancel`. Useful for long-'
|
958
|
-
'running setup commands.'))
|
959
1022
|
@click.option(
|
960
1023
|
'--detach-run',
|
961
|
-
'-d',
|
1024
|
+
'-d/-no-d',
|
962
1025
|
default=False,
|
963
1026
|
is_flag=True,
|
964
1027
|
help=('If True, as soon as a job is submitted, return from this call '
|
@@ -967,8 +1030,12 @@ def cli():
|
|
967
1030
|
'backend_name',
|
968
1031
|
flag_value=backends.LocalDockerBackend.NAME,
|
969
1032
|
default=False,
|
970
|
-
|
971
|
-
|
1033
|
+
hidden=True,
|
1034
|
+
help=('(Deprecated) Local docker support is deprecated. '
|
1035
|
+
'To run locally, create a local Kubernetes cluster with '
|
1036
|
+
'``sky local up``.'))
|
1037
|
+
@_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS +
|
1038
|
+
_COMMON_OPTIONS)
|
972
1039
|
@click.option(
|
973
1040
|
'--idle-minutes-to-autostop',
|
974
1041
|
'-i',
|
@@ -1028,38 +1095,45 @@ def cli():
|
|
1028
1095
|
help=('[Experimental] Clone disk from an existing cluster to launch '
|
1029
1096
|
'a new one. This is useful when the new cluster needs to have '
|
1030
1097
|
'the same data on the boot disk as an existing cluster.'))
|
1098
|
+
@click.option(
|
1099
|
+
'--fast',
|
1100
|
+
is_flag=True,
|
1101
|
+
default=False,
|
1102
|
+
required=False,
|
1103
|
+
help=('[Experimental] If the cluster is already up and available, skip '
|
1104
|
+
'provisioning and setup steps.'))
|
1031
1105
|
@usage_lib.entrypoint
|
1032
1106
|
def launch(
|
1033
|
-
|
1034
|
-
|
1035
|
-
|
1036
|
-
|
1037
|
-
|
1038
|
-
|
1039
|
-
|
1040
|
-
|
1041
|
-
|
1042
|
-
|
1043
|
-
|
1044
|
-
|
1045
|
-
|
1046
|
-
|
1047
|
-
|
1048
|
-
|
1049
|
-
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
|
1055
|
-
|
1056
|
-
|
1057
|
-
|
1058
|
-
|
1059
|
-
|
1060
|
-
|
1061
|
-
|
1062
|
-
):
|
1107
|
+
entrypoint: Tuple[str, ...],
|
1108
|
+
cluster: Optional[str],
|
1109
|
+
dryrun: bool,
|
1110
|
+
detach_run: bool,
|
1111
|
+
backend_name: Optional[str],
|
1112
|
+
name: Optional[str],
|
1113
|
+
workdir: Optional[str],
|
1114
|
+
cloud: Optional[str],
|
1115
|
+
region: Optional[str],
|
1116
|
+
zone: Optional[str],
|
1117
|
+
gpus: Optional[str],
|
1118
|
+
cpus: Optional[str],
|
1119
|
+
memory: Optional[str],
|
1120
|
+
instance_type: Optional[str],
|
1121
|
+
num_nodes: Optional[int],
|
1122
|
+
use_spot: Optional[bool],
|
1123
|
+
image_id: Optional[str],
|
1124
|
+
env_file: Optional[Dict[str, str]],
|
1125
|
+
env: List[Tuple[str, str]],
|
1126
|
+
disk_size: Optional[int],
|
1127
|
+
disk_tier: Optional[str],
|
1128
|
+
ports: Tuple[str, ...],
|
1129
|
+
idle_minutes_to_autostop: Optional[int],
|
1130
|
+
down: bool, # pylint: disable=redefined-outer-name
|
1131
|
+
retry_until_up: bool,
|
1132
|
+
yes: bool,
|
1133
|
+
no_setup: bool,
|
1134
|
+
clone_disk_from: Optional[str],
|
1135
|
+
fast: bool,
|
1136
|
+
async_call: bool):
|
1063
1137
|
"""Launch a cluster or task.
|
1064
1138
|
|
1065
1139
|
If ENTRYPOINT points to a valid YAML file, it is read in as the task
|
@@ -1069,6 +1143,14 @@ def launch(
|
|
1069
1143
|
and they undergo job queue scheduling.
|
1070
1144
|
"""
|
1071
1145
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
1146
|
+
# TODO(zhwu): the current --async is a bit inconsistent with the direct
|
1147
|
+
# sky launch, as `sky api logs` does not contain the logs for the actual job
|
1148
|
+
# submitted, while the synchronous way of `sky launch` does. We should
|
1149
|
+
# consider having the job logs available in `sky api logs` as well.
|
1150
|
+
# Reason for not doing it right now: immediately tailing the logs for the
|
1151
|
+
# job can take up resources on the API server. When there are a lot of
|
1152
|
+
# `launch` submitted asynchronously, the log tailing may overwhelm the API
|
1153
|
+
# server, if the jobs are long running.
|
1072
1154
|
env = _merge_env_vars(env_file, env)
|
1073
1155
|
controller_utils.check_cluster_name_not_controller(
|
1074
1156
|
cluster, operation_str='Launching tasks on it')
|
@@ -1102,6 +1184,11 @@ def launch(
|
|
1102
1184
|
backend: backends.Backend
|
1103
1185
|
if backend_name == backends.LocalDockerBackend.NAME:
|
1104
1186
|
backend = backends.LocalDockerBackend()
|
1187
|
+
click.secho(
|
1188
|
+
'WARNING: LocalDockerBackend is deprecated and will be '
|
1189
|
+
'removed in a future release. To run locally, create a local '
|
1190
|
+
'Kubernetes cluster with `sky local up`.',
|
1191
|
+
fg='yellow')
|
1105
1192
|
elif backend_name == backends.CloudVmRayBackend.NAME:
|
1106
1193
|
backend = backends.CloudVmRayBackend()
|
1107
1194
|
else:
|
@@ -1116,18 +1203,35 @@ def launch(
|
|
1116
1203
|
f'{colorama.Style.RESET_ALL}{colorama.Style.BRIGHT}sky serve up'
|
1117
1204
|
f'{colorama.Style.RESET_ALL}')
|
1118
1205
|
|
1119
|
-
|
1120
|
-
|
1121
|
-
|
1122
|
-
|
1123
|
-
|
1124
|
-
|
1125
|
-
|
1126
|
-
|
1127
|
-
|
1128
|
-
|
1129
|
-
|
1130
|
-
|
1206
|
+
request_id = sdk.launch(
|
1207
|
+
task,
|
1208
|
+
dryrun=dryrun,
|
1209
|
+
cluster_name=cluster,
|
1210
|
+
backend=backend,
|
1211
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
1212
|
+
down=down,
|
1213
|
+
retry_until_up=retry_until_up,
|
1214
|
+
no_setup=no_setup,
|
1215
|
+
clone_disk_from=clone_disk_from,
|
1216
|
+
fast=fast,
|
1217
|
+
_need_confirmation=not yes,
|
1218
|
+
)
|
1219
|
+
job_id_handle = _async_call_or_wait(request_id, async_call, 'sky.launch')
|
1220
|
+
if not async_call:
|
1221
|
+
job_id, handle = job_id_handle
|
1222
|
+
if not handle:
|
1223
|
+
assert dryrun, 'handle should only be None when dryrun is true'
|
1224
|
+
return
|
1225
|
+
# Add ssh config for the cluster
|
1226
|
+
_get_cluster_records_and_set_ssh_config(
|
1227
|
+
clusters=[handle.get_cluster_name()])
|
1228
|
+
# job_id will be None if no job was submitted (e.g. no entrypoint
|
1229
|
+
# provided)
|
1230
|
+
if not detach_run and job_id is not None:
|
1231
|
+
sdk.tail_logs(handle.get_cluster_name(), job_id, follow=True)
|
1232
|
+
click.secho(
|
1233
|
+
ux_utils.command_hint_messages(ux_utils.CommandHintType.CLUSTER_JOB,
|
1234
|
+
job_id, handle.get_cluster_name()))
|
1131
1235
|
|
1132
1236
|
|
1133
1237
|
@cli.command(cls=_DocumentedCodeCommand)
|
@@ -1155,32 +1259,19 @@ def launch(
|
|
1155
1259
|
is_flag=True,
|
1156
1260
|
help=('If True, as soon as a job is submitted, return from this call '
|
1157
1261
|
'and do not stream execution logs.'))
|
1158
|
-
@_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS
|
1262
|
+
@_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS +
|
1263
|
+
_COMMON_OPTIONS)
|
1159
1264
|
@usage_lib.entrypoint
|
1160
1265
|
# pylint: disable=redefined-builtin
|
1161
|
-
def exec(
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1166
|
-
|
1167
|
-
|
1168
|
-
|
1169
|
-
|
1170
|
-
workdir: Optional[str],
|
1171
|
-
gpus: Optional[str],
|
1172
|
-
ports: Tuple[str],
|
1173
|
-
instance_type: Optional[str],
|
1174
|
-
num_nodes: Optional[int],
|
1175
|
-
use_spot: Optional[bool],
|
1176
|
-
image_id: Optional[str],
|
1177
|
-
env_file: Optional[Dict[str, str]],
|
1178
|
-
env: List[Tuple[str, str]],
|
1179
|
-
cpus: Optional[str],
|
1180
|
-
memory: Optional[str],
|
1181
|
-
disk_size: Optional[int],
|
1182
|
-
disk_tier: Optional[str],
|
1183
|
-
):
|
1266
|
+
def exec(cluster: Optional[str], cluster_option: Optional[str],
|
1267
|
+
entrypoint: Tuple[str, ...], detach_run: bool, name: Optional[str],
|
1268
|
+
cloud: Optional[str], region: Optional[str], zone: Optional[str],
|
1269
|
+
workdir: Optional[str], gpus: Optional[str], ports: Tuple[str],
|
1270
|
+
instance_type: Optional[str], num_nodes: Optional[int],
|
1271
|
+
use_spot: Optional[bool], image_id: Optional[str],
|
1272
|
+
env_file: Optional[Dict[str, str]], env: List[Tuple[str, str]],
|
1273
|
+
cpus: Optional[str], memory: Optional[str], disk_size: Optional[int],
|
1274
|
+
disk_tier: Optional[str], async_call: bool):
|
1184
1275
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
1185
1276
|
"""Execute a task or command on an existing cluster.
|
1186
1277
|
|
@@ -1253,11 +1344,6 @@ def exec(
|
|
1253
1344
|
env = _merge_env_vars(env_file, env)
|
1254
1345
|
controller_utils.check_cluster_name_not_controller(
|
1255
1346
|
cluster, operation_str='Executing task on it')
|
1256
|
-
handle = global_user_state.get_handle_from_cluster_name(cluster)
|
1257
|
-
if handle is None:
|
1258
|
-
raise click.BadParameter(f'Cluster {cluster!r} not found. '
|
1259
|
-
'Use `sky launch` to provision first.')
|
1260
|
-
backend = backend_utils.get_backend_from_handle(handle)
|
1261
1347
|
|
1262
1348
|
task_or_dag = _make_task_or_dag_from_entrypoint_with_overrides(
|
1263
1349
|
entrypoint=entrypoint,
|
@@ -1285,23 +1371,26 @@ def exec(
|
|
1285
1371
|
'supports a single task only.')
|
1286
1372
|
task = task_or_dag
|
1287
1373
|
|
1288
|
-
click.secho(
|
1289
|
-
|
1374
|
+
click.secho('Submitting job to cluster: ', fg='cyan', nl=False)
|
1375
|
+
click.secho(cluster)
|
1376
|
+
request_id = sdk.exec(task, cluster_name=cluster)
|
1377
|
+
job_id_handle = _async_call_or_wait(request_id, async_call, 'sky.exec')
|
1378
|
+
if not async_call and not detach_run:
|
1379
|
+
job_id, _ = job_id_handle
|
1380
|
+
sdk.tail_logs(cluster, job_id, follow=True)
|
1290
1381
|
|
1291
1382
|
|
1292
|
-
def
|
1293
|
-
|
1294
|
-
skip_finished: bool,
|
1383
|
+
def _handle_jobs_queue_request(
|
1384
|
+
request_id: str,
|
1295
1385
|
show_all: bool,
|
1386
|
+
show_user: bool,
|
1296
1387
|
limit_num_jobs_to_show: bool = False,
|
1297
1388
|
is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
|
1298
1389
|
"""Get the in-progress managed jobs.
|
1299
1390
|
|
1300
1391
|
Args:
|
1301
|
-
refresh: Query the latest statuses, restarting the jobs controller if
|
1302
|
-
stopped.
|
1303
|
-
skip_finished: Show only in-progress jobs.
|
1304
1392
|
show_all: Show all information of each job (e.g., region, price).
|
1393
|
+
show_user: Show the user who submitted the job.
|
1305
1394
|
limit_num_jobs_to_show: If True, limit the number of jobs to show to
|
1306
1395
|
_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS, which is mainly used by
|
1307
1396
|
`sky status`.
|
@@ -1314,14 +1403,13 @@ def _get_managed_jobs(
|
|
1314
1403
|
msg contains the error message. Otherwise, msg contains the formatted
|
1315
1404
|
managed job table.
|
1316
1405
|
"""
|
1406
|
+
# TODO(SKY-980): remove unnecessary fallbacks on the client side.
|
1317
1407
|
num_in_progress_jobs = None
|
1408
|
+
msg = ''
|
1318
1409
|
try:
|
1319
1410
|
if not is_called_by_user:
|
1320
1411
|
usage_lib.messages.usage.set_internal()
|
1321
|
-
|
1322
|
-
# Make the call silent
|
1323
|
-
managed_jobs_ = managed_jobs.queue(refresh=refresh,
|
1324
|
-
skip_finished=skip_finished)
|
1412
|
+
managed_jobs_ = sdk.get(request_id)
|
1325
1413
|
num_in_progress_jobs = len(set(job['job_id'] for job in managed_jobs_))
|
1326
1414
|
except exceptions.ClusterNotUpError as e:
|
1327
1415
|
controller_status = e.cluster_status
|
@@ -1334,17 +1422,19 @@ def _get_managed_jobs(
|
|
1334
1422
|
msg += (f' (See finished managed jobs: {colorama.Style.BRIGHT}'
|
1335
1423
|
f'sky jobs queue --refresh{colorama.Style.RESET_ALL})')
|
1336
1424
|
except RuntimeError as e:
|
1337
|
-
msg = ''
|
1338
1425
|
try:
|
1339
1426
|
# Check the controller status again, as the RuntimeError is likely
|
1340
1427
|
# due to the controller being autostopped when querying the jobs.
|
1341
|
-
|
1342
|
-
|
1343
|
-
|
1344
|
-
|
1345
|
-
|
1346
|
-
|
1347
|
-
|
1428
|
+
# Since we are client-side, we may not know the exact name of the
|
1429
|
+
# controller, so use the prefix with a wildcard.
|
1430
|
+
# Query status of the controller cluster.
|
1431
|
+
records = sdk.get(
|
1432
|
+
sdk.status(cluster_names=[common.JOB_CONTROLLER_PREFIX + '*'],
|
1433
|
+
all_users=True))
|
1434
|
+
if (not records or
|
1435
|
+
records[0]['status'] == status_lib.ClusterStatus.STOPPED):
|
1436
|
+
controller = controller_utils.Controllers.JOBS_CONTROLLER.value
|
1437
|
+
msg = controller.default_hint_if_non_existent
|
1348
1438
|
except Exception: # pylint: disable=broad-except
|
1349
1439
|
# This is to an best effort to find the latest controller status to
|
1350
1440
|
# print more helpful message, so we can ignore any exception to
|
@@ -1357,21 +1447,28 @@ def _get_managed_jobs(
|
|
1357
1447
|
f'Details: {common_utils.format_exception(e, use_bracket=True)}'
|
1358
1448
|
)
|
1359
1449
|
except Exception as e: # pylint: disable=broad-except
|
1360
|
-
msg =
|
1361
|
-
|
1450
|
+
msg = ''
|
1451
|
+
if env_options.Options.SHOW_DEBUG_INFO.get():
|
1452
|
+
msg += traceback.format_exc()
|
1453
|
+
msg += '\n'
|
1454
|
+
msg += ('Failed to query managed jobs: '
|
1455
|
+
f'{common_utils.format_exception(e, use_bracket=True)}')
|
1362
1456
|
else:
|
1363
1457
|
max_jobs_to_show = (_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS
|
1364
1458
|
if limit_num_jobs_to_show else None)
|
1365
1459
|
msg = managed_jobs.format_job_table(managed_jobs_,
|
1366
1460
|
show_all=show_all,
|
1461
|
+
show_user=show_user,
|
1367
1462
|
max_jobs=max_jobs_to_show)
|
1368
1463
|
return num_in_progress_jobs, msg
|
1369
1464
|
|
1370
1465
|
|
1371
|
-
def
|
1372
|
-
|
1373
|
-
|
1374
|
-
|
1466
|
+
def _handle_services_request(
|
1467
|
+
request_id: str,
|
1468
|
+
service_names: Optional[List[str]],
|
1469
|
+
show_all: bool,
|
1470
|
+
show_endpoint: bool,
|
1471
|
+
is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
|
1375
1472
|
"""Get service statuses.
|
1376
1473
|
|
1377
1474
|
Args:
|
@@ -1390,12 +1487,8 @@ def _get_services(service_names: Optional[List[str]],
|
|
1390
1487
|
try:
|
1391
1488
|
if not is_called_by_user:
|
1392
1489
|
usage_lib.messages.usage.set_internal()
|
1393
|
-
|
1394
|
-
|
1395
|
-
# Change empty list to None
|
1396
|
-
service_names = None
|
1397
|
-
service_records = serve_lib.status(service_names)
|
1398
|
-
num_services = len(service_records)
|
1490
|
+
service_records = sdk.get(request_id)
|
1491
|
+
num_services = len(service_records)
|
1399
1492
|
except exceptions.ClusterNotUpError as e:
|
1400
1493
|
controller_status = e.cluster_status
|
1401
1494
|
msg = str(e)
|
@@ -1408,13 +1501,18 @@ def _get_services(service_names: Optional[List[str]],
|
|
1408
1501
|
# Check the controller status again, as the RuntimeError is likely
|
1409
1502
|
# due to the controller being autostopped when querying the
|
1410
1503
|
# services.
|
1411
|
-
|
1412
|
-
|
1413
|
-
|
1414
|
-
|
1415
|
-
|
1416
|
-
|
1417
|
-
|
1504
|
+
# Since we are client-side, we may not know the exact name of the
|
1505
|
+
# controller, so use the prefix with a wildcard.
|
1506
|
+
# Query status of the controller cluster.
|
1507
|
+
records = sdk.get(
|
1508
|
+
sdk.status(
|
1509
|
+
cluster_names=[common.SKY_SERVE_CONTROLLER_PREFIX + '*'],
|
1510
|
+
all_users=True))
|
1511
|
+
if (not records or
|
1512
|
+
records[0]['status'] == status_lib.ClusterStatus.STOPPED):
|
1513
|
+
controller = (
|
1514
|
+
controller_utils.Controllers.SKY_SERVE_CONTROLLER.value)
|
1515
|
+
msg = controller.default_hint_if_non_existent
|
1418
1516
|
except Exception: # pylint: disable=broad-except
|
1419
1517
|
# This is to an best effort to find the latest controller status to
|
1420
1518
|
# print more helpful message, so we can ignore any exception to
|
@@ -1432,12 +1530,13 @@ def _get_services(service_names: Optional[List[str]],
|
|
1432
1530
|
if len(service_records) != 1:
|
1433
1531
|
plural = 's' if len(service_records) > 1 else ''
|
1434
1532
|
service_num = (str(len(service_records))
|
1435
|
-
if
|
1533
|
+
if service_records else 'No')
|
1436
1534
|
raise click.UsageError(
|
1437
1535
|
f'{service_num} service{plural} found. Please specify '
|
1438
1536
|
'an existing service to show its endpoint. Usage: '
|
1439
1537
|
'sky serve status --endpoint <service-name>')
|
1440
|
-
|
1538
|
+
endpoint = service_records[0]['endpoint']
|
1539
|
+
msg = '-' if endpoint is None else endpoint
|
1441
1540
|
else:
|
1442
1541
|
msg = serve_lib.format_service_table(service_records, show_all)
|
1443
1542
|
service_not_found_msg = ''
|
@@ -1452,9 +1551,105 @@ def _get_services(service_names: Optional[List[str]],
|
|
1452
1551
|
return num_services, msg
|
1453
1552
|
|
1454
1553
|
|
1554
|
+
def _status_kubernetes(show_all: bool):
|
1555
|
+
"""Show all SkyPilot resources in the current Kubernetes context.
|
1556
|
+
|
1557
|
+
Args:
|
1558
|
+
show_all (bool): Show all job information (e.g., start time, failures).
|
1559
|
+
"""
|
1560
|
+
all_clusters, unmanaged_clusters, all_jobs, context = (sdk.stream_and_get(
|
1561
|
+
sdk.status_kubernetes()))
|
1562
|
+
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1563
|
+
f'Kubernetes cluster state (context: {context})'
|
1564
|
+
f'{colorama.Style.RESET_ALL}')
|
1565
|
+
status_utils.show_kubernetes_cluster_status_table(unmanaged_clusters,
|
1566
|
+
show_all)
|
1567
|
+
if all_jobs:
|
1568
|
+
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1569
|
+
f'Managed jobs'
|
1570
|
+
f'{colorama.Style.RESET_ALL}')
|
1571
|
+
msg = managed_jobs.format_job_table(all_jobs,
|
1572
|
+
show_all=show_all,
|
1573
|
+
show_user=False)
|
1574
|
+
click.echo(msg)
|
1575
|
+
if any(['sky-serve-controller' in c.cluster_name for c in all_clusters]):
|
1576
|
+
# TODO: Parse serve controllers and show services separately.
|
1577
|
+
# Currently we show a hint that services are shown as clusters.
|
1578
|
+
click.echo(f'\n{colorama.Style.DIM}Hint: SkyServe replica pods are '
|
1579
|
+
'shown in the "SkyPilot clusters" section.'
|
1580
|
+
f'{colorama.Style.RESET_ALL}')
|
1581
|
+
|
1582
|
+
|
1583
|
+
def _show_endpoint(query_clusters: Optional[List[str]],
|
1584
|
+
cluster_records: List[Dict[str, Any]], ip: bool,
|
1585
|
+
endpoints: bool, endpoint: Optional[int]) -> None:
|
1586
|
+
show_endpoints = endpoints or endpoint is not None
|
1587
|
+
show_single_endpoint = endpoint is not None
|
1588
|
+
if len(cluster_records) != 1:
|
1589
|
+
with ux_utils.print_exception_no_traceback():
|
1590
|
+
plural = 's' if len(cluster_records) > 1 else ''
|
1591
|
+
if cluster_records:
|
1592
|
+
cluster_num = str(len(cluster_records))
|
1593
|
+
else:
|
1594
|
+
cluster_num = (f'{query_clusters[0]!r}'
|
1595
|
+
if query_clusters else 'No')
|
1596
|
+
verb = 'found' if cluster_records else 'not found'
|
1597
|
+
cause = 'a single'
|
1598
|
+
if query_clusters and len(query_clusters) > 1:
|
1599
|
+
cause = 'an existing'
|
1600
|
+
raise ValueError(
|
1601
|
+
_STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format(
|
1602
|
+
cluster_num=cluster_num,
|
1603
|
+
plural=plural,
|
1604
|
+
verb=verb,
|
1605
|
+
cause=cause,
|
1606
|
+
property='IP address' if ip else 'endpoint(s)',
|
1607
|
+
flag='ip' if ip else
|
1608
|
+
('endpoint port' if show_single_endpoint else 'endpoints')))
|
1609
|
+
|
1610
|
+
cluster_record = cluster_records[0]
|
1611
|
+
if cluster_record['status'] != status_lib.ClusterStatus.UP:
|
1612
|
+
with ux_utils.print_exception_no_traceback():
|
1613
|
+
raise RuntimeError(f'Cluster {cluster_record["name"]!r} '
|
1614
|
+
'is not in UP status.')
|
1615
|
+
handle = cluster_record['handle']
|
1616
|
+
if not isinstance(handle, backends.CloudVmRayResourceHandle):
|
1617
|
+
with ux_utils.print_exception_no_traceback():
|
1618
|
+
raise ValueError('Querying IP address is not supported '
|
1619
|
+
'for local clusters.')
|
1620
|
+
|
1621
|
+
head_ip = handle.external_ips()[0]
|
1622
|
+
# The endpoint request is relatively fast, so we don't add special handling
|
1623
|
+
# for keyboard interrupt and abort the request to avoid additional latency.
|
1624
|
+
if show_endpoints:
|
1625
|
+
if endpoint:
|
1626
|
+
request_id = sdk.endpoints(cluster_record['name'], endpoint)
|
1627
|
+
cluster_endpoints = sdk.stream_and_get(request_id)
|
1628
|
+
cluster_endpoint = cluster_endpoints.get(str(endpoint), None)
|
1629
|
+
if not cluster_endpoint:
|
1630
|
+
raise click.Abort(f'Endpoint {endpoint} not found for cluster '
|
1631
|
+
f'{cluster_record["name"]!r}.')
|
1632
|
+
click.echo(cluster_endpoint)
|
1633
|
+
else:
|
1634
|
+
request_id = sdk.endpoints(cluster_record['name'])
|
1635
|
+
cluster_endpoints = sdk.stream_and_get(request_id)
|
1636
|
+
assert isinstance(cluster_endpoints, dict)
|
1637
|
+
if not cluster_endpoints:
|
1638
|
+
raise click.Abort(f'No endpoint found for cluster '
|
1639
|
+
f'{cluster_record["name"]!r}.')
|
1640
|
+
for port, port_endpoint in cluster_endpoints.items():
|
1641
|
+
click.echo(f'{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}'
|
1642
|
+
f'{colorama.Style.RESET_ALL}: '
|
1643
|
+
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1644
|
+
f'{port_endpoint}{colorama.Style.RESET_ALL}')
|
1645
|
+
return
|
1646
|
+
click.echo(head_ip)
|
1647
|
+
return
|
1648
|
+
|
1649
|
+
|
1455
1650
|
@cli.command()
|
1456
|
-
@click.option('--
|
1457
|
-
'-
|
1651
|
+
@click.option('--verbose',
|
1652
|
+
'-v',
|
1458
1653
|
default=False,
|
1459
1654
|
is_flag=True,
|
1460
1655
|
required=False,
|
@@ -1497,16 +1692,32 @@ def _get_services(service_names: Optional[List[str]],
|
|
1497
1692
|
is_flag=True,
|
1498
1693
|
required=False,
|
1499
1694
|
help='Also show sky serve services, if any.')
|
1695
|
+
@click.option(
|
1696
|
+
'--kubernetes',
|
1697
|
+
'--k8s',
|
1698
|
+
default=False,
|
1699
|
+
is_flag=True,
|
1700
|
+
required=False,
|
1701
|
+
help='[Experimental] Show all SkyPilot resources (including from other '
|
1702
|
+
'users) in the current Kubernetes context.')
|
1500
1703
|
@click.argument('clusters',
|
1501
1704
|
required=False,
|
1502
1705
|
type=str,
|
1503
1706
|
nargs=-1,
|
1504
1707
|
**_get_shell_complete_args(_complete_cluster_name))
|
1708
|
+
@click.option('--all-users',
|
1709
|
+
'-u',
|
1710
|
+
default=False,
|
1711
|
+
is_flag=True,
|
1712
|
+
required=False,
|
1713
|
+
help='Show all clusters, including those not owned by the '
|
1714
|
+
'current user.')
|
1505
1715
|
@usage_lib.entrypoint
|
1506
1716
|
# pylint: disable=redefined-builtin
|
1507
|
-
def status(
|
1717
|
+
def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
1508
1718
|
endpoint: Optional[int], show_managed_jobs: bool,
|
1509
|
-
show_services: bool, clusters: List[str]
|
1719
|
+
show_services: bool, kubernetes: bool, clusters: List[str],
|
1720
|
+
all_users: bool):
|
1510
1721
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
1511
1722
|
"""Show clusters.
|
1512
1723
|
|
@@ -1521,11 +1732,15 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1521
1732
|
``sky status --endpoints mycluster``. To query a single endpoint, you
|
1522
1733
|
can use ``sky status mycluster --endpoint 8888``.
|
1523
1734
|
|
1735
|
+
Running `sky status` will update the ssh config for the clusters locally, so
|
1736
|
+
that you can directly ssh into the clusters or connect to the clusters with
|
1737
|
+
vscode.
|
1738
|
+
|
1524
1739
|
The following fields for each cluster are recorded: cluster name, time
|
1525
1740
|
since last launch, resources, region, zone, hourly price, status, autostop,
|
1526
1741
|
command.
|
1527
1742
|
|
1528
|
-
Display all fields using ``sky status -
|
1743
|
+
Display all fields using ``sky status -v``.
|
1529
1744
|
|
1530
1745
|
Each cluster can have one of the following statuses:
|
1531
1746
|
|
@@ -1565,243 +1780,163 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1565
1780
|
or for autostop-enabled clusters, use ``--refresh`` to query the latest
|
1566
1781
|
cluster statuses from the cloud providers.
|
1567
1782
|
"""
|
1568
|
-
|
1569
|
-
|
1570
|
-
|
1571
|
-
|
1572
|
-
|
1573
|
-
|
1574
|
-
|
1575
|
-
|
1576
|
-
|
1577
|
-
|
1578
|
-
|
1579
|
-
|
1580
|
-
|
1581
|
-
|
1582
|
-
|
1583
|
-
|
1584
|
-
|
1585
|
-
|
1586
|
-
|
1587
|
-
|
1588
|
-
|
1589
|
-
|
1590
|
-
|
1591
|
-
|
1592
|
-
services_future = pool.apply_async(_get_services,
|
1593
|
-
kwds=dict(
|
1594
|
-
service_names=None,
|
1595
|
-
show_all=False,
|
1596
|
-
show_endpoint=False,
|
1597
|
-
is_called_by_user=False))
|
1598
|
-
if ip or show_endpoints:
|
1599
|
-
if refresh:
|
1600
|
-
raise click.UsageError(
|
1601
|
-
'Using --ip or --endpoint(s) with --refresh is not'
|
1602
|
-
'supported for now. To fix, refresh first, '
|
1603
|
-
'then query the IP or endpoint.')
|
1783
|
+
if kubernetes:
|
1784
|
+
_status_kubernetes(verbose)
|
1785
|
+
return
|
1786
|
+
# Do not show job queue if user specifies clusters, and if user
|
1787
|
+
# specifies --ip or --endpoint(s).
|
1788
|
+
show_managed_jobs = show_managed_jobs and not any([clusters, ip, endpoints])
|
1789
|
+
if show_managed_jobs:
|
1790
|
+
managed_jobs_queue_request_id = managed_jobs.queue(refresh=False,
|
1791
|
+
skip_finished=True,
|
1792
|
+
all_users=all_users)
|
1793
|
+
show_endpoints = endpoints or endpoint is not None
|
1794
|
+
show_single_endpoint = endpoint is not None
|
1795
|
+
show_services = show_services and not any([clusters, ip, endpoints])
|
1796
|
+
if show_services:
|
1797
|
+
# Run the sky serve service query in parallel to speed up the
|
1798
|
+
# status query.
|
1799
|
+
service_status_request_id = serve_lib.status(service_names=None)
|
1800
|
+
|
1801
|
+
if ip or show_endpoints:
|
1802
|
+
if refresh:
|
1803
|
+
raise click.UsageError(
|
1804
|
+
'Using --ip or --endpoint(s) with --refresh is not'
|
1805
|
+
'supported for now. To fix, refresh first, '
|
1806
|
+
'then query the IP or endpoint.')
|
1604
1807
|
|
1605
|
-
|
1606
|
-
|
1607
|
-
|
1608
|
-
|
1609
|
-
'at the same time.')
|
1808
|
+
if ip and show_endpoints:
|
1809
|
+
with ux_utils.print_exception_no_traceback():
|
1810
|
+
raise ValueError('Cannot specify both --ip and --endpoint(s) '
|
1811
|
+
'at the same time.')
|
1610
1812
|
|
1611
|
-
|
1612
|
-
|
1613
|
-
|
1614
|
-
|
1615
|
-
|
1813
|
+
if endpoint is not None and endpoints:
|
1814
|
+
with ux_utils.print_exception_no_traceback():
|
1815
|
+
raise ValueError(
|
1816
|
+
'Cannot specify both --endpoint and --endpoints '
|
1817
|
+
'at the same time.')
|
1616
1818
|
|
1617
|
-
|
1618
|
-
|
1619
|
-
|
1620
|
-
|
1621
|
-
|
1622
|
-
|
1623
|
-
|
1624
|
-
|
1625
|
-
|
1626
|
-
|
1627
|
-
|
1628
|
-
|
1629
|
-
|
1630
|
-
|
1631
|
-
|
1632
|
-
|
1819
|
+
if len(clusters) != 1:
|
1820
|
+
with ux_utils.print_exception_no_traceback():
|
1821
|
+
plural = 's' if len(clusters) > 1 else ''
|
1822
|
+
cluster_num = (str(len(clusters)) if clusters else 'No')
|
1823
|
+
cause = 'a single' if len(clusters) > 1 else 'an existing'
|
1824
|
+
raise ValueError(
|
1825
|
+
_STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format(
|
1826
|
+
cluster_num=cluster_num,
|
1827
|
+
plural=plural,
|
1828
|
+
verb='specified',
|
1829
|
+
cause=cause,
|
1830
|
+
property='IP address' if ip else 'endpoint(s)',
|
1831
|
+
flag='ip' if ip else
|
1832
|
+
('endpoint port'
|
1833
|
+
if show_single_endpoint else 'endpoints')))
|
1834
|
+
else:
|
1835
|
+
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters'
|
1836
|
+
f'{colorama.Style.RESET_ALL}')
|
1837
|
+
query_clusters: Optional[List[str]] = None if not clusters else clusters
|
1838
|
+
refresh_mode = common.StatusRefreshMode.NONE
|
1839
|
+
if refresh:
|
1840
|
+
refresh_mode = common.StatusRefreshMode.FORCE
|
1841
|
+
cluster_records = _get_cluster_records_and_set_ssh_config(
|
1842
|
+
query_clusters, refresh_mode, all_users)
|
1843
|
+
|
1844
|
+
# TOOD(zhwu): setup the ssh config for status
|
1845
|
+
if ip or show_endpoints:
|
1846
|
+
_show_endpoint(query_clusters, cluster_records, ip, endpoints, endpoint)
|
1847
|
+
return
|
1848
|
+
hints = []
|
1849
|
+
normal_clusters = []
|
1850
|
+
controllers = []
|
1851
|
+
for cluster_record in cluster_records:
|
1852
|
+
cluster_name = cluster_record['name']
|
1853
|
+
controller = controller_utils.Controllers.from_name(cluster_name)
|
1854
|
+
if controller is not None:
|
1855
|
+
controllers.append(cluster_record)
|
1633
1856
|
else:
|
1634
|
-
|
1635
|
-
f'{colorama.Style.RESET_ALL}')
|
1636
|
-
query_clusters: Optional[List[str]] = None
|
1637
|
-
if clusters:
|
1638
|
-
query_clusters = _get_glob_clusters(clusters, silent=ip)
|
1639
|
-
cluster_records = core.status(cluster_names=query_clusters,
|
1640
|
-
refresh=refresh)
|
1641
|
-
if ip or show_endpoints:
|
1642
|
-
if len(cluster_records) != 1:
|
1643
|
-
with ux_utils.print_exception_no_traceback():
|
1644
|
-
plural = 's' if len(cluster_records) > 1 else ''
|
1645
|
-
cluster_num = (str(len(cluster_records))
|
1646
|
-
if len(cluster_records) > 0 else
|
1647
|
-
f'{clusters[0]!r}')
|
1648
|
-
verb = 'found' if len(cluster_records) > 0 else 'not found'
|
1649
|
-
cause = 'a single' if len(clusters) > 1 else 'an existing'
|
1650
|
-
raise ValueError(
|
1651
|
-
_STATUS_PROPERTY_CLUSTER_NUM_ERROR_MESSAGE.format(
|
1652
|
-
cluster_num=cluster_num,
|
1653
|
-
plural=plural,
|
1654
|
-
verb=verb,
|
1655
|
-
cause=cause,
|
1656
|
-
property='IP address' if ip else 'endpoint(s)',
|
1657
|
-
flag='ip' if ip else
|
1658
|
-
('endpoint port'
|
1659
|
-
if show_single_endpoint else 'endpoints')))
|
1660
|
-
|
1661
|
-
cluster_record = cluster_records[0]
|
1662
|
-
if cluster_record['status'] != status_lib.ClusterStatus.UP:
|
1663
|
-
with ux_utils.print_exception_no_traceback():
|
1664
|
-
raise RuntimeError(f'Cluster {cluster_record["name"]!r} '
|
1665
|
-
'is not in UP status.')
|
1666
|
-
handle = cluster_record['handle']
|
1667
|
-
if not isinstance(handle, backends.CloudVmRayResourceHandle):
|
1668
|
-
with ux_utils.print_exception_no_traceback():
|
1669
|
-
raise ValueError('Querying IP address is not supported '
|
1670
|
-
'for local clusters.')
|
1671
|
-
|
1672
|
-
head_ip = handle.external_ips()[0]
|
1673
|
-
if show_endpoints:
|
1674
|
-
if endpoint:
|
1675
|
-
cluster_endpoint = core.endpoints(cluster_record['name'],
|
1676
|
-
endpoint).get(
|
1677
|
-
endpoint, None)
|
1678
|
-
if not cluster_endpoint:
|
1679
|
-
raise click.Abort(
|
1680
|
-
f'Endpoint {endpoint} not found for cluster '
|
1681
|
-
f'{cluster_record["name"]!r}.')
|
1682
|
-
click.echo(cluster_endpoint)
|
1683
|
-
else:
|
1684
|
-
cluster_endpoints = core.endpoints(cluster_record['name'])
|
1685
|
-
assert isinstance(cluster_endpoints, dict)
|
1686
|
-
if not cluster_endpoints:
|
1687
|
-
raise click.Abort(f'No endpoint found for cluster '
|
1688
|
-
f'{cluster_record["name"]!r}.')
|
1689
|
-
for port, port_endpoint in cluster_endpoints.items():
|
1690
|
-
click.echo(
|
1691
|
-
f'{colorama.Fore.BLUE}{colorama.Style.BRIGHT}{port}'
|
1692
|
-
f'{colorama.Style.RESET_ALL}: '
|
1693
|
-
f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1694
|
-
f'{port_endpoint}{colorama.Style.RESET_ALL}')
|
1695
|
-
return
|
1696
|
-
click.echo(head_ip)
|
1697
|
-
return
|
1698
|
-
hints = []
|
1699
|
-
normal_clusters = []
|
1700
|
-
controllers = []
|
1701
|
-
for cluster_record in cluster_records:
|
1702
|
-
cluster_name = cluster_record['name']
|
1703
|
-
controller = controller_utils.Controllers.from_name(cluster_name)
|
1704
|
-
if controller is not None:
|
1705
|
-
controllers.append(cluster_record)
|
1706
|
-
else:
|
1707
|
-
normal_clusters.append(cluster_record)
|
1857
|
+
normal_clusters.append(cluster_record)
|
1708
1858
|
|
1709
|
-
|
1710
|
-
|
1711
|
-
|
1859
|
+
num_pending_autostop = 0
|
1860
|
+
num_pending_autostop += status_utils.show_status_table(
|
1861
|
+
normal_clusters + controllers, verbose, all_users, query_clusters)
|
1712
1862
|
|
1713
|
-
|
1714
|
-
|
1715
|
-
|
1863
|
+
managed_jobs_query_interrupted = False
|
1864
|
+
if show_managed_jobs:
|
1865
|
+
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1866
|
+
f'Managed jobs{colorama.Style.RESET_ALL}')
|
1867
|
+
with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
|
1716
1868
|
try:
|
1717
|
-
|
1869
|
+
num_in_progress_jobs, msg = _handle_jobs_queue_request(
|
1870
|
+
managed_jobs_queue_request_id,
|
1871
|
+
show_all=False,
|
1872
|
+
show_user=False,
|
1873
|
+
limit_num_jobs_to_show=not all,
|
1874
|
+
is_called_by_user=False)
|
1718
1875
|
except KeyboardInterrupt:
|
1719
|
-
|
1720
|
-
|
1721
|
-
|
1722
|
-
|
1723
|
-
|
1724
|
-
|
1725
|
-
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1726
|
-
f'Managed jobs{colorama.Style.RESET_ALL}')
|
1727
|
-
with rich_utils.safe_status('[cyan]Checking managed jobs[/]'):
|
1728
|
-
managed_jobs_query_interrupted, result = _try_get_future_result(
|
1729
|
-
managed_jobs_future)
|
1730
|
-
if managed_jobs_query_interrupted:
|
1731
|
-
# Set to -1, so that the controller is not considered
|
1732
|
-
# down, and the hint for showing sky jobs queue
|
1733
|
-
# will still be shown.
|
1734
|
-
num_in_progress_jobs = -1
|
1735
|
-
msg = 'KeyboardInterrupt'
|
1736
|
-
else:
|
1737
|
-
num_in_progress_jobs, msg = result
|
1738
|
-
|
1739
|
-
click.echo(msg)
|
1740
|
-
if num_in_progress_jobs is not None:
|
1741
|
-
# jobs controller is UP.
|
1742
|
-
job_info = ''
|
1743
|
-
if num_in_progress_jobs > 0:
|
1744
|
-
plural_and_verb = ' is'
|
1745
|
-
if num_in_progress_jobs > 1:
|
1746
|
-
plural_and_verb = 's are'
|
1747
|
-
job_info = (
|
1748
|
-
f'{num_in_progress_jobs} managed job{plural_and_verb} '
|
1749
|
-
'in progress')
|
1750
|
-
if (num_in_progress_jobs >
|
1751
|
-
_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS):
|
1752
|
-
job_info += (
|
1753
|
-
f' ({_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS} latest '
|
1754
|
-
'ones shown)')
|
1755
|
-
job_info += '. '
|
1756
|
-
hints.append(
|
1757
|
-
controller_utils.Controllers.JOBS_CONTROLLER.value.
|
1758
|
-
in_progress_hint.format(job_info=job_info))
|
1759
|
-
|
1760
|
-
if show_services:
|
1761
|
-
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1762
|
-
f'Services{colorama.Style.RESET_ALL}')
|
1763
|
-
num_services = None
|
1764
|
-
if managed_jobs_query_interrupted:
|
1765
|
-
# The pool is terminated, so we cannot run the service query.
|
1876
|
+
sdk.api_cancel(managed_jobs_queue_request_id, silent=True)
|
1877
|
+
managed_jobs_query_interrupted = True
|
1878
|
+
# Set to -1, so that the controller is not considered
|
1879
|
+
# down, and the hint for showing sky jobs queue
|
1880
|
+
# will still be shown.
|
1881
|
+
num_in_progress_jobs = -1
|
1766
1882
|
msg = 'KeyboardInterrupt'
|
1767
|
-
else:
|
1768
|
-
with rich_utils.safe_status('[cyan]Checking services[/]'):
|
1769
|
-
interrupted, result = _try_get_future_result(
|
1770
|
-
services_future)
|
1771
|
-
if interrupted:
|
1772
|
-
num_services = -1
|
1773
|
-
msg = 'KeyboardInterrupt'
|
1774
|
-
else:
|
1775
|
-
num_services, msg = result
|
1776
|
-
click.echo(msg)
|
1777
|
-
if num_services is not None:
|
1778
|
-
hints.append(controller_utils.Controllers.SKY_SERVE_CONTROLLER.
|
1779
|
-
value.in_progress_hint)
|
1780
1883
|
|
1781
|
-
|
1782
|
-
|
1783
|
-
|
1784
|
-
|
1785
|
-
|
1786
|
-
|
1787
|
-
|
1788
|
-
|
1789
|
-
|
1790
|
-
|
1791
|
-
|
1792
|
-
|
1793
|
-
|
1794
|
-
|
1795
|
-
|
1796
|
-
|
1797
|
-
|
1798
|
-
|
1799
|
-
|
1800
|
-
|
1801
|
-
|
1802
|
-
|
1803
|
-
|
1804
|
-
|
1884
|
+
click.echo(msg)
|
1885
|
+
if num_in_progress_jobs is not None:
|
1886
|
+
# jobs controller is UP.
|
1887
|
+
job_info = ''
|
1888
|
+
if num_in_progress_jobs > 0:
|
1889
|
+
plural_and_verb = ' is'
|
1890
|
+
if num_in_progress_jobs > 1:
|
1891
|
+
plural_and_verb = 's are'
|
1892
|
+
job_info = (
|
1893
|
+
f'{num_in_progress_jobs} managed job{plural_and_verb} '
|
1894
|
+
'in progress')
|
1895
|
+
if num_in_progress_jobs > _NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS:
|
1896
|
+
job_info += (
|
1897
|
+
f' ({_NUM_MANAGED_JOBS_TO_SHOW_IN_STATUS} latest '
|
1898
|
+
'ones shown)')
|
1899
|
+
job_info += '. '
|
1900
|
+
hints.append(
|
1901
|
+
controller_utils.Controllers.JOBS_CONTROLLER.value.
|
1902
|
+
in_progress_hint.format(job_info=job_info))
|
1903
|
+
|
1904
|
+
if show_services:
|
1905
|
+
click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
1906
|
+
f'Services{colorama.Style.RESET_ALL}')
|
1907
|
+
num_services = None
|
1908
|
+
if managed_jobs_query_interrupted:
|
1909
|
+
msg = 'KeyboardInterrupt'
|
1910
|
+
else:
|
1911
|
+
with rich_utils.client_status('[cyan]Checking services[/]'):
|
1912
|
+
try:
|
1913
|
+
num_services, msg = _handle_services_request(
|
1914
|
+
service_status_request_id,
|
1915
|
+
service_names=None,
|
1916
|
+
show_all=False,
|
1917
|
+
show_endpoint=False,
|
1918
|
+
is_called_by_user=False)
|
1919
|
+
except KeyboardInterrupt:
|
1920
|
+
sdk.api_cancel(service_status_request_id, silent=True)
|
1921
|
+
num_services = -1
|
1922
|
+
msg = 'KeyboardInterrupt'
|
1923
|
+
click.echo(msg)
|
1924
|
+
if num_services is not None:
|
1925
|
+
hints.append(controller_utils.Controllers.SKY_SERVE_CONTROLLER.
|
1926
|
+
value.in_progress_hint)
|
1927
|
+
|
1928
|
+
if num_pending_autostop > 0 and not refresh:
|
1929
|
+
# Don't print this hint if there's no pending autostop or user has
|
1930
|
+
# already passed --refresh.
|
1931
|
+
plural_and_verb = ' has'
|
1932
|
+
if num_pending_autostop > 1:
|
1933
|
+
plural_and_verb = 's have'
|
1934
|
+
hints.append(f'* {num_pending_autostop} cluster{plural_and_verb} '
|
1935
|
+
'auto{stop,down} scheduled. Refresh statuses with: '
|
1936
|
+
f'{colorama.Style.BRIGHT}sky status --refresh'
|
1937
|
+
f'{colorama.Style.RESET_ALL}')
|
1938
|
+
if hints:
|
1939
|
+
click.echo('\n' + '\n'.join(hints))
|
1805
1940
|
|
1806
1941
|
|
1807
1942
|
@cli.command()
|
@@ -1810,7 +1945,7 @@ def status(all: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1810
1945
|
default=False,
|
1811
1946
|
is_flag=True,
|
1812
1947
|
required=False,
|
1813
|
-
help='Show all information
|
1948
|
+
help='Show all cluster information.')
|
1814
1949
|
@usage_lib.entrypoint
|
1815
1950
|
def cost_report(all: bool): # pylint: disable=redefined-builtin
|
1816
1951
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
@@ -1831,7 +1966,7 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin
|
|
1831
1966
|
|
1832
1967
|
- Clusters that were terminated/stopped on the cloud console.
|
1833
1968
|
"""
|
1834
|
-
cluster_records =
|
1969
|
+
cluster_records = sdk.get(sdk.cost_report())
|
1835
1970
|
|
1836
1971
|
normal_cluster_records = []
|
1837
1972
|
controllers = dict()
|
@@ -1876,7 +2011,7 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin
|
|
1876
2011
|
|
1877
2012
|
@cli.command()
|
1878
2013
|
@click.option('--all-users',
|
1879
|
-
'-
|
2014
|
+
'-u',
|
1880
2015
|
default=False,
|
1881
2016
|
is_flag=True,
|
1882
2017
|
required=False,
|
@@ -1896,18 +2031,21 @@ def cost_report(all: bool): # pylint: disable=redefined-builtin
|
|
1896
2031
|
def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
1897
2032
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
1898
2033
|
"""Show the job queue for cluster(s)."""
|
1899
|
-
click.secho('Fetching and parsing job queue...', fg='
|
1900
|
-
if clusters:
|
1901
|
-
|
1902
|
-
|
1903
|
-
|
1904
|
-
clusters = [c['name'] for c in cluster_infos]
|
2034
|
+
click.secho('Fetching and parsing job queue...', fg='cyan')
|
2035
|
+
if not clusters:
|
2036
|
+
cluster_records = _get_cluster_records_and_set_ssh_config(
|
2037
|
+
None, all_users=all_users)
|
2038
|
+
clusters = [cluster['name'] for cluster in cluster_records]
|
1905
2039
|
|
1906
2040
|
unsupported_clusters = []
|
1907
|
-
for
|
2041
|
+
logger.info(f'Fetching job queue for: {", ".join(clusters)}')
|
2042
|
+
job_tables = {}
|
2043
|
+
|
2044
|
+
def _get_job_queue(cluster):
|
1908
2045
|
try:
|
1909
|
-
job_table =
|
1910
|
-
|
2046
|
+
job_table = sdk.stream_and_get(
|
2047
|
+
sdk.queue(cluster, skip_finished, all_users))
|
2048
|
+
except (RuntimeError, exceptions.CommandError, ValueError,
|
1911
2049
|
exceptions.NotSupportedError, exceptions.ClusterNotUpError,
|
1912
2050
|
exceptions.CloudUserIdentityError,
|
1913
2051
|
exceptions.ClusterOwnerIdentityMismatchError) as e:
|
@@ -1916,9 +2054,14 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
1916
2054
|
click.echo(f'{colorama.Fore.YELLOW}Failed to get the job queue for '
|
1917
2055
|
f'cluster {cluster!r}.{colorama.Style.RESET_ALL}\n'
|
1918
2056
|
f' {common_utils.format_exception(e)}')
|
1919
|
-
|
1920
|
-
|
1921
|
-
|
2057
|
+
return
|
2058
|
+
job_tables[cluster] = job_lib.format_job_queue(job_table)
|
2059
|
+
|
2060
|
+
subprocess_utils.run_in_parallel(_get_job_queue, clusters)
|
2061
|
+
user_str = 'all users' if all_users else 'current user'
|
2062
|
+
for cluster, job_table in job_tables.items():
|
2063
|
+
click.echo(f'\nJob queue of {user_str} on cluster {cluster}\n'
|
2064
|
+
f'{job_table}')
|
1922
2065
|
|
1923
2066
|
if unsupported_clusters:
|
1924
2067
|
click.secho(
|
@@ -1948,6 +2091,12 @@ def queue(clusters: List[str], skip_finished: bool, all_users: bool):
|
|
1948
2091
|
help=('Follow the logs of a job. '
|
1949
2092
|
'If --no-follow is specified, print the log so far and exit. '
|
1950
2093
|
'[default: --follow]'))
|
2094
|
+
@click.option(
|
2095
|
+
'--tail',
|
2096
|
+
default=0,
|
2097
|
+
type=int,
|
2098
|
+
help=('The number of lines to display from the end of the log file. '
|
2099
|
+
'Default is 0, which means print all lines.'))
|
1951
2100
|
@click.argument('cluster',
|
1952
2101
|
required=True,
|
1953
2102
|
type=str,
|
@@ -1961,6 +2110,7 @@ def logs(
|
|
1961
2110
|
sync_down: bool,
|
1962
2111
|
status: bool, # pylint: disable=redefined-outer-name
|
1963
2112
|
follow: bool,
|
2113
|
+
tail: int,
|
1964
2114
|
):
|
1965
2115
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
1966
2116
|
"""Tail the log of a job.
|
@@ -1991,25 +2141,34 @@ def logs(
|
|
1991
2141
|
job_ids = None if not job_ids else job_ids
|
1992
2142
|
|
1993
2143
|
if sync_down:
|
1994
|
-
|
2144
|
+
with rich_utils.client_status(
|
2145
|
+
ux_utils.spinner_message('Downloading logs')):
|
2146
|
+
log_local_path_dict = sdk.download_logs(cluster, job_ids)
|
2147
|
+
style = colorama.Style
|
2148
|
+
fore = colorama.Fore
|
2149
|
+
for job, log_local_path in log_local_path_dict.items():
|
2150
|
+
logger.info(f'{fore.CYAN}Job {job} logs: {log_local_path}'
|
2151
|
+
f'{style.RESET_ALL}')
|
1995
2152
|
return
|
1996
2153
|
|
1997
2154
|
assert job_ids is None or len(job_ids) <= 1, job_ids
|
1998
|
-
job_id = None
|
2155
|
+
job_id: Optional[int] = None
|
1999
2156
|
job_ids_to_query: Optional[List[int]] = None
|
2000
2157
|
if job_ids:
|
2001
2158
|
# Already check that len(job_ids) <= 1. This variable is used later
|
2002
|
-
# in
|
2003
|
-
|
2004
|
-
if not
|
2005
|
-
raise click.UsageError(f'Invalid job ID {
|
2159
|
+
# in sdk.tail_logs.
|
2160
|
+
cur_job_id = job_ids[0]
|
2161
|
+
if not cur_job_id.isdigit():
|
2162
|
+
raise click.UsageError(f'Invalid job ID {cur_job_id}. '
|
2006
2163
|
'Job ID must be integers.')
|
2007
|
-
|
2164
|
+
job_id = int(cur_job_id)
|
2165
|
+
job_ids_to_query = [int(job_ids[0])]
|
2008
2166
|
else:
|
2009
2167
|
# job_ids is either None or empty list, so it is safe to cast it here.
|
2010
2168
|
job_ids_to_query = typing.cast(Optional[List[int]], job_ids)
|
2011
2169
|
if status:
|
2012
|
-
job_statuses =
|
2170
|
+
job_statuses = sdk.stream_and_get(
|
2171
|
+
sdk.job_status(cluster, job_ids_to_query))
|
2013
2172
|
job_id = list(job_statuses.keys())[0]
|
2014
2173
|
# If job_ids is None and no job has been submitted to the cluster,
|
2015
2174
|
# it will return {None: None}.
|
@@ -2027,7 +2186,15 @@ def logs(
|
|
2027
2186
|
click.secho(f'Job {id_str}not found', fg='red')
|
2028
2187
|
sys.exit(1)
|
2029
2188
|
|
2030
|
-
|
2189
|
+
job_str = f'job {job_id}'
|
2190
|
+
if job_id is None:
|
2191
|
+
job_str = 'the last job'
|
2192
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
2193
|
+
f'Tailing logs of {job_str} on cluster {cluster!r}...'
|
2194
|
+
f'{colorama.Style.RESET_ALL}')
|
2195
|
+
|
2196
|
+
# Stream logs from the server.
|
2197
|
+
sdk.tail_logs(cluster, job_id, follow, tail=tail)
|
2031
2198
|
|
2032
2199
|
|
2033
2200
|
@cli.command()
|
@@ -2040,16 +2207,31 @@ def logs(
|
|
2040
2207
|
default=False,
|
2041
2208
|
is_flag=True,
|
2042
2209
|
required=False,
|
2043
|
-
help='Cancel all jobs on the specified cluster.'
|
2210
|
+
help='Cancel all jobs from current user on the specified cluster.'
|
2211
|
+
)
|
2212
|
+
@click.option('--all-users',
|
2213
|
+
'-u',
|
2214
|
+
default=False,
|
2215
|
+
is_flag=True,
|
2216
|
+
required=False,
|
2217
|
+
help='Cancel all jobs on the specified cluster for all users.')
|
2044
2218
|
@click.option('--yes',
|
2045
2219
|
'-y',
|
2046
2220
|
is_flag=True,
|
2047
2221
|
default=False,
|
2048
2222
|
required=False,
|
2049
2223
|
help='Skip confirmation prompt.')
|
2224
|
+
@_add_click_options(_COMMON_OPTIONS)
|
2050
2225
|
@click.argument('jobs', required=False, type=int, nargs=-1)
|
2051
2226
|
@usage_lib.entrypoint
|
2052
|
-
def cancel(
|
2227
|
+
def cancel(
|
2228
|
+
cluster: str,
|
2229
|
+
all: bool, # pylint: disable=redefined-builtin
|
2230
|
+
all_users: bool,
|
2231
|
+
jobs: List[int], # pylint: disable=redefined-outer-name
|
2232
|
+
yes: bool,
|
2233
|
+
async_call: bool,
|
2234
|
+
): # pylint: disable=redefined-builtin
|
2053
2235
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
2054
2236
|
"""Cancel job(s).
|
2055
2237
|
|
@@ -2062,30 +2244,36 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa
|
|
2062
2244
|
sky cancel cluster_name 1
|
2063
2245
|
sky cancel cluster_name 1 2 3
|
2064
2246
|
\b
|
2065
|
-
# Cancel all jobs on a cluster.
|
2247
|
+
# Cancel all your jobs on a cluster.
|
2066
2248
|
sky cancel cluster_name -a
|
2067
2249
|
\b
|
2250
|
+
# Cancel all users' jobs on a cluster.
|
2251
|
+
sky cancel cluster_name -u
|
2252
|
+
\b
|
2068
2253
|
# Cancel the latest running job on a cluster.
|
2069
2254
|
sky cancel cluster_name
|
2070
2255
|
|
2071
2256
|
Job IDs can be looked up by ``sky queue cluster_name``.
|
2072
2257
|
"""
|
2073
|
-
job_identity_str =
|
2258
|
+
job_identity_str = ''
|
2074
2259
|
job_ids_to_cancel = None
|
2075
|
-
if not jobs and not all:
|
2076
|
-
click.echo(
|
2077
|
-
|
2078
|
-
|
2260
|
+
if not jobs and not all and not all_users:
|
2261
|
+
click.echo(
|
2262
|
+
f'{colorama.Fore.YELLOW}No job IDs or --all/--all-users provided; '
|
2263
|
+
'cancelling the latest running job.'
|
2264
|
+
f'{colorama.Style.RESET_ALL}')
|
2079
2265
|
job_identity_str = 'the latest running job'
|
2266
|
+
elif all_users:
|
2267
|
+
job_identity_str = 'all users\' jobs'
|
2080
2268
|
else:
|
2081
|
-
# Cancelling specific jobs or --all.
|
2082
|
-
job_ids = ' '.join(map(str, jobs))
|
2083
|
-
plural = 's' if len(job_ids) > 1 else ''
|
2084
|
-
job_identity_str = f'job{plural} {job_ids}'
|
2085
|
-
job_ids_to_cancel = jobs
|
2086
2269
|
if all:
|
2087
|
-
job_identity_str = 'all jobs'
|
2088
|
-
|
2270
|
+
job_identity_str = 'all your jobs'
|
2271
|
+
if jobs:
|
2272
|
+
jobs_str = ' '.join(map(str, jobs))
|
2273
|
+
plural = 's' if len(jobs) > 1 else ''
|
2274
|
+
connector = ' and ' if job_identity_str else ''
|
2275
|
+
job_identity_str += f'{connector}job{plural} {jobs_str}'
|
2276
|
+
job_ids_to_cancel = jobs
|
2089
2277
|
job_identity_str += f' on cluster {cluster!r}'
|
2090
2278
|
|
2091
2279
|
if not yes:
|
@@ -2095,7 +2283,11 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa
|
|
2095
2283
|
show_default=True)
|
2096
2284
|
|
2097
2285
|
try:
|
2098
|
-
|
2286
|
+
request_id = sdk.cancel(cluster,
|
2287
|
+
all=all,
|
2288
|
+
all_users=all_users,
|
2289
|
+
job_ids=job_ids_to_cancel)
|
2290
|
+
_async_call_or_wait(request_id, async_call, 'sky.cancel')
|
2099
2291
|
except exceptions.NotSupportedError as e:
|
2100
2292
|
controller = controller_utils.Controllers.from_name(cluster)
|
2101
2293
|
assert controller is not None, cluster
|
@@ -2115,20 +2307,28 @@ def cancel(cluster: str, all: bool, jobs: List[int], yes: bool): # pylint: disa
|
|
2115
2307
|
**_get_shell_complete_args(_complete_cluster_name))
|
2116
2308
|
@click.option('--all',
|
2117
2309
|
'-a',
|
2118
|
-
default=
|
2310
|
+
default=False,
|
2119
2311
|
is_flag=True,
|
2120
2312
|
help='Stop all existing clusters.')
|
2313
|
+
@click.option('--all-users',
|
2314
|
+
'-u',
|
2315
|
+
default=False,
|
2316
|
+
is_flag=True,
|
2317
|
+
help='Stop all existing clusters for all users.')
|
2121
2318
|
@click.option('--yes',
|
2122
2319
|
'-y',
|
2123
2320
|
is_flag=True,
|
2124
2321
|
default=False,
|
2125
2322
|
required=False,
|
2126
2323
|
help='Skip confirmation prompt.')
|
2324
|
+
@_add_click_options(_COMMON_OPTIONS)
|
2127
2325
|
@usage_lib.entrypoint
|
2128
2326
|
def stop(
|
2129
2327
|
clusters: List[str],
|
2130
|
-
all:
|
2328
|
+
all: bool, # pylint: disable=redefined-builtin
|
2329
|
+
all_users: bool,
|
2131
2330
|
yes: bool,
|
2331
|
+
async_call: bool,
|
2132
2332
|
):
|
2133
2333
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
2134
2334
|
"""Stop cluster(s).
|
@@ -2161,8 +2361,10 @@ def stop(
|
|
2161
2361
|
"""
|
2162
2362
|
_down_or_stop_clusters(clusters,
|
2163
2363
|
apply_to_all=all,
|
2364
|
+
all_users=all_users,
|
2164
2365
|
down=False,
|
2165
|
-
no_confirm=yes
|
2366
|
+
no_confirm=yes,
|
2367
|
+
async_call=async_call)
|
2166
2368
|
|
2167
2369
|
|
2168
2370
|
@cli.command(cls=_DocumentedCodeCommand)
|
@@ -2172,9 +2374,14 @@ def stop(
|
|
2172
2374
|
**_get_shell_complete_args(_complete_cluster_name))
|
2173
2375
|
@click.option('--all',
|
2174
2376
|
'-a',
|
2175
|
-
default=
|
2377
|
+
default=False,
|
2378
|
+
is_flag=True,
|
2379
|
+
help='Autostop all existing clusters.')
|
2380
|
+
@click.option('--all-users',
|
2381
|
+
'-u',
|
2382
|
+
default=False,
|
2176
2383
|
is_flag=True,
|
2177
|
-
help='
|
2384
|
+
help='Autostop all existing clusters for all users.')
|
2178
2385
|
@click.option('--idle-minutes',
|
2179
2386
|
'-i',
|
2180
2387
|
type=int,
|
@@ -2202,14 +2409,17 @@ def stop(
|
|
2202
2409
|
default=False,
|
2203
2410
|
required=False,
|
2204
2411
|
help='Skip confirmation prompt.')
|
2412
|
+
@_add_click_options(_COMMON_OPTIONS)
|
2205
2413
|
@usage_lib.entrypoint
|
2206
2414
|
def autostop(
|
2207
2415
|
clusters: List[str],
|
2208
|
-
all:
|
2416
|
+
all: bool, # pylint: disable=redefined-builtin
|
2417
|
+
all_users: bool,
|
2209
2418
|
idle_minutes: Optional[int],
|
2210
2419
|
cancel: bool, # pylint: disable=redefined-outer-name
|
2211
2420
|
down: bool, # pylint: disable=redefined-outer-name
|
2212
2421
|
yes: bool,
|
2422
|
+
async_call: bool,
|
2213
2423
|
):
|
2214
2424
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
2215
2425
|
"""Schedule an autostop or autodown for cluster(s).
|
@@ -2262,9 +2472,11 @@ def autostop(
|
|
2262
2472
|
idle_minutes = 5
|
2263
2473
|
_down_or_stop_clusters(clusters,
|
2264
2474
|
apply_to_all=all,
|
2475
|
+
all_users=all_users,
|
2265
2476
|
down=down,
|
2266
2477
|
no_confirm=yes,
|
2267
|
-
idle_minutes_to_autostop=idle_minutes
|
2478
|
+
idle_minutes_to_autostop=idle_minutes,
|
2479
|
+
async_call=async_call)
|
2268
2480
|
|
2269
2481
|
|
2270
2482
|
@cli.command(cls=_DocumentedCodeCommand)
|
@@ -2327,16 +2539,19 @@ def autostop(
|
|
2327
2539
|
required=False,
|
2328
2540
|
help=('Force start the cluster even if it is already UP. Useful for '
|
2329
2541
|
'upgrading the SkyPilot runtime on the cluster.'))
|
2542
|
+
@_add_click_options(_COMMON_OPTIONS)
|
2330
2543
|
@usage_lib.entrypoint
|
2331
2544
|
# pylint: disable=redefined-builtin
|
2332
2545
|
def start(
|
2333
|
-
|
2334
|
-
|
2335
|
-
|
2336
|
-
|
2337
|
-
|
2338
|
-
|
2339
|
-
|
2546
|
+
clusters: List[str],
|
2547
|
+
all: bool,
|
2548
|
+
yes: bool,
|
2549
|
+
idle_minutes_to_autostop: Optional[int],
|
2550
|
+
down: bool, # pylint: disable=redefined-outer-name
|
2551
|
+
retry_until_up: bool,
|
2552
|
+
force: bool,
|
2553
|
+
async_call: bool,
|
2554
|
+
):
|
2340
2555
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
2341
2556
|
"""Restart cluster(s).
|
2342
2557
|
|
@@ -2370,40 +2585,45 @@ def start(
|
|
2370
2585
|
'--idle-minutes-to-autostop must be set if --down is set.')
|
2371
2586
|
to_start = []
|
2372
2587
|
|
2588
|
+
cluster_records = None
|
2373
2589
|
if not clusters and not all:
|
2374
2590
|
# UX: frequently users may have only 1 cluster. In this case, be smart
|
2375
2591
|
# and default to that unique choice.
|
2376
|
-
|
2377
|
-
|
2378
|
-
|
2592
|
+
all_clusters = _get_cluster_records_and_set_ssh_config(
|
2593
|
+
clusters=None, refresh=common.StatusRefreshMode.AUTO)
|
2594
|
+
if len(all_clusters) <= 1:
|
2595
|
+
cluster_records = all_clusters
|
2379
2596
|
else:
|
2380
2597
|
raise click.UsageError(
|
2381
2598
|
'`sky start` requires either a cluster name or glob '
|
2382
2599
|
'(see `sky status`), or the -a/--all flag.')
|
2383
2600
|
|
2384
2601
|
if all:
|
2385
|
-
if
|
2602
|
+
if clusters:
|
2386
2603
|
click.echo('Both --all and cluster(s) specified for sky start. '
|
2387
2604
|
'Letting --all take effect.')
|
2388
2605
|
|
2606
|
+
all_clusters = _get_cluster_records_and_set_ssh_config(
|
2607
|
+
clusters=None, refresh=common.StatusRefreshMode.AUTO)
|
2608
|
+
|
2389
2609
|
# Get all clusters that are not controllers.
|
2390
|
-
|
2391
|
-
cluster
|
2392
|
-
for cluster in global_user_state.get_clusters()
|
2610
|
+
cluster_records = [
|
2611
|
+
cluster for cluster in all_clusters
|
2393
2612
|
if controller_utils.Controllers.from_name(cluster['name']) is None
|
2394
2613
|
]
|
2614
|
+
if cluster_records is None:
|
2615
|
+
# Get GLOB cluster names
|
2616
|
+
cluster_records = _get_cluster_records_and_set_ssh_config(
|
2617
|
+
clusters, refresh=common.StatusRefreshMode.AUTO)
|
2395
2618
|
|
2396
|
-
if not
|
2619
|
+
if not cluster_records:
|
2397
2620
|
click.echo('Cluster(s) not found (tip: see `sky status`). Do you '
|
2398
2621
|
'mean to use `sky launch` to provision a new cluster?')
|
2399
2622
|
return
|
2400
2623
|
else:
|
2401
|
-
|
2402
|
-
|
2403
|
-
|
2404
|
-
for name in clusters:
|
2405
|
-
cluster_status, _ = backend_utils.refresh_cluster_status_handle(
|
2406
|
-
name)
|
2624
|
+
for cluster in cluster_records:
|
2625
|
+
name = cluster['name']
|
2626
|
+
cluster_status = cluster['status']
|
2407
2627
|
# A cluster may have one of the following states:
|
2408
2628
|
#
|
2409
2629
|
# STOPPED - ok to restart
|
@@ -2461,8 +2681,8 @@ def start(
|
|
2461
2681
|
'is currently not supported.\n'
|
2462
2682
|
'Please start the former independently.')
|
2463
2683
|
if controllers:
|
2464
|
-
bold =
|
2465
|
-
reset_bold =
|
2684
|
+
bold = ux_utils.BOLD
|
2685
|
+
reset_bold = ux_utils.RESET_BOLD
|
2466
2686
|
if len(controllers) != 1:
|
2467
2687
|
raise click.UsageError(
|
2468
2688
|
'Starting multiple controllers is currently not supported.\n'
|
@@ -2483,18 +2703,25 @@ def start(
|
|
2483
2703
|
abort=True,
|
2484
2704
|
show_default=True)
|
2485
2705
|
|
2486
|
-
|
2706
|
+
request_ids = subprocess_utils.run_in_parallel(
|
2707
|
+
lambda name: sdk.start(name,
|
2708
|
+
idle_minutes_to_autostop,
|
2709
|
+
retry_until_up,
|
2710
|
+
down=down,
|
2711
|
+
force=force), to_start)
|
2712
|
+
|
2713
|
+
for name, request_id in zip(to_start, request_ids):
|
2487
2714
|
try:
|
2488
|
-
|
2489
|
-
|
2490
|
-
|
2491
|
-
|
2492
|
-
force=force)
|
2715
|
+
_async_call_or_wait(request_id, async_call, 'sky.start')
|
2716
|
+
if not async_call:
|
2717
|
+
# Add ssh config for the cluster
|
2718
|
+
_get_cluster_records_and_set_ssh_config(clusters=[name])
|
2493
2719
|
except (exceptions.NotSupportedError,
|
2494
2720
|
exceptions.ClusterOwnerIdentityMismatchError) as e:
|
2495
2721
|
click.echo(str(e))
|
2496
2722
|
else:
|
2497
|
-
|
2723
|
+
if not async_call:
|
2724
|
+
click.secho(f'Cluster {name} started.', fg='green')
|
2498
2725
|
|
2499
2726
|
|
2500
2727
|
@cli.command(cls=_DocumentedCodeCommand)
|
@@ -2504,10 +2731,15 @@ def start(
|
|
2504
2731
|
**_get_shell_complete_args(_complete_cluster_name))
|
2505
2732
|
@click.option('--all',
|
2506
2733
|
'-a',
|
2507
|
-
default=
|
2734
|
+
default=False,
|
2508
2735
|
is_flag=True,
|
2509
2736
|
help='Tear down all existing clusters.')
|
2510
|
-
@click.option('--
|
2737
|
+
@click.option('--all-users',
|
2738
|
+
'-u',
|
2739
|
+
default=False,
|
2740
|
+
is_flag=True,
|
2741
|
+
help='Tear down all existing clusters for all users.')
|
2742
|
+
@click.option('--yes',
|
2511
2743
|
'-y',
|
2512
2744
|
is_flag=True,
|
2513
2745
|
default=False,
|
@@ -2525,12 +2757,15 @@ def start(
|
|
2525
2757
|
' in certain manual troubleshooting scenarios; with it set, it is the'
|
2526
2758
|
' user\'s responsibility to ensure there are no leaked instances and '
|
2527
2759
|
'related resources.'))
|
2760
|
+
@_add_click_options(_COMMON_OPTIONS)
|
2528
2761
|
@usage_lib.entrypoint
|
2529
2762
|
def down(
|
2530
2763
|
clusters: List[str],
|
2531
|
-
all:
|
2764
|
+
all: bool, # pylint: disable=redefined-builtin
|
2765
|
+
all_users: bool,
|
2532
2766
|
yes: bool,
|
2533
2767
|
purge: bool,
|
2768
|
+
async_call: bool,
|
2534
2769
|
):
|
2535
2770
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
2536
2771
|
"""Tear down cluster(s).
|
@@ -2562,12 +2797,15 @@ def down(
|
|
2562
2797
|
"""
|
2563
2798
|
_down_or_stop_clusters(clusters,
|
2564
2799
|
apply_to_all=all,
|
2800
|
+
all_users=all_users,
|
2565
2801
|
down=True,
|
2566
2802
|
no_confirm=yes,
|
2567
|
-
purge=purge
|
2803
|
+
purge=purge,
|
2804
|
+
async_call=async_call)
|
2568
2805
|
|
2569
2806
|
|
2570
|
-
def _hint_or_raise_for_down_jobs_controller(controller_name: str
|
2807
|
+
def _hint_or_raise_for_down_jobs_controller(controller_name: str,
|
2808
|
+
purge: bool) -> None:
|
2571
2809
|
"""Helper function to check job controller status before tearing it down.
|
2572
2810
|
|
2573
2811
|
Raises helpful exceptions and errors if the controller is not in a safe
|
@@ -2582,11 +2820,13 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str):
|
|
2582
2820
|
controller = controller_utils.Controllers.from_name(controller_name)
|
2583
2821
|
assert controller is not None, controller_name
|
2584
2822
|
|
2585
|
-
with rich_utils.
|
2823
|
+
with rich_utils.client_status(
|
2586
2824
|
'[bold cyan]Checking for in-progress managed jobs[/]'):
|
2587
2825
|
try:
|
2588
|
-
|
2589
|
-
|
2826
|
+
request_id = managed_jobs.queue(refresh=False,
|
2827
|
+
skip_finished=True,
|
2828
|
+
all_users=True)
|
2829
|
+
managed_jobs_ = sdk.stream_and_get(request_id)
|
2590
2830
|
except exceptions.ClusterNotUpError as e:
|
2591
2831
|
if controller.value.connection_error_hint in str(e):
|
2592
2832
|
with ux_utils.print_exception_no_traceback():
|
@@ -2609,19 +2849,26 @@ def _hint_or_raise_for_down_jobs_controller(controller_name: str):
|
|
2609
2849
|
'jobs (output of `sky jobs queue`) will be lost.')
|
2610
2850
|
click.echo(msg)
|
2611
2851
|
if managed_jobs_:
|
2612
|
-
job_table = managed_jobs.format_job_table(managed_jobs_,
|
2852
|
+
job_table = managed_jobs.format_job_table(managed_jobs_,
|
2853
|
+
show_all=False,
|
2854
|
+
show_user=True)
|
2613
2855
|
msg = controller.value.decline_down_for_dirty_controller_hint
|
2614
2856
|
# Add prefix to each line to align with the bullet point.
|
2615
2857
|
msg += '\n'.join(
|
2616
2858
|
[' ' + line for line in job_table.split('\n') if line != ''])
|
2617
|
-
|
2618
|
-
|
2859
|
+
if purge:
|
2860
|
+
logger.warning('--purge is set, ignoring the in-progress managed '
|
2861
|
+
'jobs. This could cause leaked clusters!')
|
2862
|
+
else:
|
2863
|
+
with ux_utils.print_exception_no_traceback():
|
2864
|
+
raise exceptions.NotSupportedError(msg)
|
2619
2865
|
else:
|
2620
2866
|
click.echo(' * No in-progress managed jobs found. It should be safe to '
|
2621
2867
|
'terminate (see caveats above).')
|
2622
2868
|
|
2623
2869
|
|
2624
|
-
def _hint_or_raise_for_down_sky_serve_controller(controller_name: str
|
2870
|
+
def _hint_or_raise_for_down_sky_serve_controller(controller_name: str,
|
2871
|
+
purge: bool) -> None:
|
2625
2872
|
"""Helper function to check serve controller status before tearing it down.
|
2626
2873
|
|
2627
2874
|
Raises helpful exceptions and errors if the controller is not in a safe
|
@@ -2635,9 +2882,10 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str):
|
|
2635
2882
|
"""
|
2636
2883
|
controller = controller_utils.Controllers.from_name(controller_name)
|
2637
2884
|
assert controller is not None, controller_name
|
2638
|
-
with rich_utils.
|
2885
|
+
with rich_utils.client_status('[bold cyan]Checking for live services[/]'):
|
2639
2886
|
try:
|
2640
|
-
|
2887
|
+
request_id = serve_lib.status(service_names=None)
|
2888
|
+
services = sdk.stream_and_get(request_id)
|
2641
2889
|
except exceptions.ClusterNotUpError as e:
|
2642
2890
|
if controller.value.connection_error_hint in str(e):
|
2643
2891
|
with ux_utils.print_exception_no_traceback():
|
@@ -2654,35 +2902,52 @@ def _hint_or_raise_for_down_sky_serve_controller(controller_name: str):
|
|
2654
2902
|
|
2655
2903
|
if services:
|
2656
2904
|
service_names = [service['name'] for service in services]
|
2657
|
-
|
2658
|
-
|
2659
|
-
|
2660
|
-
|
2661
|
-
|
2905
|
+
if purge:
|
2906
|
+
logger.warning('--purge is set, ignoring the in-progress services. '
|
2907
|
+
'This could cause leaked clusters!')
|
2908
|
+
else:
|
2909
|
+
with ux_utils.print_exception_no_traceback():
|
2910
|
+
msg = (controller.value.decline_down_for_dirty_controller_hint.
|
2911
|
+
format(service_names=', '.join(service_names)))
|
2912
|
+
raise exceptions.NotSupportedError(msg)
|
2662
2913
|
# Do nothing for STOPPED state, as it is safe to terminate the cluster.
|
2663
2914
|
click.echo(f'Terminate sky serve controller: {controller_name}.')
|
2664
2915
|
|
2665
2916
|
|
2666
|
-
|
2667
|
-
|
2668
|
-
|
2669
|
-
controller_utils.Controllers.
|
2670
|
-
|
2671
|
-
|
2917
|
+
def _controller_to_hint_or_raise(
|
2918
|
+
controller: controller_utils.Controllers
|
2919
|
+
) -> Callable[[str, bool], None]:
|
2920
|
+
if controller == controller_utils.Controllers.JOBS_CONTROLLER:
|
2921
|
+
return _hint_or_raise_for_down_jobs_controller
|
2922
|
+
return _hint_or_raise_for_down_sky_serve_controller
|
2672
2923
|
|
2673
2924
|
|
2674
2925
|
def _down_or_stop_clusters(
|
2675
2926
|
names: List[str],
|
2676
|
-
apply_to_all:
|
2677
|
-
|
2678
|
-
|
2927
|
+
apply_to_all: bool = False,
|
2928
|
+
all_users: bool = False,
|
2929
|
+
down: bool = False, # pylint: disable=redefined-outer-name
|
2930
|
+
no_confirm: bool = True,
|
2679
2931
|
purge: bool = False,
|
2680
|
-
idle_minutes_to_autostop: Optional[int] = None
|
2932
|
+
idle_minutes_to_autostop: Optional[int] = None,
|
2933
|
+
async_call: bool = False) -> None:
|
2681
2934
|
"""Tears down or (auto-)stops a cluster (or all clusters).
|
2682
2935
|
|
2683
2936
|
Controllers (jobs controller and sky serve controller) can only be
|
2684
2937
|
terminated if the cluster name is explicitly and uniquely specified (not
|
2685
2938
|
via glob).
|
2939
|
+
|
2940
|
+
Args:
|
2941
|
+
names: The names of the clusters to tear down or stop. If empty,
|
2942
|
+
apply_to_all or all_users must be set.
|
2943
|
+
apply_to_all: If True, apply the operation to all clusters.
|
2944
|
+
all_users: If True, apply the operation to all clusters for all users.
|
2945
|
+
down: If True, tear down the clusters.
|
2946
|
+
no_confirm: If True, skip the confirmation prompt.
|
2947
|
+
purge: If True, forcefully remove the clusters from the cluster table.
|
2948
|
+
idle_minutes_to_autostop: The number of minutes to wait before
|
2949
|
+
automatically stopping the cluster.
|
2950
|
+
async_call: If True, send the request asynchronously.
|
2686
2951
|
"""
|
2687
2952
|
if down:
|
2688
2953
|
command = 'down'
|
@@ -2690,17 +2955,12 @@ def _down_or_stop_clusters(
|
|
2690
2955
|
command = 'autostop'
|
2691
2956
|
else:
|
2692
2957
|
command = 'stop'
|
2693
|
-
if not names and apply_to_all
|
2694
|
-
|
2695
|
-
|
2696
|
-
|
2697
|
-
|
2698
|
-
|
2699
|
-
names = all_cluster_names
|
2700
|
-
else:
|
2701
|
-
raise click.UsageError(
|
2702
|
-
f'`sky {command}` requires either a cluster name or glob '
|
2703
|
-
'(see `sky status`), or the -a/--all flag.')
|
2958
|
+
if not names and not apply_to_all and not all_users:
|
2959
|
+
raise click.UsageError(
|
2960
|
+
f'`sky {command}` requires either a cluster name or glob '
|
2961
|
+
'(see `sky status`), or the -a/--all flag for all your '
|
2962
|
+
'clusters, or the -u/--all-users flag for all clusters in '
|
2963
|
+
'your team.')
|
2704
2964
|
|
2705
2965
|
operation = 'Terminating' if down else 'Stopping'
|
2706
2966
|
if idle_minutes_to_autostop is not None:
|
@@ -2711,21 +2971,23 @@ def _down_or_stop_clusters(
|
|
2711
2971
|
option_str = '{stop,down}'
|
2712
2972
|
operation = f'{verb} auto{option_str} on'
|
2713
2973
|
|
2714
|
-
|
2974
|
+
names = list(names)
|
2975
|
+
if names:
|
2715
2976
|
controllers = [
|
2716
2977
|
name for name in names
|
2717
2978
|
if controller_utils.Controllers.from_name(name) is not None
|
2718
2979
|
]
|
2719
2980
|
controllers_str = ', '.join(map(repr, controllers))
|
2720
2981
|
names = [
|
2721
|
-
name
|
2722
|
-
|
2982
|
+
cluster['name']
|
2983
|
+
for cluster in _get_cluster_records_and_set_ssh_config(names)
|
2984
|
+
if controller_utils.Controllers.from_name(cluster['name']) is None
|
2723
2985
|
]
|
2724
2986
|
|
2725
2987
|
# Make sure the controllers are explicitly specified without other
|
2726
2988
|
# normal clusters.
|
2727
2989
|
if controllers:
|
2728
|
-
if
|
2990
|
+
if names:
|
2729
2991
|
names_str = ', '.join(map(repr, names))
|
2730
2992
|
raise click.UsageError(
|
2731
2993
|
f'{operation} controller(s) '
|
@@ -2746,7 +3008,7 @@ def _down_or_stop_clusters(
|
|
2746
3008
|
controller = controller_utils.Controllers.from_name(
|
2747
3009
|
controller_name)
|
2748
3010
|
assert controller is not None
|
2749
|
-
hint_or_raise =
|
3011
|
+
hint_or_raise = _controller_to_hint_or_raise(controller)
|
2750
3012
|
try:
|
2751
3013
|
# TODO(zhwu): This hint or raise is not transactional, which
|
2752
3014
|
# means even if it passed the check with no in-progress spot
|
@@ -2755,7 +3017,7 @@ def _down_or_stop_clusters(
|
|
2755
3017
|
# `sky serve up` before typing the delete, causing a leaked
|
2756
3018
|
# managed job or service. We should make this check atomic
|
2757
3019
|
# with the termination.
|
2758
|
-
hint_or_raise(controller_name)
|
3020
|
+
hint_or_raise(controller_name, purge)
|
2759
3021
|
except (exceptions.ClusterOwnerIdentityMismatchError,
|
2760
3022
|
RuntimeError) as e:
|
2761
3023
|
if purge:
|
@@ -2776,9 +3038,10 @@ def _down_or_stop_clusters(
|
|
2776
3038
|
no_confirm = True
|
2777
3039
|
names += controllers
|
2778
3040
|
|
2779
|
-
if apply_to_all:
|
2780
|
-
all_clusters =
|
2781
|
-
|
3041
|
+
if apply_to_all or all_users:
|
3042
|
+
all_clusters = _get_cluster_records_and_set_ssh_config(
|
3043
|
+
clusters=None, all_users=all_users)
|
3044
|
+
if names:
|
2782
3045
|
click.echo(
|
2783
3046
|
f'Both --all and cluster(s) specified for `sky {command}`. '
|
2784
3047
|
'Letting --all take effect.')
|
@@ -2790,22 +3053,14 @@ def _down_or_stop_clusters(
|
|
2790
3053
|
if controller_utils.Controllers.from_name(record['name']) is None
|
2791
3054
|
]
|
2792
3055
|
|
2793
|
-
clusters =
|
2794
|
-
for name in names:
|
2795
|
-
handle = global_user_state.get_handle_from_cluster_name(name)
|
2796
|
-
if handle is None:
|
2797
|
-
# This codepath is used for 'sky down -p <controller>' when the
|
2798
|
-
# controller is not in 'sky status'. Cluster-not-found message
|
2799
|
-
# should've been printed by _get_glob_clusters() above.
|
2800
|
-
continue
|
2801
|
-
clusters.append(name)
|
3056
|
+
clusters = names
|
2802
3057
|
usage_lib.record_cluster_name_for_current_operation(clusters)
|
2803
3058
|
|
2804
3059
|
if not clusters:
|
2805
3060
|
click.echo('Cluster(s) not found (tip: see `sky status`).')
|
2806
3061
|
return
|
2807
3062
|
|
2808
|
-
if not no_confirm and
|
3063
|
+
if not no_confirm and clusters:
|
2809
3064
|
cluster_str = 'clusters' if len(clusters) > 1 else 'cluster'
|
2810
3065
|
cluster_list = ', '.join(clusters)
|
2811
3066
|
click.confirm(
|
@@ -2823,11 +3078,17 @@ def _down_or_stop_clusters(
|
|
2823
3078
|
f'[bold cyan]{operation} {len(clusters)} cluster{plural}[/]',
|
2824
3079
|
total=len(clusters))
|
2825
3080
|
|
3081
|
+
request_ids = []
|
3082
|
+
|
2826
3083
|
def _down_or_stop(name: str):
|
2827
3084
|
success_progress = False
|
2828
3085
|
if idle_minutes_to_autostop is not None:
|
2829
3086
|
try:
|
2830
|
-
|
3087
|
+
request_id = sdk.autostop(name, idle_minutes_to_autostop, down)
|
3088
|
+
request_ids.append(request_id)
|
3089
|
+
_async_call_or_wait(
|
3090
|
+
request_id, async_call,
|
3091
|
+
server_constants.REQUEST_NAME_PREFIX + operation)
|
2831
3092
|
except (exceptions.NotSupportedError,
|
2832
3093
|
exceptions.ClusterNotUpError) as e:
|
2833
3094
|
message = str(e)
|
@@ -2850,9 +3111,17 @@ def _down_or_stop_clusters(
|
|
2850
3111
|
else:
|
2851
3112
|
try:
|
2852
3113
|
if down:
|
2853
|
-
|
3114
|
+
request_id = sdk.down(name, purge=purge)
|
2854
3115
|
else:
|
2855
|
-
|
3116
|
+
request_id = sdk.stop(name, purge=purge)
|
3117
|
+
request_ids.append(request_id)
|
3118
|
+
_async_call_or_wait(
|
3119
|
+
request_id, async_call,
|
3120
|
+
server_constants.REQUEST_NAME_PREFIX + operation)
|
3121
|
+
if not async_call:
|
3122
|
+
# Remove the cluster from the SSH config file as soon as it
|
3123
|
+
# is stopped or downed.
|
3124
|
+
cluster_utils.SSHConfigHelper.remove_cluster(name)
|
2856
3125
|
except RuntimeError as e:
|
2857
3126
|
message = (
|
2858
3127
|
f'{colorama.Fore.RED}{operation} cluster {name}...failed. '
|
@@ -2883,6 +3152,10 @@ def _down_or_stop_clusters(
|
|
2883
3152
|
# Make sure the progress bar not mess up the terminal.
|
2884
3153
|
progress.refresh()
|
2885
3154
|
|
3155
|
+
if async_call:
|
3156
|
+
click.secho(f'{operation} requests are sent. Check the requests\' '
|
3157
|
+
'status with `sky request get <request_id>`.')
|
3158
|
+
|
2886
3159
|
|
2887
3160
|
@cli.command(cls=_DocumentedCodeCommand)
|
2888
3161
|
@click.argument('clouds', required=False, type=str, nargs=-1)
|
@@ -2892,6 +3165,7 @@ def _down_or_stop_clusters(
|
|
2892
3165
|
default=False,
|
2893
3166
|
help='Show the activated account for each cloud.')
|
2894
3167
|
@usage_lib.entrypoint
|
3168
|
+
# pylint: disable=redefined-outer-name
|
2895
3169
|
def check(clouds: Tuple[str], verbose: bool):
|
2896
3170
|
"""Check which clouds are available to use.
|
2897
3171
|
|
@@ -2915,7 +3189,12 @@ def check(clouds: Tuple[str], verbose: bool):
|
|
2915
3189
|
sky check aws gcp
|
2916
3190
|
"""
|
2917
3191
|
clouds_arg = clouds if len(clouds) > 0 else None
|
2918
|
-
|
3192
|
+
request_id = sdk.check(clouds=clouds_arg, verbose=verbose)
|
3193
|
+
sdk.stream_and_get(request_id)
|
3194
|
+
api_server_url = server_common.get_server_url()
|
3195
|
+
click.echo()
|
3196
|
+
click.echo(
|
3197
|
+
click.style(f'Using SkyPilot API server: {api_server_url}', fg='green'))
|
2919
3198
|
|
2920
3199
|
|
2921
3200
|
@cli.command()
|
@@ -2972,9 +3251,9 @@ def show_gpus(
|
|
2972
3251
|
and spot instances. There may be multiple regions with the same lowest
|
2973
3252
|
price.
|
2974
3253
|
|
2975
|
-
If ``--cloud kubernetes`` is specified, it will show the
|
2976
|
-
of the GPU available on a single node and the real-time
|
2977
|
-
the GPU across all nodes in the Kubernetes cluster.
|
3254
|
+
If ``--cloud kubernetes`` or ``--cloud k8s`` is specified, it will show the
|
3255
|
+
maximum quantities of the GPU available on a single node and the real-time
|
3256
|
+
availability of the GPU across all nodes in the Kubernetes cluster.
|
2978
3257
|
|
2979
3258
|
Definitions of certain fields:
|
2980
3259
|
|
@@ -3008,49 +3287,45 @@ def show_gpus(
|
|
3008
3287
|
'--all-regions and --region flags cannot be used simultaneously.')
|
3009
3288
|
|
3010
3289
|
# This will validate 'cloud' and raise if not found.
|
3011
|
-
cloud_obj =
|
3012
|
-
|
3290
|
+
cloud_obj = registry.CLOUD_REGISTRY.from_str(cloud)
|
3291
|
+
cloud_name = str(cloud_obj).lower() if cloud is not None else None
|
3013
3292
|
show_all = all
|
3014
3293
|
if show_all and accelerator_str is not None:
|
3015
3294
|
raise click.UsageError('--all is only allowed without a GPU name.')
|
3016
3295
|
|
3017
3296
|
# Kubernetes specific bools
|
3018
|
-
|
3297
|
+
enabled_clouds = sdk.get(sdk.enabled_clouds())
|
3298
|
+
cloud_is_kubernetes = isinstance(cloud_obj, clouds.Kubernetes)
|
3299
|
+
# TODO(romilb): We should move this to the backend.
|
3019
3300
|
kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None
|
3020
|
-
kubernetes_is_enabled =
|
3021
|
-
|
3022
|
-
|
3023
|
-
|
3024
|
-
raise click.UsageError(
|
3025
|
-
'The --region flag cannot be set with --cloud kubernetes.')
|
3301
|
+
kubernetes_is_enabled = clouds.cloud_in_iterable(
|
3302
|
+
clouds.Kubernetes(),
|
3303
|
+
enabled_clouds,
|
3304
|
+
)
|
3026
3305
|
|
3027
3306
|
def _list_to_str(lst):
|
3028
3307
|
return ', '.join([str(e) for e in lst])
|
3029
3308
|
|
3309
|
+
# TODO(zhwu,romilb): We should move most of these kubernetes related
|
3310
|
+
# queries into the backend, especially behind the server.
|
3030
3311
|
def _get_kubernetes_realtime_gpu_table(
|
3312
|
+
context: Optional[str] = None,
|
3031
3313
|
name_filter: Optional[str] = None,
|
3032
3314
|
quantity_filter: Optional[int] = None):
|
3033
3315
|
if quantity_filter:
|
3034
3316
|
qty_header = 'QTY_FILTER'
|
3035
3317
|
free_header = 'FILTERED_FREE_GPUS'
|
3036
3318
|
else:
|
3037
|
-
qty_header = '
|
3319
|
+
qty_header = 'REQUESTABLE_QTY_PER_NODE'
|
3038
3320
|
free_header = 'TOTAL_FREE_GPUS'
|
3039
3321
|
realtime_gpu_table = log_utils.create_table(
|
3040
3322
|
['GPU', qty_header, 'TOTAL_GPUS', free_header])
|
3041
|
-
|
3042
|
-
|
3043
|
-
|
3044
|
-
|
3045
|
-
|
3046
|
-
|
3047
|
-
case_sensitive=False)
|
3048
|
-
assert (set(counts.keys()) == set(capacity.keys()) == set(
|
3049
|
-
available.keys())), (f'Keys of counts ({list(counts.keys())}), '
|
3050
|
-
f'capacity ({list(capacity.keys())}), '
|
3051
|
-
f'and available ({list(available.keys())}) '
|
3052
|
-
'must be same.')
|
3053
|
-
if len(counts) == 0:
|
3323
|
+
realtime_gpu_availability_list = sdk.stream_and_get(
|
3324
|
+
sdk.realtime_kubernetes_gpu_availability(
|
3325
|
+
context=context,
|
3326
|
+
name_filter=name_filter,
|
3327
|
+
quantity_filter=quantity_filter))
|
3328
|
+
if not realtime_gpu_availability_list:
|
3054
3329
|
err_msg = 'No GPUs found in Kubernetes cluster. '
|
3055
3330
|
debug_msg = 'To further debug, run: sky check '
|
3056
3331
|
if name_filter is not None:
|
@@ -3062,17 +3337,43 @@ def show_gpus(
|
|
3062
3337
|
'in Kubernetes cluster. ')
|
3063
3338
|
debug_msg = ('To show available accelerators on kubernetes,'
|
3064
3339
|
' run: sky show-gpus --cloud kubernetes ')
|
3065
|
-
full_err_msg = (err_msg +
|
3340
|
+
full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
|
3066
3341
|
debug_msg)
|
3067
3342
|
raise ValueError(full_err_msg)
|
3068
|
-
|
3343
|
+
no_permissions_str = '<no permissions>'
|
3344
|
+
for realtime_gpu_availability in sorted(realtime_gpu_availability_list):
|
3345
|
+
gpu_availability = models.RealtimeGpuAvailability(
|
3346
|
+
*realtime_gpu_availability)
|
3347
|
+
available_qty = (gpu_availability.available
|
3348
|
+
if gpu_availability.available != -1 else
|
3349
|
+
no_permissions_str)
|
3069
3350
|
realtime_gpu_table.add_row([
|
3070
|
-
gpu,
|
3071
|
-
_list_to_str(counts
|
3351
|
+
gpu_availability.gpu,
|
3352
|
+
_list_to_str(gpu_availability.counts),
|
3353
|
+
gpu_availability.capacity,
|
3354
|
+
available_qty,
|
3072
3355
|
])
|
3073
3356
|
return realtime_gpu_table
|
3074
3357
|
|
3075
|
-
|
3358
|
+
# TODO(zhwu): this needs to run on remote server.
|
3359
|
+
def _get_kubernetes_node_info_table(context: Optional[str]):
|
3360
|
+
node_table = log_utils.create_table(
|
3361
|
+
['NODE_NAME', 'GPU_NAME', 'TOTAL_GPUS', 'FREE_GPUS'])
|
3362
|
+
|
3363
|
+
no_permissions_str = '<no permissions>'
|
3364
|
+
node_info_dict = sdk.stream_and_get(
|
3365
|
+
sdk.kubernetes_node_info(context=context))
|
3366
|
+
for node_name, node_info in node_info_dict.items():
|
3367
|
+
available = node_info.free[
|
3368
|
+
'accelerators_available'] if node_info.free[
|
3369
|
+
'accelerators_available'] != -1 else no_permissions_str
|
3370
|
+
node_table.add_row([
|
3371
|
+
node_name, node_info.accelerator_type,
|
3372
|
+
node_info.total['accelerator_count'], available
|
3373
|
+
])
|
3374
|
+
return node_table
|
3375
|
+
|
3376
|
+
def _output() -> Generator[str, None, None]:
|
3076
3377
|
gpu_table = log_utils.create_table(
|
3077
3378
|
['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
|
3078
3379
|
tpu_table = log_utils.create_table(
|
@@ -3085,8 +3386,8 @@ def show_gpus(
|
|
3085
3386
|
# Optimization - do not poll for Kubernetes API for fetching
|
3086
3387
|
# common GPUs because that will be fetched later for the table after
|
3087
3388
|
# common GPUs.
|
3088
|
-
clouds_to_list =
|
3089
|
-
if
|
3389
|
+
clouds_to_list: Union[Optional[str], List[str]] = cloud_name
|
3390
|
+
if cloud_name is None:
|
3090
3391
|
clouds_to_list = [
|
3091
3392
|
c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes'
|
3092
3393
|
]
|
@@ -3096,12 +3397,16 @@ def show_gpus(
|
|
3096
3397
|
# Collect k8s related messages in k8s_messages and print them at end
|
3097
3398
|
print_section_titles = False
|
3098
3399
|
# If cloud is kubernetes, we want to show real-time capacity
|
3099
|
-
if kubernetes_is_enabled and (
|
3400
|
+
if kubernetes_is_enabled and (cloud_name is None or
|
3401
|
+
cloud_is_kubernetes):
|
3402
|
+
context = region
|
3403
|
+
|
3100
3404
|
try:
|
3101
3405
|
# If --cloud kubernetes is not specified, we want to catch
|
3102
3406
|
# the case where no GPUs are available on the cluster and
|
3103
3407
|
# print the warning at the end.
|
3104
|
-
k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
|
3408
|
+
k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
|
3409
|
+
context)
|
3105
3410
|
except ValueError as e:
|
3106
3411
|
if not cloud_is_kubernetes:
|
3107
3412
|
# Make it a note if cloud is not kubernetes
|
@@ -3109,9 +3414,27 @@ def show_gpus(
|
|
3109
3414
|
k8s_messages += str(e)
|
3110
3415
|
else:
|
3111
3416
|
print_section_titles = True
|
3417
|
+
context_str = f'(Context: {context})' if context else ''
|
3112
3418
|
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3113
|
-
f'Kubernetes GPUs{
|
3419
|
+
f'Kubernetes GPUs {context_str}'
|
3420
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3114
3421
|
yield from k8s_realtime_table.get_string()
|
3422
|
+
k8s_node_table = _get_kubernetes_node_info_table(context)
|
3423
|
+
yield '\n\n'
|
3424
|
+
# TODO(Doyoung): Update the message with the multi-host TPU
|
3425
|
+
# support.
|
3426
|
+
k8s_per_node_acc_message = (
|
3427
|
+
'Kubernetes per node accelerator availability ')
|
3428
|
+
if kubernetes_utils.multi_host_tpu_exists_in_cluster(
|
3429
|
+
context):
|
3430
|
+
k8s_per_node_acc_message += (
|
3431
|
+
'(Note: Multi-host TPUs are detected and excluded '
|
3432
|
+
'from the display as multi-host TPUs are not '
|
3433
|
+
'supported.)')
|
3434
|
+
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3435
|
+
f'{k8s_per_node_acc_message}'
|
3436
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3437
|
+
yield from k8s_node_table.get_string()
|
3115
3438
|
if kubernetes_autoscaling:
|
3116
3439
|
k8s_messages += (
|
3117
3440
|
'\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
|
@@ -3129,11 +3452,14 @@ def show_gpus(
|
|
3129
3452
|
yield k8s_messages
|
3130
3453
|
yield '\n\n'
|
3131
3454
|
|
3132
|
-
result =
|
3133
|
-
|
3134
|
-
|
3135
|
-
|
3136
|
-
|
3455
|
+
result = sdk.stream_and_get(
|
3456
|
+
sdk.list_accelerator_counts(
|
3457
|
+
gpus_only=True,
|
3458
|
+
clouds=clouds_to_list,
|
3459
|
+
region_filter=region,
|
3460
|
+
))
|
3461
|
+
# TODO(zhwu): handle the case where no accelerators are found,
|
3462
|
+
# especially when --region specified a non-existent region.
|
3137
3463
|
|
3138
3464
|
if print_section_titles:
|
3139
3465
|
# If section titles were printed above, print again here
|
@@ -3151,7 +3477,7 @@ def show_gpus(
|
|
3151
3477
|
for tpu in service_catalog.get_tpus():
|
3152
3478
|
if tpu in result:
|
3153
3479
|
tpu_table.add_row([tpu, _list_to_str(result.pop(tpu))])
|
3154
|
-
if
|
3480
|
+
if tpu_table.get_string():
|
3155
3481
|
yield '\n\n'
|
3156
3482
|
yield from tpu_table.get_string()
|
3157
3483
|
|
@@ -3192,13 +3518,14 @@ def show_gpus(
|
|
3192
3518
|
name, quantity = accelerator_str, None
|
3193
3519
|
|
3194
3520
|
print_section_titles = False
|
3195
|
-
if (kubernetes_is_enabled and
|
3196
|
-
|
3521
|
+
if (kubernetes_is_enabled and
|
3522
|
+
(cloud_name is None or cloud_is_kubernetes) and not show_all):
|
3197
3523
|
# Print section title if not showing all and instead a specific
|
3198
3524
|
# accelerator is requested
|
3199
3525
|
print_section_titles = True
|
3200
3526
|
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3201
3527
|
f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
|
3528
|
+
# TODO(romilb): Show filtered per node GPU availability here as well
|
3202
3529
|
try:
|
3203
3530
|
k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
|
3204
3531
|
name_filter=name, quantity_filter=quantity)
|
@@ -3220,16 +3547,17 @@ def show_gpus(
|
|
3220
3547
|
|
3221
3548
|
# For clouds other than Kubernetes, get the accelerator details
|
3222
3549
|
# Case-sensitive
|
3223
|
-
result =
|
3224
|
-
|
3225
|
-
|
3226
|
-
|
3227
|
-
|
3228
|
-
|
3229
|
-
|
3550
|
+
result = sdk.stream_and_get(
|
3551
|
+
sdk.list_accelerators(gpus_only=True,
|
3552
|
+
name_filter=name,
|
3553
|
+
quantity_filter=quantity,
|
3554
|
+
region_filter=region,
|
3555
|
+
clouds=clouds_to_list,
|
3556
|
+
case_sensitive=False,
|
3557
|
+
all_regions=all_regions))
|
3230
3558
|
# Import here to save module load speed.
|
3231
3559
|
# pylint: disable=import-outside-toplevel,line-too-long
|
3232
|
-
from sky.clouds.service_catalog import common
|
3560
|
+
from sky.clouds.service_catalog import common as catalog_common
|
3233
3561
|
|
3234
3562
|
# For each gpu name (count not included):
|
3235
3563
|
# - Group by cloud
|
@@ -3250,7 +3578,7 @@ def show_gpus(
|
|
3250
3578
|
df = df.sort_values(by=['min_price', 'min_spot_price'])
|
3251
3579
|
df = df.drop(columns=['min_price', 'min_spot_price'])
|
3252
3580
|
sorted_dataclasses = [
|
3253
|
-
|
3581
|
+
catalog_common.InstanceTypeInfo(*row)
|
3254
3582
|
for row in df.to_records(index=False)
|
3255
3583
|
]
|
3256
3584
|
new_result[gpu] = sorted_dataclasses
|
@@ -3261,10 +3589,10 @@ def show_gpus(
|
|
3261
3589
|
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3262
3590
|
f'Cloud GPUs{colorama.Style.RESET_ALL}\n')
|
3263
3591
|
|
3264
|
-
if
|
3592
|
+
if not result:
|
3265
3593
|
quantity_str = (f' with requested quantity {quantity}'
|
3266
3594
|
if quantity else '')
|
3267
|
-
cloud_str = f' on {cloud_obj}.' if
|
3595
|
+
cloud_str = f' on {cloud_obj}.' if cloud_name else ' in cloud catalogs.'
|
3268
3596
|
yield f'Resources \'{name}\'{quantity_str} not found{cloud_str} '
|
3269
3597
|
yield 'To show available accelerators, run: sky show-gpus --all'
|
3270
3598
|
return
|
@@ -3325,10 +3653,11 @@ def show_gpus(
|
|
3325
3653
|
yield '\n\n'
|
3326
3654
|
yield from accelerator_table.get_string()
|
3327
3655
|
|
3656
|
+
outputs = _output()
|
3328
3657
|
if show_all:
|
3329
|
-
click.echo_via_pager(
|
3658
|
+
click.echo_via_pager(outputs)
|
3330
3659
|
else:
|
3331
|
-
for out in
|
3660
|
+
for out in outputs:
|
3332
3661
|
click.echo(out, nl=False)
|
3333
3662
|
click.echo()
|
3334
3663
|
|
@@ -3340,18 +3669,20 @@ def storage():
|
|
3340
3669
|
|
3341
3670
|
|
3342
3671
|
@storage.command('ls', cls=_DocumentedCodeCommand)
|
3343
|
-
@click.option('--
|
3344
|
-
'-
|
3672
|
+
@click.option('--verbose',
|
3673
|
+
'-v',
|
3345
3674
|
default=False,
|
3346
3675
|
is_flag=True,
|
3347
3676
|
required=False,
|
3348
3677
|
help='Show all information in full.')
|
3349
3678
|
@usage_lib.entrypoint
|
3350
3679
|
# pylint: disable=redefined-builtin
|
3351
|
-
def storage_ls(
|
3680
|
+
def storage_ls(verbose: bool):
|
3352
3681
|
"""List storage objects managed by SkyPilot."""
|
3353
|
-
|
3354
|
-
|
3682
|
+
request_id = sdk.storage_ls()
|
3683
|
+
storages = sdk.stream_and_get(request_id)
|
3684
|
+
storage_table = storage_utils.format_storage_table(storages,
|
3685
|
+
show_all=verbose)
|
3355
3686
|
click.echo(storage_table)
|
3356
3687
|
|
3357
3688
|
|
@@ -3373,8 +3704,9 @@ def storage_ls(all: bool):
|
|
3373
3704
|
is_flag=True,
|
3374
3705
|
required=False,
|
3375
3706
|
help='Skip confirmation prompt.')
|
3707
|
+
@_add_click_options(_COMMON_OPTIONS)
|
3376
3708
|
@usage_lib.entrypoint
|
3377
|
-
def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=redefined-builtin
|
3709
|
+
def storage_delete(names: List[str], all: bool, yes: bool, async_call: bool): # pylint: disable=redefined-builtin
|
3378
3710
|
"""Delete storage objects.
|
3379
3711
|
|
3380
3712
|
Examples:
|
@@ -3390,14 +3722,13 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
|
|
3390
3722
|
# Delete all storage objects.
|
3391
3723
|
sky storage delete -a
|
3392
3724
|
"""
|
3393
|
-
if sum([
|
3725
|
+
if sum([bool(names), all]) != 1:
|
3394
3726
|
raise click.UsageError('Either --all or a name must be specified.')
|
3395
3727
|
if all:
|
3396
|
-
storages =
|
3728
|
+
storages = sdk.get(sdk.storage_ls())
|
3397
3729
|
if not storages:
|
3398
3730
|
click.echo('No storage(s) to delete.')
|
3399
3731
|
return
|
3400
|
-
names = [s['name'] for s in storages]
|
3401
3732
|
else:
|
3402
3733
|
names = _get_glob_storages(names)
|
3403
3734
|
if names:
|
@@ -3411,13 +3742,25 @@ def storage_delete(names: List[str], all: bool, yes: bool): # pylint: disable=r
|
|
3411
3742
|
abort=True,
|
3412
3743
|
show_default=True)
|
3413
3744
|
|
3414
|
-
|
3745
|
+
request_ids = {}
|
3746
|
+
# TODO(zhwu): Support all flag for the underlying SDK and API server to
|
3747
|
+
# avoid multiple requests.
|
3748
|
+
for name in names:
|
3749
|
+
request_ids[name] = sdk.storage_delete(name)
|
3415
3750
|
|
3751
|
+
for name, request_id in request_ids.items():
|
3752
|
+
try:
|
3753
|
+
_async_call_or_wait(request_id, async_call, 'sky.storage')
|
3754
|
+
except Exception as e: # pylint: disable=broad-except
|
3755
|
+
logger.error(f'{colorama.Fore.RED}Error deleting storage {name}: '
|
3756
|
+
f'{common_utils.format_exception(e, use_bracket=True)}'
|
3757
|
+
f'{colorama.Style.RESET_ALL}')
|
3416
3758
|
|
3417
|
-
|
3759
|
+
|
3760
|
+
@cli.group(cls=_NaturalOrderGroup, hidden=True)
|
3418
3761
|
def bench():
|
3419
3762
|
"""SkyPilot Benchmark CLI."""
|
3420
|
-
|
3763
|
+
raise click.UsageError('The benchmark CLI is currently disabled.')
|
3421
3764
|
|
3422
3765
|
|
3423
3766
|
@cli.group(cls=_NaturalOrderGroup)
|
@@ -3433,13 +3776,14 @@ def jobs():
|
|
3433
3776
|
nargs=-1,
|
3434
3777
|
**_get_shell_complete_args(_complete_file_name))
|
3435
3778
|
# TODO(zhwu): Add --dryrun option to test the launch command.
|
3436
|
-
@_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS
|
3779
|
+
@_add_click_options(_TASK_OPTIONS_WITH_NAME + _EXTRA_RESOURCES_OPTIONS +
|
3780
|
+
_COMMON_OPTIONS)
|
3437
3781
|
@click.option('--cluster',
|
3438
3782
|
'-c',
|
3439
3783
|
default=None,
|
3440
3784
|
type=str,
|
3441
3785
|
hidden=True,
|
3442
|
-
help=('Alias for --name, the name of the
|
3786
|
+
help=('Alias for --name, the name of the managed job.'))
|
3443
3787
|
@click.option('--job-recovery',
|
3444
3788
|
default=None,
|
3445
3789
|
type=str,
|
@@ -3451,18 +3795,6 @@ def jobs():
|
|
3451
3795
|
is_flag=True,
|
3452
3796
|
help=('If True, as soon as a job is submitted, return from this call '
|
3453
3797
|
'and do not stream execution logs.'))
|
3454
|
-
@click.option(
|
3455
|
-
'--retry-until-up/--no-retry-until-up',
|
3456
|
-
'-r/-no-r',
|
3457
|
-
default=None,
|
3458
|
-
is_flag=True,
|
3459
|
-
required=False,
|
3460
|
-
help=(
|
3461
|
-
'(Default: True; this flag is deprecated and will be removed in a '
|
3462
|
-
'future release.) Whether to retry provisioning infinitely until the '
|
3463
|
-
'cluster is up, if unavailability errors are encountered. This ' # pylint: disable=bad-docstring-quotes
|
3464
|
-
'applies to launching all managed jobs (both the initial and '
|
3465
|
-
'any recovery attempts), not the jobs controller.'))
|
3466
3798
|
@click.option('--yes',
|
3467
3799
|
'-y',
|
3468
3800
|
is_flag=True,
|
@@ -3493,8 +3825,8 @@ def jobs_launch(
|
|
3493
3825
|
disk_tier: Optional[str],
|
3494
3826
|
ports: Tuple[str],
|
3495
3827
|
detach_run: bool,
|
3496
|
-
retry_until_up: bool,
|
3497
3828
|
yes: bool,
|
3829
|
+
async_call: bool,
|
3498
3830
|
):
|
3499
3831
|
"""Launch a managed job from a YAML or a command.
|
3500
3832
|
|
@@ -3536,19 +3868,6 @@ def jobs_launch(
|
|
3536
3868
|
ports=ports,
|
3537
3869
|
job_recovery=job_recovery,
|
3538
3870
|
)
|
3539
|
-
# Deprecation. We set the default behavior to be retry until up, and the
|
3540
|
-
# flag `--retry-until-up` is deprecated. We can remove the flag in 0.8.0.
|
3541
|
-
if retry_until_up is not None:
|
3542
|
-
flag_str = '--retry-until-up'
|
3543
|
-
if not retry_until_up:
|
3544
|
-
flag_str = '--no-retry-until-up'
|
3545
|
-
click.secho(
|
3546
|
-
f'Flag {flag_str} is deprecated and will be removed in a '
|
3547
|
-
'future release (managed jobs will always be retried). '
|
3548
|
-
'Please file an issue if this does not work for you.',
|
3549
|
-
fg='yellow')
|
3550
|
-
else:
|
3551
|
-
retry_until_up = True
|
3552
3871
|
|
3553
3872
|
if not isinstance(task_or_dag, sky.Dag):
|
3554
3873
|
assert isinstance(task_or_dag, sky.Task), task_or_dag
|
@@ -3564,26 +3883,25 @@ def jobs_launch(
|
|
3564
3883
|
dag_utils.maybe_infer_and_fill_dag_and_task_names(dag)
|
3565
3884
|
dag_utils.fill_default_config_in_dag_for_job_launch(dag)
|
3566
3885
|
|
3886
|
+
common_utils.check_cluster_name_is_valid(name)
|
3887
|
+
|
3567
3888
|
click.secho(f'Managed job {dag.name!r} will be launched on (estimated):',
|
3568
3889
|
fg='yellow')
|
3569
|
-
dag = sky.optimize(dag)
|
3570
|
-
|
3571
|
-
if not yes:
|
3572
|
-
prompt = f'Launching a managed job {dag.name!r}. Proceed?'
|
3573
|
-
if prompt is not None:
|
3574
|
-
click.confirm(prompt, default=True, abort=True, show_default=True)
|
3575
|
-
|
3576
|
-
common_utils.check_cluster_name_is_valid(name)
|
3577
3890
|
|
3578
|
-
managed_jobs.launch(dag,
|
3579
|
-
|
3580
|
-
|
3581
|
-
|
3891
|
+
request_id = managed_jobs.launch(dag, name, _need_confirmation=not yes)
|
3892
|
+
job_id_handle = _async_call_or_wait(request_id, async_call,
|
3893
|
+
'sky.jobs.launch')
|
3894
|
+
if not async_call and not detach_run:
|
3895
|
+
job_id = job_id_handle[0]
|
3896
|
+
managed_jobs.tail_logs(name=None,
|
3897
|
+
job_id=job_id,
|
3898
|
+
follow=True,
|
3899
|
+
controller=False)
|
3582
3900
|
|
3583
3901
|
|
3584
3902
|
@jobs.command('queue', cls=_DocumentedCodeCommand)
|
3585
|
-
@click.option('--
|
3586
|
-
'-
|
3903
|
+
@click.option('--verbose',
|
3904
|
+
'-v',
|
3587
3905
|
default=False,
|
3588
3906
|
is_flag=True,
|
3589
3907
|
required=False,
|
@@ -3602,9 +3920,16 @@ def jobs_launch(
|
|
3602
3920
|
is_flag=True,
|
3603
3921
|
required=False,
|
3604
3922
|
help='Show only pending/running jobs\' information.')
|
3923
|
+
@click.option('--all-users',
|
3924
|
+
'-u',
|
3925
|
+
default=False,
|
3926
|
+
is_flag=True,
|
3927
|
+
required=False,
|
3928
|
+
help='Show jobs from all users.')
|
3605
3929
|
@usage_lib.entrypoint
|
3606
3930
|
# pylint: disable=redefined-builtin
|
3607
|
-
def jobs_queue(
|
3931
|
+
def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
|
3932
|
+
all_users: bool):
|
3608
3933
|
"""Show statuses of managed jobs.
|
3609
3934
|
|
3610
3935
|
Each managed jobs can have one of the following statuses:
|
@@ -3658,12 +3983,14 @@ def jobs_queue(all: bool, refresh: bool, skip_finished: bool):
|
|
3658
3983
|
watch -n60 sky jobs queue
|
3659
3984
|
|
3660
3985
|
"""
|
3661
|
-
click.secho('Fetching managed job statuses...', fg='
|
3662
|
-
with rich_utils.
|
3663
|
-
|
3664
|
-
|
3665
|
-
|
3666
|
-
|
3986
|
+
click.secho('Fetching managed job statuses...', fg='cyan')
|
3987
|
+
with rich_utils.client_status('[cyan]Checking managed jobs[/]'):
|
3988
|
+
managed_jobs_request_id = managed_jobs.queue(
|
3989
|
+
refresh=refresh, skip_finished=skip_finished, all_users=all_users)
|
3990
|
+
_, msg = _handle_jobs_queue_request(managed_jobs_request_id,
|
3991
|
+
show_all=verbose,
|
3992
|
+
show_user=all_users,
|
3993
|
+
is_called_by_user=True)
|
3667
3994
|
if not skip_finished:
|
3668
3995
|
in_progress_only_hint = ''
|
3669
3996
|
else:
|
@@ -3685,16 +4012,23 @@ def jobs_queue(all: bool, refresh: bool, skip_finished: bool):
|
|
3685
4012
|
is_flag=True,
|
3686
4013
|
default=False,
|
3687
4014
|
required=False,
|
3688
|
-
help='Cancel all managed jobs.')
|
4015
|
+
help='Cancel all managed jobs for the current user.')
|
3689
4016
|
@click.option('--yes',
|
3690
4017
|
'-y',
|
3691
4018
|
is_flag=True,
|
3692
4019
|
default=False,
|
3693
4020
|
required=False,
|
3694
4021
|
help='Skip confirmation prompt.')
|
4022
|
+
@click.option('--all-users',
|
4023
|
+
'-u',
|
4024
|
+
is_flag=True,
|
4025
|
+
default=False,
|
4026
|
+
required=False,
|
4027
|
+
help='Cancel all managed jobs from all users.')
|
3695
4028
|
@usage_lib.entrypoint
|
3696
4029
|
# pylint: disable=redefined-builtin
|
3697
|
-
def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool
|
4030
|
+
def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
|
4031
|
+
all_users: bool):
|
3698
4032
|
"""Cancel managed jobs.
|
3699
4033
|
|
3700
4034
|
You can provide either a job name or a list of job IDs to be cancelled.
|
@@ -3710,31 +4044,34 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
|
|
3710
4044
|
# Cancel managed jobs with IDs 1, 2, 3
|
3711
4045
|
$ sky jobs cancel 1 2 3
|
3712
4046
|
"""
|
3713
|
-
backend_utils.is_controller_accessible(
|
3714
|
-
controller=controller_utils.Controllers.JOBS_CONTROLLER,
|
3715
|
-
stopped_message='All managed jobs should have finished.',
|
3716
|
-
exit_if_not_accessible=True)
|
3717
|
-
|
3718
4047
|
job_id_str = ','.join(map(str, job_ids))
|
3719
|
-
if sum([
|
3720
|
-
|
3721
|
-
|
3722
|
-
|
4048
|
+
if sum([bool(job_ids), name is not None, all or all_users]) != 1:
|
4049
|
+
arguments = []
|
4050
|
+
arguments += [f'--job-ids {job_id_str}'] if job_ids else []
|
4051
|
+
arguments += [f'--name {name}'] if name is not None else []
|
4052
|
+
arguments += ['--all'] if all else []
|
4053
|
+
arguments += ['--all-users'] if all_users else []
|
3723
4054
|
raise click.UsageError(
|
3724
|
-
'Can only specify one of JOB_IDS
|
3725
|
-
f'Provided {
|
4055
|
+
'Can only specify one of JOB_IDS, --name, or --all/--all-users. '
|
4056
|
+
f'Provided {" ".join(arguments)!r}.')
|
3726
4057
|
|
3727
4058
|
if not yes:
|
3728
4059
|
job_identity_str = (f'managed jobs with IDs {job_id_str}'
|
3729
4060
|
if job_ids else repr(name))
|
3730
|
-
if
|
4061
|
+
if all_users:
|
4062
|
+
job_identity_str = 'all managed jobs FOR ALL USERS'
|
4063
|
+
elif all:
|
3731
4064
|
job_identity_str = 'all managed jobs'
|
3732
4065
|
click.confirm(f'Cancelling {job_identity_str}. Proceed?',
|
3733
4066
|
default=True,
|
3734
4067
|
abort=True,
|
3735
4068
|
show_default=True)
|
3736
4069
|
|
3737
|
-
|
4070
|
+
sdk.stream_and_get(
|
4071
|
+
managed_jobs.cancel(job_ids=job_ids,
|
4072
|
+
name=name,
|
4073
|
+
all=all,
|
4074
|
+
all_users=all_users))
|
3738
4075
|
|
3739
4076
|
|
3740
4077
|
@jobs.command('logs', cls=_DocumentedCodeCommand)
|
@@ -3755,97 +4092,56 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool):
|
|
3755
4092
|
default=False,
|
3756
4093
|
help=('Show the controller logs of this job; useful for debugging '
|
3757
4094
|
'launching/recoveries, etc.'))
|
4095
|
+
@click.option(
|
4096
|
+
'--refresh',
|
4097
|
+
'-r',
|
4098
|
+
default=False,
|
4099
|
+
is_flag=True,
|
4100
|
+
required=False,
|
4101
|
+
help='Query the latest job logs, restarting the jobs controller if stopped.'
|
4102
|
+
)
|
4103
|
+
@click.option('--sync-down',
|
4104
|
+
'-s',
|
4105
|
+
default=False,
|
4106
|
+
is_flag=True,
|
4107
|
+
required=False,
|
4108
|
+
help='Download logs for all jobs shown in the queue.')
|
3758
4109
|
@click.argument('job_id', required=False, type=int)
|
3759
4110
|
@usage_lib.entrypoint
|
3760
4111
|
def jobs_logs(name: Optional[str], job_id: Optional[int], follow: bool,
|
3761
|
-
controller: bool):
|
3762
|
-
"""Tail the log of a managed job."""
|
4112
|
+
controller: bool, refresh: bool, sync_down: bool):
|
4113
|
+
"""Tail or sync down the log of a managed job."""
|
3763
4114
|
try:
|
3764
|
-
|
3765
|
-
|
3766
|
-
|
3767
|
-
|
4115
|
+
if sync_down:
|
4116
|
+
with rich_utils.client_status(
|
4117
|
+
ux_utils.spinner_message('Downloading jobs logs')):
|
4118
|
+
log_local_path_dict = managed_jobs.download_logs(
|
4119
|
+
name=name,
|
4120
|
+
job_id=job_id,
|
4121
|
+
controller=controller,
|
4122
|
+
refresh=refresh)
|
4123
|
+
style = colorama.Style
|
4124
|
+
fore = colorama.Fore
|
4125
|
+
controller_str = ' (controller)' if controller else ''
|
4126
|
+
for job, log_local_path in log_local_path_dict.items():
|
4127
|
+
logger.info(f'{fore.CYAN}Job {job} logs{controller_str}: '
|
4128
|
+
f'{log_local_path}{style.RESET_ALL}')
|
4129
|
+
else:
|
4130
|
+
managed_jobs.tail_logs(name=name,
|
4131
|
+
job_id=job_id,
|
4132
|
+
follow=follow,
|
4133
|
+
controller=controller,
|
4134
|
+
refresh=refresh)
|
3768
4135
|
except exceptions.ClusterNotUpError:
|
3769
4136
|
with ux_utils.print_exception_no_traceback():
|
3770
4137
|
raise
|
3771
4138
|
|
3772
4139
|
|
3773
4140
|
@jobs.command('dashboard', cls=_DocumentedCodeCommand)
|
3774
|
-
@click.option(
|
3775
|
-
'--port',
|
3776
|
-
'-p',
|
3777
|
-
default=None,
|
3778
|
-
type=int,
|
3779
|
-
required=False,
|
3780
|
-
help=('Local port to use for the dashboard. If None, a free port is '
|
3781
|
-
'automatically chosen.'))
|
3782
4141
|
@usage_lib.entrypoint
|
3783
|
-
def jobs_dashboard(
|
3784
|
-
"""Opens a dashboard for managed jobs
|
3785
|
-
|
3786
|
-
# API perhaps via REST. Then here we would (1) not have to use SSH to try to
|
3787
|
-
# see if the controller is UP first, which is slow; (2) not have to run SSH
|
3788
|
-
# port forwarding first (we'd just launch a local dashboard which would make
|
3789
|
-
# REST API calls to the controller dashboard server).
|
3790
|
-
click.secho('Checking if jobs controller is up...', fg='yellow')
|
3791
|
-
hint = ('Dashboard is not available if jobs controller is not up. Run a '
|
3792
|
-
'managed job first.')
|
3793
|
-
backend_utils.is_controller_accessible(
|
3794
|
-
controller=controller_utils.Controllers.JOBS_CONTROLLER,
|
3795
|
-
stopped_message=hint,
|
3796
|
-
non_existent_message=hint,
|
3797
|
-
exit_if_not_accessible=True)
|
3798
|
-
|
3799
|
-
# SSH forward a free local port to remote's dashboard port.
|
3800
|
-
remote_port = constants.SPOT_DASHBOARD_REMOTE_PORT
|
3801
|
-
if port is None:
|
3802
|
-
free_port = common_utils.find_free_port(remote_port)
|
3803
|
-
else:
|
3804
|
-
free_port = port
|
3805
|
-
ssh_command = (
|
3806
|
-
f'ssh -qNL {free_port}:localhost:{remote_port} '
|
3807
|
-
f'{controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name}')
|
3808
|
-
click.echo('Forwarding port: ', nl=False)
|
3809
|
-
click.secho(f'{ssh_command}', dim=True)
|
3810
|
-
|
3811
|
-
with subprocess.Popen(ssh_command, shell=True,
|
3812
|
-
start_new_session=True) as ssh_process:
|
3813
|
-
time.sleep(3) # Added delay for ssh_command to initialize.
|
3814
|
-
webbrowser.open(f'http://localhost:{free_port}')
|
3815
|
-
click.secho(
|
3816
|
-
f'Dashboard is now available at: http://127.0.0.1:{free_port}',
|
3817
|
-
fg='green')
|
3818
|
-
try:
|
3819
|
-
ssh_process.wait()
|
3820
|
-
except KeyboardInterrupt:
|
3821
|
-
# When user presses Ctrl-C in terminal, exits the previous ssh
|
3822
|
-
# command so that <free local port> is freed up.
|
3823
|
-
try:
|
3824
|
-
os.killpg(os.getpgid(ssh_process.pid), signal.SIGTERM)
|
3825
|
-
except ProcessLookupError:
|
3826
|
-
# This happens if jobs controller is auto-stopped.
|
3827
|
-
pass
|
3828
|
-
finally:
|
3829
|
-
click.echo('Exiting.')
|
3830
|
-
|
3831
|
-
|
3832
|
-
# TODO(zhwu): Backward compatibility for the old `sky spot launch` command.
|
3833
|
-
# It is now renamed to `sky jobs launch` and the old command is deprecated.
|
3834
|
-
# Remove in v0.8.0.
|
3835
|
-
@cli.group(cls=_NaturalOrderGroup)
|
3836
|
-
def spot():
|
3837
|
-
"""Alias for Managed Jobs CLI (default to managed spot jobs)."""
|
3838
|
-
pass
|
3839
|
-
|
3840
|
-
|
3841
|
-
_add_command_alias(jobs,
|
3842
|
-
jobs_launch,
|
3843
|
-
new_group=spot,
|
3844
|
-
override_command_argument={'use_spot': True})
|
3845
|
-
_add_command_alias(jobs, jobs_queue, new_group=spot)
|
3846
|
-
_add_command_alias(jobs, jobs_logs, new_group=spot)
|
3847
|
-
_add_command_alias(jobs, jobs_cancel, new_group=spot)
|
3848
|
-
_add_command_alias(jobs, jobs_dashboard, new_group=spot)
|
4142
|
+
def jobs_dashboard():
|
4143
|
+
"""Opens a dashboard for managed jobs."""
|
4144
|
+
managed_jobs.dashboard()
|
3849
4145
|
|
3850
4146
|
|
3851
4147
|
@cli.group(cls=_NaturalOrderGroup)
|
@@ -3868,7 +4164,7 @@ def _generate_task_with_service(
|
|
3868
4164
|
env: List[Tuple[str, str]],
|
3869
4165
|
gpus: Optional[str],
|
3870
4166
|
instance_type: Optional[str],
|
3871
|
-
ports: Tuple[str],
|
4167
|
+
ports: Optional[Tuple[str]],
|
3872
4168
|
cpus: Optional[str],
|
3873
4169
|
memory: Optional[str],
|
3874
4170
|
disk_size: Optional[int],
|
@@ -3900,7 +4196,6 @@ def _generate_task_with_service(
|
|
3900
4196
|
disk_size=disk_size,
|
3901
4197
|
disk_tier=disk_tier,
|
3902
4198
|
ports=ports,
|
3903
|
-
entrypoint_name='Service',
|
3904
4199
|
)
|
3905
4200
|
if isinstance(task, sky.Dag):
|
3906
4201
|
raise click.UsageError(
|
@@ -3910,32 +4205,64 @@ def _generate_task_with_service(
|
|
3910
4205
|
with ux_utils.print_exception_no_traceback():
|
3911
4206
|
raise ValueError('Service section not found in the YAML file. '
|
3912
4207
|
'To fix, add a valid `service` field.')
|
3913
|
-
|
3914
|
-
|
3915
|
-
|
3916
|
-
|
3917
|
-
|
3918
|
-
|
3919
|
-
|
3920
|
-
|
3921
|
-
|
3922
|
-
|
3923
|
-
|
3924
|
-
|
3925
|
-
|
3926
|
-
|
3927
|
-
|
3928
|
-
|
3929
|
-
|
3930
|
-
|
3931
|
-
|
3932
|
-
|
3933
|
-
|
3934
|
-
|
3935
|
-
|
3936
|
-
|
3937
|
-
|
3938
|
-
|
4208
|
+
|
4209
|
+
# NOTE(yi): we only allow one service port now.
|
4210
|
+
service_port: Optional[int] = int(
|
4211
|
+
task.service.ports) if task.service.ports is not None else None
|
4212
|
+
if service_port is None:
|
4213
|
+
for requested_resources in list(task.resources):
|
4214
|
+
if requested_resources.ports is None:
|
4215
|
+
with ux_utils.print_exception_no_traceback():
|
4216
|
+
raise ValueError(
|
4217
|
+
'Must specify at least one ports in resources. Each '
|
4218
|
+
'replica will use the port specified as application '
|
4219
|
+
'ingress port if only one port is specified in the '
|
4220
|
+
'replica resources. If there are multiple ports opened '
|
4221
|
+
'in the replica, please set the `service.ports` field '
|
4222
|
+
'in the service config.')
|
4223
|
+
requested_ports = list(
|
4224
|
+
resources_utils.port_ranges_to_set(requested_resources.ports))
|
4225
|
+
if len(requested_ports) > 1:
|
4226
|
+
with ux_utils.print_exception_no_traceback():
|
4227
|
+
raise ValueError(
|
4228
|
+
'Multiple ports specified in resources. Please '
|
4229
|
+
'specify the main port in the `service.ports` field.')
|
4230
|
+
# We request all the replicas using the same port for now, but it
|
4231
|
+
# should be fine to allow different replicas to use different ports
|
4232
|
+
# in the future.
|
4233
|
+
resource_port = requested_ports[0]
|
4234
|
+
if service_port is None:
|
4235
|
+
service_port = resource_port
|
4236
|
+
if service_port != resource_port:
|
4237
|
+
with ux_utils.print_exception_no_traceback():
|
4238
|
+
raise ValueError(
|
4239
|
+
f'Got multiple ports: {service_port} and '
|
4240
|
+
f'{resource_port} in different resources. '
|
4241
|
+
'Please specify the same port in all replicas, or '
|
4242
|
+
'explicitly set the service port in the '
|
4243
|
+
'`service.ports` section.')
|
4244
|
+
assert service_port is not None
|
4245
|
+
task.service.set_ports(str(service_port))
|
4246
|
+
else:
|
4247
|
+
for requested_resources in list(task.resources):
|
4248
|
+
if requested_resources.ports is None:
|
4249
|
+
with ux_utils.print_exception_no_traceback():
|
4250
|
+
raise ValueError(
|
4251
|
+
'Must specify at least one ports in every replica '
|
4252
|
+
'resources.')
|
4253
|
+
ports_set = resources_utils.port_ranges_to_set(
|
4254
|
+
requested_resources.ports)
|
4255
|
+
if service_port not in ports_set:
|
4256
|
+
with ux_utils.print_exception_no_traceback():
|
4257
|
+
# TODO(tian): Automatically infer resource port from
|
4258
|
+
# service port if none of them is specified in the
|
4259
|
+
# replica resources.
|
4260
|
+
raise ValueError(
|
4261
|
+
f'The service port {service_port} specified in the '
|
4262
|
+
'service section is not found in some resources. '
|
4263
|
+
'Please check if the service port is correct or add '
|
4264
|
+
'the service port to replica resources.')
|
4265
|
+
|
3939
4266
|
return task
|
3940
4267
|
|
3941
4268
|
|
@@ -3951,7 +4278,7 @@ def _generate_task_with_service(
|
|
3951
4278
|
type=str,
|
3952
4279
|
help='A service name. Unique for each service. If not provided, '
|
3953
4280
|
'a unique name is autogenerated.')
|
3954
|
-
@_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS)
|
4281
|
+
@_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS + _COMMON_OPTIONS)
|
3955
4282
|
@click.option('--yes',
|
3956
4283
|
'-y',
|
3957
4284
|
is_flag=True,
|
@@ -3980,6 +4307,7 @@ def serve_up(
|
|
3980
4307
|
disk_size: Optional[int],
|
3981
4308
|
disk_tier: Optional[str],
|
3982
4309
|
yes: bool,
|
4310
|
+
async_call: bool,
|
3983
4311
|
):
|
3984
4312
|
"""Launch a SkyServe service.
|
3985
4313
|
|
@@ -4033,21 +4361,16 @@ def serve_up(
|
|
4033
4361
|
ports=ports,
|
4034
4362
|
not_supported_cmd='sky serve up',
|
4035
4363
|
)
|
4036
|
-
click.secho('Service
|
4364
|
+
click.secho('Service spec:', fg='cyan')
|
4037
4365
|
click.echo(task.service)
|
4038
4366
|
|
4039
4367
|
click.secho('Each replica will use the following resources (estimated):',
|
4040
4368
|
fg='cyan')
|
4041
4369
|
with sky.Dag() as dag:
|
4042
4370
|
dag.add(task)
|
4043
|
-
sky.optimize(dag)
|
4044
|
-
|
4045
|
-
if not yes:
|
4046
|
-
prompt = f'Launching a new service {service_name!r}. Proceed?'
|
4047
|
-
if prompt is not None:
|
4048
|
-
click.confirm(prompt, default=True, abort=True, show_default=True)
|
4049
4371
|
|
4050
|
-
serve_lib.up(task, service_name)
|
4372
|
+
request_id = serve_lib.up(task, service_name, _need_confirmation=not yes)
|
4373
|
+
_async_call_or_wait(request_id, async_call, 'sky.serve.up')
|
4051
4374
|
|
4052
4375
|
|
4053
4376
|
# TODO(MaoZiming): Update Doc.
|
@@ -4060,7 +4383,7 @@ def serve_up(
|
|
4060
4383
|
type=str,
|
4061
4384
|
nargs=-1,
|
4062
4385
|
**_get_shell_complete_args(_complete_file_name))
|
4063
|
-
@_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS)
|
4386
|
+
@_add_click_options(_TASK_OPTIONS + _EXTRA_RESOURCES_OPTIONS + _COMMON_OPTIONS)
|
4064
4387
|
@click.option('--mode',
|
4065
4388
|
default=serve_lib.DEFAULT_UPDATE_MODE.value,
|
4066
4389
|
type=click.Choice([m.value for m in serve_lib.UpdateMode],
|
@@ -4077,28 +4400,16 @@ def serve_up(
|
|
4077
4400
|
help='Skip confirmation prompt.')
|
4078
4401
|
@timeline.event
|
4079
4402
|
@usage_lib.entrypoint
|
4080
|
-
def serve_update(
|
4081
|
-
|
4082
|
-
|
4083
|
-
|
4084
|
-
|
4085
|
-
|
4086
|
-
|
4087
|
-
|
4088
|
-
|
4089
|
-
|
4090
|
-
env_file: Optional[Dict[str, str]],
|
4091
|
-
env: List[Tuple[str, str]],
|
4092
|
-
gpus: Optional[str],
|
4093
|
-
instance_type: Optional[str],
|
4094
|
-
ports: Tuple[str],
|
4095
|
-
cpus: Optional[str],
|
4096
|
-
memory: Optional[str],
|
4097
|
-
disk_size: Optional[int],
|
4098
|
-
disk_tier: Optional[str],
|
4099
|
-
mode: str,
|
4100
|
-
yes: bool,
|
4101
|
-
):
|
4403
|
+
def serve_update(service_name: str, service_yaml: Tuple[str, ...],
|
4404
|
+
workdir: Optional[str], cloud: Optional[str],
|
4405
|
+
region: Optional[str], zone: Optional[str],
|
4406
|
+
num_nodes: Optional[int], use_spot: Optional[bool],
|
4407
|
+
image_id: Optional[str], env_file: Optional[Dict[str, str]],
|
4408
|
+
env: List[Tuple[str, str]], gpus: Optional[str],
|
4409
|
+
instance_type: Optional[str], ports: Tuple[str],
|
4410
|
+
cpus: Optional[str], memory: Optional[str],
|
4411
|
+
disk_size: Optional[int], disk_tier: Optional[str], mode: str,
|
4412
|
+
yes: bool, async_call: bool):
|
4102
4413
|
"""Update a SkyServe service.
|
4103
4414
|
|
4104
4415
|
service_yaml must point to a valid YAML file.
|
@@ -4149,27 +4460,24 @@ def serve_update(
|
|
4149
4460
|
ports=ports,
|
4150
4461
|
not_supported_cmd='sky serve update',
|
4151
4462
|
)
|
4152
|
-
click.secho('Service
|
4463
|
+
click.secho('Service spec:', fg='cyan')
|
4153
4464
|
click.echo(task.service)
|
4154
4465
|
|
4155
4466
|
click.secho('New replica will use the following resources (estimated):',
|
4156
4467
|
fg='cyan')
|
4157
4468
|
with sky.Dag() as dag:
|
4158
4469
|
dag.add(task)
|
4159
|
-
sky.optimize(dag)
|
4160
4470
|
|
4161
|
-
|
4162
|
-
|
4163
|
-
|
4164
|
-
|
4165
|
-
|
4166
|
-
|
4167
|
-
serve_lib.update(task, service_name, mode=serve_lib.UpdateMode(mode))
|
4471
|
+
request_id = serve_lib.update(task,
|
4472
|
+
service_name,
|
4473
|
+
mode=serve_lib.UpdateMode(mode),
|
4474
|
+
_need_confirmation=not yes)
|
4475
|
+
_async_call_or_wait(request_id, async_call, 'sky.serve.update')
|
4168
4476
|
|
4169
4477
|
|
4170
4478
|
@serve.command('status', cls=_DocumentedCodeCommand)
|
4171
|
-
@click.option('--
|
4172
|
-
'-
|
4479
|
+
@click.option('--verbose',
|
4480
|
+
'-v',
|
4173
4481
|
default=False,
|
4174
4482
|
is_flag=True,
|
4175
4483
|
required=False,
|
@@ -4182,7 +4490,7 @@ def serve_update(
|
|
4182
4490
|
@click.argument('service_names', required=False, type=str, nargs=-1)
|
4183
4491
|
@usage_lib.entrypoint
|
4184
4492
|
# pylint: disable=redefined-builtin
|
4185
|
-
def serve_status(
|
4493
|
+
def serve_status(verbose: bool, endpoint: bool, service_names: List[str]):
|
4186
4494
|
"""Show statuses of SkyServe services.
|
4187
4495
|
|
4188
4496
|
Show detailed statuses of one or more services. If SERVICE_NAME is not
|
@@ -4269,17 +4577,22 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]):
|
|
4269
4577
|
sky serve status
|
4270
4578
|
\b
|
4271
4579
|
# Show detailed status for all services
|
4272
|
-
sky serve status -
|
4580
|
+
sky serve status -v
|
4273
4581
|
\b
|
4274
4582
|
# Only show status of my-service
|
4275
4583
|
sky serve status my-service
|
4276
4584
|
"""
|
4585
|
+
service_names_to_query: Optional[List[str]] = service_names
|
4586
|
+
if not service_names:
|
4587
|
+
service_names_to_query = None
|
4277
4588
|
# This won't pollute the output of --endpoint.
|
4278
|
-
with rich_utils.
|
4279
|
-
|
4280
|
-
|
4281
|
-
|
4282
|
-
|
4589
|
+
with rich_utils.client_status('[cyan]Checking services[/]'):
|
4590
|
+
service_status_request_id = serve_lib.status(service_names_to_query)
|
4591
|
+
_, msg = _handle_services_request(service_status_request_id,
|
4592
|
+
service_names=service_names_to_query,
|
4593
|
+
show_all=verbose,
|
4594
|
+
show_endpoint=endpoint,
|
4595
|
+
is_called_by_user=True)
|
4283
4596
|
|
4284
4597
|
if not endpoint:
|
4285
4598
|
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
@@ -4305,8 +4618,21 @@ def serve_status(all: bool, endpoint: bool, service_names: List[str]):
|
|
4305
4618
|
default=False,
|
4306
4619
|
required=False,
|
4307
4620
|
help='Skip confirmation prompt.')
|
4621
|
+
@click.option('--replica-id',
|
4622
|
+
default=None,
|
4623
|
+
type=int,
|
4624
|
+
help='Tear down a given replica')
|
4625
|
+
@_add_click_options(_COMMON_OPTIONS)
|
4626
|
+
@usage_lib.entrypoint
|
4308
4627
|
# pylint: disable=redefined-builtin
|
4309
|
-
def serve_down(
|
4628
|
+
def serve_down(
|
4629
|
+
service_names: List[str],
|
4630
|
+
all: bool,
|
4631
|
+
purge: bool,
|
4632
|
+
yes: bool,
|
4633
|
+
replica_id: Optional[int],
|
4634
|
+
async_call: bool,
|
4635
|
+
) -> None:
|
4310
4636
|
"""Teardown service(s).
|
4311
4637
|
|
4312
4638
|
SERVICE_NAMES is the name of the service (or glob pattern) to tear down. If
|
@@ -4333,31 +4659,58 @@ def serve_down(service_names: List[str], all: bool, purge: bool, yes: bool):
|
|
4333
4659
|
\b
|
4334
4660
|
# Forcefully tear down a service in failed status.
|
4335
4661
|
sky serve down failed-service --purge
|
4662
|
+
\b
|
4663
|
+
# Tear down a specific replica
|
4664
|
+
sky serve down my-service --replica-id 1
|
4665
|
+
\b
|
4666
|
+
# Forcefully tear down a specific replica, even in failed status.
|
4667
|
+
sky serve down my-service --replica-id 1 --purge
|
4336
4668
|
"""
|
4337
|
-
if sum([
|
4338
|
-
argument_str = f'SERVICE_NAMES={",".join(service_names)}'
|
4339
|
-
|
4669
|
+
if sum([bool(service_names), all]) != 1:
|
4670
|
+
argument_str = (f'SERVICE_NAMES={",".join(service_names)}'
|
4671
|
+
if service_names else '')
|
4340
4672
|
argument_str += ' --all' if all else ''
|
4341
4673
|
raise click.UsageError(
|
4342
4674
|
'Can only specify one of SERVICE_NAMES or --all. '
|
4343
4675
|
f'Provided {argument_str!r}.')
|
4344
4676
|
|
4345
|
-
|
4346
|
-
|
4347
|
-
|
4348
|
-
|
4349
|
-
|
4350
|
-
|
4351
|
-
|
4352
|
-
service_identity_str = f'service(s) {", ".join(quoted_service_names)}'
|
4677
|
+
replica_id_is_defined = replica_id is not None
|
4678
|
+
if replica_id_is_defined:
|
4679
|
+
if len(service_names) != 1:
|
4680
|
+
service_names_str = ', '.join(service_names)
|
4681
|
+
raise click.UsageError(f'The --replica-id option can only be used '
|
4682
|
+
f'with a single service name. Got: '
|
4683
|
+
f'{service_names_str}.')
|
4353
4684
|
if all:
|
4354
|
-
|
4355
|
-
|
4356
|
-
|
4357
|
-
|
4358
|
-
|
4359
|
-
|
4360
|
-
|
4685
|
+
raise click.UsageError('The --replica-id option cannot be used '
|
4686
|
+
'with the --all option.')
|
4687
|
+
if not yes:
|
4688
|
+
if replica_id_is_defined:
|
4689
|
+
click.confirm(
|
4690
|
+
f'Terminating replica ID {replica_id} in '
|
4691
|
+
f'{service_names[0]!r}. Proceed?',
|
4692
|
+
default=True,
|
4693
|
+
abort=True,
|
4694
|
+
show_default=True)
|
4695
|
+
else:
|
4696
|
+
quoted_service_names = [f'{name!r}' for name in service_names]
|
4697
|
+
list_service_str = ', '.join(quoted_service_names)
|
4698
|
+
service_identity_str = f'service(s) {list_service_str}'
|
4699
|
+
if all:
|
4700
|
+
service_identity_str = 'all services'
|
4701
|
+
click.confirm(f'Terminating {service_identity_str}. Proceed?',
|
4702
|
+
default=True,
|
4703
|
+
abort=True,
|
4704
|
+
show_default=True)
|
4705
|
+
|
4706
|
+
if replica_id_is_defined:
|
4707
|
+
request_id = serve_lib.terminate_replica(service_names[0], replica_id,
|
4708
|
+
purge)
|
4709
|
+
else:
|
4710
|
+
request_id = serve_lib.down(service_names=service_names,
|
4711
|
+
all=all,
|
4712
|
+
purge=purge)
|
4713
|
+
_async_call_or_wait(request_id, async_call, 'sky.serve.down')
|
4361
4714
|
|
4362
4715
|
|
4363
4716
|
@serve.command('logs', cls=_DocumentedCodeCommand)
|
@@ -4484,7 +4837,7 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]:
|
|
4484
4837
|
required=True,
|
4485
4838
|
type=str,
|
4486
4839
|
help='Benchmark name.')
|
4487
|
-
@_add_click_options(_TASK_OPTIONS_WITH_NAME)
|
4840
|
+
@_add_click_options(_TASK_OPTIONS_WITH_NAME + _COMMON_OPTIONS)
|
4488
4841
|
@click.option('--gpus',
|
4489
4842
|
required=False,
|
4490
4843
|
type=str,
|
@@ -4519,26 +4872,27 @@ def _get_candidate_configs(yaml_path: str) -> Optional[List[Dict[str, str]]]:
|
|
4519
4872
|
help='Skip confirmation prompt.')
|
4520
4873
|
@usage_lib.entrypoint
|
4521
4874
|
def benchmark_launch(
|
4522
|
-
|
4523
|
-
|
4524
|
-
|
4525
|
-
|
4526
|
-
|
4527
|
-
|
4528
|
-
|
4529
|
-
|
4530
|
-
|
4531
|
-
|
4532
|
-
|
4533
|
-
|
4534
|
-
|
4535
|
-
|
4536
|
-
|
4537
|
-
|
4538
|
-
|
4539
|
-
|
4540
|
-
|
4541
|
-
|
4875
|
+
entrypoint: str,
|
4876
|
+
benchmark: str,
|
4877
|
+
name: Optional[str],
|
4878
|
+
workdir: Optional[str],
|
4879
|
+
cloud: Optional[str],
|
4880
|
+
region: Optional[str],
|
4881
|
+
zone: Optional[str],
|
4882
|
+
gpus: Optional[str],
|
4883
|
+
num_nodes: Optional[int],
|
4884
|
+
use_spot: Optional[bool],
|
4885
|
+
image_id: Optional[str],
|
4886
|
+
env_file: Optional[Dict[str, str]],
|
4887
|
+
env: List[Tuple[str, str]],
|
4888
|
+
cpus: Optional[str],
|
4889
|
+
memory: Optional[str],
|
4890
|
+
disk_size: Optional[int],
|
4891
|
+
disk_tier: Optional[str],
|
4892
|
+
ports: Tuple[str],
|
4893
|
+
idle_minutes_to_autostop: Optional[int],
|
4894
|
+
yes: bool,
|
4895
|
+
async_call: bool, # pylint: disable=unused-argument
|
4542
4896
|
) -> None:
|
4543
4897
|
"""Benchmark a task on different resources.
|
4544
4898
|
|
@@ -4547,6 +4901,7 @@ def benchmark_launch(
|
|
4547
4901
|
Alternatively, specify the benchmarking resources in your YAML (see doc),
|
4548
4902
|
which allows benchmarking on many more resource fields.
|
4549
4903
|
"""
|
4904
|
+
# TODO(zhwu): move benchmark to SkyPilot API server
|
4550
4905
|
env = _merge_env_vars(env_file, env)
|
4551
4906
|
record = benchmark_state.get_benchmark_from_name(benchmark)
|
4552
4907
|
if record is not None:
|
@@ -4565,7 +4920,7 @@ def benchmark_launch(
|
|
4565
4920
|
'Please provide a YAML file.')
|
4566
4921
|
assert config is not None, (is_yaml, config)
|
4567
4922
|
|
4568
|
-
click.secho('Benchmarking a task from YAML
|
4923
|
+
click.secho('Benchmarking a task from YAML: ', fg='cyan', nl=False)
|
4569
4924
|
click.secho(entrypoint, bold=True)
|
4570
4925
|
|
4571
4926
|
candidates = _get_candidate_configs(entrypoint)
|
@@ -4686,7 +5041,7 @@ def benchmark_launch(
|
|
4686
5041
|
if idle_minutes_to_autostop is None:
|
4687
5042
|
idle_minutes_to_autostop = 5
|
4688
5043
|
commandline_args['idle-minutes-to-autostop'] = idle_minutes_to_autostop
|
4689
|
-
if
|
5044
|
+
if env:
|
4690
5045
|
commandline_args['env'] = [f'{k}={v}' for k, v in env]
|
4691
5046
|
|
4692
5047
|
# Launch the benchmarking clusters in detach mode in parallel.
|
@@ -4699,11 +5054,11 @@ def benchmark_launch(
|
|
4699
5054
|
f'\n{colorama.Fore.CYAN}Benchmark name: '
|
4700
5055
|
f'{colorama.Style.BRIGHT}{benchmark}{colorama.Style.RESET_ALL}'
|
4701
5056
|
'\nTo see the benchmark results: '
|
4702
|
-
f'{
|
4703
|
-
f'{benchmark}{
|
5057
|
+
f'{ux_utils.BOLD}sky bench show '
|
5058
|
+
f'{benchmark}{ux_utils.RESET_BOLD}'
|
4704
5059
|
'\nTo teardown the clusters: '
|
4705
|
-
f'{
|
4706
|
-
f'{benchmark}{
|
5060
|
+
f'{ux_utils.BOLD}sky bench down '
|
5061
|
+
f'{benchmark}{ux_utils.RESET_BOLD}')
|
4707
5062
|
subprocess_utils.run('sky bench ls')
|
4708
5063
|
else:
|
4709
5064
|
logger.error('No benchmarking clusters are created.')
|
@@ -4937,10 +5292,7 @@ def benchmark_down(
|
|
4937
5292
|
continue
|
4938
5293
|
to_stop.append(cluster)
|
4939
5294
|
|
4940
|
-
_down_or_stop_clusters(to_stop,
|
4941
|
-
apply_to_all=False,
|
4942
|
-
down=True,
|
4943
|
-
no_confirm=yes)
|
5295
|
+
_down_or_stop_clusters(to_stop, down=True, no_confirm=yes)
|
4944
5296
|
|
4945
5297
|
|
4946
5298
|
@bench.command('delete', cls=_DocumentedCodeCommand)
|
@@ -4965,7 +5317,7 @@ def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool],
|
|
4965
5317
|
raise click.BadParameter(
|
4966
5318
|
'Either specify benchmarks or use --all to delete all benchmarks.')
|
4967
5319
|
to_delete = []
|
4968
|
-
if
|
5320
|
+
if benchmarks:
|
4969
5321
|
for benchmark in benchmarks:
|
4970
5322
|
record = benchmark_state.get_benchmark_from_name(benchmark)
|
4971
5323
|
if record is None:
|
@@ -4974,7 +5326,7 @@ def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool],
|
|
4974
5326
|
to_delete.append(record)
|
4975
5327
|
if all:
|
4976
5328
|
to_delete = benchmark_state.get_benchmarks()
|
4977
|
-
if
|
5329
|
+
if benchmarks:
|
4978
5330
|
print('Both --all and benchmark(s) specified '
|
4979
5331
|
'for sky bench delete. Letting --all take effect.')
|
4980
5332
|
|
@@ -5011,8 +5363,8 @@ def benchmark_delete(benchmarks: Tuple[str], all: Optional[bool],
|
|
5011
5363
|
message = (f'{colorama.Fore.YELLOW}Benchmark {benchmark} '
|
5012
5364
|
f'has {num_clusters} un-terminated cluster{plural}. '
|
5013
5365
|
f'Terminate the cluster{plural} with '
|
5014
|
-
f'{
|
5015
|
-
f'{
|
5366
|
+
f'{ux_utils.BOLD} sky bench down {benchmark} '
|
5367
|
+
f'{ux_utils.RESET_BOLD} '
|
5016
5368
|
'before deleting the benchmark report.')
|
5017
5369
|
success = False
|
5018
5370
|
else:
|
@@ -5051,181 +5403,276 @@ def local():
|
|
5051
5403
|
is_flag=True,
|
5052
5404
|
help='Launch cluster without GPU support even '
|
5053
5405
|
'if GPUs are detected on the host.')
|
5406
|
+
@click.option(
|
5407
|
+
'--ips',
|
5408
|
+
type=str,
|
5409
|
+
required=False,
|
5410
|
+
help='Path to the file containing IP addresses of remote machines.')
|
5411
|
+
@click.option('--ssh-user',
|
5412
|
+
type=str,
|
5413
|
+
required=False,
|
5414
|
+
help='SSH username for accessing remote machines.')
|
5415
|
+
@click.option('--ssh-key-path',
|
5416
|
+
type=str,
|
5417
|
+
required=False,
|
5418
|
+
help='Path to the SSH private key.')
|
5419
|
+
@click.option('--cleanup',
|
5420
|
+
is_flag=True,
|
5421
|
+
help='Clean up the remote cluster instead of deploying it.')
|
5054
5422
|
@local.command('up', cls=_DocumentedCodeCommand)
|
5423
|
+
@_add_click_options(_COMMON_OPTIONS)
|
5055
5424
|
@usage_lib.entrypoint
|
5056
|
-
def local_up(gpus: bool
|
5057
|
-
|
5058
|
-
|
5059
|
-
|
5060
|
-
|
5061
|
-
|
5062
|
-
|
5063
|
-
|
5064
|
-
|
5065
|
-
|
5066
|
-
|
5067
|
-
|
5068
|
-
|
5069
|
-
|
5070
|
-
|
5071
|
-
|
5072
|
-
|
5073
|
-
|
5074
|
-
|
5075
|
-
|
5076
|
-
|
5077
|
-
|
5078
|
-
|
5079
|
-
|
5080
|
-
|
5081
|
-
|
5082
|
-
|
5083
|
-
|
5084
|
-
|
5085
|
-
|
5086
|
-
|
5087
|
-
|
5088
|
-
|
5089
|
-
|
5090
|
-
|
5091
|
-
|
5092
|
-
|
5093
|
-
|
5094
|
-
|
5095
|
-
|
5096
|
-
|
5097
|
-
|
5098
|
-
|
5099
|
-
|
5100
|
-
|
5101
|
-
|
5102
|
-
|
5103
|
-
|
5104
|
-
# Kind always writes to stderr even if it succeeds.
|
5105
|
-
# If the failure happens after the cluster is created, we need
|
5106
|
-
# to strip all stderr of "No kind clusters found.", which is
|
5107
|
-
# printed when querying with kind get clusters.
|
5108
|
-
stderr = stderr.replace('No kind clusters found.\n', '')
|
5109
|
-
|
5110
|
-
if returncode == 0:
|
5111
|
-
cluster_created = True
|
5112
|
-
elif returncode == 100:
|
5113
|
-
click.echo(f'{colorama.Fore.GREEN}Local cluster already '
|
5114
|
-
f'exists.{style.RESET_ALL}\n'
|
5115
|
-
'If you want to delete it instead, run: sky local down')
|
5116
|
-
else:
|
5117
|
-
with ux_utils.print_exception_no_traceback():
|
5118
|
-
raise RuntimeError(
|
5119
|
-
'Failed to create local cluster. '
|
5120
|
-
f'Full log: {log_path}'
|
5121
|
-
f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}')
|
5122
|
-
# Run sky check
|
5123
|
-
with rich_utils.safe_status('[bold cyan]Running sky check...'):
|
5124
|
-
sky_check.check(clouds=['kubernetes'], quiet=True)
|
5125
|
-
if cluster_created:
|
5126
|
-
# Prepare completion message which shows CPU and GPU count
|
5127
|
-
# Get number of CPUs
|
5128
|
-
p = subprocess_utils.run(
|
5129
|
-
'kubectl get nodes -o jsonpath=\'{.items[0].status.capacity.cpu}\'',
|
5130
|
-
capture_output=True)
|
5131
|
-
num_cpus = int(p.stdout.decode('utf-8'))
|
5132
|
-
|
5133
|
-
# GPU count/type parsing
|
5134
|
-
gpu_message = ''
|
5135
|
-
gpu_hint = ''
|
5136
|
-
if gpus:
|
5137
|
-
# Get GPU model by querying the node labels
|
5138
|
-
label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
|
5139
|
-
gpu_type_cmd = f'kubectl get node skypilot-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
|
5140
|
-
try:
|
5141
|
-
# Run the command and capture the output
|
5142
|
-
gpu_count_output = subprocess.check_output(gpu_type_cmd,
|
5143
|
-
shell=True,
|
5144
|
-
text=True)
|
5145
|
-
gpu_type_str = gpu_count_output.strip() + ' '
|
5146
|
-
except subprocess.CalledProcessError as e:
|
5147
|
-
output = str(e.output.decode('utf-8'))
|
5148
|
-
logger.warning(f'Failed to get GPU type: {output}')
|
5149
|
-
gpu_type_str = ''
|
5150
|
-
|
5151
|
-
# Get number of GPUs (sum of nvidia.com/gpu resources)
|
5152
|
-
gpu_count_command = 'kubectl get nodes -o=jsonpath=\'{range .items[*]}{.status.allocatable.nvidia\\.com/gpu}{\"\\n\"}{end}\' | awk \'{sum += $1} END {print sum}\'' # pylint: disable=line-too-long
|
5153
|
-
try:
|
5154
|
-
# Run the command and capture the output
|
5155
|
-
gpu_count_output = subprocess.check_output(gpu_count_command,
|
5156
|
-
shell=True,
|
5157
|
-
text=True)
|
5158
|
-
gpu_count = gpu_count_output.strip(
|
5159
|
-
) # Remove any extra whitespace
|
5160
|
-
gpu_message = f' and {gpu_count} {gpu_type_str}GPUs'
|
5161
|
-
except subprocess.CalledProcessError as e:
|
5162
|
-
output = str(e.output.decode('utf-8'))
|
5163
|
-
logger.warning(f'Failed to get GPU count: {output}')
|
5164
|
-
gpu_message = f' with {gpu_type_str}GPU support'
|
5165
|
-
|
5166
|
-
gpu_hint = (
|
5167
|
-
'\nHint: To see the list of GPUs in the cluster, '
|
5168
|
-
'run \'sky show-gpus --cloud kubernetes\'') if gpus else ''
|
5169
|
-
|
5170
|
-
if num_cpus < 2:
|
5171
|
-
click.echo('Warning: Local cluster has less than 2 CPUs. '
|
5172
|
-
'This may cause issues with running tasks.')
|
5173
|
-
click.echo(
|
5174
|
-
f'\n{colorama.Fore.GREEN}Local Kubernetes cluster created '
|
5175
|
-
'successfully with '
|
5176
|
-
f'{num_cpus} CPUs{gpu_message}.{style.RESET_ALL}\n`sky launch` can '
|
5177
|
-
'now run tasks locally.'
|
5178
|
-
'\nHint: To change the number of CPUs, change your docker '
|
5179
|
-
'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
|
5180
|
-
f'{gpu_hint}')
|
5425
|
+
def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
|
5426
|
+
cleanup: bool, async_call: bool):
|
5427
|
+
"""Creates a local or remote cluster."""
|
5428
|
+
|
5429
|
+
def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
|
5430
|
+
# If any of --ips, --ssh-user, or --ssh-key-path is specified,
|
5431
|
+
# all must be specified
|
5432
|
+
if bool(ips) or bool(ssh_user) or bool(ssh_key_path):
|
5433
|
+
if not (ips and ssh_user and ssh_key_path):
|
5434
|
+
raise click.BadParameter(
|
5435
|
+
'All --ips, --ssh-user, and --ssh-key-path '
|
5436
|
+
'must be specified together.')
|
5437
|
+
|
5438
|
+
# --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
|
5439
|
+
# are all provided
|
5440
|
+
if cleanup and not (ips and ssh_user and ssh_key_path):
|
5441
|
+
raise click.BadParameter('--cleanup can only be used with '
|
5442
|
+
'--ips, --ssh-user and --ssh-key-path.')
|
5443
|
+
|
5444
|
+
_validate_args(ips, ssh_user, ssh_key_path, cleanup)
|
5445
|
+
|
5446
|
+
# If remote deployment arguments are specified, run remote up script
|
5447
|
+
ip_list = None
|
5448
|
+
ssh_key = None
|
5449
|
+
if ips and ssh_user and ssh_key_path:
|
5450
|
+
# Read and validate IP file
|
5451
|
+
try:
|
5452
|
+
with open(os.path.expanduser(ips), 'r', encoding='utf-8') as f:
|
5453
|
+
ip_list = f.read().strip().splitlines()
|
5454
|
+
if not ip_list:
|
5455
|
+
raise click.BadParameter(f'IP file is empty: {ips}')
|
5456
|
+
except (IOError, OSError) as e:
|
5457
|
+
raise click.BadParameter(f'Failed to read IP file {ips}: {str(e)}')
|
5458
|
+
|
5459
|
+
# Read and validate SSH key file
|
5460
|
+
try:
|
5461
|
+
with open(os.path.expanduser(ssh_key_path), 'r',
|
5462
|
+
encoding='utf-8') as f:
|
5463
|
+
ssh_key = f.read()
|
5464
|
+
if not ssh_key:
|
5465
|
+
raise click.BadParameter(
|
5466
|
+
f'SSH key file is empty: {ssh_key_path}')
|
5467
|
+
except (IOError, OSError) as e:
|
5468
|
+
raise click.BadParameter(
|
5469
|
+
f'Failed to read SSH key file {ssh_key_path}: {str(e)}')
|
5470
|
+
|
5471
|
+
request_id = sdk.local_up(gpus, ip_list, ssh_user, ssh_key, cleanup)
|
5472
|
+
_async_call_or_wait(request_id, async_call, request_name='local up')
|
5181
5473
|
|
5182
5474
|
|
5183
5475
|
@local.command('down', cls=_DocumentedCodeCommand)
|
5476
|
+
@_add_click_options(_COMMON_OPTIONS)
|
5184
5477
|
@usage_lib.entrypoint
|
5185
|
-
def local_down():
|
5478
|
+
def local_down(async_call: bool):
|
5186
5479
|
"""Deletes a local cluster."""
|
5187
|
-
|
5480
|
+
request_id = sdk.local_down()
|
5481
|
+
_async_call_or_wait(request_id, async_call, request_name='sky.local.down')
|
5188
5482
|
|
5189
|
-
path_to_package = os.path.dirname(os.path.dirname(__file__))
|
5190
|
-
down_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes',
|
5191
|
-
'delete_cluster.sh')
|
5192
5483
|
|
5193
|
-
|
5194
|
-
|
5484
|
+
@cli.group(cls=_NaturalOrderGroup)
|
5485
|
+
def api():
|
5486
|
+
"""SkyPilot API server commands."""
|
5487
|
+
pass
|
5195
5488
|
|
5196
|
-
# Setup logging paths
|
5197
|
-
run_timestamp = backend_utils.get_run_timestamp()
|
5198
|
-
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
5199
|
-
'local_down.log')
|
5200
|
-
tail_cmd = 'tail -n100 -f ' + log_path
|
5201
5489
|
|
5202
|
-
|
5203
|
-
|
5204
|
-
|
5205
|
-
|
5206
|
-
|
5207
|
-
|
5208
|
-
|
5209
|
-
|
5210
|
-
|
5211
|
-
|
5212
|
-
|
5213
|
-
|
5214
|
-
|
5215
|
-
|
5216
|
-
|
5217
|
-
|
5218
|
-
|
5219
|
-
|
5220
|
-
|
5221
|
-
|
5222
|
-
|
5223
|
-
|
5224
|
-
|
5225
|
-
|
5226
|
-
|
5227
|
-
|
5228
|
-
|
5490
|
+
@api.command('start', cls=_DocumentedCodeCommand)
|
5491
|
+
@click.option('--deploy',
|
5492
|
+
type=bool,
|
5493
|
+
is_flag=True,
|
5494
|
+
default=False,
|
5495
|
+
required=False,
|
5496
|
+
help=('Deploy the SkyPilot API server. When set to True, '
|
5497
|
+
'SkyPilot API server will use all resources on the host '
|
5498
|
+
'machine assuming the machine is dedicated to SkyPilot API '
|
5499
|
+
'server; host will also be set to 0.0.0.0 to allow remote '
|
5500
|
+
'access.'))
|
5501
|
+
@click.option('--host',
|
5502
|
+
default='127.0.0.1',
|
5503
|
+
type=click.Choice(server_common.AVAILBLE_LOCAL_API_SERVER_HOSTS),
|
5504
|
+
required=False,
|
5505
|
+
help=('The host to deploy the SkyPilot API server. To allow '
|
5506
|
+
'remote access, set this to 0.0.0.0'))
|
5507
|
+
@click.option('--foreground',
|
5508
|
+
is_flag=True,
|
5509
|
+
default=False,
|
5510
|
+
required=False,
|
5511
|
+
help='Run the SkyPilot API server in the foreground and output '
|
5512
|
+
'its logs to stdout/stderr. Allowing external systems '
|
5513
|
+
'to manage the process lifecycle and collect logs directly. '
|
5514
|
+
'This is useful when the API server is managed by systems '
|
5515
|
+
'like systemd and Kubernetes.')
|
5516
|
+
@usage_lib.entrypoint
|
5517
|
+
def api_start(deploy: bool, host: Optional[str], foreground: bool):
|
5518
|
+
"""Starts the SkyPilot API server locally."""
|
5519
|
+
sdk.api_start(deploy=deploy, host=host, foreground=foreground)
|
5520
|
+
|
5521
|
+
|
5522
|
+
@api.command('stop', cls=_DocumentedCodeCommand)
|
5523
|
+
@usage_lib.entrypoint
|
5524
|
+
def api_stop():
|
5525
|
+
"""Stops the SkyPilot API server locally."""
|
5526
|
+
sdk.api_stop()
|
5527
|
+
|
5528
|
+
|
5529
|
+
@api.command('logs', cls=_DocumentedCodeCommand)
|
5530
|
+
@click.argument('request_id', required=False, type=str)
|
5531
|
+
@click.option('--server-logs',
|
5532
|
+
is_flag=True,
|
5533
|
+
default=False,
|
5534
|
+
required=False,
|
5535
|
+
help='Stream the server logs.')
|
5536
|
+
@click.option('--log-path',
|
5537
|
+
'-l',
|
5538
|
+
required=False,
|
5539
|
+
type=str,
|
5540
|
+
help='The path to the log file to stream.')
|
5541
|
+
@click.option('--tail',
|
5542
|
+
required=False,
|
5543
|
+
type=int,
|
5544
|
+
help=('Number of lines to show from the end of the logs. '
|
5545
|
+
'(default: None)'))
|
5546
|
+
@click.option('--follow/--no-follow',
|
5547
|
+
is_flag=True,
|
5548
|
+
default=True,
|
5549
|
+
required=False,
|
5550
|
+
help='Follow the logs.')
|
5551
|
+
@usage_lib.entrypoint
|
5552
|
+
def api_logs(request_id: Optional[str], server_logs: bool,
|
5553
|
+
log_path: Optional[str], tail: Optional[int], follow: bool):
|
5554
|
+
"""Stream the logs of a request running on SkyPilot API server."""
|
5555
|
+
if not server_logs and request_id is None and log_path is None:
|
5556
|
+
# TODO(zhwu): get the latest request ID.
|
5557
|
+
raise click.BadParameter('Please provide the request ID or log path.')
|
5558
|
+
if server_logs:
|
5559
|
+
sdk.api_server_logs(follow=follow, tail=tail)
|
5560
|
+
return
|
5561
|
+
|
5562
|
+
if request_id is not None and log_path is not None:
|
5563
|
+
raise click.BadParameter(
|
5564
|
+
'Only one of request ID and log path can be provided.')
|
5565
|
+
sdk.stream_and_get(request_id, log_path, tail)
|
5566
|
+
|
5567
|
+
|
5568
|
+
@api.command('cancel', cls=_DocumentedCodeCommand)
|
5569
|
+
@click.argument('request_ids', required=False, type=str, nargs=-1)
|
5570
|
+
@click.option('--all',
|
5571
|
+
'-a',
|
5572
|
+
is_flag=True,
|
5573
|
+
default=False,
|
5574
|
+
required=False,
|
5575
|
+
help='Cancel all your requests.')
|
5576
|
+
@click.option('--all-users',
|
5577
|
+
'-u',
|
5578
|
+
is_flag=True,
|
5579
|
+
default=False,
|
5580
|
+
required=False,
|
5581
|
+
help='Cancel all requests from all users.')
|
5582
|
+
@usage_lib.entrypoint
|
5583
|
+
# pylint: disable=redefined-builtin
|
5584
|
+
def api_cancel(request_ids: Optional[List[str]], all: bool, all_users: bool):
|
5585
|
+
"""Cancel a request running on SkyPilot API server."""
|
5586
|
+
if all or all_users:
|
5587
|
+
keyword = 'ALL USERS\'' if all_users else 'YOUR'
|
5588
|
+
user_input = click.prompt(
|
5589
|
+
f'This will cancel all {keyword} requests.\n'
|
5590
|
+
f'To proceed, please type {colorama.Style.BRIGHT}'
|
5591
|
+
f'\'cancel all requests\'{colorama.Style.RESET_ALL}',
|
5592
|
+
type=str)
|
5593
|
+
if user_input != 'cancel all requests':
|
5594
|
+
raise click.Abort()
|
5595
|
+
if all:
|
5596
|
+
request_ids = None
|
5597
|
+
cancelled_request_ids = sdk.get(
|
5598
|
+
sdk.api_cancel(request_ids=request_ids, all_users=all_users))
|
5599
|
+
if not cancelled_request_ids:
|
5600
|
+
click.secho('No requests need to be cancelled.', fg='green')
|
5601
|
+
elif len(cancelled_request_ids) == 1:
|
5602
|
+
click.secho(f'Cancelled 1 request: {cancelled_request_ids[0]}',
|
5603
|
+
fg='green')
|
5604
|
+
else:
|
5605
|
+
click.secho(f'Cancelled {len(cancelled_request_ids)} requests.',
|
5606
|
+
fg='green')
|
5607
|
+
|
5608
|
+
|
5609
|
+
@api.command('status', cls=_DocumentedCodeCommand)
|
5610
|
+
@click.argument('request_ids', required=False, type=str, nargs=-1)
|
5611
|
+
@click.option('--all-status',
|
5612
|
+
'-a',
|
5613
|
+
is_flag=True,
|
5614
|
+
default=False,
|
5615
|
+
required=False,
|
5616
|
+
help='Show requests of all statuses.')
|
5617
|
+
@click.option('--verbose',
|
5618
|
+
'-v',
|
5619
|
+
is_flag=True,
|
5620
|
+
default=False,
|
5621
|
+
required=False,
|
5622
|
+
help='Show more details.')
|
5623
|
+
@usage_lib.entrypoint
|
5624
|
+
# pylint: disable=redefined-builtin
|
5625
|
+
def api_status(request_ids: Optional[List[str]], all_status: bool,
|
5626
|
+
verbose: bool):
|
5627
|
+
"""List requests on SkyPilot API server."""
|
5628
|
+
if not request_ids:
|
5629
|
+
request_ids = None
|
5630
|
+
request_list = sdk.api_status(request_ids, all_status)
|
5631
|
+
columns = ['ID', 'User', 'Name']
|
5632
|
+
if verbose:
|
5633
|
+
columns.append('Cluster')
|
5634
|
+
columns.extend(['Created', 'Status'])
|
5635
|
+
table = log_utils.create_table(columns)
|
5636
|
+
for request in request_list:
|
5637
|
+
r_id = request.request_id
|
5638
|
+
if not verbose:
|
5639
|
+
r_id = common_utils.truncate_long_string(r_id, 36)
|
5640
|
+
req_status = requests.RequestStatus(request.status)
|
5641
|
+
row = [r_id, request.user_name, request.name]
|
5642
|
+
if verbose:
|
5643
|
+
row.append(request.cluster_name)
|
5644
|
+
row.extend([
|
5645
|
+
log_utils.readable_time_duration(request.created_at),
|
5646
|
+
req_status.colored_str()
|
5647
|
+
])
|
5648
|
+
table.add_row(row)
|
5649
|
+
click.echo(table)
|
5650
|
+
|
5651
|
+
|
5652
|
+
@api.command('login', cls=_DocumentedCodeCommand)
|
5653
|
+
@click.option('--endpoint',
|
5654
|
+
'-e',
|
5655
|
+
required=False,
|
5656
|
+
help='The SkyPilot API server endpoint.')
|
5657
|
+
@usage_lib.entrypoint
|
5658
|
+
def api_login(endpoint: Optional[str]):
|
5659
|
+
"""Logs into a SkyPilot API server."""
|
5660
|
+
sdk.api_login(endpoint)
|
5661
|
+
|
5662
|
+
|
5663
|
+
@api.command('info', cls=_DocumentedCodeCommand)
|
5664
|
+
@usage_lib.entrypoint
|
5665
|
+
def api_info():
|
5666
|
+
"""Shows the SkyPilot API server URL."""
|
5667
|
+
url = server_common.get_server_url()
|
5668
|
+
api_server_info = sdk.api_info()
|
5669
|
+
user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
|
5670
|
+
user_hash = common_utils.get_user_hash()
|
5671
|
+
click.echo(f'Using SkyPilot API server: {url}\n'
|
5672
|
+
f'{ux_utils.INDENT_SYMBOL}Status: {api_server_info["status"]}, '
|
5673
|
+
f'commit: {api_server_info["commit"]}, '
|
5674
|
+
f'version: {api_server_info["version"]}\n'
|
5675
|
+
f'{ux_utils.INDENT_LAST_SYMBOL}User: {user_name} ({user_hash})')
|
5229
5676
|
|
5230
5677
|
|
5231
5678
|
def main():
|