skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/client/sdk.py
ADDED
@@ -0,0 +1,1765 @@
|
|
1
|
+
"""Client-side Python SDK for SkyPilot.
|
2
|
+
|
3
|
+
All functions will return a future that can be awaited on with the `get` method.
|
4
|
+
|
5
|
+
Usage example:
|
6
|
+
|
7
|
+
.. code-block:: python
|
8
|
+
|
9
|
+
request_id = sky.status()
|
10
|
+
statuses = sky.get(request_id)
|
11
|
+
|
12
|
+
"""
|
13
|
+
import getpass
|
14
|
+
import json
|
15
|
+
import logging
|
16
|
+
import os
|
17
|
+
import pathlib
|
18
|
+
import subprocess
|
19
|
+
import typing
|
20
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
21
|
+
|
22
|
+
import click
|
23
|
+
import colorama
|
24
|
+
import filelock
|
25
|
+
import psutil
|
26
|
+
import requests
|
27
|
+
|
28
|
+
from sky import backends
|
29
|
+
from sky import exceptions
|
30
|
+
from sky import sky_logging
|
31
|
+
from sky import skypilot_config
|
32
|
+
from sky.client import common as client_common
|
33
|
+
from sky.server import common as server_common
|
34
|
+
from sky.server import constants as server_constants
|
35
|
+
from sky.server.requests import payloads
|
36
|
+
from sky.server.requests import requests as requests_lib
|
37
|
+
from sky.skylet import constants
|
38
|
+
from sky.usage import usage_lib
|
39
|
+
from sky.utils import annotations
|
40
|
+
from sky.utils import cluster_utils
|
41
|
+
from sky.utils import common
|
42
|
+
from sky.utils import common_utils
|
43
|
+
from sky.utils import dag_utils
|
44
|
+
from sky.utils import env_options
|
45
|
+
from sky.utils import rich_utils
|
46
|
+
from sky.utils import status_lib
|
47
|
+
from sky.utils import subprocess_utils
|
48
|
+
from sky.utils import ux_utils
|
49
|
+
|
50
|
+
if typing.TYPE_CHECKING:
|
51
|
+
import io
|
52
|
+
|
53
|
+
import sky
|
54
|
+
|
55
|
+
logger = sky_logging.init_logger(__name__)
|
56
|
+
logging.getLogger('httpx').setLevel(logging.CRITICAL)
|
57
|
+
|
58
|
+
|
59
|
+
def stream_response(request_id: Optional[str],
|
60
|
+
response: requests.Response,
|
61
|
+
output_stream: Optional['io.TextIOBase'] = None) -> Any:
|
62
|
+
"""Streams the response to the console.
|
63
|
+
|
64
|
+
Args:
|
65
|
+
request_id: The request ID.
|
66
|
+
response: The HTTP response.
|
67
|
+
output_stream: The output stream to write to. If None, print to the
|
68
|
+
console.
|
69
|
+
"""
|
70
|
+
|
71
|
+
try:
|
72
|
+
for line in rich_utils.decode_rich_status(response):
|
73
|
+
if line is not None:
|
74
|
+
print(line, flush=True, end='', file=output_stream)
|
75
|
+
return get(request_id)
|
76
|
+
except Exception: # pylint: disable=broad-except
|
77
|
+
logger.debug(f'To stream request logs: sky api logs {request_id}')
|
78
|
+
raise
|
79
|
+
|
80
|
+
|
81
|
+
@usage_lib.entrypoint
|
82
|
+
@server_common.check_server_healthy_or_start
|
83
|
+
@annotations.client_api
|
84
|
+
def check(clouds: Optional[Tuple[str]],
|
85
|
+
verbose: bool) -> server_common.RequestId:
|
86
|
+
"""Checks the credentials to enable clouds.
|
87
|
+
|
88
|
+
Args:
|
89
|
+
clouds: The clouds to check.
|
90
|
+
verbose: Whether to show verbose output.
|
91
|
+
|
92
|
+
Returns:
|
93
|
+
The request ID of the check request.
|
94
|
+
|
95
|
+
Request Returns:
|
96
|
+
None
|
97
|
+
"""
|
98
|
+
body = payloads.CheckBody(clouds=clouds, verbose=verbose)
|
99
|
+
response = requests.post(f'{server_common.get_server_url()}/check',
|
100
|
+
json=json.loads(body.model_dump_json()))
|
101
|
+
return server_common.get_request_id(response)
|
102
|
+
|
103
|
+
|
104
|
+
@usage_lib.entrypoint
|
105
|
+
@server_common.check_server_healthy_or_start
|
106
|
+
@annotations.client_api
|
107
|
+
def enabled_clouds() -> server_common.RequestId:
|
108
|
+
"""Gets the enabled clouds.
|
109
|
+
|
110
|
+
Returns:
|
111
|
+
The request ID of the enabled clouds request.
|
112
|
+
|
113
|
+
Request Returns:
|
114
|
+
A list of enabled clouds in string format.
|
115
|
+
"""
|
116
|
+
response = requests.get(f'{server_common.get_server_url()}/enabled_clouds')
|
117
|
+
return server_common.get_request_id(response)
|
118
|
+
|
119
|
+
|
120
|
+
@usage_lib.entrypoint
|
121
|
+
@server_common.check_server_healthy_or_start
|
122
|
+
@annotations.client_api
|
123
|
+
def list_accelerators(gpus_only: bool = True,
|
124
|
+
name_filter: Optional[str] = None,
|
125
|
+
region_filter: Optional[str] = None,
|
126
|
+
quantity_filter: Optional[int] = None,
|
127
|
+
clouds: Optional[Union[List[str], str]] = None,
|
128
|
+
all_regions: bool = False,
|
129
|
+
require_price: bool = True,
|
130
|
+
case_sensitive: bool = True) -> server_common.RequestId:
|
131
|
+
"""Lists the names of all accelerators offered by Sky.
|
132
|
+
|
133
|
+
This will include all accelerators offered by Sky, including those
|
134
|
+
that may not be available in the user's account.
|
135
|
+
|
136
|
+
Args:
|
137
|
+
gpus_only: Whether to only list GPU accelerators.
|
138
|
+
name_filter: The name filter.
|
139
|
+
region_filter: The region filter.
|
140
|
+
quantity_filter: The quantity filter.
|
141
|
+
clouds: The clouds to list.
|
142
|
+
all_regions: Whether to list all regions.
|
143
|
+
require_price: Whether to require price.
|
144
|
+
case_sensitive: Whether to case sensitive.
|
145
|
+
|
146
|
+
Returns:
|
147
|
+
The request ID of the list accelerator counts request.
|
148
|
+
|
149
|
+
Request Returns:
|
150
|
+
acc_to_instance_type_dict (Dict[str, List[InstanceTypeInfo]]): A
|
151
|
+
dictionary of canonical accelerator names mapped to a list of
|
152
|
+
instance type offerings. See usage in cli.py.
|
153
|
+
"""
|
154
|
+
body = payloads.ListAcceleratorsBody(
|
155
|
+
gpus_only=gpus_only,
|
156
|
+
name_filter=name_filter,
|
157
|
+
region_filter=region_filter,
|
158
|
+
quantity_filter=quantity_filter,
|
159
|
+
clouds=clouds,
|
160
|
+
all_regions=all_regions,
|
161
|
+
require_price=require_price,
|
162
|
+
case_sensitive=case_sensitive,
|
163
|
+
)
|
164
|
+
response = requests.post(
|
165
|
+
f'{server_common.get_server_url()}/list_accelerators',
|
166
|
+
json=json.loads(body.model_dump_json()))
|
167
|
+
return server_common.get_request_id(response)
|
168
|
+
|
169
|
+
|
170
|
+
@usage_lib.entrypoint
|
171
|
+
@server_common.check_server_healthy_or_start
|
172
|
+
@annotations.client_api
|
173
|
+
def list_accelerator_counts(
|
174
|
+
gpus_only: bool = True,
|
175
|
+
name_filter: Optional[str] = None,
|
176
|
+
region_filter: Optional[str] = None,
|
177
|
+
quantity_filter: Optional[int] = None,
|
178
|
+
clouds: Optional[Union[List[str],
|
179
|
+
str]] = None) -> server_common.RequestId:
|
180
|
+
"""Lists all accelerators offered by Sky and available counts.
|
181
|
+
|
182
|
+
Args:
|
183
|
+
gpus_only: Whether to only list GPU accelerators.
|
184
|
+
name_filter: The name filter.
|
185
|
+
region_filter: The region filter.
|
186
|
+
quantity_filter: The quantity filter.
|
187
|
+
clouds: The clouds to list.
|
188
|
+
|
189
|
+
Returns:
|
190
|
+
The request ID of the list accelerator counts request.
|
191
|
+
|
192
|
+
Request Returns:
|
193
|
+
acc_to_acc_num_dict (Dict[str, List[int]]): A dictionary of canonical
|
194
|
+
accelerator names mapped to a list of available counts. See usage
|
195
|
+
in cli.py.
|
196
|
+
"""
|
197
|
+
body = payloads.ListAcceleratorsBody(
|
198
|
+
gpus_only=gpus_only,
|
199
|
+
name_filter=name_filter,
|
200
|
+
region_filter=region_filter,
|
201
|
+
quantity_filter=quantity_filter,
|
202
|
+
clouds=clouds,
|
203
|
+
)
|
204
|
+
response = requests.post(
|
205
|
+
f'{server_common.get_server_url()}/list_accelerator_counts',
|
206
|
+
json=json.loads(body.model_dump_json()))
|
207
|
+
return server_common.get_request_id(response)
|
208
|
+
|
209
|
+
|
210
|
+
@usage_lib.entrypoint
|
211
|
+
@server_common.check_server_healthy_or_start
|
212
|
+
@annotations.client_api
|
213
|
+
def optimize(
|
214
|
+
dag: 'sky.Dag',
|
215
|
+
minimize: common.OptimizeTarget = common.OptimizeTarget.COST
|
216
|
+
) -> server_common.RequestId:
|
217
|
+
"""Finds the best execution plan for the given DAG.
|
218
|
+
|
219
|
+
Args:
|
220
|
+
dag: the DAG to optimize.
|
221
|
+
minimize: whether to minimize cost or time.
|
222
|
+
|
223
|
+
Returns:
|
224
|
+
The request ID of the optimize request.
|
225
|
+
|
226
|
+
Request Returns:
|
227
|
+
optimized_dag (str): The optimized DAG in YAML format.
|
228
|
+
|
229
|
+
Request Raises:
|
230
|
+
exceptions.ResourcesUnavailableError: if no resources are available
|
231
|
+
for a task.
|
232
|
+
exceptions.NoCloudAccessError: if no public clouds are enabled.
|
233
|
+
"""
|
234
|
+
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
235
|
+
|
236
|
+
body = payloads.OptimizeBody(dag=dag_str, minimize=minimize)
|
237
|
+
response = requests.post(f'{server_common.get_server_url()}/optimize',
|
238
|
+
json=json.loads(body.model_dump_json()))
|
239
|
+
return server_common.get_request_id(response)
|
240
|
+
|
241
|
+
|
242
|
+
@usage_lib.entrypoint
|
243
|
+
@server_common.check_server_healthy_or_start
|
244
|
+
@annotations.client_api
|
245
|
+
def validate(dag: 'sky.Dag', workdir_only: bool = False) -> None:
|
246
|
+
"""Validates the tasks.
|
247
|
+
|
248
|
+
The file paths (workdir and file_mounts) are validated on the client side
|
249
|
+
while the rest (e.g. resource) are validated on server side.
|
250
|
+
|
251
|
+
Raises exceptions if the DAG is invalid.
|
252
|
+
|
253
|
+
Args:
|
254
|
+
dag: the DAG to validate.
|
255
|
+
workdir_only: whether to only validate the workdir. This is used for
|
256
|
+
`exec` as it does not need other files/folders in file_mounts.
|
257
|
+
"""
|
258
|
+
for task in dag.tasks:
|
259
|
+
task.expand_and_validate_workdir()
|
260
|
+
if not workdir_only:
|
261
|
+
task.expand_and_validate_file_mounts()
|
262
|
+
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
263
|
+
body = payloads.ValidateBody(dag=dag_str)
|
264
|
+
response = requests.post(f'{server_common.get_server_url()}/validate',
|
265
|
+
json=json.loads(body.model_dump_json()))
|
266
|
+
if response.status_code == 400:
|
267
|
+
with ux_utils.print_exception_no_traceback():
|
268
|
+
raise exceptions.deserialize_exception(
|
269
|
+
response.json().get('detail'))
|
270
|
+
|
271
|
+
|
272
|
+
@usage_lib.entrypoint
|
273
|
+
@server_common.check_server_healthy_or_start
|
274
|
+
@annotations.client_api
|
275
|
+
def launch(
|
276
|
+
task: Union['sky.Task', 'sky.Dag'],
|
277
|
+
cluster_name: Optional[str] = None,
|
278
|
+
retry_until_up: bool = False,
|
279
|
+
idle_minutes_to_autostop: Optional[int] = None,
|
280
|
+
dryrun: bool = False,
|
281
|
+
down: bool = False, # pylint: disable=redefined-outer-name
|
282
|
+
backend: Optional[backends.Backend] = None,
|
283
|
+
optimize_target: common.OptimizeTarget = common.OptimizeTarget.COST,
|
284
|
+
no_setup: bool = False,
|
285
|
+
clone_disk_from: Optional[str] = None,
|
286
|
+
fast: bool = False,
|
287
|
+
# Internal only:
|
288
|
+
# pylint: disable=invalid-name
|
289
|
+
_need_confirmation: bool = False,
|
290
|
+
_is_launched_by_jobs_controller: bool = False,
|
291
|
+
_is_launched_by_sky_serve_controller: bool = False,
|
292
|
+
_disable_controller_check: bool = False,
|
293
|
+
) -> server_common.RequestId:
|
294
|
+
"""Launches a cluster or task.
|
295
|
+
|
296
|
+
The task's setup and run commands are executed under the task's workdir
|
297
|
+
(when specified, it is synced to remote cluster). The task undergoes job
|
298
|
+
queue scheduling on the cluster.
|
299
|
+
|
300
|
+
Currently, the first argument must be a sky.Task, or (EXPERIMENTAL advanced
|
301
|
+
usage) a sky.Dag. In the latter case, currently it must contain a single
|
302
|
+
task; support for pipelines/general DAGs are in experimental branches.
|
303
|
+
|
304
|
+
Example:
|
305
|
+
.. code-block:: python
|
306
|
+
|
307
|
+
import sky
|
308
|
+
task = sky.Task(run='echo hello SkyPilot')
|
309
|
+
task.set_resources(
|
310
|
+
sky.Resources(cloud=sky.AWS(), accelerators='V100:4'))
|
311
|
+
sky.launch(task, cluster_name='my-cluster')
|
312
|
+
|
313
|
+
|
314
|
+
Args:
|
315
|
+
task: sky.Task, or sky.Dag (experimental; 1-task only) to launch.
|
316
|
+
cluster_name: name of the cluster to create/reuse. If None,
|
317
|
+
auto-generate a name.
|
318
|
+
retry_until_up: whether to retry launching the cluster until it is
|
319
|
+
up.
|
320
|
+
idle_minutes_to_autostop: automatically stop the cluster after this
|
321
|
+
many minute of idleness, i.e., no running or pending jobs in the
|
322
|
+
cluster's job queue. Idleness gets reset whenever setting-up/
|
323
|
+
running/pending jobs are found in the job queue. Setting this
|
324
|
+
flag is equivalent to running ``sky.launch()`` and then
|
325
|
+
``sky.autostop(idle_minutes=<minutes>)``. If not set, the cluster
|
326
|
+
will not be autostopped.
|
327
|
+
dryrun: if True, do not actually launch the cluster.
|
328
|
+
down: Tear down the cluster after all jobs finish (successfully or
|
329
|
+
abnormally). If --idle-minutes-to-autostop is also set, the
|
330
|
+
cluster will be torn down after the specified idle time.
|
331
|
+
Note that if errors occur during provisioning/data syncing/setting
|
332
|
+
up, the cluster will not be torn down for debugging purposes.
|
333
|
+
backend: backend to use. If None, use the default backend
|
334
|
+
(CloudVMRayBackend).
|
335
|
+
optimize_target: target to optimize for. Choices: OptimizeTarget.COST,
|
336
|
+
OptimizeTarget.TIME.
|
337
|
+
no_setup: if True, do not re-run setup commands.
|
338
|
+
clone_disk_from: [Experimental] if set, clone the disk from the
|
339
|
+
specified cluster. This is useful to migrate the cluster to a
|
340
|
+
different availability zone or region.
|
341
|
+
fast: [Experimental] If the cluster is already up and available,
|
342
|
+
skip provisioning and setup steps.
|
343
|
+
_need_confirmation: (Internal only) If True, show the confirmation
|
344
|
+
prompt.
|
345
|
+
|
346
|
+
Returns:
|
347
|
+
The request ID of the launch request.
|
348
|
+
|
349
|
+
Request Returns:
|
350
|
+
job_id (Optional[int]): the job ID of the submitted job. None if the
|
351
|
+
backend is not ``CloudVmRayBackend``, or no job is submitted to the
|
352
|
+
cluster.
|
353
|
+
handle (Optional[backends.ResourceHandle]): the handle to the cluster.
|
354
|
+
None if dryrun.
|
355
|
+
|
356
|
+
Request Raises:
|
357
|
+
exceptions.ClusterOwnerIdentityMismatchError: if the cluster is owned
|
358
|
+
by another user.
|
359
|
+
exceptions.InvalidClusterNameError: if the cluster name is invalid.
|
360
|
+
exceptions.ResourcesMismatchError: if the requested resources
|
361
|
+
do not match the existing cluster.
|
362
|
+
exceptions.NotSupportedError: if required features are not supported
|
363
|
+
by the backend/cloud/cluster.
|
364
|
+
exceptions.ResourcesUnavailableError: if the requested resources
|
365
|
+
cannot be satisfied. The failover_history of the exception will be set
|
366
|
+
as:
|
367
|
+
|
368
|
+
1. Empty: iff the first-ever sky.optimize() fails to find a feasible
|
369
|
+
resource; no pre-check or actual launch is attempted.
|
370
|
+
|
371
|
+
2. Non-empty: iff at least 1 exception from either our pre-checks
|
372
|
+
(e.g., cluster name invalid) or a region/zone throwing resource
|
373
|
+
unavailability.
|
374
|
+
|
375
|
+
exceptions.CommandError: any ssh command error.
|
376
|
+
exceptions.NoCloudAccessError: if all clouds are disabled.
|
377
|
+
|
378
|
+
Other exceptions may be raised depending on the backend.
|
379
|
+
"""
|
380
|
+
if cluster_name is None:
|
381
|
+
cluster_name = cluster_utils.generate_cluster_name()
|
382
|
+
|
383
|
+
if clone_disk_from is not None:
|
384
|
+
with ux_utils.print_exception_no_traceback():
|
385
|
+
raise NotImplementedError('clone_disk_from is not implemented yet. '
|
386
|
+
'Please contact the SkyPilot team if you '
|
387
|
+
'need this feature at slack.skypilot.co.')
|
388
|
+
dag = dag_utils.convert_entrypoint_to_dag(task)
|
389
|
+
validate(dag)
|
390
|
+
|
391
|
+
confirm_shown = False
|
392
|
+
if _need_confirmation:
|
393
|
+
cluster_status = None
|
394
|
+
# TODO(SKY-998): we should reduce RTTs before launching the cluster.
|
395
|
+
request_id = status([cluster_name], all_users=True)
|
396
|
+
clusters = get(request_id)
|
397
|
+
cluster_user_hash = common_utils.get_user_hash()
|
398
|
+
cluster_user_hash_str = ''
|
399
|
+
cluster_user_name = getpass.getuser()
|
400
|
+
if not clusters:
|
401
|
+
# Show the optimize log before the prompt if the cluster does not
|
402
|
+
# exist.
|
403
|
+
request_id = optimize(dag)
|
404
|
+
stream_and_get(request_id)
|
405
|
+
else:
|
406
|
+
cluster_record = clusters[0]
|
407
|
+
cluster_status = cluster_record['status']
|
408
|
+
cluster_user_hash = cluster_record['user_hash']
|
409
|
+
cluster_user_name = cluster_record['user_name']
|
410
|
+
if cluster_user_name == getpass.getuser():
|
411
|
+
# Only show the hash if the username is the same as the local
|
412
|
+
# username, to avoid confusion.
|
413
|
+
cluster_user_hash_str = f' (hash: {cluster_user_hash})'
|
414
|
+
|
415
|
+
# Prompt if (1) --cluster is None, or (2) cluster doesn't exist, or (3)
|
416
|
+
# it exists but is STOPPED.
|
417
|
+
prompt = None
|
418
|
+
if cluster_status is None:
|
419
|
+
prompt = (
|
420
|
+
f'Launching a new cluster {cluster_name!r}. '
|
421
|
+
# '{clone_source_str}. '
|
422
|
+
'Proceed?')
|
423
|
+
elif cluster_status == status_lib.ClusterStatus.STOPPED:
|
424
|
+
user_name_str = ''
|
425
|
+
if cluster_user_hash != common_utils.get_user_hash():
|
426
|
+
user_name_str = (' created by another user '
|
427
|
+
f'{cluster_user_name!r}'
|
428
|
+
f'{cluster_user_hash_str}')
|
429
|
+
prompt = (f'Restarting the stopped cluster {cluster_name!r}'
|
430
|
+
f'{user_name_str}. Proceed?')
|
431
|
+
elif cluster_user_hash != common_utils.get_user_hash():
|
432
|
+
# Prompt if the cluster was created by a different user.
|
433
|
+
prompt = (f'Cluster {cluster_name!r} was created by another user '
|
434
|
+
f'{cluster_user_name!r}{cluster_user_hash_str}. '
|
435
|
+
'Reusing the cluster. Proceed?')
|
436
|
+
if prompt is not None:
|
437
|
+
confirm_shown = True
|
438
|
+
click.confirm(prompt, default=True, abort=True, show_default=True)
|
439
|
+
|
440
|
+
if not confirm_shown:
|
441
|
+
click.secho('Running on cluster: ', fg='cyan', nl=False)
|
442
|
+
click.secho(cluster_name)
|
443
|
+
|
444
|
+
dag = client_common.upload_mounts_to_api_server(dag)
|
445
|
+
|
446
|
+
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
447
|
+
|
448
|
+
body = payloads.LaunchBody(
|
449
|
+
task=dag_str,
|
450
|
+
cluster_name=cluster_name,
|
451
|
+
retry_until_up=retry_until_up,
|
452
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
453
|
+
dryrun=dryrun,
|
454
|
+
down=down,
|
455
|
+
backend=backend.NAME if backend else None,
|
456
|
+
optimize_target=optimize_target,
|
457
|
+
no_setup=no_setup,
|
458
|
+
clone_disk_from=clone_disk_from,
|
459
|
+
fast=fast,
|
460
|
+
# For internal use
|
461
|
+
quiet_optimizer=_need_confirmation,
|
462
|
+
is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
|
463
|
+
is_launched_by_sky_serve_controller=(
|
464
|
+
_is_launched_by_sky_serve_controller),
|
465
|
+
disable_controller_check=_disable_controller_check,
|
466
|
+
)
|
467
|
+
response = requests.post(
|
468
|
+
f'{server_common.get_server_url()}/launch',
|
469
|
+
json=json.loads(body.model_dump_json()),
|
470
|
+
timeout=5,
|
471
|
+
)
|
472
|
+
return server_common.get_request_id(response)
|
473
|
+
|
474
|
+
|
475
|
+
@usage_lib.entrypoint
|
476
|
+
@server_common.check_server_healthy_or_start
|
477
|
+
@annotations.client_api
|
478
|
+
def exec( # pylint: disable=redefined-builtin
|
479
|
+
task: Union['sky.Task', 'sky.Dag'],
|
480
|
+
cluster_name: Optional[str] = None,
|
481
|
+
dryrun: bool = False,
|
482
|
+
down: bool = False, # pylint: disable=redefined-outer-name
|
483
|
+
backend: Optional[backends.Backend] = None,
|
484
|
+
) -> server_common.RequestId:
|
485
|
+
"""Executes a task on an existing cluster.
|
486
|
+
|
487
|
+
This function performs two actions:
|
488
|
+
|
489
|
+
(1) workdir syncing, if the task has a workdir specified;
|
490
|
+
(2) executing the task's ``run`` commands.
|
491
|
+
|
492
|
+
All other steps (provisioning, setup commands, file mounts syncing) are
|
493
|
+
skipped. If any of those specifications changed in the task, this function
|
494
|
+
will not reflect those changes. To ensure a cluster's setup is up to date,
|
495
|
+
use ``sky.launch()`` instead.
|
496
|
+
|
497
|
+
Execution and scheduling behavior:
|
498
|
+
|
499
|
+
- The task will undergo job queue scheduling, respecting any specified
|
500
|
+
resource requirement. It can be executed on any node of the cluster with
|
501
|
+
enough resources.
|
502
|
+
- The task is run under the workdir (if specified).
|
503
|
+
- The task is run non-interactively (without a pseudo-terminal or
|
504
|
+
pty), so interactive commands such as ``htop`` do not work.
|
505
|
+
Use ``ssh my_cluster`` instead.
|
506
|
+
|
507
|
+
Args:
|
508
|
+
task: sky.Task, or sky.Dag (experimental; 1-task only) containing the
|
509
|
+
task to execute.
|
510
|
+
cluster_name: name of an existing cluster to execute the task.
|
511
|
+
dryrun: if True, do not actually execute the task.
|
512
|
+
down: Tear down the cluster after all jobs finish (successfully or
|
513
|
+
abnormally). If --idle-minutes-to-autostop is also set, the
|
514
|
+
cluster will be torn down after the specified idle time.
|
515
|
+
Note that if errors occur during provisioning/data syncing/setting
|
516
|
+
up, the cluster will not be torn down for debugging purposes.
|
517
|
+
backend: backend to use. If None, use the default backend
|
518
|
+
(CloudVMRayBackend).
|
519
|
+
|
520
|
+
Returns:
|
521
|
+
The request ID of the exec request.
|
522
|
+
|
523
|
+
|
524
|
+
Request Returns:
|
525
|
+
job_id (Optional[int]): the job ID of the submitted job. None if the
|
526
|
+
backend is not CloudVmRayBackend, or no job is submitted to
|
527
|
+
the cluster.
|
528
|
+
handle (Optional[backends.ResourceHandle]): the handle to the cluster.
|
529
|
+
None if dryrun.
|
530
|
+
|
531
|
+
Request Raises:
|
532
|
+
ValueError: if the specified cluster is not in UP status.
|
533
|
+
sky.exceptions.ClusterDoesNotExist: if the specified cluster does not
|
534
|
+
exist.
|
535
|
+
sky.exceptions.NotSupportedError: if the specified cluster is a
|
536
|
+
controller that does not support this operation.
|
537
|
+
"""
|
538
|
+
dag = dag_utils.convert_entrypoint_to_dag(task)
|
539
|
+
validate(dag, workdir_only=True)
|
540
|
+
dag = client_common.upload_mounts_to_api_server(dag, workdir_only=True)
|
541
|
+
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
542
|
+
body = payloads.ExecBody(
|
543
|
+
task=dag_str,
|
544
|
+
cluster_name=cluster_name,
|
545
|
+
dryrun=dryrun,
|
546
|
+
down=down,
|
547
|
+
backend=backend.NAME if backend else None,
|
548
|
+
)
|
549
|
+
|
550
|
+
response = requests.post(
|
551
|
+
f'{server_common.get_server_url()}/exec',
|
552
|
+
json=json.loads(body.model_dump_json()),
|
553
|
+
timeout=5,
|
554
|
+
)
|
555
|
+
return server_common.get_request_id(response)
|
556
|
+
|
557
|
+
|
558
|
+
@usage_lib.entrypoint
|
559
|
+
@server_common.check_server_healthy_or_start
|
560
|
+
@annotations.client_api
|
561
|
+
def tail_logs(cluster_name: str,
|
562
|
+
job_id: Optional[int],
|
563
|
+
follow: bool,
|
564
|
+
tail: int = 0,
|
565
|
+
output_stream: Optional['io.TextIOBase'] = None) -> None:
|
566
|
+
"""Tails the logs of a job.
|
567
|
+
|
568
|
+
Args:
|
569
|
+
cluster_name: name of the cluster.
|
570
|
+
job_id: job id.
|
571
|
+
follow: if True, follow the logs. Otherwise, return the logs
|
572
|
+
immediately.
|
573
|
+
tail: if > 0, tail the last N lines of the logs.
|
574
|
+
output_stream: the stream to write the logs to. If None, print to the
|
575
|
+
console.
|
576
|
+
|
577
|
+
Returns:
|
578
|
+
None
|
579
|
+
|
580
|
+
Request Raises:
|
581
|
+
ValueError: if arguments are invalid or the cluster is not supported.
|
582
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
583
|
+
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
584
|
+
sky.exceptions.NotSupportedError: if the cluster is not based on
|
585
|
+
CloudVmRayBackend.
|
586
|
+
sky.exceptions.ClusterOwnerIdentityMismatchError: if the current user is
|
587
|
+
not the same as the user who created the cluster.
|
588
|
+
sky.exceptions.CloudUserIdentityError: if we fail to get the current
|
589
|
+
user identity.
|
590
|
+
"""
|
591
|
+
body = payloads.ClusterJobBody(
|
592
|
+
cluster_name=cluster_name,
|
593
|
+
job_id=job_id,
|
594
|
+
follow=follow,
|
595
|
+
tail=tail,
|
596
|
+
)
|
597
|
+
response = requests.post(
|
598
|
+
f'{server_common.get_server_url()}/logs',
|
599
|
+
json=json.loads(body.model_dump_json()),
|
600
|
+
stream=True,
|
601
|
+
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
602
|
+
None))
|
603
|
+
request_id = server_common.get_request_id(response)
|
604
|
+
stream_response(request_id, response, output_stream)
|
605
|
+
|
606
|
+
|
607
|
+
@usage_lib.entrypoint
|
608
|
+
@server_common.check_server_healthy_or_start
|
609
|
+
@annotations.client_api
|
610
|
+
def download_logs(cluster_name: str,
|
611
|
+
job_ids: Optional[List[str]]) -> Dict[str, str]:
|
612
|
+
"""Downloads the logs of jobs.
|
613
|
+
|
614
|
+
Args:
|
615
|
+
cluster_name: (str) name of the cluster.
|
616
|
+
job_ids: (List[str]) job ids.
|
617
|
+
|
618
|
+
Returns:
|
619
|
+
The request ID of the download_logs request.
|
620
|
+
|
621
|
+
Request Returns:
|
622
|
+
job_log_paths (Dict[str, str]): a mapping of job_id to local log path.
|
623
|
+
|
624
|
+
Request Raises:
|
625
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
626
|
+
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
627
|
+
sky.exceptions.NotSupportedError: if the cluster is not based on
|
628
|
+
CloudVmRayBackend.
|
629
|
+
sky.exceptions.ClusterOwnerIdentityMismatchError: if the current user is
|
630
|
+
not the same as the user who created the cluster.
|
631
|
+
sky.exceptions.CloudUserIdentityError: if we fail to get the current
|
632
|
+
user identity.
|
633
|
+
"""
|
634
|
+
body = payloads.ClusterJobsDownloadLogsBody(
|
635
|
+
cluster_name=cluster_name,
|
636
|
+
job_ids=job_ids,
|
637
|
+
)
|
638
|
+
response = requests.post(f'{server_common.get_server_url()}/download_logs',
|
639
|
+
json=json.loads(body.model_dump_json()))
|
640
|
+
job_id_remote_path_dict = stream_and_get(
|
641
|
+
server_common.get_request_id(response))
|
642
|
+
remote2local_path_dict = client_common.download_logs_from_api_server(
|
643
|
+
job_id_remote_path_dict.values())
|
644
|
+
return {
|
645
|
+
job_id: remote2local_path_dict[remote_path]
|
646
|
+
for job_id, remote_path in job_id_remote_path_dict.items()
|
647
|
+
}
|
648
|
+
|
649
|
+
|
650
|
+
@usage_lib.entrypoint
|
651
|
+
@server_common.check_server_healthy_or_start
|
652
|
+
@annotations.client_api
|
653
|
+
def start(
|
654
|
+
cluster_name: str,
|
655
|
+
idle_minutes_to_autostop: Optional[int] = None,
|
656
|
+
retry_until_up: bool = False,
|
657
|
+
down: bool = False, # pylint: disable=redefined-outer-name
|
658
|
+
force: bool = False,
|
659
|
+
) -> server_common.RequestId:
|
660
|
+
"""Restart a cluster.
|
661
|
+
|
662
|
+
If a cluster is previously stopped (status is STOPPED) or failed in
|
663
|
+
provisioning/runtime installation (status is INIT), this function will
|
664
|
+
attempt to start the cluster. In the latter case, provisioning and runtime
|
665
|
+
installation will be retried.
|
666
|
+
|
667
|
+
Auto-failover provisioning is not used when restarting a stopped
|
668
|
+
cluster. It will be started on the same cloud, region, and zone that were
|
669
|
+
chosen before.
|
670
|
+
|
671
|
+
If a cluster is already in the UP status, this function has no effect.
|
672
|
+
|
673
|
+
Args:
|
674
|
+
cluster_name: name of the cluster to start.
|
675
|
+
idle_minutes_to_autostop: automatically stop the cluster after this
|
676
|
+
many minute of idleness, i.e., no running or pending jobs in the
|
677
|
+
cluster's job queue. Idleness gets reset whenever setting-up/
|
678
|
+
running/pending jobs are found in the job queue. Setting this
|
679
|
+
flag is equivalent to running ``sky.launch()`` and then
|
680
|
+
``sky.autostop(idle_minutes=<minutes>)``. If not set, the
|
681
|
+
cluster will not be autostopped.
|
682
|
+
retry_until_up: whether to retry launching the cluster until it is
|
683
|
+
up.
|
684
|
+
down: Autodown the cluster: tear down the cluster after specified
|
685
|
+
minutes of idle time after all jobs finish (successfully or
|
686
|
+
abnormally). Requires ``idle_minutes_to_autostop`` to be set.
|
687
|
+
force: whether to force start the cluster even if it is already up.
|
688
|
+
Useful for upgrading SkyPilot runtime.
|
689
|
+
|
690
|
+
Returns:
|
691
|
+
The request ID of the start request.
|
692
|
+
|
693
|
+
Request Returns:
|
694
|
+
None
|
695
|
+
|
696
|
+
Request Raises:
|
697
|
+
ValueError: argument values are invalid: (1) if ``down`` is set to True
|
698
|
+
but ``idle_minutes_to_autostop`` is None; (2) if the specified
|
699
|
+
cluster is the managed jobs controller, and either
|
700
|
+
``idle_minutes_to_autostop`` is not None or ``down`` is True (omit
|
701
|
+
them to use the default autostop settings).
|
702
|
+
sky.exceptions.ClusterDoesNotExist: the specified cluster does not
|
703
|
+
exist.
|
704
|
+
sky.exceptions.NotSupportedError: if the cluster to restart was
|
705
|
+
launched using a non-default backend that does not support this
|
706
|
+
operation.
|
707
|
+
sky.exceptions.ClusterOwnerIdentitiesMismatchError: if the cluster to
|
708
|
+
restart was launched by a different user.
|
709
|
+
"""
|
710
|
+
body = payloads.StartBody(
|
711
|
+
cluster_name=cluster_name,
|
712
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
713
|
+
retry_until_up=retry_until_up,
|
714
|
+
down=down,
|
715
|
+
force=force,
|
716
|
+
)
|
717
|
+
response = requests.post(
|
718
|
+
f'{server_common.get_server_url()}/start',
|
719
|
+
json=json.loads(body.model_dump_json()),
|
720
|
+
timeout=5,
|
721
|
+
)
|
722
|
+
return server_common.get_request_id(response)
|
723
|
+
|
724
|
+
|
725
|
+
@usage_lib.entrypoint
|
726
|
+
@server_common.check_server_healthy_or_start
|
727
|
+
@annotations.client_api
|
728
|
+
def down(cluster_name: str, purge: bool = False) -> server_common.RequestId:
|
729
|
+
"""Tears down a cluster.
|
730
|
+
|
731
|
+
Tearing down a cluster will delete all associated resources (all billing
|
732
|
+
stops), and any data on the attached disks will be lost. Accelerators
|
733
|
+
(e.g., TPUs) that are part of the cluster will be deleted too.
|
734
|
+
|
735
|
+
Args:
|
736
|
+
cluster_name: name of the cluster to down.
|
737
|
+
purge: (Advanced) Forcefully remove the cluster from SkyPilot's cluster
|
738
|
+
table, even if the actual cluster termination failed on the cloud.
|
739
|
+
WARNING: This flag should only be set sparingly in certain manual
|
740
|
+
troubleshooting scenarios; with it set, it is the user's
|
741
|
+
responsibility to ensure there are no leaked instances and related
|
742
|
+
resources.
|
743
|
+
|
744
|
+
Returns:
|
745
|
+
The request ID of the down request.
|
746
|
+
|
747
|
+
Request Returns:
|
748
|
+
None
|
749
|
+
|
750
|
+
Request Raises:
|
751
|
+
sky.exceptions.ClusterDoesNotExist: the specified cluster does not
|
752
|
+
exist.
|
753
|
+
RuntimeError: failed to tear down the cluster.
|
754
|
+
sky.exceptions.NotSupportedError: the specified cluster is the managed
|
755
|
+
jobs controller.
|
756
|
+
|
757
|
+
"""
|
758
|
+
body = payloads.StopOrDownBody(
|
759
|
+
cluster_name=cluster_name,
|
760
|
+
purge=purge,
|
761
|
+
)
|
762
|
+
response = requests.post(
|
763
|
+
f'{server_common.get_server_url()}/down',
|
764
|
+
json=json.loads(body.model_dump_json()),
|
765
|
+
timeout=5,
|
766
|
+
)
|
767
|
+
return server_common.get_request_id(response)
|
768
|
+
|
769
|
+
|
770
|
+
@usage_lib.entrypoint
|
771
|
+
@server_common.check_server_healthy_or_start
|
772
|
+
@annotations.client_api
|
773
|
+
def stop(cluster_name: str, purge: bool = False) -> server_common.RequestId:
|
774
|
+
"""Stops a cluster.
|
775
|
+
|
776
|
+
Data on attached disks is not lost when a cluster is stopped. Billing for
|
777
|
+
the instances will stop, while the disks will still be charged. Those
|
778
|
+
disks will be reattached when restarting the cluster.
|
779
|
+
|
780
|
+
Currently, spot instance clusters cannot be stopped (except for GCP, which
|
781
|
+
does allow disk contents to be preserved when stopping spot VMs).
|
782
|
+
|
783
|
+
Args:
|
784
|
+
cluster_name: name of the cluster to stop.
|
785
|
+
purge: (Advanced) Forcefully mark the cluster as stopped in SkyPilot's
|
786
|
+
cluster table, even if the actual cluster stop operation failed on
|
787
|
+
the cloud. WARNING: This flag should only be set sparingly in
|
788
|
+
certain manual troubleshooting scenarios; with it set, it is the
|
789
|
+
user's responsibility to ensure there are no leaked instances and
|
790
|
+
related resources.
|
791
|
+
|
792
|
+
Returns:
|
793
|
+
The request ID of the stop request.
|
794
|
+
|
795
|
+
Request Returns:
|
796
|
+
None
|
797
|
+
|
798
|
+
Request Raises:
|
799
|
+
sky.exceptions.ClusterDoesNotExist: the specified cluster does not
|
800
|
+
exist.
|
801
|
+
RuntimeError: failed to stop the cluster.
|
802
|
+
sky.exceptions.NotSupportedError: if the specified cluster is a spot
|
803
|
+
cluster, or a TPU VM Pod cluster, or the managed jobs controller.
|
804
|
+
|
805
|
+
"""
|
806
|
+
body = payloads.StopOrDownBody(
|
807
|
+
cluster_name=cluster_name,
|
808
|
+
purge=purge,
|
809
|
+
)
|
810
|
+
response = requests.post(
|
811
|
+
f'{server_common.get_server_url()}/stop',
|
812
|
+
json=json.loads(body.model_dump_json()),
|
813
|
+
timeout=5,
|
814
|
+
)
|
815
|
+
return server_common.get_request_id(response)
|
816
|
+
|
817
|
+
|
818
|
+
@usage_lib.entrypoint
|
819
|
+
@server_common.check_server_healthy_or_start
|
820
|
+
@annotations.client_api
|
821
|
+
def autostop(
|
822
|
+
cluster_name: str,
|
823
|
+
idle_minutes: int,
|
824
|
+
down: bool = False # pylint: disable=redefined-outer-name
|
825
|
+
) -> server_common.RequestId:
|
826
|
+
"""Schedules an autostop/autodown for a cluster.
|
827
|
+
|
828
|
+
Autostop/autodown will automatically stop or teardown a cluster when it
|
829
|
+
becomes idle for a specified duration. Idleness means there are no
|
830
|
+
in-progress (pending/running) jobs in a cluster's job queue.
|
831
|
+
|
832
|
+
Idleness time of a cluster is reset to zero, whenever:
|
833
|
+
|
834
|
+
- A job is submitted (``sky.launch()`` or ``sky.exec()``).
|
835
|
+
|
836
|
+
- The cluster has restarted.
|
837
|
+
|
838
|
+
- An autostop is set when there is no active setting. (Namely, either
|
839
|
+
there's never any autostop setting set, or the previous autostop setting
|
840
|
+
was canceled.) This is useful for restarting the autostop timer.
|
841
|
+
|
842
|
+
Example: say a cluster without any autostop set has been idle for 1 hour,
|
843
|
+
then an autostop of 30 minutes is set. The cluster will not be immediately
|
844
|
+
autostopped. Instead, the idleness timer only starts counting after the
|
845
|
+
autostop setting was set.
|
846
|
+
|
847
|
+
When multiple autostop settings are specified for the same cluster, the
|
848
|
+
last setting takes precedence.
|
849
|
+
|
850
|
+
Args:
|
851
|
+
cluster_name: name of the cluster.
|
852
|
+
idle_minutes: the number of minutes of idleness (no pending/running
|
853
|
+
jobs) after which the cluster will be stopped automatically. Setting
|
854
|
+
to a negative number cancels any autostop/autodown setting.
|
855
|
+
down: if true, use autodown (tear down the cluster; non-restartable),
|
856
|
+
rather than autostop (restartable).
|
857
|
+
|
858
|
+
Returns:
|
859
|
+
The request ID of the autostop request.
|
860
|
+
|
861
|
+
Request Returns:
|
862
|
+
None
|
863
|
+
|
864
|
+
Request Raises:
|
865
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
866
|
+
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
867
|
+
sky.exceptions.NotSupportedError: if the cluster is not based on
|
868
|
+
CloudVmRayBackend or the cluster is TPU VM Pod.
|
869
|
+
sky.exceptions.ClusterOwnerIdentityMismatchError: if the current user is
|
870
|
+
not the same as the user who created the cluster.
|
871
|
+
sky.exceptions.CloudUserIdentityError: if we fail to get the current
|
872
|
+
user identity.
|
873
|
+
"""
|
874
|
+
body = payloads.AutostopBody(
|
875
|
+
cluster_name=cluster_name,
|
876
|
+
idle_minutes=idle_minutes,
|
877
|
+
down=down,
|
878
|
+
)
|
879
|
+
response = requests.post(
|
880
|
+
f'{server_common.get_server_url()}/autostop',
|
881
|
+
json=json.loads(body.model_dump_json()),
|
882
|
+
timeout=5,
|
883
|
+
)
|
884
|
+
return server_common.get_request_id(response)
|
885
|
+
|
886
|
+
|
887
|
+
@usage_lib.entrypoint
|
888
|
+
@server_common.check_server_healthy_or_start
|
889
|
+
@annotations.client_api
|
890
|
+
def queue(cluster_name: str,
|
891
|
+
skip_finished: bool = False,
|
892
|
+
all_users: bool = False) -> server_common.RequestId:
|
893
|
+
"""Gets the job queue of a cluster.
|
894
|
+
|
895
|
+
Args:
|
896
|
+
cluster_name: name of the cluster.
|
897
|
+
skip_finished: if True, skip finished jobs.
|
898
|
+
all_users: if True, return jobs from all users.
|
899
|
+
|
900
|
+
|
901
|
+
Returns:
|
902
|
+
The request ID of the queue request.
|
903
|
+
|
904
|
+
Request Returns:
|
905
|
+
job_records (List[Dict[str, Any]]): A list of dicts for each job in the
|
906
|
+
queue.
|
907
|
+
|
908
|
+
.. code-block:: python
|
909
|
+
|
910
|
+
[
|
911
|
+
{
|
912
|
+
'job_id': (int) job id,
|
913
|
+
'job_name': (str) job name,
|
914
|
+
'username': (str) username,
|
915
|
+
'user_hash': (str) user hash,
|
916
|
+
'submitted_at': (int) timestamp of submitted,
|
917
|
+
'start_at': (int) timestamp of started,
|
918
|
+
'end_at': (int) timestamp of ended,
|
919
|
+
'resources': (str) resources,
|
920
|
+
'status': (job_lib.JobStatus) job status,
|
921
|
+
'log_path': (str) log path,
|
922
|
+
}
|
923
|
+
]
|
924
|
+
|
925
|
+
Request Raises:
|
926
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
927
|
+
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
928
|
+
sky.exceptions.NotSupportedError: if the cluster is not based on
|
929
|
+
``CloudVmRayBackend``.
|
930
|
+
sky.exceptions.ClusterOwnerIdentityMismatchError: if the current user is
|
931
|
+
not the same as the user who created the cluster.
|
932
|
+
sky.exceptions.CloudUserIdentityError: if we fail to get the current
|
933
|
+
user identity.
|
934
|
+
sky.exceptions.CommandError: if failed to get the job queue with ssh.
|
935
|
+
"""
|
936
|
+
body = payloads.QueueBody(
|
937
|
+
cluster_name=cluster_name,
|
938
|
+
skip_finished=skip_finished,
|
939
|
+
all_users=all_users,
|
940
|
+
)
|
941
|
+
response = requests.post(f'{server_common.get_server_url()}/queue',
|
942
|
+
json=json.loads(body.model_dump_json()))
|
943
|
+
return server_common.get_request_id(response)
|
944
|
+
|
945
|
+
|
946
|
+
@usage_lib.entrypoint
|
947
|
+
@server_common.check_server_healthy_or_start
|
948
|
+
@annotations.client_api
|
949
|
+
def job_status(cluster_name: str,
|
950
|
+
job_ids: Optional[List[int]] = None) -> server_common.RequestId:
|
951
|
+
"""Gets the status of jobs on a cluster.
|
952
|
+
|
953
|
+
Args:
|
954
|
+
cluster_name: name of the cluster.
|
955
|
+
job_ids: job ids. If None, get the status of the last job.
|
956
|
+
|
957
|
+
Returns:
|
958
|
+
The request ID of the job status request.
|
959
|
+
|
960
|
+
Request Returns:
|
961
|
+
job_statuses (Dict[Optional[int], Optional[job_lib.JobStatus]]): A
|
962
|
+
mapping of job_id to job statuses. The status will be None if the
|
963
|
+
job does not exist. If job_ids is None and there is no job on the
|
964
|
+
cluster, it will return {None: None}.
|
965
|
+
|
966
|
+
Request Raises:
|
967
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
968
|
+
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
969
|
+
sky.exceptions.NotSupportedError: if the cluster is not based on
|
970
|
+
``CloudVmRayBackend``.
|
971
|
+
sky.exceptions.ClusterOwnerIdentityMismatchError: if the current user is
|
972
|
+
not the same as the user who created the cluster.
|
973
|
+
sky.exceptions.CloudUserIdentityError: if we fail to get the current
|
974
|
+
user identity.
|
975
|
+
"""
|
976
|
+
# TODO: merge this into the queue endpoint, i.e., let the queue endpoint
|
977
|
+
# take job_ids to filter the returned jobs.
|
978
|
+
body = payloads.JobStatusBody(
|
979
|
+
cluster_name=cluster_name,
|
980
|
+
job_ids=job_ids,
|
981
|
+
)
|
982
|
+
response = requests.post(f'{server_common.get_server_url()}/job_status',
|
983
|
+
json=json.loads(body.model_dump_json()))
|
984
|
+
return server_common.get_request_id(response)
|
985
|
+
|
986
|
+
|
987
|
+
@usage_lib.entrypoint
|
988
|
+
@server_common.check_server_healthy_or_start
|
989
|
+
@annotations.client_api
|
990
|
+
def cancel(
|
991
|
+
cluster_name: str,
|
992
|
+
all: bool = False, # pylint: disable=redefined-builtin
|
993
|
+
all_users: bool = False,
|
994
|
+
job_ids: Optional[List[int]] = None,
|
995
|
+
# pylint: disable=invalid-name
|
996
|
+
_try_cancel_if_cluster_is_init: bool = False
|
997
|
+
) -> server_common.RequestId:
|
998
|
+
"""Cancels jobs on a cluster.
|
999
|
+
|
1000
|
+
Args:
|
1001
|
+
cluster_name: name of the cluster.
|
1002
|
+
all: if True, cancel all jobs.
|
1003
|
+
all_users: if True, cancel all jobs from all users.
|
1004
|
+
job_ids: a list of job IDs to cancel.
|
1005
|
+
_try_cancel_if_cluster_is_init: (bool) whether to try cancelling the job
|
1006
|
+
even if the cluster is not UP, but the head node is still alive.
|
1007
|
+
This is used by the jobs controller to cancel the job when the
|
1008
|
+
worker node is preempted in the spot cluster.
|
1009
|
+
|
1010
|
+
Returns:
|
1011
|
+
The request ID of the cancel request.
|
1012
|
+
|
1013
|
+
Request Returns:
|
1014
|
+
None
|
1015
|
+
|
1016
|
+
Request Raises:
|
1017
|
+
ValueError: if arguments are invalid.
|
1018
|
+
sky.exceptions.ClusterDoesNotExist: if the cluster does not exist.
|
1019
|
+
sky.exceptions.ClusterNotUpError: if the cluster is not UP.
|
1020
|
+
sky.exceptions.NotSupportedError: if the specified cluster is a
|
1021
|
+
controller that does not support this operation.
|
1022
|
+
sky.exceptions.ClusterOwnerIdentityMismatchError: if the current user is
|
1023
|
+
not the same as the user who created the cluster.
|
1024
|
+
sky.exceptions.CloudUserIdentityError: if we fail to get the current
|
1025
|
+
user identity.
|
1026
|
+
|
1027
|
+
"""
|
1028
|
+
body = payloads.CancelBody(
|
1029
|
+
cluster_name=cluster_name,
|
1030
|
+
all=all,
|
1031
|
+
all_users=all_users,
|
1032
|
+
job_ids=job_ids,
|
1033
|
+
try_cancel_if_cluster_is_init=_try_cancel_if_cluster_is_init,
|
1034
|
+
)
|
1035
|
+
response = requests.post(f'{server_common.get_server_url()}/cancel',
|
1036
|
+
json=json.loads(body.model_dump_json()))
|
1037
|
+
return server_common.get_request_id(response)
|
1038
|
+
|
1039
|
+
|
1040
|
+
@usage_lib.entrypoint
|
1041
|
+
@server_common.check_server_healthy_or_start
|
1042
|
+
@annotations.client_api
|
1043
|
+
def status(
|
1044
|
+
cluster_names: Optional[List[str]] = None,
|
1045
|
+
refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
|
1046
|
+
all_users: bool = False,
|
1047
|
+
) -> server_common.RequestId:
|
1048
|
+
"""Gets cluster statuses.
|
1049
|
+
|
1050
|
+
If cluster_names is given, return those clusters. Otherwise, return all
|
1051
|
+
clusters.
|
1052
|
+
|
1053
|
+
Each cluster can have one of the following statuses:
|
1054
|
+
|
1055
|
+
- ``INIT``: The cluster may be live or down. It can happen in the following
|
1056
|
+
cases:
|
1057
|
+
|
1058
|
+
- Ongoing provisioning or runtime setup. (A ``sky.launch()`` has started
|
1059
|
+
but has not completed.)
|
1060
|
+
- Or, the cluster is in an abnormal state, e.g., some cluster nodes are
|
1061
|
+
down, or the SkyPilot runtime is unhealthy. (To recover the cluster,
|
1062
|
+
try ``sky launch`` again on it.)
|
1063
|
+
|
1064
|
+
- ``UP``: Provisioning and runtime setup have succeeded and the cluster is
|
1065
|
+
live. (The most recent ``sky.launch()`` has completed successfully.)
|
1066
|
+
|
1067
|
+
- ``STOPPED``: The cluster is stopped and the storage is persisted. Use
|
1068
|
+
``sky.start()`` to restart the cluster.
|
1069
|
+
|
1070
|
+
Autostop column:
|
1071
|
+
|
1072
|
+
- The autostop column indicates how long the cluster will be autostopped
|
1073
|
+
after minutes of idling (no jobs running). If ``to_down`` is True, the
|
1074
|
+
cluster will be autodowned, rather than autostopped.
|
1075
|
+
|
1076
|
+
Getting up-to-date cluster statuses:
|
1077
|
+
|
1078
|
+
- In normal cases where clusters are entirely managed by SkyPilot (i.e., no
|
1079
|
+
manual operations in cloud consoles) and no autostopping is used, the
|
1080
|
+
table returned by this command will accurately reflect the cluster
|
1081
|
+
statuses.
|
1082
|
+
|
1083
|
+
- In cases where the clusters are changed outside of SkyPilot (e.g., manual
|
1084
|
+
operations in cloud consoles; unmanaged spot clusters getting preempted)
|
1085
|
+
or for autostop-enabled clusters, use ``refresh=True`` to query the
|
1086
|
+
latest cluster statuses from the cloud providers.
|
1087
|
+
|
1088
|
+
Args:
|
1089
|
+
cluster_names: a list of cluster names to query. If not
|
1090
|
+
provided, all clusters will be queried.
|
1091
|
+
refresh: whether to query the latest cluster statuses from the cloud
|
1092
|
+
provider(s).
|
1093
|
+
all_users: whether to include all users' clusters. By default, only
|
1094
|
+
the current user's clusters are included.
|
1095
|
+
|
1096
|
+
Returns:
|
1097
|
+
The request ID of the status request.
|
1098
|
+
|
1099
|
+
Request Returns:
|
1100
|
+
cluster_records (List[Dict[str, Any]]): A list of dicts, with each dict
|
1101
|
+
containing the information of a cluster. If a cluster is found to be
|
1102
|
+
terminated or not found, it will be omitted from the returned list.
|
1103
|
+
|
1104
|
+
.. code-block:: python
|
1105
|
+
|
1106
|
+
{
|
1107
|
+
'name': (str) cluster name,
|
1108
|
+
'launched_at': (int) timestamp of last launch on this cluster,
|
1109
|
+
'handle': (ResourceHandle) an internal handle to the cluster,
|
1110
|
+
'last_use': (str) the last command/entrypoint that affected this
|
1111
|
+
cluster,
|
1112
|
+
'status': (sky.ClusterStatus) cluster status,
|
1113
|
+
'autostop': (int) idle time before autostop,
|
1114
|
+
'to_down': (bool) whether autodown is used instead of autostop,
|
1115
|
+
'metadata': (dict) metadata of the cluster,
|
1116
|
+
'user_hash': (str) user hash of the cluster owner,
|
1117
|
+
'user_name': (str) user name of the cluster owner,
|
1118
|
+
'resources_str': (str) the resource string representation of the
|
1119
|
+
cluster,
|
1120
|
+
}
|
1121
|
+
|
1122
|
+
"""
|
1123
|
+
# TODO(zhwu): this does not stream the logs output by logger back to the
|
1124
|
+
# user, due to the rich progress implementation.
|
1125
|
+
body = payloads.StatusBody(
|
1126
|
+
cluster_names=cluster_names,
|
1127
|
+
refresh=refresh,
|
1128
|
+
all_users=all_users,
|
1129
|
+
)
|
1130
|
+
response = requests.post(f'{server_common.get_server_url()}/status',
|
1131
|
+
json=json.loads(body.model_dump_json()))
|
1132
|
+
return server_common.get_request_id(response)
|
1133
|
+
|
1134
|
+
|
1135
|
+
@usage_lib.entrypoint
|
1136
|
+
@server_common.check_server_healthy_or_start
|
1137
|
+
@annotations.client_api
|
1138
|
+
def endpoints(
|
1139
|
+
cluster: str,
|
1140
|
+
port: Optional[Union[int, str]] = None) -> server_common.RequestId:
|
1141
|
+
"""Gets the endpoint for a given cluster and port number (endpoint).
|
1142
|
+
|
1143
|
+
Args:
|
1144
|
+
cluster: The name of the cluster.
|
1145
|
+
port: The port number to get the endpoint for. If None, endpoints
|
1146
|
+
for all ports are returned.
|
1147
|
+
|
1148
|
+
Returns:
|
1149
|
+
The request ID of the endpoints request.
|
1150
|
+
|
1151
|
+
Request Returns:
|
1152
|
+
A dictionary of port numbers to endpoints. If port is None,
|
1153
|
+
the dictionary will contain all ports:endpoints exposed on the cluster.
|
1154
|
+
|
1155
|
+
Request Raises:
|
1156
|
+
ValueError: if the cluster is not UP or the endpoint is not exposed.
|
1157
|
+
RuntimeError: if the cluster has no ports to be exposed or no endpoints
|
1158
|
+
are exposed yet.
|
1159
|
+
"""
|
1160
|
+
body = payloads.EndpointsBody(
|
1161
|
+
cluster=cluster,
|
1162
|
+
port=port,
|
1163
|
+
)
|
1164
|
+
response = requests.post(f'{server_common.get_server_url()}/endpoints',
|
1165
|
+
json=json.loads(body.model_dump_json()))
|
1166
|
+
return server_common.get_request_id(response)
|
1167
|
+
|
1168
|
+
|
1169
|
+
@usage_lib.entrypoint
|
1170
|
+
@server_common.check_server_healthy_or_start
|
1171
|
+
@annotations.client_api
|
1172
|
+
def cost_report() -> server_common.RequestId: # pylint: disable=redefined-builtin
|
1173
|
+
"""Gets all cluster cost reports, including those that have been downed.
|
1174
|
+
|
1175
|
+
The estimated cost column indicates price for the cluster based on the type
|
1176
|
+
of resources being used and the duration of use up until the call to
|
1177
|
+
status. This means if the cluster is UP, successive calls to report will
|
1178
|
+
show increasing price. The estimated cost is calculated based on the local
|
1179
|
+
cache of the cluster status, and may not be accurate for the cluster with
|
1180
|
+
autostop/use_spot set or terminated/stopped on the cloud console.
|
1181
|
+
|
1182
|
+
Returns:
|
1183
|
+
The request ID of the cost report request.
|
1184
|
+
|
1185
|
+
Request Returns:
|
1186
|
+
cluster_cost_records (List[Dict[str, Any]]): A list of dicts, with each
|
1187
|
+
dict containing the cost information of a cluster.
|
1188
|
+
|
1189
|
+
.. code-block:: python
|
1190
|
+
|
1191
|
+
{
|
1192
|
+
'name': (str) cluster name,
|
1193
|
+
'launched_at': (int) timestamp of last launch on this cluster,
|
1194
|
+
'duration': (int) total seconds that cluster was up and running,
|
1195
|
+
'last_use': (str) the last command/entrypoint that affected this
|
1196
|
+
'num_nodes': (int) number of nodes launched for cluster,
|
1197
|
+
'resources': (resources.Resources) type of resource launched,
|
1198
|
+
'cluster_hash': (str) unique hash identifying cluster,
|
1199
|
+
'usage_intervals': (List[Tuple[int, int]]) cluster usage times,
|
1200
|
+
'total_cost': (float) cost given resources and usage intervals,
|
1201
|
+
}
|
1202
|
+
"""
|
1203
|
+
response = requests.get(f'{server_common.get_server_url()}/cost_report')
|
1204
|
+
return server_common.get_request_id(response)
|
1205
|
+
|
1206
|
+
|
1207
|
+
# === Storage APIs ===
|
1208
|
+
@usage_lib.entrypoint
|
1209
|
+
@server_common.check_server_healthy_or_start
|
1210
|
+
@annotations.client_api
|
1211
|
+
def storage_ls() -> server_common.RequestId:
|
1212
|
+
"""Gets the storages.
|
1213
|
+
|
1214
|
+
Returns:
|
1215
|
+
The request ID of the storage list request.
|
1216
|
+
|
1217
|
+
Request Returns:
|
1218
|
+
storage_records (List[Dict[str, Any]]): A list of dicts, with each dict
|
1219
|
+
containing the information of a storage.
|
1220
|
+
|
1221
|
+
.. code-block:: python
|
1222
|
+
|
1223
|
+
{
|
1224
|
+
'name': (str) storage name,
|
1225
|
+
'launched_at': (int) timestamp of creation,
|
1226
|
+
'store': (List[sky.StoreType]) storage type,
|
1227
|
+
'last_use': (int) timestamp of last use,
|
1228
|
+
'status': (sky.StorageStatus) storage status,
|
1229
|
+
}
|
1230
|
+
]
|
1231
|
+
"""
|
1232
|
+
response = requests.get(f'{server_common.get_server_url()}/storage/ls')
|
1233
|
+
return server_common.get_request_id(response)
|
1234
|
+
|
1235
|
+
|
1236
|
+
@usage_lib.entrypoint
|
1237
|
+
@server_common.check_server_healthy_or_start
|
1238
|
+
@annotations.client_api
|
1239
|
+
def storage_delete(name: str) -> server_common.RequestId:
|
1240
|
+
"""Deletes a storage.
|
1241
|
+
|
1242
|
+
Args:
|
1243
|
+
name: The name of the storage to delete.
|
1244
|
+
|
1245
|
+
Returns:
|
1246
|
+
The request ID of the storage delete request.
|
1247
|
+
|
1248
|
+
Request Returns:
|
1249
|
+
None
|
1250
|
+
|
1251
|
+
Request Raises:
|
1252
|
+
ValueError: If the storage does not exist.
|
1253
|
+
"""
|
1254
|
+
body = payloads.StorageBody(name=name)
|
1255
|
+
response = requests.post(f'{server_common.get_server_url()}/storage/delete',
|
1256
|
+
json=json.loads(body.model_dump_json()))
|
1257
|
+
return server_common.get_request_id(response)
|
1258
|
+
|
1259
|
+
|
1260
|
+
# === Kubernetes ===
|
1261
|
+
|
1262
|
+
|
1263
|
+
@usage_lib.entrypoint
|
1264
|
+
@server_common.check_server_healthy_or_start
|
1265
|
+
@annotations.client_api
|
1266
|
+
def local_up(gpus: bool, ips: Optional[List[str]], ssh_user: Optional[str],
|
1267
|
+
ssh_key: Optional[str], cleanup: bool) -> server_common.RequestId:
|
1268
|
+
"""Launches a Kubernetes cluster on local machines.
|
1269
|
+
|
1270
|
+
Returns:
|
1271
|
+
request_id: The request ID of the local up request.
|
1272
|
+
"""
|
1273
|
+
# We do not allow local up when the API server is running remotely since it
|
1274
|
+
# will modify the kubeconfig.
|
1275
|
+
# TODO: move this check to server.
|
1276
|
+
if not server_common.is_api_server_local():
|
1277
|
+
with ux_utils.print_exception_no_traceback():
|
1278
|
+
raise ValueError(
|
1279
|
+
'sky local up is only supported when running SkyPilot locally.')
|
1280
|
+
|
1281
|
+
body = payloads.LocalUpBody(gpus=gpus,
|
1282
|
+
ips=ips,
|
1283
|
+
ssh_user=ssh_user,
|
1284
|
+
ssh_key=ssh_key,
|
1285
|
+
cleanup=cleanup)
|
1286
|
+
response = requests.post(f'{server_common.get_server_url()}/local_up',
|
1287
|
+
json=json.loads(body.model_dump_json()))
|
1288
|
+
return server_common.get_request_id(response)
|
1289
|
+
|
1290
|
+
|
1291
|
+
@usage_lib.entrypoint
|
1292
|
+
@server_common.check_server_healthy_or_start
|
1293
|
+
@annotations.client_api
|
1294
|
+
def local_down() -> server_common.RequestId:
|
1295
|
+
"""Tears down the Kubernetes cluster started by local_up."""
|
1296
|
+
# We do not allow local up when the API server is running remotely since it
|
1297
|
+
# will modify the kubeconfig.
|
1298
|
+
# TODO: move this check to remote server.
|
1299
|
+
if not server_common.is_api_server_local():
|
1300
|
+
with ux_utils.print_exception_no_traceback():
|
1301
|
+
raise ValueError('sky local down is only supported when running '
|
1302
|
+
'SkyPilot locally.')
|
1303
|
+
response = requests.post(f'{server_common.get_server_url()}/local_down')
|
1304
|
+
return server_common.get_request_id(response)
|
1305
|
+
|
1306
|
+
|
1307
|
+
@usage_lib.entrypoint
|
1308
|
+
@server_common.check_server_healthy_or_start
|
1309
|
+
@annotations.client_api
|
1310
|
+
def realtime_kubernetes_gpu_availability(
|
1311
|
+
context: Optional[str] = None,
|
1312
|
+
name_filter: Optional[str] = None,
|
1313
|
+
quantity_filter: Optional[int] = None) -> server_common.RequestId:
|
1314
|
+
"""Gets the real-time Kubernetes GPU availability.
|
1315
|
+
|
1316
|
+
Returns:
|
1317
|
+
The request ID of the real-time Kubernetes GPU availability request.
|
1318
|
+
"""
|
1319
|
+
body = payloads.RealtimeGpuAvailabilityRequestBody(
|
1320
|
+
context=context,
|
1321
|
+
name_filter=name_filter,
|
1322
|
+
quantity_filter=quantity_filter,
|
1323
|
+
)
|
1324
|
+
response = requests.post(
|
1325
|
+
f'{server_common.get_server_url()}/'
|
1326
|
+
'realtime_kubernetes_gpu_availability',
|
1327
|
+
json=json.loads(body.model_dump_json()))
|
1328
|
+
return server_common.get_request_id(response)
|
1329
|
+
|
1330
|
+
|
1331
|
+
@usage_lib.entrypoint
|
1332
|
+
@server_common.check_server_healthy_or_start
|
1333
|
+
@annotations.client_api
|
1334
|
+
def kubernetes_node_info(
|
1335
|
+
context: Optional[str] = None) -> server_common.RequestId:
|
1336
|
+
"""Gets the resource information for all the nodes in the cluster.
|
1337
|
+
|
1338
|
+
Currently only GPU resources are supported. The function returns the total
|
1339
|
+
number of GPUs available on the node and the number of free GPUs on the
|
1340
|
+
node.
|
1341
|
+
|
1342
|
+
If the user does not have sufficient permissions to list pods in all
|
1343
|
+
namespaces, the function will return free GPUs as -1.
|
1344
|
+
|
1345
|
+
Args:
|
1346
|
+
context: The Kubernetes context. If None, the default context is used.
|
1347
|
+
|
1348
|
+
Returns:
|
1349
|
+
The request ID of the Kubernetes node info request.
|
1350
|
+
|
1351
|
+
Request Returns:
|
1352
|
+
Dict[str, KubernetesNodeInfo]: Dictionary containing the node name as
|
1353
|
+
key and the KubernetesNodeInfo object as value
|
1354
|
+
"""
|
1355
|
+
body = payloads.KubernetesNodeInfoRequestBody(context=context)
|
1356
|
+
response = requests.post(
|
1357
|
+
f'{server_common.get_server_url()}/kubernetes_node_info',
|
1358
|
+
json=json.loads(body.model_dump_json()))
|
1359
|
+
return server_common.get_request_id(response)
|
1360
|
+
|
1361
|
+
|
1362
|
+
@usage_lib.entrypoint
|
1363
|
+
@server_common.check_server_healthy_or_start
|
1364
|
+
@annotations.client_api
|
1365
|
+
def status_kubernetes() -> server_common.RequestId:
|
1366
|
+
"""Gets all SkyPilot clusters and jobs in the Kubernetes cluster.
|
1367
|
+
|
1368
|
+
Managed jobs and services are also included in the clusters returned.
|
1369
|
+
The caller must parse the controllers to identify which clusters are run
|
1370
|
+
as managed jobs or services.
|
1371
|
+
|
1372
|
+
Returns:
|
1373
|
+
The request ID of the status request.
|
1374
|
+
|
1375
|
+
Request Returns:
|
1376
|
+
A tuple containing:
|
1377
|
+
- all_clusters: List of KubernetesSkyPilotClusterInfoPayload with info
|
1378
|
+
for all clusters, including managed jobs, services and controllers.
|
1379
|
+
- unmanaged_clusters: List of KubernetesSkyPilotClusterInfoPayload with
|
1380
|
+
info for all clusters excluding managed jobs and services.
|
1381
|
+
Controllers are included.
|
1382
|
+
- all_jobs: List of managed jobs from all controllers. Each entry is a
|
1383
|
+
dictionary job info, see jobs.queue_from_kubernetes_pod for details.
|
1384
|
+
- context: Kubernetes context used to fetch the cluster information.
|
1385
|
+
"""
|
1386
|
+
response = requests.get(
|
1387
|
+
f'{server_common.get_server_url()}/status_kubernetes')
|
1388
|
+
return server_common.get_request_id(response)
|
1389
|
+
|
1390
|
+
|
1391
|
+
# === API request APIs ===
|
1392
|
+
@usage_lib.entrypoint
|
1393
|
+
@server_common.check_server_healthy_or_start
|
1394
|
+
@annotations.client_api
|
1395
|
+
def get(request_id: str) -> Any:
|
1396
|
+
"""Waits for and gets the result of a request.
|
1397
|
+
|
1398
|
+
Args:
|
1399
|
+
request_id: The request ID of the request to get.
|
1400
|
+
|
1401
|
+
Returns:
|
1402
|
+
The ``Request Returns`` of the specified request. See the documentation
|
1403
|
+
of the specific requests above for more details.
|
1404
|
+
|
1405
|
+
Raises:
|
1406
|
+
Exception: It raises the same exceptions as the specific requests,
|
1407
|
+
see ``Request Raises`` in the documentation of the specific requests
|
1408
|
+
above.
|
1409
|
+
"""
|
1410
|
+
response = requests.get(
|
1411
|
+
f'{server_common.get_server_url()}/api/get?request_id={request_id}',
|
1412
|
+
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
1413
|
+
None))
|
1414
|
+
request_task = None
|
1415
|
+
if response.status_code == 200:
|
1416
|
+
request_task = requests_lib.Request.decode(
|
1417
|
+
requests_lib.RequestPayload(**response.json()))
|
1418
|
+
elif response.status_code == 500:
|
1419
|
+
try:
|
1420
|
+
request_task = requests_lib.Request.decode(
|
1421
|
+
requests_lib.RequestPayload(**response.json().get('detail')))
|
1422
|
+
logger.debug(f'Got request with error: {request_task.name}')
|
1423
|
+
except Exception: # pylint: disable=broad-except
|
1424
|
+
request_task = None
|
1425
|
+
if request_task is None:
|
1426
|
+
with ux_utils.print_exception_no_traceback():
|
1427
|
+
raise RuntimeError(f'Failed to get request {request_id}: '
|
1428
|
+
f'{response.status_code} {response.text}')
|
1429
|
+
error = request_task.get_error()
|
1430
|
+
if error is not None:
|
1431
|
+
error_obj = error['object']
|
1432
|
+
if env_options.Options.SHOW_DEBUG_INFO.get():
|
1433
|
+
stacktrace = getattr(error_obj, 'stacktrace', str(error_obj))
|
1434
|
+
logger.error('=== Traceback on SkyPilot API Server ===\n'
|
1435
|
+
f'{stacktrace}')
|
1436
|
+
with ux_utils.print_exception_no_traceback():
|
1437
|
+
raise error_obj
|
1438
|
+
if request_task.status == requests_lib.RequestStatus.CANCELLED:
|
1439
|
+
with ux_utils.print_exception_no_traceback():
|
1440
|
+
raise exceptions.RequestCancelled(
|
1441
|
+
f'{colorama.Fore.YELLOW}Current {request_task.name!r} request '
|
1442
|
+
f'({request_task.request_id}) is cancelled by another process.'
|
1443
|
+
f'{colorama.Style.RESET_ALL}')
|
1444
|
+
return request_task.get_return_value()
|
1445
|
+
|
1446
|
+
|
1447
|
+
@usage_lib.entrypoint
|
1448
|
+
@server_common.check_server_healthy_or_start
|
1449
|
+
@annotations.client_api
|
1450
|
+
def stream_and_get(
|
1451
|
+
request_id: Optional[str] = None,
|
1452
|
+
log_path: Optional[str] = None,
|
1453
|
+
tail: Optional[int] = None,
|
1454
|
+
follow: bool = True,
|
1455
|
+
output_stream: Optional['io.TextIOBase'] = None,
|
1456
|
+
) -> Any:
|
1457
|
+
"""Streams the logs of a request or a log file and gets the final result.
|
1458
|
+
|
1459
|
+
This will block until the request is finished. The request id can be a
|
1460
|
+
prefix of the full request id.
|
1461
|
+
|
1462
|
+
Args:
|
1463
|
+
request_id: The prefix of the request ID of the request to stream.
|
1464
|
+
log_path: The path to the log file to stream.
|
1465
|
+
tail: The number of lines to show from the end of the logs.
|
1466
|
+
If None, show all logs.
|
1467
|
+
follow: Whether to follow the logs.
|
1468
|
+
output_stream: The output stream to write to. If None, print to the
|
1469
|
+
console.
|
1470
|
+
|
1471
|
+
Returns:
|
1472
|
+
The ``Request Returns`` of the specified request. See the documentation
|
1473
|
+
of the specific requests above for more details.
|
1474
|
+
|
1475
|
+
Raises:
|
1476
|
+
Exception: It raises the same exceptions as the specific requests,
|
1477
|
+
see ``Request Raises`` in the documentation of the specific requests
|
1478
|
+
above.
|
1479
|
+
"""
|
1480
|
+
params = {
|
1481
|
+
'request_id': request_id,
|
1482
|
+
'log_path': log_path,
|
1483
|
+
'tail': str(tail) if tail is not None else None,
|
1484
|
+
'follow': follow,
|
1485
|
+
'format': 'console',
|
1486
|
+
}
|
1487
|
+
response = requests.get(
|
1488
|
+
f'{server_common.get_server_url()}/api/stream',
|
1489
|
+
params=params,
|
1490
|
+
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
1491
|
+
None),
|
1492
|
+
stream=True)
|
1493
|
+
if response.status_code in [404, 400]:
|
1494
|
+
detail = response.json().get('detail')
|
1495
|
+
with ux_utils.print_exception_no_traceback():
|
1496
|
+
raise RuntimeError(f'Failed to stream logs: {detail}')
|
1497
|
+
elif response.status_code != 200:
|
1498
|
+
return get(request_id)
|
1499
|
+
return stream_response(request_id, response, output_stream)
|
1500
|
+
|
1501
|
+
|
1502
|
+
@usage_lib.entrypoint
|
1503
|
+
@annotations.client_api
|
1504
|
+
def api_cancel(request_ids: Optional[Union[str, List[str]]] = None,
|
1505
|
+
all_users: bool = False,
|
1506
|
+
silent: bool = False) -> server_common.RequestId:
|
1507
|
+
"""Aborts a request or all requests.
|
1508
|
+
|
1509
|
+
Args:
|
1510
|
+
request_ids: The request ID(s) to abort. Can be a single string or a
|
1511
|
+
list of strings.
|
1512
|
+
all_users: Whether to abort all requests from all users.
|
1513
|
+
silent: Whether to suppress the output.
|
1514
|
+
|
1515
|
+
Returns:
|
1516
|
+
The request ID of the abort request itself.
|
1517
|
+
|
1518
|
+
Request Returns:
|
1519
|
+
A list of request IDs that were cancelled.
|
1520
|
+
|
1521
|
+
Raises:
|
1522
|
+
click.BadParameter: If no request ID is specified and not all or
|
1523
|
+
all_users is not set.
|
1524
|
+
"""
|
1525
|
+
echo = logger.info if not silent else logger.debug
|
1526
|
+
user_id = None
|
1527
|
+
if not all_users:
|
1528
|
+
user_id = common_utils.get_user_hash()
|
1529
|
+
|
1530
|
+
# Convert single request ID to list if needed
|
1531
|
+
if isinstance(request_ids, str):
|
1532
|
+
request_ids = [request_ids]
|
1533
|
+
|
1534
|
+
body = payloads.RequestCancelBody(request_ids=request_ids, user_id=user_id)
|
1535
|
+
if all_users:
|
1536
|
+
echo('Cancelling all users\' requests...')
|
1537
|
+
elif request_ids is None:
|
1538
|
+
echo(f'Cancelling all requests for user {user_id!r}...')
|
1539
|
+
else:
|
1540
|
+
request_id_str = ', '.join(
|
1541
|
+
repr(request_id) for request_id in request_ids)
|
1542
|
+
plural = 's' if len(request_ids) > 1 else ''
|
1543
|
+
echo(f'Cancelling {len(request_ids)} request{plural}: '
|
1544
|
+
f'{request_id_str}...')
|
1545
|
+
|
1546
|
+
response = requests.post(f'{server_common.get_server_url()}/api/cancel',
|
1547
|
+
json=json.loads(body.model_dump_json()),
|
1548
|
+
timeout=5)
|
1549
|
+
return server_common.get_request_id(response)
|
1550
|
+
|
1551
|
+
|
1552
|
+
@usage_lib.entrypoint
|
1553
|
+
@annotations.client_api
|
1554
|
+
def api_status(
|
1555
|
+
request_ids: Optional[List[str]] = None,
|
1556
|
+
# pylint: disable=redefined-builtin
|
1557
|
+
all_status: bool = False
|
1558
|
+
) -> List[requests_lib.RequestPayload]:
|
1559
|
+
"""Lists all requests.
|
1560
|
+
|
1561
|
+
Args:
|
1562
|
+
request_ids: The prefixes of the request IDs of the requests to query.
|
1563
|
+
If None, all requests are queried.
|
1564
|
+
all_status: Whether to list all finished requests as well. This argument
|
1565
|
+
is ignored if request_ids is not None.
|
1566
|
+
|
1567
|
+
Returns:
|
1568
|
+
A list of request payloads.
|
1569
|
+
"""
|
1570
|
+
body = payloads.RequestStatusBody(request_ids=request_ids,
|
1571
|
+
all_status=all_status)
|
1572
|
+
response = requests.get(
|
1573
|
+
f'{server_common.get_server_url()}/api/status',
|
1574
|
+
params=server_common.request_body_to_params(body),
|
1575
|
+
timeout=(client_common.API_SERVER_REQUEST_CONNECTION_TIMEOUT_SECONDS,
|
1576
|
+
None))
|
1577
|
+
server_common.handle_request_error(response)
|
1578
|
+
return [
|
1579
|
+
requests_lib.RequestPayload(**request) for request in response.json()
|
1580
|
+
]
|
1581
|
+
|
1582
|
+
|
1583
|
+
# === API server management APIs ===
|
1584
|
+
@usage_lib.entrypoint
|
1585
|
+
@server_common.check_server_healthy_or_start
|
1586
|
+
@annotations.client_api
|
1587
|
+
def api_info() -> Dict[str, str]:
|
1588
|
+
"""Gets the server's status, commit and version.
|
1589
|
+
|
1590
|
+
Returns:
|
1591
|
+
A dictionary containing the server's status, commit and version.
|
1592
|
+
|
1593
|
+
.. code-block:: python
|
1594
|
+
|
1595
|
+
{
|
1596
|
+
'status': 'healthy',
|
1597
|
+
'api_version': '1',
|
1598
|
+
'commit': 'abc1234567890',
|
1599
|
+
'version': '1.0.0',
|
1600
|
+
}
|
1601
|
+
|
1602
|
+
"""
|
1603
|
+
response = requests.get(f'{server_common.get_server_url()}/api/health')
|
1604
|
+
response.raise_for_status()
|
1605
|
+
return response.json()
|
1606
|
+
|
1607
|
+
|
1608
|
+
@usage_lib.entrypoint
|
1609
|
+
@annotations.client_api
|
1610
|
+
def api_start(
|
1611
|
+
*,
|
1612
|
+
deploy: bool = False,
|
1613
|
+
host: str = '127.0.0.1',
|
1614
|
+
foreground: bool = False,
|
1615
|
+
) -> None:
|
1616
|
+
"""Starts the API server.
|
1617
|
+
|
1618
|
+
It checks the existence of the API server and starts it if it does not
|
1619
|
+
exist.
|
1620
|
+
|
1621
|
+
Args:
|
1622
|
+
deploy: Whether to deploy the API server, i.e. fully utilize the
|
1623
|
+
resources of the machine.
|
1624
|
+
host: The host to deploy the API server. It will be set to 0.0.0.0
|
1625
|
+
if deploy is True, to allow remote access.
|
1626
|
+
foreground: Whether to run the API server in the foreground (run in
|
1627
|
+
the current process).
|
1628
|
+
Returns:
|
1629
|
+
None
|
1630
|
+
"""
|
1631
|
+
if deploy:
|
1632
|
+
host = '0.0.0.0'
|
1633
|
+
if host not in server_common.AVAILBLE_LOCAL_API_SERVER_HOSTS:
|
1634
|
+
raise ValueError(f'Invalid host: {host}. Should be one of: '
|
1635
|
+
f'{server_common.AVAILBLE_LOCAL_API_SERVER_HOSTS}')
|
1636
|
+
is_local_api_server = server_common.is_api_server_local()
|
1637
|
+
if not is_local_api_server:
|
1638
|
+
server_url = server_common.get_server_url()
|
1639
|
+
with ux_utils.print_exception_no_traceback():
|
1640
|
+
raise ValueError(f'Unable to start local API server: '
|
1641
|
+
f'server endpoint is set to {server_url}. '
|
1642
|
+
'To start a local API server, remove the endpoint '
|
1643
|
+
'from the config file and/or unset the '
|
1644
|
+
'SKYPILOT_API_SERVER_ENDPOINT environment '
|
1645
|
+
'variable.')
|
1646
|
+
server_common.check_server_healthy_or_start_fn(deploy, host, foreground)
|
1647
|
+
if foreground:
|
1648
|
+
# Explain why current process exited
|
1649
|
+
logger.info('API server is already running:')
|
1650
|
+
logger.info(f'{ux_utils.INDENT_SYMBOL}SkyPilot API server: '
|
1651
|
+
f'{server_common.get_server_url(host)}\n'
|
1652
|
+
f'{ux_utils.INDENT_LAST_SYMBOL}'
|
1653
|
+
f'View API server logs at: {constants.API_SERVER_LOGS}')
|
1654
|
+
|
1655
|
+
|
1656
|
+
@usage_lib.entrypoint
|
1657
|
+
@annotations.client_api
|
1658
|
+
def api_stop() -> None:
|
1659
|
+
"""Stops the API server.
|
1660
|
+
|
1661
|
+
It will do nothing if the API server is remotely hosted.
|
1662
|
+
|
1663
|
+
Returns:
|
1664
|
+
None
|
1665
|
+
"""
|
1666
|
+
# Kill the uvicorn process by name: uvicorn sky.server.server:app
|
1667
|
+
server_url = server_common.get_server_url()
|
1668
|
+
if not server_common.is_api_server_local():
|
1669
|
+
with ux_utils.print_exception_no_traceback():
|
1670
|
+
raise RuntimeError(
|
1671
|
+
f'Cannot kill the API server at {server_url} because it is not '
|
1672
|
+
f'the default SkyPilot API server started locally.')
|
1673
|
+
|
1674
|
+
found = False
|
1675
|
+
for process in psutil.process_iter(attrs=['pid', 'cmdline']):
|
1676
|
+
cmdline = process.info['cmdline']
|
1677
|
+
if cmdline and server_common.API_SERVER_CMD in ' '.join(cmdline):
|
1678
|
+
subprocess_utils.kill_children_processes(parent_pids=[process.pid],
|
1679
|
+
force=True)
|
1680
|
+
found = True
|
1681
|
+
|
1682
|
+
# Remove the database for requests including any files starting with
|
1683
|
+
# api.constants.API_SERVER_REQUEST_DB_PATH
|
1684
|
+
db_path = os.path.expanduser(server_constants.API_SERVER_REQUEST_DB_PATH)
|
1685
|
+
for extension in ['', '-shm', '-wal']:
|
1686
|
+
try:
|
1687
|
+
os.remove(f'{db_path}{extension}')
|
1688
|
+
except FileNotFoundError:
|
1689
|
+
logger.debug(f'Database file {db_path}{extension} not found.')
|
1690
|
+
|
1691
|
+
if found:
|
1692
|
+
logger.info(f'{colorama.Fore.GREEN}SkyPilot API server stopped.'
|
1693
|
+
f'{colorama.Style.RESET_ALL}')
|
1694
|
+
else:
|
1695
|
+
logger.info('SkyPilot API server is not running.')
|
1696
|
+
|
1697
|
+
|
1698
|
+
# Use the same args as `docker logs`
|
1699
|
+
@usage_lib.entrypoint
|
1700
|
+
@annotations.client_api
|
1701
|
+
def api_server_logs(follow: bool = True, tail: Optional[int] = None) -> None:
|
1702
|
+
"""Streams the API server logs.
|
1703
|
+
|
1704
|
+
Args:
|
1705
|
+
follow: Whether to follow the logs.
|
1706
|
+
tail: the number of lines to show from the end of the logs.
|
1707
|
+
If None, show all logs.
|
1708
|
+
|
1709
|
+
Returns:
|
1710
|
+
None
|
1711
|
+
"""
|
1712
|
+
if server_common.is_api_server_local():
|
1713
|
+
tail_args = ['-f'] if follow else []
|
1714
|
+
if tail is None:
|
1715
|
+
tail_args.extend(['-n', '+1'])
|
1716
|
+
else:
|
1717
|
+
tail_args.extend(['-n', f'{tail}'])
|
1718
|
+
log_path = os.path.expanduser(constants.API_SERVER_LOGS)
|
1719
|
+
subprocess.run(['tail', *tail_args, f'{log_path}'], check=False)
|
1720
|
+
else:
|
1721
|
+
stream_and_get(log_path=constants.API_SERVER_LOGS, tail=tail)
|
1722
|
+
|
1723
|
+
|
1724
|
+
@usage_lib.entrypoint
|
1725
|
+
@annotations.client_api
|
1726
|
+
def api_login(endpoint: Optional[str] = None) -> None:
|
1727
|
+
"""Logs into a SkyPilot API server.
|
1728
|
+
|
1729
|
+
This sets the endpoint globally, i.e., all SkyPilot CLI and SDK calls will
|
1730
|
+
use this endpoint.
|
1731
|
+
|
1732
|
+
To temporarily override the endpoint, use the environment variable
|
1733
|
+
`SKYPILOT_API_SERVER_ENDPOINT` instead.
|
1734
|
+
|
1735
|
+
Args:
|
1736
|
+
endpoint: The endpoint of the SkyPilot API server, e.g.,
|
1737
|
+
http://1.2.3.4:46580 or https://skypilot.mydomain.com.
|
1738
|
+
|
1739
|
+
Returns:
|
1740
|
+
None
|
1741
|
+
"""
|
1742
|
+
# TODO(zhwu): this SDK sets global endpoint, which may not be the best
|
1743
|
+
# design as a user may expect this is only effective for the current
|
1744
|
+
# session. We should consider using env var for specifying endpoint.
|
1745
|
+
if endpoint is None:
|
1746
|
+
endpoint = click.prompt('Enter your SkyPilot API server endpoint')
|
1747
|
+
# Check endpoint is a valid URL
|
1748
|
+
if (endpoint is not None and not endpoint.startswith('http://') and
|
1749
|
+
not endpoint.startswith('https://')):
|
1750
|
+
raise click.BadParameter('Endpoint must be a valid URL.')
|
1751
|
+
|
1752
|
+
server_common.check_server_healthy(endpoint)
|
1753
|
+
|
1754
|
+
# Set the endpoint in the config file
|
1755
|
+
config_path = pathlib.Path(skypilot_config.CONFIG_PATH).expanduser()
|
1756
|
+
with filelock.FileLock(config_path.with_suffix('.lock')):
|
1757
|
+
if not skypilot_config.loaded():
|
1758
|
+
config_path.touch()
|
1759
|
+
config = {'api_server': {'endpoint': endpoint}}
|
1760
|
+
else:
|
1761
|
+
config = skypilot_config.set_nested(('api_server', 'endpoint'),
|
1762
|
+
endpoint)
|
1763
|
+
common_utils.dump_yaml(str(config_path), config)
|
1764
|
+
click.secho(f'Logged in to SkyPilot API server at {endpoint}',
|
1765
|
+
fg='green')
|