skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
File without changes
|
sky/serve/client/sdk.py
ADDED
@@ -0,0 +1,366 @@
|
|
1
|
+
"""SDK for SkyServe."""
|
2
|
+
import json
|
3
|
+
import typing
|
4
|
+
from typing import List, Optional, Union
|
5
|
+
|
6
|
+
import click
|
7
|
+
import requests
|
8
|
+
|
9
|
+
from sky.client import common as client_common
|
10
|
+
from sky.server import common as server_common
|
11
|
+
from sky.server.requests import payloads
|
12
|
+
from sky.usage import usage_lib
|
13
|
+
from sky.utils import dag_utils
|
14
|
+
|
15
|
+
if typing.TYPE_CHECKING:
|
16
|
+
import io
|
17
|
+
|
18
|
+
import sky
|
19
|
+
from sky.serve import serve_utils
|
20
|
+
|
21
|
+
|
22
|
+
@usage_lib.entrypoint
|
23
|
+
@server_common.check_server_healthy_or_start
|
24
|
+
def up(
|
25
|
+
task: Union['sky.Task', 'sky.Dag'],
|
26
|
+
service_name: str,
|
27
|
+
# Internal only:
|
28
|
+
# pylint: disable=invalid-name
|
29
|
+
_need_confirmation: bool = False
|
30
|
+
) -> server_common.RequestId:
|
31
|
+
"""Spins up a service.
|
32
|
+
|
33
|
+
Please refer to the sky.cli.serve_up for the document.
|
34
|
+
|
35
|
+
Args:
|
36
|
+
task: sky.Task to serve up.
|
37
|
+
service_name: Name of the service.
|
38
|
+
_need_confirmation: (Internal only) Whether to show a confirmation
|
39
|
+
prompt before spinning up the service.
|
40
|
+
|
41
|
+
Returns:
|
42
|
+
The request ID of the up request.
|
43
|
+
|
44
|
+
Request Returns:
|
45
|
+
service_name (str): The name of the service. Same if passed in as an
|
46
|
+
argument.
|
47
|
+
endpoint (str): The service endpoint.
|
48
|
+
"""
|
49
|
+
|
50
|
+
# Avoid circular import.
|
51
|
+
from sky.client import sdk # pylint: disable=import-outside-toplevel
|
52
|
+
|
53
|
+
dag = dag_utils.convert_entrypoint_to_dag(task)
|
54
|
+
sdk.validate(dag)
|
55
|
+
request_id = sdk.optimize(dag)
|
56
|
+
sdk.stream_and_get(request_id)
|
57
|
+
if _need_confirmation:
|
58
|
+
prompt = f'Launching a new service {service_name!r}. Proceed?'
|
59
|
+
if prompt is not None:
|
60
|
+
click.confirm(prompt, default=True, abort=True, show_default=True)
|
61
|
+
|
62
|
+
dag = client_common.upload_mounts_to_api_server(dag)
|
63
|
+
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
64
|
+
|
65
|
+
body = payloads.ServeUpBody(
|
66
|
+
task=dag_str,
|
67
|
+
service_name=service_name,
|
68
|
+
)
|
69
|
+
response = requests.post(
|
70
|
+
f'{server_common.get_server_url()}/serve/up',
|
71
|
+
json=json.loads(body.model_dump_json()),
|
72
|
+
timeout=(5, None),
|
73
|
+
)
|
74
|
+
return server_common.get_request_id(response)
|
75
|
+
|
76
|
+
|
77
|
+
@usage_lib.entrypoint
|
78
|
+
@server_common.check_server_healthy_or_start
|
79
|
+
def update(
|
80
|
+
task: Union['sky.Task', 'sky.Dag'],
|
81
|
+
service_name: str,
|
82
|
+
mode: 'serve_utils.UpdateMode',
|
83
|
+
# Internal only:
|
84
|
+
# pylint: disable=invalid-name
|
85
|
+
_need_confirmation: bool = False
|
86
|
+
) -> server_common.RequestId:
|
87
|
+
"""Updates an existing service.
|
88
|
+
|
89
|
+
Please refer to the sky.cli.serve_update for the document.
|
90
|
+
|
91
|
+
Args:
|
92
|
+
task: sky.Task to update.
|
93
|
+
service_name: Name of the service.
|
94
|
+
mode: Update mode, including:
|
95
|
+
- sky.serve.UpdateMode.ROLLING
|
96
|
+
- sky.serve.UpdateMode.BLUE_GREEN
|
97
|
+
_need_confirmation: (Internal only) Whether to show a confirmation
|
98
|
+
prompt before updating the service.
|
99
|
+
|
100
|
+
Returns:
|
101
|
+
The request ID of the update request.
|
102
|
+
|
103
|
+
Request Returns:
|
104
|
+
None
|
105
|
+
"""
|
106
|
+
# Avoid circular import.
|
107
|
+
from sky.client import sdk # pylint: disable=import-outside-toplevel
|
108
|
+
|
109
|
+
dag = dag_utils.convert_entrypoint_to_dag(task)
|
110
|
+
sdk.validate(dag)
|
111
|
+
request_id = sdk.optimize(dag)
|
112
|
+
sdk.stream_and_get(request_id)
|
113
|
+
if _need_confirmation:
|
114
|
+
click.confirm(f'Updating service {service_name!r}. Proceed?',
|
115
|
+
default=True,
|
116
|
+
abort=True,
|
117
|
+
show_default=True)
|
118
|
+
|
119
|
+
dag = client_common.upload_mounts_to_api_server(dag)
|
120
|
+
dag_str = dag_utils.dump_chain_dag_to_yaml_str(dag)
|
121
|
+
body = payloads.ServeUpdateBody(
|
122
|
+
task=dag_str,
|
123
|
+
service_name=service_name,
|
124
|
+
mode=mode,
|
125
|
+
)
|
126
|
+
|
127
|
+
response = requests.post(
|
128
|
+
f'{server_common.get_server_url()}/serve/update',
|
129
|
+
json=json.loads(body.model_dump_json()),
|
130
|
+
timeout=(5, None),
|
131
|
+
)
|
132
|
+
return server_common.get_request_id(response)
|
133
|
+
|
134
|
+
|
135
|
+
@usage_lib.entrypoint
|
136
|
+
@server_common.check_server_healthy_or_start
|
137
|
+
def down(
|
138
|
+
service_names: Optional[Union[str, List[str]]],
|
139
|
+
all: bool = False, # pylint: disable=redefined-builtin
|
140
|
+
purge: bool = False
|
141
|
+
) -> server_common.RequestId:
|
142
|
+
"""Tears down a service.
|
143
|
+
|
144
|
+
Please refer to the sky.cli.serve_down for the docs.
|
145
|
+
|
146
|
+
Args:
|
147
|
+
service_names: Name of the service(s).
|
148
|
+
all: Whether to terminate all services.
|
149
|
+
purge: Whether to terminate services in a failed status. These services
|
150
|
+
may potentially lead to resource leaks.
|
151
|
+
|
152
|
+
Returns:
|
153
|
+
The request ID of the down request.
|
154
|
+
|
155
|
+
Request Returns:
|
156
|
+
None
|
157
|
+
|
158
|
+
Request Raises:
|
159
|
+
sky.exceptions.ClusterNotUpError: if the sky serve controller is not up.
|
160
|
+
ValueError: if the arguments are invalid.
|
161
|
+
RuntimeError: if failed to terminate the service.
|
162
|
+
"""
|
163
|
+
body = payloads.ServeDownBody(
|
164
|
+
service_names=service_names,
|
165
|
+
all=all,
|
166
|
+
purge=purge,
|
167
|
+
)
|
168
|
+
response = requests.post(
|
169
|
+
f'{server_common.get_server_url()}/serve/down',
|
170
|
+
json=json.loads(body.model_dump_json()),
|
171
|
+
timeout=(5, None),
|
172
|
+
)
|
173
|
+
return server_common.get_request_id(response)
|
174
|
+
|
175
|
+
|
176
|
+
@usage_lib.entrypoint
|
177
|
+
@server_common.check_server_healthy_or_start
|
178
|
+
def terminate_replica(service_name: str, replica_id: int,
|
179
|
+
purge: bool) -> server_common.RequestId:
|
180
|
+
"""Tears down a specific replica for the given service.
|
181
|
+
|
182
|
+
Args:
|
183
|
+
service_name: Name of the service.
|
184
|
+
replica_id: ID of replica to terminate.
|
185
|
+
purge: Whether to terminate replicas in a failed status. These replicas
|
186
|
+
may lead to resource leaks, so we require the user to explicitly
|
187
|
+
specify this flag to make sure they are aware of this potential
|
188
|
+
resource leak.
|
189
|
+
|
190
|
+
Returns:
|
191
|
+
The request ID of the terminate replica request.
|
192
|
+
|
193
|
+
Request Raises:
|
194
|
+
sky.exceptions.ClusterNotUpError: if the sky sere controller is not up.
|
195
|
+
RuntimeError: if failed to terminate the replica.
|
196
|
+
"""
|
197
|
+
body = payloads.ServeTerminateReplicaBody(
|
198
|
+
service_name=service_name,
|
199
|
+
replica_id=replica_id,
|
200
|
+
purge=purge,
|
201
|
+
)
|
202
|
+
response = requests.post(
|
203
|
+
f'{server_common.get_server_url()}/serve/terminate-replica',
|
204
|
+
json=json.loads(body.model_dump_json()),
|
205
|
+
timeout=(5, None),
|
206
|
+
)
|
207
|
+
return server_common.get_request_id(response)
|
208
|
+
|
209
|
+
|
210
|
+
@usage_lib.entrypoint
|
211
|
+
@server_common.check_server_healthy_or_start
|
212
|
+
def status(
|
213
|
+
service_names: Optional[Union[str,
|
214
|
+
List[str]]]) -> server_common.RequestId:
|
215
|
+
"""Gets service statuses.
|
216
|
+
|
217
|
+
If service_names is given, return those services. Otherwise, return all
|
218
|
+
services.
|
219
|
+
|
220
|
+
Each returned value has the following fields:
|
221
|
+
|
222
|
+
.. code-block:: python
|
223
|
+
|
224
|
+
{
|
225
|
+
'name': (str) service name,
|
226
|
+
'active_versions': (List[int]) a list of versions that are active,
|
227
|
+
'controller_job_id': (int) the job id of the controller,
|
228
|
+
'uptime': (int) uptime in seconds,
|
229
|
+
'status': (sky.ServiceStatus) service status,
|
230
|
+
'controller_port': (Optional[int]) controller port,
|
231
|
+
'load_balancer_port': (Optional[int]) load balancer port,
|
232
|
+
'endpoint': (Optional[str]) endpoint of the service,
|
233
|
+
'policy': (Optional[str]) autoscaling policy description,
|
234
|
+
'requested_resources_str': (str) str representation of
|
235
|
+
requested resources,
|
236
|
+
'load_balancing_policy': (str) load balancing policy name,
|
237
|
+
'replica_info': (List[Dict[str, Any]]) replica information,
|
238
|
+
}
|
239
|
+
|
240
|
+
Each entry in replica_info has the following fields:
|
241
|
+
|
242
|
+
.. code-block:: python
|
243
|
+
|
244
|
+
{
|
245
|
+
'replica_id': (int) replica id,
|
246
|
+
'name': (str) replica name,
|
247
|
+
'status': (sky.serve.ReplicaStatus) replica status,
|
248
|
+
'version': (int) replica version,
|
249
|
+
'launched_at': (int) timestamp of launched,
|
250
|
+
'handle': (ResourceHandle) handle of the replica cluster,
|
251
|
+
'endpoint': (str) endpoint of the replica,
|
252
|
+
}
|
253
|
+
|
254
|
+
For possible service statuses and replica statuses, please refer to
|
255
|
+
sky.cli.serve_status.
|
256
|
+
|
257
|
+
Args:
|
258
|
+
service_names: a single or a list of service names to query. If None,
|
259
|
+
query all services.
|
260
|
+
|
261
|
+
Returns:
|
262
|
+
The request ID of the status request.
|
263
|
+
|
264
|
+
Request Returns:
|
265
|
+
service_records (List[Dict[str, Any]]): A list of dicts, with each
|
266
|
+
dict containing the information of a service. If a service is not
|
267
|
+
found, it will be omitted from the returned list.
|
268
|
+
|
269
|
+
Request Raises:
|
270
|
+
RuntimeError: if failed to get the service status.
|
271
|
+
exceptions.ClusterNotUpError: if the sky serve controller is not up.
|
272
|
+
"""
|
273
|
+
body = payloads.ServeStatusBody(service_names=service_names,)
|
274
|
+
response = requests.post(
|
275
|
+
f'{server_common.get_server_url()}/serve/status',
|
276
|
+
json=json.loads(body.model_dump_json()),
|
277
|
+
timeout=(5, None),
|
278
|
+
)
|
279
|
+
return server_common.get_request_id(response)
|
280
|
+
|
281
|
+
|
282
|
+
@usage_lib.entrypoint
|
283
|
+
@server_common.check_server_healthy_or_start
|
284
|
+
def tail_logs(service_name: str,
|
285
|
+
target: Union[str, 'serve_utils.ServiceComponent'],
|
286
|
+
replica_id: Optional[int] = None,
|
287
|
+
follow: bool = True,
|
288
|
+
output_stream: Optional['io.TextIOBase'] = None) -> None:
|
289
|
+
"""Tails logs for a service.
|
290
|
+
|
291
|
+
Usage:
|
292
|
+
|
293
|
+
.. code-block:: python
|
294
|
+
|
295
|
+
sky.serve.tail_logs(
|
296
|
+
service_name,
|
297
|
+
target=<component>,
|
298
|
+
follow=False, # Optionally, default to True
|
299
|
+
# replica_id=3, # Must be specified when target is REPLICA.
|
300
|
+
)
|
301
|
+
|
302
|
+
|
303
|
+
``target`` is a enum of ``sky.serve.ServiceComponent``, which can be one of:
|
304
|
+
|
305
|
+
- ``sky.serve.ServiceComponent.CONTROLLER``
|
306
|
+
|
307
|
+
- ``sky.serve.ServiceComponent.LOAD_BALANCER``
|
308
|
+
|
309
|
+
- ``sky.serve.ServiceComponent.REPLICA``
|
310
|
+
|
311
|
+
Pass target as a lower-case string is also supported, e.g.
|
312
|
+
``target='controller'``.
|
313
|
+
To use ``sky.serve.ServiceComponent.REPLICA``, you must specify
|
314
|
+
``replica_id``.
|
315
|
+
|
316
|
+
To tail controller logs:
|
317
|
+
|
318
|
+
.. code-block:: python
|
319
|
+
|
320
|
+
# follow default to True
|
321
|
+
sky.serve.tail_logs(
|
322
|
+
service_name, target=sky.serve.ServiceComponent.CONTROLLER
|
323
|
+
)
|
324
|
+
|
325
|
+
To print replica 3 logs:
|
326
|
+
|
327
|
+
.. code-block:: python
|
328
|
+
|
329
|
+
# Pass target as a lower-case string is also supported.
|
330
|
+
sky.serve.tail_logs(
|
331
|
+
service_name, target='replica',
|
332
|
+
follow=False, replica_id=3
|
333
|
+
)
|
334
|
+
|
335
|
+
Args:
|
336
|
+
service_name: Name of the service.
|
337
|
+
target: The component to tail logs.
|
338
|
+
replica_id: The ID of the replica to tail logs.
|
339
|
+
follow: Whether to follow the logs.
|
340
|
+
output_stream: The stream to write the logs to. If None, print to the
|
341
|
+
console.
|
342
|
+
|
343
|
+
Returns:
|
344
|
+
The request ID of the tail logs request.
|
345
|
+
|
346
|
+
Request Raises:
|
347
|
+
sky.exceptions.ClusterNotUpError: the sky serve controller is not up.
|
348
|
+
ValueError: arguments not valid, or failed to tail the logs.
|
349
|
+
"""
|
350
|
+
# Avoid circular import.
|
351
|
+
from sky.client import sdk # pylint: disable=import-outside-toplevel
|
352
|
+
|
353
|
+
body = payloads.ServeLogsBody(
|
354
|
+
service_name=service_name,
|
355
|
+
target=target,
|
356
|
+
replica_id=replica_id,
|
357
|
+
follow=follow,
|
358
|
+
)
|
359
|
+
response = requests.post(
|
360
|
+
f'{server_common.get_server_url()}/serve/logs',
|
361
|
+
json=json.loads(body.model_dump_json()),
|
362
|
+
timeout=(5, None),
|
363
|
+
stream=True,
|
364
|
+
)
|
365
|
+
request_id = server_common.get_request_id(response)
|
366
|
+
sdk.stream_response(request_id, response, output_stream)
|
sky/serve/constants.py
CHANGED
@@ -12,6 +12,9 @@ PORT_SELECTION_FILE_LOCK_PATH = f'{SKYSERVE_METADATA_DIR}/port_selection.lock'
|
|
12
12
|
# Signal file path for controller to handle signals.
|
13
13
|
SIGNAL_FILE_PATH = '/tmp/sky_serve_controller_signal_{}'
|
14
14
|
|
15
|
+
# Time to wait in seconds for controller to setup, this involves the time to run
|
16
|
+
# cloud dependencies installation.
|
17
|
+
CONTROLLER_SETUP_TIMEOUT_SECONDS = 300
|
15
18
|
# Time to wait in seconds for service to register on the controller.
|
16
19
|
SERVICE_REGISTER_TIMEOUT_SECONDS = 60
|
17
20
|
|
@@ -39,8 +42,7 @@ ENDPOINT_PROBE_INTERVAL_SECONDS = 10
|
|
39
42
|
# The default timeout in seconds for a readiness probe request. We set the
|
40
43
|
# timeout to 15s since using actual generation in LLM services as readiness
|
41
44
|
# probe is very time-consuming (33B, 70B, ...).
|
42
|
-
|
43
|
-
READINESS_PROBE_TIMEOUT_SECONDS = 15
|
45
|
+
DEFAULT_READINESS_PROBE_TIMEOUT_SECONDS = 15
|
44
46
|
|
45
47
|
# Autoscaler window size in seconds for query per second. We calculate qps by
|
46
48
|
# divide the number of queries in last window size by this window size.
|
@@ -93,4 +95,11 @@ REPLICA_ID_ENV_VAR = 'SKYPILOT_SERVE_REPLICA_ID'
|
|
93
95
|
# change for the serve_utils.ServeCodeGen, we need to bump this version, so that
|
94
96
|
# the user can be notified to update their SkyPilot serve version on the remote
|
95
97
|
# cluster.
|
96
|
-
|
98
|
+
# Changelog:
|
99
|
+
# v1.0 - Introduce rolling update.
|
100
|
+
# v2.0 - Added template-replica feature.
|
101
|
+
SERVE_VERSION = 2
|
102
|
+
|
103
|
+
TERMINATE_REPLICA_VERSION_MISMATCH_ERROR = (
|
104
|
+
'The version of service is outdated and does not support manually '
|
105
|
+
'terminating replicas. Please terminate the service and spin up again.')
|
sky/serve/controller.py
CHANGED
@@ -2,14 +2,16 @@
|
|
2
2
|
|
3
3
|
Responsible for autoscaling and replica management.
|
4
4
|
"""
|
5
|
+
import contextlib
|
5
6
|
import logging
|
6
|
-
import os
|
7
7
|
import threading
|
8
8
|
import time
|
9
9
|
import traceback
|
10
10
|
from typing import Any, Dict, List
|
11
11
|
|
12
|
+
import colorama
|
12
13
|
import fastapi
|
14
|
+
from fastapi import responses
|
13
15
|
import uvicorn
|
14
16
|
|
15
17
|
from sky import serve
|
@@ -50,7 +52,14 @@ class SkyServeController:
|
|
50
52
|
autoscalers.Autoscaler.from_spec(service_name, service_spec))
|
51
53
|
self._host = host
|
52
54
|
self._port = port
|
53
|
-
self._app = fastapi.FastAPI()
|
55
|
+
self._app = fastapi.FastAPI(lifespan=self.lifespan)
|
56
|
+
|
57
|
+
@contextlib.asynccontextmanager
|
58
|
+
async def lifespan(self, _: fastapi.FastAPI):
|
59
|
+
uvicorn_access_logger = logging.getLogger('uvicorn.access')
|
60
|
+
for handler in uvicorn_access_logger.handlers:
|
61
|
+
handler.setFormatter(sky_logging.FORMATTER)
|
62
|
+
yield
|
54
63
|
|
55
64
|
def _run_autoscaler(self):
|
56
65
|
logger.info('Starting autoscaler.')
|
@@ -58,9 +67,16 @@ class SkyServeController:
|
|
58
67
|
try:
|
59
68
|
replica_infos = serve_state.get_replica_infos(
|
60
69
|
self._service_name)
|
70
|
+
# Use the active versions set by replica manager to make
|
71
|
+
# sure we only scale down the outdated replicas that are
|
72
|
+
# not used by the load balancer.
|
73
|
+
record = serve_state.get_service_from_name(self._service_name)
|
74
|
+
assert record is not None, ('No service record found for '
|
75
|
+
f'{self._service_name}')
|
76
|
+
active_versions = record['active_versions']
|
61
77
|
logger.info(f'All replica info: {replica_infos}')
|
62
|
-
scaling_options = self._autoscaler.
|
63
|
-
replica_infos)
|
78
|
+
scaling_options = self._autoscaler.generate_scaling_decisions(
|
79
|
+
replica_infos, active_versions)
|
64
80
|
for scaling_option in scaling_options:
|
65
81
|
logger.info(f'Scaling option received: {scaling_option}')
|
66
82
|
if (scaling_option.operator ==
|
@@ -68,15 +84,10 @@ class SkyServeController:
|
|
68
84
|
assert (scaling_option.target is None or isinstance(
|
69
85
|
scaling_option.target, dict)), scaling_option
|
70
86
|
self._replica_manager.scale_up(scaling_option.target)
|
71
|
-
|
72
|
-
autoscalers.AutoscalerDecisionOperator.SCALE_DOWN):
|
87
|
+
else:
|
73
88
|
assert isinstance(scaling_option.target,
|
74
89
|
int), scaling_option
|
75
90
|
self._replica_manager.scale_down(scaling_option.target)
|
76
|
-
else:
|
77
|
-
with ux_utils.enable_traceback():
|
78
|
-
logger.error('Error in scaling_option.operator: '
|
79
|
-
f'{scaling_option.operator}')
|
80
91
|
except Exception as e: # pylint: disable=broad-except
|
81
92
|
# No matter what error happens, we should keep the
|
82
93
|
# monitor running.
|
@@ -89,7 +100,8 @@ class SkyServeController:
|
|
89
100
|
def run(self) -> None:
|
90
101
|
|
91
102
|
@self._app.post('/controller/load_balancer_sync')
|
92
|
-
async def load_balancer_sync(
|
103
|
+
async def load_balancer_sync(
|
104
|
+
request: fastapi.Request) -> fastapi.Response:
|
93
105
|
request_data = await request.json()
|
94
106
|
# TODO(MaoZiming): Check aggregator type.
|
95
107
|
request_aggregator: Dict[str, Any] = request_data.get(
|
@@ -97,18 +109,21 @@ class SkyServeController:
|
|
97
109
|
timestamps: List[int] = request_aggregator.get('timestamps', [])
|
98
110
|
logger.info(f'Received {len(timestamps)} inflight requests.')
|
99
111
|
self._autoscaler.collect_request_information(request_aggregator)
|
100
|
-
return {
|
112
|
+
return responses.JSONResponse(content={
|
101
113
|
'ready_replica_urls':
|
102
114
|
self._replica_manager.get_active_replica_urls()
|
103
|
-
}
|
115
|
+
},
|
116
|
+
status_code=200)
|
104
117
|
|
105
118
|
@self._app.post('/controller/update_service')
|
106
|
-
async def update_service(request: fastapi.Request):
|
119
|
+
async def update_service(request: fastapi.Request) -> fastapi.Response:
|
107
120
|
request_data = await request.json()
|
108
121
|
try:
|
109
122
|
version = request_data.get('version', None)
|
110
123
|
if version is None:
|
111
|
-
return
|
124
|
+
return responses.JSONResponse(
|
125
|
+
content={'message': 'Error: version is not specified.'},
|
126
|
+
status_code=400)
|
112
127
|
update_mode_str = request_data.get(
|
113
128
|
'mode', serve_utils.DEFAULT_UPDATE_MODE.value)
|
114
129
|
update_mode = serve_utils.UpdateMode(update_mode_str)
|
@@ -137,40 +152,95 @@ class SkyServeController:
|
|
137
152
|
self._autoscaler.update_version(version,
|
138
153
|
service,
|
139
154
|
update_mode=update_mode)
|
140
|
-
return {'message': 'Success'}
|
155
|
+
return responses.JSONResponse(content={'message': 'Success'},
|
156
|
+
status_code=200)
|
141
157
|
except Exception as e: # pylint: disable=broad-except
|
142
158
|
logger.error(f'Error in update_service: '
|
143
159
|
f'{common_utils.format_exception(e)}')
|
144
|
-
return {'message': 'Error'}
|
160
|
+
return responses.JSONResponse(content={'message': 'Error'},
|
161
|
+
status_code=500)
|
145
162
|
|
146
|
-
@self._app.
|
147
|
-
def
|
148
|
-
|
149
|
-
|
150
|
-
|
163
|
+
@self._app.post('/controller/terminate_replica')
|
164
|
+
async def terminate_replica(
|
165
|
+
request: fastapi.Request) -> fastapi.Response:
|
166
|
+
request_data = await request.json()
|
167
|
+
replica_id = request_data['replica_id']
|
168
|
+
assert isinstance(replica_id,
|
169
|
+
int), 'Error: replica ID must be an integer.'
|
170
|
+
purge = request_data['purge']
|
171
|
+
assert isinstance(purge, bool), 'Error: purge must be a boolean.'
|
172
|
+
replica_info = serve_state.get_replica_info_from_id(
|
173
|
+
self._service_name, replica_id)
|
174
|
+
assert replica_info is not None, (f'Error: replica '
|
175
|
+
f'{replica_id} does not exist.')
|
176
|
+
replica_status = replica_info.status
|
177
|
+
|
178
|
+
if replica_status == serve_state.ReplicaStatus.SHUTTING_DOWN:
|
179
|
+
return responses.JSONResponse(
|
180
|
+
status_code=409,
|
181
|
+
content={
|
182
|
+
'message':
|
183
|
+
f'Replica {replica_id} of service '
|
184
|
+
f'{self._service_name!r} is already in the process '
|
185
|
+
f'of terminating. Skip terminating now.'
|
186
|
+
})
|
187
|
+
|
188
|
+
if (replica_status in serve_state.ReplicaStatus.failed_statuses()
|
189
|
+
and not purge):
|
190
|
+
return responses.JSONResponse(
|
191
|
+
status_code=409,
|
192
|
+
content={
|
193
|
+
'message': f'{colorama.Fore.YELLOW}Replica '
|
194
|
+
f'{replica_id} of service '
|
195
|
+
f'{self._service_name!r} is in failed '
|
196
|
+
f'status ({replica_info.status}). '
|
197
|
+
f'Skipping its termination as it could '
|
198
|
+
f'lead to a resource leak. '
|
199
|
+
f'(Use `sky serve down '
|
200
|
+
f'{self._service_name!r} --replica-id '
|
201
|
+
f'{replica_id} --purge` to '
|
202
|
+
'forcefully terminate the replica.)'
|
203
|
+
f'{colorama.Style.RESET_ALL}'
|
204
|
+
})
|
205
|
+
|
206
|
+
self._replica_manager.scale_down(replica_id, purge=purge)
|
207
|
+
|
208
|
+
action = 'terminated' if not purge else 'purged'
|
209
|
+
message = (f'{colorama.Fore.GREEN}Replica {replica_id} of service '
|
210
|
+
f'{self._service_name!r} is scheduled to be '
|
211
|
+
f'{action}.{colorama.Style.RESET_ALL}\n'
|
212
|
+
f'Please use {ux_utils.BOLD}sky serve status '
|
213
|
+
f'{self._service_name}{ux_utils.RESET_BOLD} '
|
214
|
+
f'to check the latest status.')
|
215
|
+
return responses.JSONResponse(status_code=200,
|
216
|
+
content={'message': message})
|
217
|
+
|
218
|
+
@self._app.exception_handler(Exception)
|
219
|
+
async def validation_exception_handler(
|
220
|
+
request: fastapi.Request, exc: Exception) -> fastapi.Response:
|
221
|
+
with ux_utils.enable_traceback():
|
222
|
+
logger.error(f'Error in controller: {exc!r}')
|
223
|
+
return responses.JSONResponse(
|
224
|
+
status_code=500,
|
225
|
+
content={
|
226
|
+
'message':
|
227
|
+
(f'Failed method {request.method} at URL {request.url}.'
|
228
|
+
f' Exception message is {exc!r}.')
|
229
|
+
},
|
230
|
+
)
|
151
231
|
|
152
232
|
threading.Thread(target=self._run_autoscaler).start()
|
153
233
|
|
154
234
|
logger.info('SkyServe Controller started on '
|
155
235
|
f'http://{self._host}:{self._port}')
|
156
236
|
|
157
|
-
uvicorn.run(self._app, host=
|
237
|
+
uvicorn.run(self._app, host=self._host, port=self._port)
|
158
238
|
|
159
239
|
|
160
240
|
# TODO(tian): Probably we should support service that will stop the VM in
|
161
241
|
# specific time period.
|
162
242
|
def run_controller(service_name: str, service_spec: serve.SkyServiceSpec,
|
163
|
-
task_yaml: str, controller_port: int):
|
164
|
-
|
165
|
-
|
166
|
-
# high availability load balancers) to communicate with the controller.
|
167
|
-
def _get_host():
|
168
|
-
if 'KUBERNETES_SERVICE_HOST' in os.environ:
|
169
|
-
return '0.0.0.0'
|
170
|
-
else:
|
171
|
-
return 'localhost'
|
172
|
-
|
173
|
-
host = _get_host()
|
174
|
-
controller = SkyServeController(service_name, service_spec, task_yaml, host,
|
175
|
-
controller_port)
|
243
|
+
task_yaml: str, controller_host: str, controller_port: int):
|
244
|
+
controller = SkyServeController(service_name, service_spec, task_yaml,
|
245
|
+
controller_host, controller_port)
|
176
246
|
controller.run()
|