skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,112 @@
|
|
1
|
+
"""Rest APIs for SkyServe."""
|
2
|
+
|
3
|
+
import fastapi
|
4
|
+
|
5
|
+
from sky import sky_logging
|
6
|
+
from sky.serve.server import core
|
7
|
+
from sky.server import stream_utils
|
8
|
+
from sky.server.requests import executor
|
9
|
+
from sky.server.requests import payloads
|
10
|
+
from sky.server.requests import requests as api_requests
|
11
|
+
from sky.utils import common
|
12
|
+
|
13
|
+
logger = sky_logging.init_logger(__name__)
|
14
|
+
router = fastapi.APIRouter()
|
15
|
+
|
16
|
+
|
17
|
+
@router.post('/up')
|
18
|
+
async def up(
|
19
|
+
request: fastapi.Request,
|
20
|
+
up_body: payloads.ServeUpBody,
|
21
|
+
) -> None:
|
22
|
+
executor.schedule_request(
|
23
|
+
request_id=request.state.request_id,
|
24
|
+
request_name='serve.up',
|
25
|
+
request_body=up_body,
|
26
|
+
func=core.up,
|
27
|
+
schedule_type=api_requests.ScheduleType.LONG,
|
28
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
29
|
+
)
|
30
|
+
|
31
|
+
|
32
|
+
@router.post('/update')
|
33
|
+
async def update(
|
34
|
+
request: fastapi.Request,
|
35
|
+
update_body: payloads.ServeUpdateBody,
|
36
|
+
) -> None:
|
37
|
+
executor.schedule_request(
|
38
|
+
request_id=request.state.request_id,
|
39
|
+
request_name='serve.update',
|
40
|
+
request_body=update_body,
|
41
|
+
func=core.update,
|
42
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
43
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
44
|
+
)
|
45
|
+
|
46
|
+
|
47
|
+
@router.post('/down')
|
48
|
+
async def down(
|
49
|
+
request: fastapi.Request,
|
50
|
+
down_body: payloads.ServeDownBody,
|
51
|
+
) -> None:
|
52
|
+
executor.schedule_request(
|
53
|
+
request_id=request.state.request_id,
|
54
|
+
request_name='serve.down',
|
55
|
+
request_body=down_body,
|
56
|
+
func=core.down,
|
57
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
58
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
59
|
+
)
|
60
|
+
|
61
|
+
|
62
|
+
@router.post('/terminate-replica')
|
63
|
+
async def terminate_replica(
|
64
|
+
request: fastapi.Request,
|
65
|
+
terminate_replica_body: payloads.ServeTerminateReplicaBody,
|
66
|
+
) -> None:
|
67
|
+
executor.schedule_request(
|
68
|
+
request_id=request.state.request_id,
|
69
|
+
request_name='serve.terminate_replica',
|
70
|
+
request_body=terminate_replica_body,
|
71
|
+
func=core.terminate_replica,
|
72
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
73
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
74
|
+
)
|
75
|
+
|
76
|
+
|
77
|
+
@router.post('/status')
|
78
|
+
async def status(
|
79
|
+
request: fastapi.Request,
|
80
|
+
status_body: payloads.ServeStatusBody,
|
81
|
+
) -> None:
|
82
|
+
executor.schedule_request(
|
83
|
+
request_id=request.state.request_id,
|
84
|
+
request_name='serve.status',
|
85
|
+
request_body=status_body,
|
86
|
+
func=core.status,
|
87
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
88
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
89
|
+
)
|
90
|
+
|
91
|
+
|
92
|
+
@router.post('/logs')
|
93
|
+
async def tail_logs(
|
94
|
+
request: fastapi.Request, log_body: payloads.ServeLogsBody,
|
95
|
+
background_tasks: fastapi.BackgroundTasks
|
96
|
+
) -> fastapi.responses.StreamingResponse:
|
97
|
+
executor.schedule_request(
|
98
|
+
request_id=request.state.request_id,
|
99
|
+
request_name='serve.logs',
|
100
|
+
request_body=log_body,
|
101
|
+
func=core.tail_logs,
|
102
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
103
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
104
|
+
)
|
105
|
+
|
106
|
+
request_task = api_requests.get_request(request.state.request_id)
|
107
|
+
|
108
|
+
return stream_utils.stream_response(
|
109
|
+
request_id=request_task.request_id,
|
110
|
+
logs_path=request_task.log_path,
|
111
|
+
background_tasks=background_tasks,
|
112
|
+
)
|
sky/serve/service.py
CHANGED
@@ -9,7 +9,7 @@ import pathlib
|
|
9
9
|
import shutil
|
10
10
|
import time
|
11
11
|
import traceback
|
12
|
-
from typing import Dict
|
12
|
+
from typing import Dict
|
13
13
|
|
14
14
|
import filelock
|
15
15
|
|
@@ -73,6 +73,12 @@ def cleanup_storage(task_yaml: str) -> bool:
|
|
73
73
|
try:
|
74
74
|
task = task_lib.Task.from_yaml(task_yaml)
|
75
75
|
backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
76
|
+
# Need to re-construct storage object in the controller process
|
77
|
+
# because when SkyPilot API server machine sends the yaml config to the
|
78
|
+
# controller machine, only storage metadata is sent, not the storage
|
79
|
+
# object itself.
|
80
|
+
for storage in task.storage_mounts.values():
|
81
|
+
storage.construct()
|
76
82
|
backend.teardown_ephemeral_storage(task)
|
77
83
|
except Exception as e: # pylint: disable=broad-except
|
78
84
|
logger.error('Failed to clean up storage: '
|
@@ -116,15 +122,17 @@ def _cleanup(service_name: str) -> bool:
|
|
116
122
|
logger.error(f'Replica {info.replica_id} failed to terminate.')
|
117
123
|
versions = serve_state.get_service_versions(service_name)
|
118
124
|
serve_state.remove_service_versions(service_name)
|
119
|
-
|
120
|
-
|
125
|
+
|
126
|
+
def cleanup_version_storage(version: int) -> bool:
|
121
127
|
task_yaml: str = serve_utils.generate_task_yaml_file_name(
|
122
128
|
service_name, version)
|
123
129
|
logger.info(f'Cleaning up storage for version {version}, '
|
124
130
|
f'task_yaml: {task_yaml}')
|
125
|
-
|
126
|
-
|
131
|
+
return cleanup_storage(task_yaml)
|
132
|
+
|
133
|
+
if not all(map(cleanup_version_storage, versions)):
|
127
134
|
failed = True
|
135
|
+
|
128
136
|
return failed
|
129
137
|
|
130
138
|
|
@@ -148,7 +156,9 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
148
156
|
controller_job_id=job_id,
|
149
157
|
policy=service_spec.autoscaling_policy_str(),
|
150
158
|
requested_resources_str=backend_utils.get_task_resources_str(task),
|
151
|
-
|
159
|
+
load_balancing_policy=service_spec.load_balancing_policy,
|
160
|
+
status=serve_state.ServiceStatus.CONTROLLER_INIT,
|
161
|
+
tls_encrypted=service_spec.tls_credential is not None)
|
152
162
|
# Directly throw an error here. See sky/serve/api.py::up
|
153
163
|
# for more details.
|
154
164
|
if not success:
|
@@ -156,6 +166,10 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
156
166
|
with ux_utils.print_exception_no_traceback():
|
157
167
|
raise ValueError(f'Service {service_name} already exists.')
|
158
168
|
|
169
|
+
# Add initial version information to the service state.
|
170
|
+
serve_state.add_or_update_version(service_name, constants.INITIAL_VERSION,
|
171
|
+
service_spec)
|
172
|
+
|
159
173
|
# Create the service working directory.
|
160
174
|
service_dir = os.path.expanduser(
|
161
175
|
serve_utils.generate_remote_service_dir_name(service_name))
|
@@ -182,19 +196,39 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
182
196
|
os.path.expanduser(constants.PORT_SELECTION_FILE_LOCK_PATH)):
|
183
197
|
controller_port = common_utils.find_free_port(
|
184
198
|
constants.CONTROLLER_PORT_START)
|
199
|
+
|
200
|
+
# We expose the controller to the public network when running
|
201
|
+
# inside a kubernetes cluster to allow external load balancers
|
202
|
+
# (example, for high availability load balancers) to communicate
|
203
|
+
# with the controller.
|
204
|
+
def _get_host():
|
205
|
+
if 'KUBERNETES_SERVICE_HOST' in os.environ:
|
206
|
+
return '0.0.0.0'
|
207
|
+
# Not using localhost to avoid using ipv6 address and causing
|
208
|
+
# the following error:
|
209
|
+
# ERROR: [Errno 99] error while attempting to bind on address
|
210
|
+
# ('::1', 20001, 0, 0): cannot assign requested address
|
211
|
+
return '127.0.0.1'
|
212
|
+
|
213
|
+
controller_host = _get_host()
|
214
|
+
|
185
215
|
# Start the controller.
|
186
216
|
controller_process = multiprocessing.Process(
|
187
217
|
target=controller.run_controller,
|
188
|
-
args=(service_name, service_spec, task_yaml,
|
218
|
+
args=(service_name, service_spec, task_yaml, controller_host,
|
219
|
+
controller_port))
|
189
220
|
controller_process.start()
|
190
221
|
serve_state.set_service_controller_port(service_name,
|
191
222
|
controller_port)
|
192
223
|
|
193
|
-
|
194
|
-
|
224
|
+
controller_addr = f'http://{controller_host}:{controller_port}'
|
225
|
+
|
195
226
|
load_balancer_port = common_utils.find_free_port(
|
196
227
|
constants.LOAD_BALANCER_PORT_START)
|
197
228
|
|
229
|
+
# Extract the load balancing policy from the service spec
|
230
|
+
policy_name = service_spec.load_balancing_policy
|
231
|
+
|
198
232
|
# Start the load balancer.
|
199
233
|
# TODO(tian): Probably we could enable multiple ports specified in
|
200
234
|
# service spec and we could start multiple load balancers.
|
@@ -203,7 +237,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
203
237
|
target=ux_utils.RedirectOutputForProcess(
|
204
238
|
load_balancer.run_load_balancer,
|
205
239
|
load_balancer_log_file).run,
|
206
|
-
args=(controller_addr, load_balancer_port
|
240
|
+
args=(controller_addr, load_balancer_port, policy_name,
|
241
|
+
service_spec.tls_credential))
|
207
242
|
load_balancer_process.start()
|
208
243
|
serve_state.set_service_load_balancer_port(service_name,
|
209
244
|
load_balancer_port)
|
@@ -215,15 +250,15 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
215
250
|
serve_state.set_service_status_and_active_versions(
|
216
251
|
service_name, serve_state.ServiceStatus.SHUTTING_DOWN)
|
217
252
|
finally:
|
218
|
-
process_to_kill: List[multiprocessing.Process] = []
|
219
|
-
if load_balancer_process is not None:
|
220
|
-
process_to_kill.append(load_balancer_process)
|
221
|
-
if controller_process is not None:
|
222
|
-
process_to_kill.append(controller_process)
|
223
253
|
# Kill load balancer process first since it will raise errors if failed
|
224
254
|
# to connect to the controller. Then the controller process.
|
255
|
+
process_to_kill = [
|
256
|
+
proc for proc in [load_balancer_process, controller_process]
|
257
|
+
if proc is not None
|
258
|
+
]
|
225
259
|
subprocess_utils.kill_children_processes(
|
226
|
-
[process.pid for process in process_to_kill],
|
260
|
+
parent_pids=[process.pid for process in process_to_kill],
|
261
|
+
force=True)
|
227
262
|
for process in process_to_kill:
|
228
263
|
process.join()
|
229
264
|
failed = _cleanup(service_name)
|
@@ -234,6 +269,7 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
234
269
|
else:
|
235
270
|
shutil.rmtree(service_dir)
|
236
271
|
serve_state.remove_service(service_name)
|
272
|
+
serve_state.delete_all_versions(service_name)
|
237
273
|
logger.info(f'Service {service_name} terminated successfully.')
|
238
274
|
|
239
275
|
|
sky/serve/service_spec.py
CHANGED
@@ -2,11 +2,14 @@
|
|
2
2
|
import json
|
3
3
|
import os
|
4
4
|
import textwrap
|
5
|
-
from typing import Any, Dict, Optional
|
5
|
+
from typing import Any, Dict, List, Optional
|
6
6
|
|
7
7
|
import yaml
|
8
8
|
|
9
|
+
from sky import serve
|
9
10
|
from sky.serve import constants
|
11
|
+
from sky.serve import load_balancing_policies as lb_policies
|
12
|
+
from sky.serve import serve_utils
|
10
13
|
from sky.utils import common_utils
|
11
14
|
from sky.utils import schemas
|
12
15
|
from sky.utils import ux_utils
|
@@ -19,22 +22,19 @@ class SkyServiceSpec:
|
|
19
22
|
self,
|
20
23
|
readiness_path: str,
|
21
24
|
initial_delay_seconds: int,
|
25
|
+
readiness_timeout_seconds: int,
|
22
26
|
min_replicas: int,
|
23
27
|
max_replicas: Optional[int] = None,
|
28
|
+
ports: Optional[str] = None,
|
24
29
|
target_qps_per_replica: Optional[float] = None,
|
25
30
|
post_data: Optional[Dict[str, Any]] = None,
|
31
|
+
tls_credential: Optional[serve_utils.TLSCredential] = None,
|
26
32
|
readiness_headers: Optional[Dict[str, str]] = None,
|
27
33
|
dynamic_ondemand_fallback: Optional[bool] = None,
|
28
34
|
base_ondemand_fallback_replicas: Optional[int] = None,
|
29
35
|
upscale_delay_seconds: Optional[int] = None,
|
30
36
|
downscale_delay_seconds: Optional[int] = None,
|
31
|
-
|
32
|
-
# TODO(ziming): remove this after 2 minor release, i.e. 0.6.0.
|
33
|
-
# Deprecated: Always be True
|
34
|
-
auto_restart: Optional[bool] = None,
|
35
|
-
# Deprecated: replaced by the target_qps_per_replica.
|
36
|
-
qps_upper_threshold: Optional[float] = None,
|
37
|
-
qps_lower_threshold: Optional[float] = None,
|
37
|
+
load_balancing_policy: Optional[str] = None,
|
38
38
|
) -> None:
|
39
39
|
if max_replicas is not None and max_replicas < min_replicas:
|
40
40
|
with ux_utils.print_exception_no_traceback():
|
@@ -61,27 +61,23 @@ class SkyServiceSpec:
|
|
61
61
|
raise ValueError('readiness_path must start with a slash (/). '
|
62
62
|
f'Got: {readiness_path}')
|
63
63
|
|
64
|
-
#
|
65
|
-
|
66
|
-
|
64
|
+
# Add the check for unknown load balancing policies
|
65
|
+
if (load_balancing_policy is not None and
|
66
|
+
load_balancing_policy not in serve.LB_POLICIES):
|
67
67
|
with ux_utils.print_exception_no_traceback():
|
68
68
|
raise ValueError(
|
69
|
-
'
|
70
|
-
'
|
71
|
-
'Please use target_qps_per_replica instead.')
|
72
|
-
if auto_restart is not None:
|
73
|
-
with ux_utils.print_exception_no_traceback():
|
74
|
-
raise ValueError(
|
75
|
-
'Field `auto_restart` under `replica_policy` is deprecated.'
|
76
|
-
'Currently, SkyServe will cleanup failed replicas'
|
77
|
-
'and auto restart it to keep the service running.')
|
78
|
-
|
69
|
+
f'Unknown load balancing policy: {load_balancing_policy}. '
|
70
|
+
f'Available policies: {list(serve.LB_POLICIES.keys())}')
|
79
71
|
self._readiness_path: str = readiness_path
|
80
72
|
self._initial_delay_seconds: int = initial_delay_seconds
|
73
|
+
self._readiness_timeout_seconds: int = readiness_timeout_seconds
|
81
74
|
self._min_replicas: int = min_replicas
|
82
75
|
self._max_replicas: Optional[int] = max_replicas
|
76
|
+
self._ports: Optional[str] = ports
|
83
77
|
self._target_qps_per_replica: Optional[float] = target_qps_per_replica
|
84
78
|
self._post_data: Optional[Dict[str, Any]] = post_data
|
79
|
+
self._tls_credential: Optional[serve_utils.TLSCredential] = (
|
80
|
+
tls_credential)
|
85
81
|
self._readiness_headers: Optional[Dict[str, str]] = readiness_headers
|
86
82
|
self._dynamic_ondemand_fallback: Optional[
|
87
83
|
bool] = dynamic_ondemand_fallback
|
@@ -89,6 +85,7 @@ class SkyServiceSpec:
|
|
89
85
|
int] = base_ondemand_fallback_replicas
|
90
86
|
self._upscale_delay_seconds: Optional[int] = upscale_delay_seconds
|
91
87
|
self._downscale_delay_seconds: Optional[int] = downscale_delay_seconds
|
88
|
+
self._load_balancing_policy: Optional[str] = load_balancing_policy
|
92
89
|
|
93
90
|
self._use_ondemand_fallback: bool = (
|
94
91
|
self.dynamic_ondemand_fallback is not None and
|
@@ -113,16 +110,23 @@ class SkyServiceSpec:
|
|
113
110
|
service_config['readiness_path'] = readiness_section
|
114
111
|
initial_delay_seconds = None
|
115
112
|
post_data = None
|
113
|
+
readiness_timeout_seconds = None
|
116
114
|
readiness_headers = None
|
117
115
|
else:
|
118
116
|
service_config['readiness_path'] = readiness_section['path']
|
119
117
|
initial_delay_seconds = readiness_section.get(
|
120
118
|
'initial_delay_seconds', None)
|
121
119
|
post_data = readiness_section.get('post_data', None)
|
120
|
+
readiness_timeout_seconds = readiness_section.get(
|
121
|
+
'timeout_seconds', None)
|
122
122
|
readiness_headers = readiness_section.get('headers', None)
|
123
123
|
if initial_delay_seconds is None:
|
124
124
|
initial_delay_seconds = constants.DEFAULT_INITIAL_DELAY_SECONDS
|
125
125
|
service_config['initial_delay_seconds'] = initial_delay_seconds
|
126
|
+
if readiness_timeout_seconds is None:
|
127
|
+
readiness_timeout_seconds = (
|
128
|
+
constants.DEFAULT_READINESS_PROBE_TIMEOUT_SECONDS)
|
129
|
+
service_config['readiness_timeout_seconds'] = readiness_timeout_seconds
|
126
130
|
if isinstance(post_data, str):
|
127
131
|
try:
|
128
132
|
post_data = json.loads(post_data)
|
@@ -135,6 +139,14 @@ class SkyServiceSpec:
|
|
135
139
|
service_config['post_data'] = post_data
|
136
140
|
service_config['readiness_headers'] = readiness_headers
|
137
141
|
|
142
|
+
ports = config.get('ports', None)
|
143
|
+
if ports is not None:
|
144
|
+
assert isinstance(ports, int)
|
145
|
+
if not 1 <= ports <= 65535:
|
146
|
+
with ux_utils.print_exception_no_traceback():
|
147
|
+
raise ValueError('Port must be between 1 and 65535.')
|
148
|
+
service_config['ports'] = str(ports) if ports is not None else None
|
149
|
+
|
138
150
|
policy_section = config.get('replica_policy', None)
|
139
151
|
simplified_policy_section = config.get('replicas', None)
|
140
152
|
if policy_section is None or simplified_policy_section is not None:
|
@@ -151,14 +163,8 @@ class SkyServiceSpec:
|
|
151
163
|
service_config['min_replicas'] = policy_section['min_replicas']
|
152
164
|
service_config['max_replicas'] = policy_section.get(
|
153
165
|
'max_replicas', None)
|
154
|
-
service_config['qps_upper_threshold'] = policy_section.get(
|
155
|
-
'qps_upper_threshold', None)
|
156
|
-
service_config['qps_lower_threshold'] = policy_section.get(
|
157
|
-
'qps_lower_threshold', None)
|
158
166
|
service_config['target_qps_per_replica'] = policy_section.get(
|
159
167
|
'target_qps_per_replica', None)
|
160
|
-
service_config['auto_restart'] = policy_section.get(
|
161
|
-
'auto_restart', None)
|
162
168
|
service_config['upscale_delay_seconds'] = policy_section.get(
|
163
169
|
'upscale_delay_seconds', None)
|
164
170
|
service_config['downscale_delay_seconds'] = policy_section.get(
|
@@ -169,6 +175,16 @@ class SkyServiceSpec:
|
|
169
175
|
service_config['dynamic_ondemand_fallback'] = policy_section.get(
|
170
176
|
'dynamic_ondemand_fallback', None)
|
171
177
|
|
178
|
+
service_config['load_balancing_policy'] = config.get(
|
179
|
+
'load_balancing_policy', None)
|
180
|
+
|
181
|
+
tls_section = config.get('tls', None)
|
182
|
+
if tls_section is not None:
|
183
|
+
service_config['tls_credential'] = serve_utils.TLSCredential(
|
184
|
+
keyfile=tls_section.get('keyfile', None),
|
185
|
+
certfile=tls_section.get('certfile', None),
|
186
|
+
)
|
187
|
+
|
172
188
|
return SkyServiceSpec(**service_config)
|
173
189
|
|
174
190
|
@staticmethod
|
@@ -192,9 +208,12 @@ class SkyServiceSpec:
|
|
192
208
|
return SkyServiceSpec.from_yaml_config(config['service'])
|
193
209
|
|
194
210
|
def to_yaml_config(self) -> Dict[str, Any]:
|
195
|
-
config =
|
211
|
+
config: Dict[str, Any] = {}
|
196
212
|
|
197
|
-
def add_if_not_none(section
|
213
|
+
def add_if_not_none(section: str,
|
214
|
+
key: Optional[str],
|
215
|
+
value: Any,
|
216
|
+
no_empty: bool = False):
|
198
217
|
if no_empty and not value:
|
199
218
|
return
|
200
219
|
if value is not None:
|
@@ -209,6 +228,8 @@ class SkyServiceSpec:
|
|
209
228
|
add_if_not_none('readiness_probe', 'initial_delay_seconds',
|
210
229
|
self.initial_delay_seconds)
|
211
230
|
add_if_not_none('readiness_probe', 'post_data', self.post_data)
|
231
|
+
add_if_not_none('readiness_probe', 'timeout_seconds',
|
232
|
+
self.readiness_timeout_seconds)
|
212
233
|
add_if_not_none('readiness_probe', 'headers', self._readiness_headers)
|
213
234
|
add_if_not_none('replica_policy', 'min_replicas', self.min_replicas)
|
214
235
|
add_if_not_none('replica_policy', 'max_replicas', self.max_replicas)
|
@@ -222,6 +243,12 @@ class SkyServiceSpec:
|
|
222
243
|
self.upscale_delay_seconds)
|
223
244
|
add_if_not_none('replica_policy', 'downscale_delay_seconds',
|
224
245
|
self.downscale_delay_seconds)
|
246
|
+
add_if_not_none('load_balancing_policy', None,
|
247
|
+
self._load_balancing_policy)
|
248
|
+
add_if_not_none('ports', None, int(self.ports) if self.ports else None)
|
249
|
+
if self.tls_credential is not None:
|
250
|
+
add_if_not_none('tls', 'keyfile', self.tls_credential.keyfile)
|
251
|
+
add_if_not_none('tls', 'certfile', self.tls_credential.certfile)
|
225
252
|
return config
|
226
253
|
|
227
254
|
def probe_str(self):
|
@@ -233,8 +260,8 @@ class SkyServiceSpec:
|
|
233
260
|
' with custom headers')
|
234
261
|
return f'{method}{headers}'
|
235
262
|
|
236
|
-
def spot_policy_str(self):
|
237
|
-
policy_strs = []
|
263
|
+
def spot_policy_str(self) -> str:
|
264
|
+
policy_strs: List[str] = []
|
238
265
|
if (self.dynamic_ondemand_fallback is not None and
|
239
266
|
self.dynamic_ondemand_fallback):
|
240
267
|
policy_strs.append('Dynamic on-demand fallback')
|
@@ -249,7 +276,9 @@ class SkyServiceSpec:
|
|
249
276
|
policy_strs.append('Static spot mixture with '
|
250
277
|
f'{self.base_ondemand_fallback_replicas} '
|
251
278
|
f'base on-demand replica{plural}')
|
252
|
-
|
279
|
+
if not policy_strs:
|
280
|
+
return 'No spot fallback policy'
|
281
|
+
return ' '.join(policy_strs)
|
253
282
|
|
254
283
|
def autoscaling_policy_str(self):
|
255
284
|
# TODO(MaoZiming): Update policy_str
|
@@ -264,12 +293,24 @@ class SkyServiceSpec:
|
|
264
293
|
f'replica{max_plural} (target QPS per replica: '
|
265
294
|
f'{self.target_qps_per_replica})')
|
266
295
|
|
296
|
+
def set_ports(self, ports: str) -> None:
|
297
|
+
self._ports = ports
|
298
|
+
|
299
|
+
def tls_str(self):
|
300
|
+
if self.tls_credential is None:
|
301
|
+
return 'No TLS Enabled'
|
302
|
+
return (f'Keyfile: {self.tls_credential.keyfile}, '
|
303
|
+
f'Certfile: {self.tls_credential.certfile}')
|
304
|
+
|
267
305
|
def __repr__(self) -> str:
|
268
306
|
return textwrap.dedent(f"""\
|
269
307
|
Readiness probe method: {self.probe_str()}
|
270
308
|
Readiness initial delay seconds: {self.initial_delay_seconds}
|
309
|
+
Readiness probe timeout seconds: {self.readiness_timeout_seconds}
|
271
310
|
Replica autoscaling policy: {self.autoscaling_policy_str()}
|
311
|
+
TLS Certificates: {self.tls_str()}
|
272
312
|
Spot Policy: {self.spot_policy_str()}
|
313
|
+
Load Balancing Policy: {self.load_balancing_policy}
|
273
314
|
""")
|
274
315
|
|
275
316
|
@property
|
@@ -280,6 +321,10 @@ class SkyServiceSpec:
|
|
280
321
|
def initial_delay_seconds(self) -> int:
|
281
322
|
return self._initial_delay_seconds
|
282
323
|
|
324
|
+
@property
|
325
|
+
def readiness_timeout_seconds(self) -> int:
|
326
|
+
return self._readiness_timeout_seconds
|
327
|
+
|
283
328
|
@property
|
284
329
|
def min_replicas(self) -> int:
|
285
330
|
return self._min_replicas
|
@@ -289,6 +334,10 @@ class SkyServiceSpec:
|
|
289
334
|
# If None, treated as having the same value of min_replicas.
|
290
335
|
return self._max_replicas
|
291
336
|
|
337
|
+
@property
|
338
|
+
def ports(self) -> Optional[str]:
|
339
|
+
return self._ports
|
340
|
+
|
292
341
|
@property
|
293
342
|
def target_qps_per_replica(self) -> Optional[float]:
|
294
343
|
return self._target_qps_per_replica
|
@@ -297,6 +346,15 @@ class SkyServiceSpec:
|
|
297
346
|
def post_data(self) -> Optional[Dict[str, Any]]:
|
298
347
|
return self._post_data
|
299
348
|
|
349
|
+
@property
|
350
|
+
def tls_credential(self) -> Optional[serve_utils.TLSCredential]:
|
351
|
+
return self._tls_credential
|
352
|
+
|
353
|
+
@tls_credential.setter
|
354
|
+
def tls_credential(self,
|
355
|
+
value: Optional[serve_utils.TLSCredential]) -> None:
|
356
|
+
self._tls_credential = value
|
357
|
+
|
300
358
|
@property
|
301
359
|
def readiness_headers(self) -> Optional[Dict[str, str]]:
|
302
360
|
return self._readiness_headers
|
@@ -320,3 +378,8 @@ class SkyServiceSpec:
|
|
320
378
|
@property
|
321
379
|
def use_ondemand_fallback(self) -> bool:
|
322
380
|
return self._use_ondemand_fallback
|
381
|
+
|
382
|
+
@property
|
383
|
+
def load_balancing_policy(self) -> str:
|
384
|
+
return lb_policies.LoadBalancingPolicy.make_policy_name(
|
385
|
+
self._load_balancing_policy)
|
sky/server/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
"""SkyPilot API Server."""
|