skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
File without changes
|
@@ -1,6 +1,8 @@
|
|
1
1
|
"""SkyServe core APIs."""
|
2
2
|
import re
|
3
|
+
import signal
|
3
4
|
import tempfile
|
5
|
+
import threading
|
4
6
|
from typing import Any, Dict, List, Optional, Tuple, Union
|
5
7
|
|
6
8
|
import colorama
|
@@ -8,6 +10,7 @@ import colorama
|
|
8
10
|
import sky
|
9
11
|
from sky import backends
|
10
12
|
from sky import exceptions
|
13
|
+
from sky import execution
|
11
14
|
from sky import sky_logging
|
12
15
|
from sky import task as task_lib
|
13
16
|
from sky.backends import backend_utils
|
@@ -17,6 +20,9 @@ from sky.serve import serve_state
|
|
17
20
|
from sky.serve import serve_utils
|
18
21
|
from sky.skylet import constants
|
19
22
|
from sky.usage import usage_lib
|
23
|
+
from sky.utils import admin_policy_utils
|
24
|
+
from sky.utils import command_runner
|
25
|
+
from sky.utils import common
|
20
26
|
from sky.utils import common_utils
|
21
27
|
from sky.utils import controller_utils
|
22
28
|
from sky.utils import resources_utils
|
@@ -63,7 +69,8 @@ def _validate_service_task(task: 'sky.Task') -> None:
|
|
63
69
|
'SkyServe will replenish preempted spot '
|
64
70
|
f'with {policy_description} instances.')
|
65
71
|
|
66
|
-
replica_ingress_port: Optional[int] =
|
72
|
+
replica_ingress_port: Optional[int] = int(
|
73
|
+
task.service.ports) if (task.service.ports is not None) else None
|
67
74
|
for requested_resources in task.resources:
|
68
75
|
if (task.service.use_ondemand_fallback and
|
69
76
|
not requested_resources.use_spot):
|
@@ -72,22 +79,58 @@ def _validate_service_task(task: 'sky.Task') -> None:
|
|
72
79
|
'`use_ondemand_fallback` is only supported '
|
73
80
|
'for spot resources. Please explicitly specify '
|
74
81
|
'`use_spot: true` in resources for on-demand fallback.')
|
75
|
-
|
76
|
-
|
77
|
-
|
78
|
-
|
79
|
-
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
|
84
|
-
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
82
|
+
if task.service.ports is None:
|
83
|
+
requested_ports = list(
|
84
|
+
resources_utils.port_ranges_to_set(requested_resources.ports))
|
85
|
+
if len(requested_ports) != 1:
|
86
|
+
with ux_utils.print_exception_no_traceback():
|
87
|
+
raise ValueError(
|
88
|
+
'To open multiple ports on the replica, please set the '
|
89
|
+
'`service.ports` field to specify a main service port. '
|
90
|
+
'Must only specify one port in resources otherwise. '
|
91
|
+
'Each replica will use the port specified as '
|
92
|
+
'application ingress port.')
|
93
|
+
service_port = requested_ports[0]
|
94
|
+
if replica_ingress_port is None:
|
95
|
+
replica_ingress_port = service_port
|
96
|
+
elif service_port != replica_ingress_port:
|
97
|
+
with ux_utils.print_exception_no_traceback():
|
98
|
+
raise ValueError(
|
99
|
+
f'Got multiple ports: {service_port} and '
|
100
|
+
f'{replica_ingress_port} in different resources. '
|
101
|
+
'Please specify the same port instead.')
|
102
|
+
|
103
|
+
|
104
|
+
def _rewrite_tls_credential_paths_and_get_tls_env_vars(
|
105
|
+
service_name: str, task: 'sky.Task') -> Dict[str, Any]:
|
106
|
+
"""Rewrite the paths of TLS credentials in the task.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
service_name: Name of the service.
|
110
|
+
task: sky.Task to rewrite.
|
111
|
+
|
112
|
+
Returns:
|
113
|
+
The generated template variables for TLS.
|
114
|
+
"""
|
115
|
+
service_spec = task.service
|
116
|
+
# Already checked by _validate_service_task
|
117
|
+
assert service_spec is not None
|
118
|
+
if service_spec.tls_credential is None:
|
119
|
+
return {'use_tls': False}
|
120
|
+
remote_tls_keyfile = (
|
121
|
+
serve_utils.generate_remote_tls_keyfile_name(service_name))
|
122
|
+
remote_tls_certfile = (
|
123
|
+
serve_utils.generate_remote_tls_certfile_name(service_name))
|
124
|
+
tls_template_vars = {
|
125
|
+
'use_tls': True,
|
126
|
+
'remote_tls_keyfile': remote_tls_keyfile,
|
127
|
+
'remote_tls_certfile': remote_tls_certfile,
|
128
|
+
'local_tls_keyfile': service_spec.tls_credential.keyfile,
|
129
|
+
'local_tls_certfile': service_spec.tls_credential.certfile,
|
130
|
+
}
|
131
|
+
service_spec.tls_credential = serve_utils.TLSCredential(
|
132
|
+
remote_tls_keyfile, remote_tls_certfile)
|
133
|
+
return tls_template_vars
|
91
134
|
|
92
135
|
|
93
136
|
@usage_lib.entrypoint
|
@@ -95,7 +138,7 @@ def up(
|
|
95
138
|
task: 'sky.Task',
|
96
139
|
service_name: Optional[str] = None,
|
97
140
|
) -> Tuple[str, str]:
|
98
|
-
"""
|
141
|
+
"""Spins up a service.
|
99
142
|
|
100
143
|
Please refer to the sky.cli.serve_up for the document.
|
101
144
|
|
@@ -108,6 +151,7 @@ def up(
|
|
108
151
|
argument.
|
109
152
|
endpoint: str; The service endpoint.
|
110
153
|
"""
|
154
|
+
task.validate()
|
111
155
|
if service_name is None:
|
112
156
|
service_name = serve_utils.generate_service_name()
|
113
157
|
|
@@ -123,9 +167,20 @@ def up(
|
|
123
167
|
f'{constants.CLUSTER_NAME_VALID_REGEX}')
|
124
168
|
|
125
169
|
_validate_service_task(task)
|
170
|
+
# Always apply the policy again here, even though it might have been applied
|
171
|
+
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
172
|
+
# and get the mutated config.
|
173
|
+
dag, mutated_user_config = admin_policy_utils.apply(
|
174
|
+
task, use_mutated_config_in_current_request=False)
|
175
|
+
task = dag.tasks[0]
|
176
|
+
|
177
|
+
with rich_utils.safe_status(
|
178
|
+
ux_utils.spinner_message('Initializing service')):
|
179
|
+
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
180
|
+
task, task_type='serve')
|
126
181
|
|
127
|
-
|
128
|
-
|
182
|
+
tls_template_vars = _rewrite_tls_credential_paths_and_get_tls_env_vars(
|
183
|
+
service_name, task)
|
129
184
|
|
130
185
|
with tempfile.NamedTemporaryFile(
|
131
186
|
prefix=f'service-task-{service_name}-',
|
@@ -134,7 +189,7 @@ def up(
|
|
134
189
|
prefix=f'controller-task-{service_name}-',
|
135
190
|
mode='w',
|
136
191
|
) as controller_file:
|
137
|
-
controller_name =
|
192
|
+
controller_name = common.SKY_SERVE_CONTROLLER_NAME
|
138
193
|
task_config = task.to_yaml_config()
|
139
194
|
common_utils.dump_yaml(service_file.name, task_config)
|
140
195
|
remote_tmp_task_yaml_path = (
|
@@ -155,9 +210,11 @@ def up(
|
|
155
210
|
'remote_user_config_path': remote_config_yaml_path,
|
156
211
|
'modified_catalogs':
|
157
212
|
service_catalog_common.get_modified_catalog_file_mounts(),
|
213
|
+
**tls_template_vars,
|
158
214
|
**controller_utils.shared_controller_vars_to_fill(
|
159
215
|
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
|
160
216
|
remote_user_config_path=remote_config_yaml_path,
|
217
|
+
local_user_config=mutated_user_config,
|
161
218
|
),
|
162
219
|
}
|
163
220
|
common_utils.fill_template(serve_constants.CONTROLLER_TEMPLATE,
|
@@ -192,15 +249,16 @@ def up(
|
|
192
249
|
# with the current job id, we know the service is up and running
|
193
250
|
# for the first time; otherwise it is a name conflict.
|
194
251
|
idle_minutes_to_autostop = constants.CONTROLLER_IDLE_MINUTES_TO_AUTOSTOP
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
252
|
+
# Since the controller may be shared among multiple users, launch the
|
253
|
+
# controller with the API server's user hash.
|
254
|
+
with common.with_server_user_hash():
|
255
|
+
controller_job_id, controller_handle = execution.launch(
|
256
|
+
task=controller_task,
|
257
|
+
cluster_name=controller_name,
|
258
|
+
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
259
|
+
retry_until_up=True,
|
260
|
+
_disable_controller_check=True,
|
261
|
+
)
|
204
262
|
|
205
263
|
style = colorama.Style
|
206
264
|
fore = colorama.Fore
|
@@ -209,7 +267,8 @@ def up(
|
|
209
267
|
# TODO(tian): Cache endpoint locally to speedup. Endpoint won't
|
210
268
|
# change after the first time, so there is no consistency issue.
|
211
269
|
with rich_utils.safe_status(
|
212
|
-
|
270
|
+
ux_utils.spinner_message(
|
271
|
+
'Waiting for the service to register')):
|
213
272
|
# This function will check the controller job id in the database
|
214
273
|
# and return the endpoint if the job id matches. Otherwise it will
|
215
274
|
# return None.
|
@@ -258,44 +317,47 @@ def up(
|
|
258
317
|
else:
|
259
318
|
lb_port = serve_utils.load_service_initialization_result(
|
260
319
|
lb_port_payload)
|
261
|
-
|
320
|
+
socket_endpoint = backend_utils.get_endpoints(
|
262
321
|
controller_handle.cluster_name, lb_port,
|
263
322
|
skip_status_check=True).get(lb_port)
|
264
|
-
assert
|
265
|
-
|
266
|
-
|
323
|
+
assert socket_endpoint is not None, (
|
324
|
+
'Did not get endpoint for controller.')
|
325
|
+
# Already checked by _validate_service_task
|
326
|
+
assert task.service is not None
|
327
|
+
protocol = ('http'
|
328
|
+
if task.service.tls_credential is None else 'https')
|
329
|
+
endpoint = f'{protocol}://{socket_endpoint}'
|
330
|
+
|
331
|
+
logger.info(
|
267
332
|
f'{fore.CYAN}Service name: '
|
268
333
|
f'{style.BRIGHT}{service_name}{style.RESET_ALL}'
|
269
334
|
f'\n{fore.CYAN}Endpoint URL: '
|
270
335
|
f'{style.BRIGHT}{endpoint}{style.RESET_ALL}'
|
271
|
-
'\
|
272
|
-
f'{
|
273
|
-
f'
|
274
|
-
'
|
275
|
-
f'{
|
276
|
-
f'{
|
277
|
-
'
|
278
|
-
'\
|
279
|
-
f'{
|
280
|
-
f'{
|
281
|
-
'\
|
282
|
-
f'{
|
283
|
-
f'{
|
284
|
-
'\
|
285
|
-
f'{
|
286
|
-
f'{
|
287
|
-
'\n'
|
288
|
-
'
|
289
|
-
f'{
|
290
|
-
f'{
|
291
|
-
'
|
292
|
-
f'{
|
293
|
-
|
294
|
-
'
|
295
|
-
|
296
|
-
f'{style.RESET_ALL}'
|
297
|
-
f'\n{fore.GREEN}The replicas should be ready within a '
|
298
|
-
f'short time.{style.RESET_ALL}')
|
336
|
+
f'\n📋 Useful Commands'
|
337
|
+
f'\n{ux_utils.INDENT_SYMBOL}To check service status:\t'
|
338
|
+
f'{ux_utils.BOLD}sky serve status {service_name} '
|
339
|
+
f'[--endpoint]{ux_utils.RESET_BOLD}'
|
340
|
+
f'\n{ux_utils.INDENT_SYMBOL}To teardown the service:\t'
|
341
|
+
f'{ux_utils.BOLD}sky serve down {service_name}'
|
342
|
+
f'{ux_utils.RESET_BOLD}'
|
343
|
+
f'\n{ux_utils.INDENT_SYMBOL}To see replica logs:\t'
|
344
|
+
f'{ux_utils.BOLD}sky serve logs {service_name} [REPLICA_ID]'
|
345
|
+
f'{ux_utils.RESET_BOLD}'
|
346
|
+
f'\n{ux_utils.INDENT_SYMBOL}To see load balancer logs:\t'
|
347
|
+
f'{ux_utils.BOLD}sky serve logs --load-balancer {service_name}'
|
348
|
+
f'{ux_utils.RESET_BOLD}'
|
349
|
+
f'\n{ux_utils.INDENT_SYMBOL}To see controller logs:\t'
|
350
|
+
f'{ux_utils.BOLD}sky serve logs --controller {service_name}'
|
351
|
+
f'{ux_utils.RESET_BOLD}'
|
352
|
+
f'\n{ux_utils.INDENT_SYMBOL}To monitor the status:\t'
|
353
|
+
f'{ux_utils.BOLD}watch -n10 sky serve status {service_name}'
|
354
|
+
f'{ux_utils.RESET_BOLD}'
|
355
|
+
f'\n{ux_utils.INDENT_LAST_SYMBOL}To send a test request:\t'
|
356
|
+
f'{ux_utils.BOLD}curl {endpoint}'
|
357
|
+
f'{ux_utils.RESET_BOLD}'
|
358
|
+
'\n\n' +
|
359
|
+
ux_utils.finishing_message('Service is spinning up and replicas '
|
360
|
+
'will be ready shortly.'))
|
299
361
|
return service_name, endpoint
|
300
362
|
|
301
363
|
|
@@ -304,24 +366,43 @@ def update(
|
|
304
366
|
task: 'sky.Task',
|
305
367
|
service_name: str,
|
306
368
|
mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE) -> None:
|
307
|
-
"""
|
369
|
+
"""Updates an existing service.
|
308
370
|
|
309
371
|
Please refer to the sky.cli.serve_update for the document.
|
310
372
|
|
311
373
|
Args:
|
312
374
|
task: sky.Task to update.
|
313
375
|
service_name: Name of the service.
|
376
|
+
mode: Update mode.
|
314
377
|
"""
|
378
|
+
task.validate()
|
315
379
|
_validate_service_task(task)
|
380
|
+
|
381
|
+
# Always apply the policy again here, even though it might have been applied
|
382
|
+
# in the CLI. This is to ensure that we apply the policy to the final DAG
|
383
|
+
# and get the mutated config.
|
384
|
+
# TODO(cblmemo,zhwu): If a user sets a new skypilot_config, the update
|
385
|
+
# will not apply the config.
|
386
|
+
dag, _ = admin_policy_utils.apply(
|
387
|
+
task, use_mutated_config_in_current_request=False)
|
388
|
+
task = dag.tasks[0]
|
389
|
+
|
390
|
+
assert task.service is not None
|
391
|
+
if task.service.tls_credential is not None:
|
392
|
+
logger.warning('Updating TLS keyfile and certfile is not supported. '
|
393
|
+
'Any updates to the keyfile and certfile will not take '
|
394
|
+
'effect. To update TLS keyfile and certfile, please '
|
395
|
+
'tear down the service and spin up a new one.')
|
396
|
+
|
316
397
|
handle = backend_utils.is_controller_accessible(
|
317
398
|
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
|
318
399
|
stopped_message=
|
319
400
|
'Service controller is stopped. There is no service to update. '
|
320
|
-
f'To spin up a new service, use {
|
321
|
-
f'sky serve up{
|
401
|
+
f'To spin up a new service, use {ux_utils.BOLD}'
|
402
|
+
f'sky serve up{ux_utils.RESET_BOLD}',
|
322
403
|
non_existent_message='Service does not exist. '
|
323
404
|
'To spin up a new service, '
|
324
|
-
f'use {
|
405
|
+
f'use {ux_utils.BOLD}sky serve up{ux_utils.RESET_BOLD}',
|
325
406
|
)
|
326
407
|
|
327
408
|
backend = backend_utils.get_backend_from_handle(handle)
|
@@ -344,11 +425,11 @@ def update(
|
|
344
425
|
raise RuntimeError(e.error_msg) from e
|
345
426
|
|
346
427
|
service_statuses = serve_utils.load_service_status(serve_status_payload)
|
347
|
-
if
|
428
|
+
if not service_statuses:
|
348
429
|
with ux_utils.print_exception_no_traceback():
|
349
430
|
raise RuntimeError(f'Cannot find service {service_name!r}.'
|
350
|
-
f'To spin up a service, use {
|
351
|
-
f'sky serve up{
|
431
|
+
f'To spin up a service, use {ux_utils.BOLD}'
|
432
|
+
f'sky serve up{ux_utils.RESET_BOLD}')
|
352
433
|
|
353
434
|
if len(service_statuses) > 1:
|
354
435
|
with ux_utils.print_exception_no_traceback():
|
@@ -368,8 +449,21 @@ def update(
|
|
368
449
|
with ux_utils.print_exception_no_traceback():
|
369
450
|
raise RuntimeError(prompt)
|
370
451
|
|
371
|
-
|
372
|
-
|
452
|
+
original_lb_policy = service_record['load_balancing_policy']
|
453
|
+
assert task.service is not None, 'Service section not found.'
|
454
|
+
if original_lb_policy != task.service.load_balancing_policy:
|
455
|
+
logger.warning(
|
456
|
+
f'{colorama.Fore.YELLOW}Current load balancing policy '
|
457
|
+
f'{original_lb_policy!r} is different from the new policy '
|
458
|
+
f'{task.service.load_balancing_policy!r}. Updating the load '
|
459
|
+
'balancing policy is not supported yet and it will be ignored. '
|
460
|
+
'The service will continue to use the current load balancing '
|
461
|
+
f'policy.{colorama.Style.RESET_ALL}')
|
462
|
+
|
463
|
+
with rich_utils.safe_status(
|
464
|
+
ux_utils.spinner_message('Initializing service')):
|
465
|
+
controller_utils.maybe_translate_local_file_mounts_and_sync_up(
|
466
|
+
task, task_type='serve')
|
373
467
|
|
374
468
|
code = serve_utils.ServeCodeGen.add_version(service_name)
|
375
469
|
returncode, version_string_payload, stderr = backend.run_on_head(
|
@@ -427,8 +521,8 @@ def update(
|
|
427
521
|
|
428
522
|
print(f'{colorama.Fore.GREEN}Service {service_name!r} update scheduled.'
|
429
523
|
f'{colorama.Style.RESET_ALL}\n'
|
430
|
-
f'Please use {
|
431
|
-
f'{
|
524
|
+
f'Please use {ux_utils.BOLD}sky serve status {service_name} '
|
525
|
+
f'{ux_utils.RESET_BOLD}to check the latest status.')
|
432
526
|
|
433
527
|
|
434
528
|
@usage_lib.entrypoint
|
@@ -438,7 +532,7 @@ def down(
|
|
438
532
|
all: bool = False,
|
439
533
|
purge: bool = False,
|
440
534
|
) -> None:
|
441
|
-
"""
|
535
|
+
"""Tears down a service.
|
442
536
|
|
443
537
|
Please refer to the sky.cli.serve_down for the docs.
|
444
538
|
|
@@ -462,9 +556,9 @@ def down(
|
|
462
556
|
stopped_message='All services should have terminated.')
|
463
557
|
|
464
558
|
service_names_str = ','.join(service_names)
|
465
|
-
if sum([
|
466
|
-
argument_str = f'service_names={service_names_str}'
|
467
|
-
|
559
|
+
if sum([bool(service_names), all]) != 1:
|
560
|
+
argument_str = (f'service_names={service_names_str}'
|
561
|
+
if service_names else '')
|
468
562
|
argument_str += ' all' if all else ''
|
469
563
|
raise ValueError('Can only specify one of service_names or all. '
|
470
564
|
f'Provided {argument_str!r}.')
|
@@ -482,7 +576,7 @@ def down(
|
|
482
576
|
except exceptions.FetchClusterInfoError as e:
|
483
577
|
raise RuntimeError(
|
484
578
|
'Failed to fetch controller IP. Please refresh controller status '
|
485
|
-
f'by `sky status -r {
|
579
|
+
f'by `sky status -r {common.SKY_SERVE_CONTROLLER_NAME}` '
|
486
580
|
'and try again.') from e
|
487
581
|
|
488
582
|
try:
|
@@ -492,6 +586,53 @@ def down(
|
|
492
586
|
except exceptions.CommandError as e:
|
493
587
|
raise RuntimeError(e.error_msg) from e
|
494
588
|
|
589
|
+
logger.info(stdout)
|
590
|
+
|
591
|
+
|
592
|
+
@usage_lib.entrypoint
|
593
|
+
def terminate_replica(service_name: str, replica_id: int, purge: bool) -> None:
|
594
|
+
"""Tears down a specific replica for the given service.
|
595
|
+
|
596
|
+
Args:
|
597
|
+
service_name: Name of the service.
|
598
|
+
replica_id: ID of replica to terminate.
|
599
|
+
purge: Whether to terminate replicas in a failed status. These replicas
|
600
|
+
may lead to resource leaks, so we require the user to explicitly
|
601
|
+
specify this flag to make sure they are aware of this potential
|
602
|
+
resource leak.
|
603
|
+
|
604
|
+
Raises:
|
605
|
+
sky.exceptions.ClusterNotUpError: if the sky sere controller is not up.
|
606
|
+
RuntimeError: if failed to terminate the replica.
|
607
|
+
"""
|
608
|
+
handle = backend_utils.is_controller_accessible(
|
609
|
+
controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
|
610
|
+
stopped_message=
|
611
|
+
'No service is running now. Please spin up a service first.',
|
612
|
+
non_existent_message='No service is running now. '
|
613
|
+
'Please spin up a service first.',
|
614
|
+
)
|
615
|
+
|
616
|
+
backend = backend_utils.get_backend_from_handle(handle)
|
617
|
+
assert isinstance(backend, backends.CloudVmRayBackend)
|
618
|
+
|
619
|
+
code = serve_utils.ServeCodeGen.terminate_replica(service_name, replica_id,
|
620
|
+
purge)
|
621
|
+
returncode, stdout, stderr = backend.run_on_head(handle,
|
622
|
+
code,
|
623
|
+
require_outputs=True,
|
624
|
+
stream_logs=False,
|
625
|
+
separate_stderr=True)
|
626
|
+
|
627
|
+
try:
|
628
|
+
subprocess_utils.handle_returncode(returncode,
|
629
|
+
code,
|
630
|
+
'Failed to terminate the replica',
|
631
|
+
stderr,
|
632
|
+
stream_logs=True)
|
633
|
+
except exceptions.CommandError as e:
|
634
|
+
raise RuntimeError(e.error_msg) from e
|
635
|
+
|
495
636
|
sky_logging.print(stdout)
|
496
637
|
|
497
638
|
|
@@ -499,7 +640,7 @@ def down(
|
|
499
640
|
def status(
|
500
641
|
service_names: Optional[Union[str,
|
501
642
|
List[str]]] = None) -> List[Dict[str, Any]]:
|
502
|
-
"""
|
643
|
+
"""Gets service statuses.
|
503
644
|
|
504
645
|
If service_names is given, return those services. Otherwise, return all
|
505
646
|
services.
|
@@ -516,11 +657,12 @@ def status(
|
|
516
657
|
'status': (sky.ServiceStatus) service status,
|
517
658
|
'controller_port': (Optional[int]) controller port,
|
518
659
|
'load_balancer_port': (Optional[int]) load balancer port,
|
519
|
-
'
|
520
|
-
'
|
521
|
-
for replica (deprecated),
|
660
|
+
'endpoint': (Optional[str]) load balancer endpoint,
|
661
|
+
'policy': (Optional[str]) autoscaling policy description,
|
522
662
|
'requested_resources_str': (str) str representation of
|
523
663
|
requested resources,
|
664
|
+
'load_balancing_policy': (str) load balancing policy name,
|
665
|
+
'tls_encrypted': (bool) whether the service is TLS encrypted,
|
524
666
|
'replica_info': (List[Dict[str, Any]]) replica information,
|
525
667
|
}
|
526
668
|
|
@@ -535,6 +677,7 @@ def status(
|
|
535
677
|
'version': (int) replica version,
|
536
678
|
'launched_at': (int) timestamp of launched,
|
537
679
|
'handle': (ResourceHandle) handle of the replica cluster,
|
680
|
+
'endpoint': (str) endpoint of the replica,
|
538
681
|
}
|
539
682
|
|
540
683
|
For possible service statuses and replica statuses, please refer to
|
@@ -588,7 +731,24 @@ def status(
|
|
588
731
|
except exceptions.CommandError as e:
|
589
732
|
raise RuntimeError(e.error_msg) from e
|
590
733
|
|
591
|
-
|
734
|
+
service_records = serve_utils.load_service_status(serve_status_payload)
|
735
|
+
# Get the endpoint for each service
|
736
|
+
for service_record in service_records:
|
737
|
+
service_record['endpoint'] = None
|
738
|
+
if service_record['load_balancer_port'] is not None:
|
739
|
+
try:
|
740
|
+
endpoint = backend_utils.get_endpoints(
|
741
|
+
cluster=common.SKY_SERVE_CONTROLLER_NAME,
|
742
|
+
port=service_record['load_balancer_port']).get(
|
743
|
+
service_record['load_balancer_port'], None)
|
744
|
+
except exceptions.ClusterNotUpError:
|
745
|
+
pass
|
746
|
+
else:
|
747
|
+
protocol = ('https'
|
748
|
+
if service_record['tls_encrypted'] else 'http')
|
749
|
+
service_record['endpoint'] = f'{protocol}://{endpoint}'
|
750
|
+
|
751
|
+
return service_records
|
592
752
|
|
593
753
|
|
594
754
|
@usage_lib.entrypoint
|
@@ -599,7 +759,7 @@ def tail_logs(
|
|
599
759
|
replica_id: Optional[int] = None,
|
600
760
|
follow: bool = True,
|
601
761
|
) -> None:
|
602
|
-
"""
|
762
|
+
"""Tails logs for a service.
|
603
763
|
|
604
764
|
Usage:
|
605
765
|
sky.serve.tail_logs(
|
@@ -638,6 +798,7 @@ def tail_logs(
|
|
638
798
|
with ux_utils.print_exception_no_traceback():
|
639
799
|
raise ValueError(f'`target` must be a string or '
|
640
800
|
f'sky.serve.ServiceComponent, got {type(target)}.')
|
801
|
+
|
641
802
|
if target == serve_utils.ServiceComponent.REPLICA:
|
642
803
|
if replica_id is None:
|
643
804
|
with ux_utils.print_exception_no_traceback():
|
@@ -655,8 +816,28 @@ def tail_logs(
|
|
655
816
|
|
656
817
|
backend = backend_utils.get_backend_from_handle(handle)
|
657
818
|
assert isinstance(backend, backends.CloudVmRayBackend), backend
|
658
|
-
|
659
|
-
|
660
|
-
|
661
|
-
|
662
|
-
|
819
|
+
|
820
|
+
if target != serve_utils.ServiceComponent.REPLICA:
|
821
|
+
code = serve_utils.ServeCodeGen.stream_serve_process_logs(
|
822
|
+
service_name,
|
823
|
+
stream_controller=(
|
824
|
+
target == serve_utils.ServiceComponent.CONTROLLER),
|
825
|
+
follow=follow)
|
826
|
+
else:
|
827
|
+
assert replica_id is not None, service_name
|
828
|
+
code = serve_utils.ServeCodeGen.stream_replica_logs(
|
829
|
+
service_name, replica_id, follow)
|
830
|
+
|
831
|
+
# With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
|
832
|
+
# kill the process, so we need to handle it manually here.
|
833
|
+
if threading.current_thread() is threading.main_thread():
|
834
|
+
signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
|
835
|
+
signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
|
836
|
+
|
837
|
+
# Refer to the notes in
|
838
|
+
# sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
|
839
|
+
backend.run_on_head(handle,
|
840
|
+
code,
|
841
|
+
stream_logs=True,
|
842
|
+
process_stream=False,
|
843
|
+
ssh_mode=command_runner.SshMode.INTERACTIVE)
|