skypilot-nightly 1.0.0.dev2024053101__py3-none-any.whl → 1.0.0.dev2025022801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +64 -32
- sky/adaptors/aws.py +23 -6
- sky/adaptors/azure.py +432 -15
- sky/adaptors/cloudflare.py +5 -5
- sky/adaptors/common.py +19 -9
- sky/adaptors/do.py +20 -0
- sky/adaptors/gcp.py +3 -2
- sky/adaptors/kubernetes.py +122 -88
- sky/adaptors/nebius.py +100 -0
- sky/adaptors/oci.py +39 -1
- sky/adaptors/vast.py +29 -0
- sky/admin_policy.py +101 -0
- sky/authentication.py +117 -98
- sky/backends/backend.py +52 -20
- sky/backends/backend_utils.py +669 -557
- sky/backends/cloud_vm_ray_backend.py +1099 -808
- sky/backends/local_docker_backend.py +14 -8
- sky/backends/wheel_utils.py +38 -20
- sky/benchmark/benchmark_utils.py +22 -23
- sky/check.py +76 -27
- sky/cli.py +1586 -1139
- sky/client/__init__.py +1 -0
- sky/client/cli.py +5683 -0
- sky/client/common.py +345 -0
- sky/client/sdk.py +1765 -0
- sky/cloud_stores.py +283 -19
- sky/clouds/__init__.py +7 -2
- sky/clouds/aws.py +303 -112
- sky/clouds/azure.py +185 -179
- sky/clouds/cloud.py +115 -37
- sky/clouds/cudo.py +29 -22
- sky/clouds/do.py +313 -0
- sky/clouds/fluidstack.py +44 -54
- sky/clouds/gcp.py +206 -65
- sky/clouds/ibm.py +26 -21
- sky/clouds/kubernetes.py +345 -91
- sky/clouds/lambda_cloud.py +40 -29
- sky/clouds/nebius.py +297 -0
- sky/clouds/oci.py +129 -90
- sky/clouds/paperspace.py +22 -18
- sky/clouds/runpod.py +53 -34
- sky/clouds/scp.py +28 -24
- sky/clouds/service_catalog/__init__.py +19 -13
- sky/clouds/service_catalog/aws_catalog.py +29 -12
- sky/clouds/service_catalog/azure_catalog.py +33 -6
- sky/clouds/service_catalog/common.py +95 -75
- sky/clouds/service_catalog/constants.py +3 -3
- sky/clouds/service_catalog/cudo_catalog.py +13 -3
- sky/clouds/service_catalog/data_fetchers/fetch_aws.py +36 -21
- sky/clouds/service_catalog/data_fetchers/fetch_azure.py +31 -4
- sky/clouds/service_catalog/data_fetchers/fetch_cudo.py +8 -117
- sky/clouds/service_catalog/data_fetchers/fetch_fluidstack.py +197 -44
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +224 -36
- sky/clouds/service_catalog/data_fetchers/fetch_lambda_cloud.py +44 -24
- sky/clouds/service_catalog/data_fetchers/fetch_vast.py +147 -0
- sky/clouds/service_catalog/data_fetchers/fetch_vsphere.py +1 -1
- sky/clouds/service_catalog/do_catalog.py +111 -0
- sky/clouds/service_catalog/fluidstack_catalog.py +2 -2
- sky/clouds/service_catalog/gcp_catalog.py +16 -2
- sky/clouds/service_catalog/ibm_catalog.py +2 -2
- sky/clouds/service_catalog/kubernetes_catalog.py +192 -70
- sky/clouds/service_catalog/lambda_catalog.py +8 -3
- sky/clouds/service_catalog/nebius_catalog.py +116 -0
- sky/clouds/service_catalog/oci_catalog.py +31 -4
- sky/clouds/service_catalog/paperspace_catalog.py +2 -2
- sky/clouds/service_catalog/runpod_catalog.py +2 -2
- sky/clouds/service_catalog/scp_catalog.py +2 -2
- sky/clouds/service_catalog/vast_catalog.py +104 -0
- sky/clouds/service_catalog/vsphere_catalog.py +2 -2
- sky/clouds/utils/aws_utils.py +65 -0
- sky/clouds/utils/azure_utils.py +91 -0
- sky/clouds/utils/gcp_utils.py +5 -9
- sky/clouds/utils/oci_utils.py +47 -5
- sky/clouds/utils/scp_utils.py +4 -3
- sky/clouds/vast.py +280 -0
- sky/clouds/vsphere.py +22 -18
- sky/core.py +361 -107
- sky/dag.py +41 -28
- sky/data/data_transfer.py +37 -0
- sky/data/data_utils.py +211 -32
- sky/data/mounting_utils.py +182 -30
- sky/data/storage.py +2118 -270
- sky/data/storage_utils.py +126 -5
- sky/exceptions.py +179 -8
- sky/execution.py +158 -85
- sky/global_user_state.py +150 -34
- sky/jobs/__init__.py +12 -10
- sky/jobs/client/__init__.py +0 -0
- sky/jobs/client/sdk.py +302 -0
- sky/jobs/constants.py +49 -11
- sky/jobs/controller.py +161 -99
- sky/jobs/dashboard/dashboard.py +171 -25
- sky/jobs/dashboard/templates/index.html +572 -60
- sky/jobs/recovery_strategy.py +157 -156
- sky/jobs/scheduler.py +307 -0
- sky/jobs/server/__init__.py +1 -0
- sky/jobs/server/core.py +598 -0
- sky/jobs/server/dashboard_utils.py +69 -0
- sky/jobs/server/server.py +190 -0
- sky/jobs/state.py +627 -122
- sky/jobs/utils.py +615 -206
- sky/models.py +27 -0
- sky/optimizer.py +142 -83
- sky/provision/__init__.py +20 -5
- sky/provision/aws/config.py +124 -42
- sky/provision/aws/instance.py +130 -53
- sky/provision/azure/__init__.py +7 -0
- sky/{skylet/providers → provision}/azure/azure-config-template.json +19 -7
- sky/provision/azure/config.py +220 -0
- sky/provision/azure/instance.py +1012 -37
- sky/provision/common.py +31 -3
- sky/provision/constants.py +25 -0
- sky/provision/cudo/__init__.py +2 -1
- sky/provision/cudo/cudo_utils.py +112 -0
- sky/provision/cudo/cudo_wrapper.py +37 -16
- sky/provision/cudo/instance.py +28 -12
- sky/provision/do/__init__.py +11 -0
- sky/provision/do/config.py +14 -0
- sky/provision/do/constants.py +10 -0
- sky/provision/do/instance.py +287 -0
- sky/provision/do/utils.py +301 -0
- sky/provision/docker_utils.py +82 -46
- sky/provision/fluidstack/fluidstack_utils.py +57 -125
- sky/provision/fluidstack/instance.py +15 -43
- sky/provision/gcp/config.py +19 -9
- sky/provision/gcp/constants.py +7 -1
- sky/provision/gcp/instance.py +55 -34
- sky/provision/gcp/instance_utils.py +339 -80
- sky/provision/gcp/mig_utils.py +210 -0
- sky/provision/instance_setup.py +172 -133
- sky/provision/kubernetes/__init__.py +1 -0
- sky/provision/kubernetes/config.py +104 -90
- sky/provision/kubernetes/constants.py +8 -0
- sky/provision/kubernetes/instance.py +680 -325
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +3 -0
- sky/provision/kubernetes/network.py +54 -20
- sky/provision/kubernetes/network_utils.py +70 -21
- sky/provision/kubernetes/utils.py +1370 -251
- sky/provision/lambda_cloud/__init__.py +11 -0
- sky/provision/lambda_cloud/config.py +10 -0
- sky/provision/lambda_cloud/instance.py +265 -0
- sky/{clouds/utils → provision/lambda_cloud}/lambda_utils.py +24 -23
- sky/provision/logging.py +1 -1
- sky/provision/nebius/__init__.py +11 -0
- sky/provision/nebius/config.py +11 -0
- sky/provision/nebius/instance.py +285 -0
- sky/provision/nebius/utils.py +318 -0
- sky/provision/oci/__init__.py +15 -0
- sky/provision/oci/config.py +51 -0
- sky/provision/oci/instance.py +436 -0
- sky/provision/oci/query_utils.py +681 -0
- sky/provision/paperspace/constants.py +6 -0
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/paperspace/utils.py +2 -0
- sky/provision/provisioner.py +207 -130
- sky/provision/runpod/__init__.py +1 -0
- sky/provision/runpod/api/__init__.py +3 -0
- sky/provision/runpod/api/commands.py +119 -0
- sky/provision/runpod/api/pods.py +142 -0
- sky/provision/runpod/instance.py +64 -8
- sky/provision/runpod/utils.py +239 -23
- sky/provision/vast/__init__.py +10 -0
- sky/provision/vast/config.py +11 -0
- sky/provision/vast/instance.py +247 -0
- sky/provision/vast/utils.py +162 -0
- sky/provision/vsphere/common/vim_utils.py +1 -1
- sky/provision/vsphere/instance.py +8 -18
- sky/provision/vsphere/vsphere_utils.py +1 -1
- sky/resources.py +247 -102
- sky/serve/__init__.py +9 -9
- sky/serve/autoscalers.py +361 -299
- sky/serve/client/__init__.py +0 -0
- sky/serve/client/sdk.py +366 -0
- sky/serve/constants.py +12 -3
- sky/serve/controller.py +106 -36
- sky/serve/load_balancer.py +63 -12
- sky/serve/load_balancing_policies.py +84 -2
- sky/serve/replica_managers.py +42 -34
- sky/serve/serve_state.py +62 -32
- sky/serve/serve_utils.py +271 -160
- sky/serve/server/__init__.py +0 -0
- sky/serve/{core.py → server/core.py} +271 -90
- sky/serve/server/server.py +112 -0
- sky/serve/service.py +52 -16
- sky/serve/service_spec.py +95 -32
- sky/server/__init__.py +1 -0
- sky/server/common.py +430 -0
- sky/server/constants.py +21 -0
- sky/server/html/log.html +174 -0
- sky/server/requests/__init__.py +0 -0
- sky/server/requests/executor.py +472 -0
- sky/server/requests/payloads.py +487 -0
- sky/server/requests/queues/__init__.py +0 -0
- sky/server/requests/queues/mp_queue.py +76 -0
- sky/server/requests/requests.py +567 -0
- sky/server/requests/serializers/__init__.py +0 -0
- sky/server/requests/serializers/decoders.py +192 -0
- sky/server/requests/serializers/encoders.py +166 -0
- sky/server/server.py +1106 -0
- sky/server/stream_utils.py +141 -0
- sky/setup_files/MANIFEST.in +2 -5
- sky/setup_files/dependencies.py +159 -0
- sky/setup_files/setup.py +14 -125
- sky/sky_logging.py +59 -14
- sky/skylet/autostop_lib.py +2 -2
- sky/skylet/constants.py +183 -50
- sky/skylet/events.py +22 -10
- sky/skylet/job_lib.py +403 -258
- sky/skylet/log_lib.py +111 -71
- sky/skylet/log_lib.pyi +6 -0
- sky/skylet/providers/command_runner.py +6 -8
- sky/skylet/providers/ibm/node_provider.py +2 -2
- sky/skylet/providers/scp/config.py +11 -3
- sky/skylet/providers/scp/node_provider.py +8 -8
- sky/skylet/skylet.py +3 -1
- sky/skylet/subprocess_daemon.py +69 -17
- sky/skypilot_config.py +119 -57
- sky/task.py +205 -64
- sky/templates/aws-ray.yml.j2 +37 -7
- sky/templates/azure-ray.yml.j2 +27 -82
- sky/templates/cudo-ray.yml.j2 +7 -3
- sky/templates/do-ray.yml.j2 +98 -0
- sky/templates/fluidstack-ray.yml.j2 +7 -4
- sky/templates/gcp-ray.yml.j2 +26 -6
- sky/templates/ibm-ray.yml.j2 +3 -2
- sky/templates/jobs-controller.yaml.j2 +46 -11
- sky/templates/kubernetes-ingress.yml.j2 +7 -0
- sky/templates/kubernetes-loadbalancer.yml.j2 +7 -0
- sky/templates/{kubernetes-port-forward-proxy-command.sh.j2 → kubernetes-port-forward-proxy-command.sh} +51 -7
- sky/templates/kubernetes-ray.yml.j2 +292 -25
- sky/templates/lambda-ray.yml.j2 +30 -40
- sky/templates/nebius-ray.yml.j2 +79 -0
- sky/templates/oci-ray.yml.j2 +18 -57
- sky/templates/paperspace-ray.yml.j2 +10 -6
- sky/templates/runpod-ray.yml.j2 +26 -4
- sky/templates/scp-ray.yml.j2 +3 -2
- sky/templates/sky-serve-controller.yaml.j2 +12 -1
- sky/templates/skypilot-server-kubernetes-proxy.sh +36 -0
- sky/templates/vast-ray.yml.j2 +70 -0
- sky/templates/vsphere-ray.yml.j2 +8 -3
- sky/templates/websocket_proxy.py +64 -0
- sky/usage/constants.py +10 -1
- sky/usage/usage_lib.py +130 -37
- sky/utils/accelerator_registry.py +35 -51
- sky/utils/admin_policy_utils.py +147 -0
- sky/utils/annotations.py +51 -0
- sky/utils/cli_utils/status_utils.py +81 -23
- sky/utils/cluster_utils.py +356 -0
- sky/utils/command_runner.py +452 -89
- sky/utils/command_runner.pyi +77 -3
- sky/utils/common.py +54 -0
- sky/utils/common_utils.py +319 -108
- sky/utils/config_utils.py +204 -0
- sky/utils/control_master_utils.py +48 -0
- sky/utils/controller_utils.py +548 -266
- sky/utils/dag_utils.py +93 -32
- sky/utils/db_utils.py +18 -4
- sky/utils/env_options.py +29 -7
- sky/utils/kubernetes/create_cluster.sh +8 -60
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +73 -0
- sky/utils/kubernetes/generate_kubeconfig.sh +336 -0
- sky/utils/kubernetes/gpu_labeler.py +4 -4
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +4 -3
- sky/utils/kubernetes/kubernetes_deploy_utils.py +228 -0
- sky/utils/kubernetes/rsync_helper.sh +24 -0
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +1 -1
- sky/utils/log_utils.py +240 -33
- sky/utils/message_utils.py +81 -0
- sky/utils/registry.py +127 -0
- sky/utils/resources_utils.py +94 -22
- sky/utils/rich_utils.py +247 -18
- sky/utils/schemas.py +284 -64
- sky/{status_lib.py → utils/status_lib.py} +12 -7
- sky/utils/subprocess_utils.py +212 -46
- sky/utils/timeline.py +12 -7
- sky/utils/ux_utils.py +168 -15
- skypilot_nightly-1.0.0.dev2025022801.dist-info/METADATA +363 -0
- skypilot_nightly-1.0.0.dev2025022801.dist-info/RECORD +352 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/WHEEL +1 -1
- sky/clouds/cloud_registry.py +0 -31
- sky/jobs/core.py +0 -330
- sky/skylet/providers/azure/__init__.py +0 -2
- sky/skylet/providers/azure/azure-vm-template.json +0 -301
- sky/skylet/providers/azure/config.py +0 -170
- sky/skylet/providers/azure/node_provider.py +0 -466
- sky/skylet/providers/lambda_cloud/__init__.py +0 -2
- sky/skylet/providers/lambda_cloud/node_provider.py +0 -320
- sky/skylet/providers/oci/__init__.py +0 -2
- sky/skylet/providers/oci/node_provider.py +0 -488
- sky/skylet/providers/oci/query_helper.py +0 -383
- sky/skylet/providers/oci/utils.py +0 -21
- sky/utils/cluster_yaml_utils.py +0 -24
- sky/utils/kubernetes/generate_static_kubeconfig.sh +0 -137
- skypilot_nightly-1.0.0.dev2024053101.dist-info/METADATA +0 -315
- skypilot_nightly-1.0.0.dev2024053101.dist-info/RECORD +0 -275
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev2024053101.dist-info → skypilot_nightly-1.0.0.dev2025022801.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
"""User interface with the SkyServe."""
|
2
2
|
import base64
|
3
3
|
import collections
|
4
|
+
import dataclasses
|
4
5
|
import enum
|
5
6
|
import os
|
6
7
|
import pathlib
|
@@ -23,15 +24,15 @@ import requests
|
|
23
24
|
from sky import backends
|
24
25
|
from sky import exceptions
|
25
26
|
from sky import global_user_state
|
26
|
-
from sky import status_lib
|
27
|
-
from sky.backends import backend_utils
|
28
27
|
from sky.serve import constants
|
29
28
|
from sky.serve import serve_state
|
30
29
|
from sky.skylet import constants as skylet_constants
|
31
30
|
from sky.skylet import job_lib
|
32
31
|
from sky.utils import common_utils
|
33
32
|
from sky.utils import log_utils
|
33
|
+
from sky.utils import message_utils
|
34
34
|
from sky.utils import resources_utils
|
35
|
+
from sky.utils import status_lib
|
35
36
|
from sky.utils import ux_utils
|
36
37
|
|
37
38
|
if typing.TYPE_CHECKING:
|
@@ -39,15 +40,19 @@ if typing.TYPE_CHECKING:
|
|
39
40
|
|
40
41
|
from sky.serve import replica_managers
|
41
42
|
|
42
|
-
SKY_SERVE_CONTROLLER_NAME: str = (
|
43
|
-
f'sky-serve-controller-{common_utils.get_user_hash()}')
|
44
43
|
_SYSTEM_MEMORY_GB = psutil.virtual_memory().total // (1024**3)
|
45
44
|
NUM_SERVICE_THRESHOLD = (_SYSTEM_MEMORY_GB //
|
46
45
|
constants.CONTROLLER_MEMORY_USAGE_GB)
|
47
46
|
_CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
|
48
47
|
|
49
|
-
|
50
|
-
|
48
|
+
# NOTE(dev): We assume log paths are either in ~/sky_logs/... or ~/.sky/...
|
49
|
+
# and always appear after a space. Be careful when changing UX as this
|
50
|
+
# assumption is used to expand some log files while ignoring others.
|
51
|
+
_SKYPILOT_LOG_DIRS = r'~/(sky_logs|\.sky)'
|
52
|
+
_SKYPILOT_PROVISION_LOG_PATTERN = (
|
53
|
+
fr'.* ({_SKYPILOT_LOG_DIRS}/.*provision\.log)')
|
54
|
+
_SKYPILOT_LOG_PATTERN = fr'.* ({_SKYPILOT_LOG_DIRS}/.*\.log)'
|
55
|
+
|
51
56
|
# TODO(tian): Find all existing replica id and print here.
|
52
57
|
_FAILED_TO_FIND_REPLICA_MSG = (
|
53
58
|
f'{colorama.Fore.RED}Failed to find replica '
|
@@ -86,6 +91,19 @@ class UpdateMode(enum.Enum):
|
|
86
91
|
BLUE_GREEN = 'blue_green'
|
87
92
|
|
88
93
|
|
94
|
+
@dataclasses.dataclass
|
95
|
+
class TLSCredential:
|
96
|
+
"""TLS credential for the service."""
|
97
|
+
keyfile: str
|
98
|
+
certfile: str
|
99
|
+
|
100
|
+
def dump_uvicorn_kwargs(self) -> Dict[str, str]:
|
101
|
+
return {
|
102
|
+
'ssl_keyfile': os.path.expanduser(self.keyfile),
|
103
|
+
'ssl_certfile': os.path.expanduser(self.certfile),
|
104
|
+
}
|
105
|
+
|
106
|
+
|
89
107
|
DEFAULT_UPDATE_MODE = UpdateMode.ROLLING
|
90
108
|
|
91
109
|
_SIGNAL_TO_ERROR = {
|
@@ -104,7 +122,7 @@ ValueType = TypeVar('ValueType')
|
|
104
122
|
class ThreadSafeDict(Generic[KeyType, ValueType]):
|
105
123
|
"""A thread-safe dict."""
|
106
124
|
|
107
|
-
def __init__(self, *args, **kwargs) -> None:
|
125
|
+
def __init__(self, *args: Any, **kwargs: Any) -> None:
|
108
126
|
self._dict: Dict[KeyType, ValueType] = dict(*args, **kwargs)
|
109
127
|
self._lock = threading.Lock()
|
110
128
|
|
@@ -237,6 +255,18 @@ def generate_replica_log_file_name(service_name: str, replica_id: int) -> str:
|
|
237
255
|
return os.path.join(dir_name, f'replica_{replica_id}.log')
|
238
256
|
|
239
257
|
|
258
|
+
def generate_remote_tls_keyfile_name(service_name: str) -> str:
|
259
|
+
dir_name = generate_remote_service_dir_name(service_name)
|
260
|
+
# Don't expand here since it is used for remote machine.
|
261
|
+
return os.path.join(dir_name, 'tls_keyfile')
|
262
|
+
|
263
|
+
|
264
|
+
def generate_remote_tls_certfile_name(service_name: str) -> str:
|
265
|
+
dir_name = generate_remote_service_dir_name(service_name)
|
266
|
+
# Don't expand here since it is used for remote machine.
|
267
|
+
return os.path.join(dir_name, 'tls_certfile')
|
268
|
+
|
269
|
+
|
240
270
|
def generate_replica_cluster_name(service_name: str, replica_id: int) -> str:
|
241
271
|
return f'{service_name}-{replica_id}'
|
242
272
|
|
@@ -246,9 +276,11 @@ def set_service_status_and_active_versions_from_replica(
|
|
246
276
|
update_mode: UpdateMode) -> None:
|
247
277
|
record = serve_state.get_service_from_name(service_name)
|
248
278
|
if record is None:
|
249
|
-
|
250
|
-
|
251
|
-
|
279
|
+
with ux_utils.print_exception_no_traceback():
|
280
|
+
raise ValueError(
|
281
|
+
'The service is up-ed in an old version and does not '
|
282
|
+
'support update. Please `sky serve down` '
|
283
|
+
'it first and relaunch the service.')
|
252
284
|
if record['status'] == serve_state.ServiceStatus.SHUTTING_DOWN:
|
253
285
|
# When the service is shutting down, there is a period of time which the
|
254
286
|
# controller still responds to the request, and the replica is not
|
@@ -289,7 +321,8 @@ def update_service_status() -> None:
|
|
289
321
|
def update_service_encoded(service_name: str, version: int, mode: str) -> str:
|
290
322
|
service_status = _get_service_status(service_name)
|
291
323
|
if service_status is None:
|
292
|
-
|
324
|
+
with ux_utils.print_exception_no_traceback():
|
325
|
+
raise ValueError(f'Service {service_name!r} does not exist.')
|
293
326
|
controller_port = service_status['controller_port']
|
294
327
|
resp = requests.post(
|
295
328
|
_CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
|
@@ -299,14 +332,54 @@ def update_service_encoded(service_name: str, version: int, mode: str) -> str:
|
|
299
332
|
'mode': mode,
|
300
333
|
})
|
301
334
|
if resp.status_code == 404:
|
302
|
-
|
303
|
-
|
304
|
-
|
335
|
+
with ux_utils.print_exception_no_traceback():
|
336
|
+
raise ValueError(
|
337
|
+
'The service is up-ed in an old version and does not '
|
338
|
+
'support update. Please `sky serve down` '
|
339
|
+
'it first and relaunch the service. ')
|
340
|
+
elif resp.status_code == 400:
|
341
|
+
with ux_utils.print_exception_no_traceback():
|
342
|
+
raise ValueError(f'Client error during service update: {resp.text}')
|
343
|
+
elif resp.status_code == 500:
|
344
|
+
with ux_utils.print_exception_no_traceback():
|
345
|
+
raise RuntimeError(
|
346
|
+
f'Server error during service update: {resp.text}')
|
305
347
|
elif resp.status_code != 200:
|
306
|
-
|
348
|
+
with ux_utils.print_exception_no_traceback():
|
349
|
+
raise ValueError(f'Failed to update service: {resp.text}')
|
307
350
|
|
308
351
|
service_msg = resp.json()['message']
|
309
|
-
return
|
352
|
+
return message_utils.encode_payload(service_msg)
|
353
|
+
|
354
|
+
|
355
|
+
def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
|
356
|
+
service_status = _get_service_status(service_name)
|
357
|
+
if service_status is None:
|
358
|
+
with ux_utils.print_exception_no_traceback():
|
359
|
+
raise ValueError(f'Service {service_name!r} does not exist.')
|
360
|
+
replica_info = serve_state.get_replica_info_from_id(service_name,
|
361
|
+
replica_id)
|
362
|
+
if replica_info is None:
|
363
|
+
with ux_utils.print_exception_no_traceback():
|
364
|
+
raise ValueError(
|
365
|
+
f'Replica {replica_id} for service {service_name} does not '
|
366
|
+
'exist.')
|
367
|
+
|
368
|
+
controller_port = service_status['controller_port']
|
369
|
+
resp = requests.post(
|
370
|
+
_CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
|
371
|
+
'/controller/terminate_replica',
|
372
|
+
json={
|
373
|
+
'replica_id': replica_id,
|
374
|
+
'purge': purge,
|
375
|
+
})
|
376
|
+
|
377
|
+
message: str = resp.json()['message']
|
378
|
+
if resp.status_code != 200:
|
379
|
+
with ux_utils.print_exception_no_traceback():
|
380
|
+
raise ValueError(f'Failed to terminate replica {replica_id} '
|
381
|
+
f'in {service_name}. Reason:\n{message}')
|
382
|
+
return message
|
310
383
|
|
311
384
|
|
312
385
|
def _get_service_status(
|
@@ -334,7 +407,7 @@ def _get_service_status(
|
|
334
407
|
|
335
408
|
|
336
409
|
def get_service_status_encoded(service_names: Optional[List[str]]) -> str:
|
337
|
-
service_statuses = []
|
410
|
+
service_statuses: List[Dict[str, str]] = []
|
338
411
|
if service_names is None:
|
339
412
|
# Get all service names
|
340
413
|
service_names = serve_state.get_glob_service_names(None)
|
@@ -346,13 +419,28 @@ def get_service_status_encoded(service_names: Optional[List[str]]) -> str:
|
|
346
419
|
k: base64.b64encode(pickle.dumps(v)).decode('utf-8')
|
347
420
|
for k, v in service_status.items()
|
348
421
|
})
|
349
|
-
|
422
|
+
# We have to use payload_type here to avoid the issue of
|
423
|
+
# message_utils.decode_payload() not being able to correctly decode the
|
424
|
+
# message with <sky-payload> tags.
|
425
|
+
return message_utils.encode_payload(service_statuses,
|
426
|
+
payload_type='service_status')
|
350
427
|
|
351
428
|
|
352
429
|
def load_service_status(payload: str) -> List[Dict[str, Any]]:
|
353
|
-
|
354
|
-
|
430
|
+
try:
|
431
|
+
service_statuses_encoded = message_utils.decode_payload(
|
432
|
+
payload, payload_type='service_status')
|
433
|
+
except ValueError as e:
|
434
|
+
if 'Invalid payload string' in str(e):
|
435
|
+
# Backward compatibility for serve controller started before #4660
|
436
|
+
# where the payload type is not added.
|
437
|
+
service_statuses_encoded = message_utils.decode_payload(payload)
|
438
|
+
else:
|
439
|
+
raise
|
440
|
+
service_statuses: List[Dict[str, Any]] = []
|
355
441
|
for service_status in service_statuses_encoded:
|
442
|
+
if not isinstance(service_status, dict):
|
443
|
+
raise ValueError(f'Invalid service status: {service_status}')
|
356
444
|
service_statuses.append({
|
357
445
|
k: pickle.loads(base64.b64decode(v))
|
358
446
|
for k, v in service_status.items()
|
@@ -362,16 +450,16 @@ def load_service_status(payload: str) -> List[Dict[str, Any]]:
|
|
362
450
|
|
363
451
|
def add_version_encoded(service_name: str) -> str:
|
364
452
|
new_version = serve_state.add_version(service_name)
|
365
|
-
return
|
453
|
+
return message_utils.encode_payload(new_version)
|
366
454
|
|
367
455
|
|
368
456
|
def load_version_string(payload: str) -> str:
|
369
|
-
return
|
457
|
+
return message_utils.decode_payload(payload)
|
370
458
|
|
371
459
|
|
372
460
|
def _terminate_failed_services(
|
373
461
|
service_name: str,
|
374
|
-
service_status: serve_state.ServiceStatus) -> Optional[str]:
|
462
|
+
service_status: Optional[serve_state.ServiceStatus]) -> Optional[str]:
|
375
463
|
"""Terminate service in failed status.
|
376
464
|
|
377
465
|
Services included in ServiceStatus.failed_statuses() do not have an
|
@@ -383,7 +471,7 @@ def _terminate_failed_services(
|
|
383
471
|
A message indicating potential resource leak (if any). If no
|
384
472
|
resource leak is detected, return None.
|
385
473
|
"""
|
386
|
-
remaining_replica_clusters = []
|
474
|
+
remaining_replica_clusters: List[str] = []
|
387
475
|
# The controller should have already attempted to terminate those
|
388
476
|
# replicas, so we don't need to try again here.
|
389
477
|
for replica_info in serve_state.get_replica_infos(service_name):
|
@@ -397,6 +485,7 @@ def _terminate_failed_services(
|
|
397
485
|
generate_remote_service_dir_name(service_name))
|
398
486
|
shutil.rmtree(service_dir)
|
399
487
|
serve_state.remove_service(service_name)
|
488
|
+
serve_state.delete_all_versions(service_name)
|
400
489
|
|
401
490
|
if not remaining_replica_clusters:
|
402
491
|
return None
|
@@ -409,26 +498,35 @@ def _terminate_failed_services(
|
|
409
498
|
|
410
499
|
def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
|
411
500
|
service_names = serve_state.get_glob_service_names(service_names)
|
412
|
-
terminated_service_names = []
|
413
|
-
messages = []
|
501
|
+
terminated_service_names: List[str] = []
|
502
|
+
messages: List[str] = []
|
414
503
|
for service_name in service_names:
|
415
504
|
service_status = _get_service_status(service_name,
|
416
505
|
with_replica_info=False)
|
417
|
-
|
418
|
-
|
506
|
+
if (service_status is not None and service_status['status']
|
507
|
+
== serve_state.ServiceStatus.SHUTTING_DOWN):
|
419
508
|
# Already scheduled to be terminated.
|
420
509
|
continue
|
421
|
-
|
510
|
+
# If the `services` and `version_specs` table are not aligned, it might
|
511
|
+
# result in a None service status. In this case, the controller process
|
512
|
+
# is not functioning as well and we should also use the
|
513
|
+
# `_terminate_failed_services` function to clean up the service.
|
514
|
+
# This is a safeguard for a rare case, that is accidentally abort
|
515
|
+
# between `serve_state.add_service` and
|
516
|
+
# `serve_state.add_or_update_version` in service.py.
|
517
|
+
if (service_status is None or service_status['status']
|
422
518
|
in serve_state.ServiceStatus.failed_statuses()):
|
519
|
+
failed_status = (service_status['status']
|
520
|
+
if service_status is not None else None)
|
423
521
|
if purge:
|
424
522
|
message = _terminate_failed_services(service_name,
|
425
|
-
|
523
|
+
failed_status)
|
426
524
|
if message is not None:
|
427
525
|
messages.append(message)
|
428
526
|
else:
|
429
527
|
messages.append(
|
430
528
|
f'{colorama.Fore.YELLOW}Service {service_name!r} is in '
|
431
|
-
f'failed status ({
|
529
|
+
f'failed status ({failed_status}). Skipping '
|
432
530
|
'its termination as it could lead to a resource leak. '
|
433
531
|
f'(Use `sky serve down {service_name} --purge` to '
|
434
532
|
'forcefully terminate the service.)'
|
@@ -447,7 +545,7 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
|
|
447
545
|
f.write(UserSignal.TERMINATE.value)
|
448
546
|
f.flush()
|
449
547
|
terminated_service_names.append(f'{service_name!r}')
|
450
|
-
if
|
548
|
+
if not terminated_service_names:
|
451
549
|
messages.append('No service to terminate.')
|
452
550
|
else:
|
453
551
|
identity_str = f'Service {terminated_service_names[0]} is'
|
@@ -472,7 +570,31 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
|
|
472
570
|
Encoded load balancer port assigned to the service.
|
473
571
|
"""
|
474
572
|
start_time = time.time()
|
573
|
+
setup_completed = False
|
475
574
|
while True:
|
575
|
+
job_status = job_lib.get_status(job_id)
|
576
|
+
if job_status is None or job_status < job_lib.JobStatus.RUNNING:
|
577
|
+
# Wait for the controller process to finish setting up. It can be
|
578
|
+
# slow if a lot cloud dependencies are being installed.
|
579
|
+
if (time.time() - start_time >
|
580
|
+
constants.CONTROLLER_SETUP_TIMEOUT_SECONDS):
|
581
|
+
with ux_utils.print_exception_no_traceback():
|
582
|
+
raise RuntimeError(
|
583
|
+
f'Failed to start the controller '
|
584
|
+
f'process for the service {service_name!r} '
|
585
|
+
f'within '
|
586
|
+
f'{constants.CONTROLLER_SETUP_TIMEOUT_SECONDS} seconds.'
|
587
|
+
)
|
588
|
+
# No need to check the service status as the controller process
|
589
|
+
# is still setting up.
|
590
|
+
time.sleep(1)
|
591
|
+
continue
|
592
|
+
|
593
|
+
if not setup_completed:
|
594
|
+
setup_completed = True
|
595
|
+
# Reset the start time to wait for the service to be registered.
|
596
|
+
start_time = time.time()
|
597
|
+
|
476
598
|
record = serve_state.get_service_from_name(service_name)
|
477
599
|
if record is not None:
|
478
600
|
if job_id != record['controller_job_id']:
|
@@ -480,12 +602,11 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
|
|
480
602
|
raise ValueError(
|
481
603
|
f'The service {service_name!r} is already running. '
|
482
604
|
'Please specify a different name for your service. '
|
483
|
-
'To update an existing service, run:
|
484
|
-
'
|
485
|
-
'be supported in the future).')
|
605
|
+
'To update an existing service, run: sky serve update '
|
606
|
+
f'{service_name} <new-service-yaml>')
|
486
607
|
lb_port = record['load_balancer_port']
|
487
608
|
if lb_port is not None:
|
488
|
-
return
|
609
|
+
return message_utils.encode_payload(lb_port)
|
489
610
|
elif len(serve_state.get_services()) >= NUM_SERVICE_THRESHOLD:
|
490
611
|
with ux_utils.print_exception_no_traceback():
|
491
612
|
raise RuntimeError('Max number of services reached. '
|
@@ -508,7 +629,7 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
|
|
508
629
|
|
509
630
|
|
510
631
|
def load_service_initialization_result(payload: str) -> int:
|
511
|
-
return
|
632
|
+
return message_utils.decode_payload(payload)
|
512
633
|
|
513
634
|
|
514
635
|
def check_service_status_healthy(service_name: str) -> Optional[str]:
|
@@ -539,16 +660,27 @@ def get_latest_version_with_min_replicas(
|
|
539
660
|
return active_versions[-1] if active_versions else None
|
540
661
|
|
541
662
|
|
542
|
-
def
|
543
|
-
|
544
|
-
|
545
|
-
|
546
|
-
|
547
|
-
|
548
|
-
|
549
|
-
|
550
|
-
|
551
|
-
|
663
|
+
def _follow_logs_with_provision_expanding(
|
664
|
+
file: TextIO,
|
665
|
+
cluster_name: str,
|
666
|
+
*,
|
667
|
+
should_stop: Callable[[], bool],
|
668
|
+
stop_on_eof: bool = False,
|
669
|
+
idle_timeout_seconds: Optional[int] = None,
|
670
|
+
) -> Iterator[str]:
|
671
|
+
"""Follows logs and expands any provision.log references found.
|
672
|
+
|
673
|
+
Args:
|
674
|
+
file: Log file to read from.
|
675
|
+
cluster_name: Name of the cluster being launched.
|
676
|
+
should_stop: Callback that returns True when streaming should stop.
|
677
|
+
stop_on_eof: If True, stop when reaching end of file.
|
678
|
+
idle_timeout_seconds: If set, stop after these many seconds without
|
679
|
+
new content.
|
680
|
+
|
681
|
+
Yields:
|
682
|
+
Log lines, including expanded content from referenced provision logs.
|
683
|
+
"""
|
552
684
|
|
553
685
|
def cluster_is_up() -> bool:
|
554
686
|
cluster_record = global_user_state.get_cluster_from_name(cluster_name)
|
@@ -556,51 +688,51 @@ def _follow_replica_logs(
|
|
556
688
|
return False
|
557
689
|
return cluster_record['status'] == status_lib.ClusterStatus.UP
|
558
690
|
|
559
|
-
|
560
|
-
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
#
|
573
|
-
#
|
574
|
-
#
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
|
581
|
-
|
582
|
-
|
583
|
-
|
584
|
-
|
585
|
-
|
586
|
-
|
587
|
-
|
588
|
-
|
589
|
-
|
590
|
-
|
591
|
-
|
592
|
-
|
593
|
-
|
594
|
-
|
595
|
-
|
596
|
-
|
597
|
-
|
598
|
-
|
599
|
-
|
600
|
-
|
601
|
-
|
602
|
-
|
603
|
-
|
691
|
+
def process_line(line: str) -> Iterator[str]:
|
692
|
+
# The line might be directing users to view logs, like
|
693
|
+
# `✓ Cluster launched: new-http. View logs at: *.log`
|
694
|
+
# We should tail the detailed logs for user.
|
695
|
+
provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
|
696
|
+
log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
|
697
|
+
|
698
|
+
if provision_log_prompt is not None:
|
699
|
+
nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
|
700
|
+
|
701
|
+
try:
|
702
|
+
with open(nested_log_path, 'r', newline='',
|
703
|
+
encoding='utf-8') as f:
|
704
|
+
# We still exit if more than 10 seconds without new content
|
705
|
+
# to avoid any internal bug that causes the launch to fail
|
706
|
+
# while cluster status remains INIT.
|
707
|
+
yield from log_utils.follow_logs(f,
|
708
|
+
should_stop=cluster_is_up,
|
709
|
+
stop_on_eof=stop_on_eof,
|
710
|
+
idle_timeout_seconds=10)
|
711
|
+
except FileNotFoundError:
|
712
|
+
yield line
|
713
|
+
|
714
|
+
yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
|
715
|
+
f'Try to expand log file {nested_log_path} but not '
|
716
|
+
f'found. Skipping...{colorama.Style.RESET_ALL}')
|
717
|
+
pass
|
718
|
+
return
|
719
|
+
|
720
|
+
if log_prompt is not None:
|
721
|
+
# Now we skip other logs (file sync logs) since we lack
|
722
|
+
# utility to determine when these log files are finished
|
723
|
+
# writing.
|
724
|
+
# TODO(tian): We should not skip these logs since there are
|
725
|
+
# small chance that error will happen in file sync. Need to
|
726
|
+
# find a better way to do this.
|
727
|
+
return
|
728
|
+
|
729
|
+
yield line
|
730
|
+
|
731
|
+
return log_utils.follow_logs(file,
|
732
|
+
should_stop=should_stop,
|
733
|
+
stop_on_eof=stop_on_eof,
|
734
|
+
process_line=process_line,
|
735
|
+
idle_timeout_seconds=idle_timeout_seconds)
|
604
736
|
|
605
737
|
|
606
738
|
def stream_replica_logs(service_name: str, replica_id: int,
|
@@ -631,17 +763,21 @@ def stream_replica_logs(service_name: str, replica_id: int,
|
|
631
763
|
for info in replica_info:
|
632
764
|
if info.replica_id == replica_id:
|
633
765
|
return info.status
|
634
|
-
|
635
|
-
|
766
|
+
with ux_utils.print_exception_no_traceback():
|
767
|
+
raise ValueError(
|
768
|
+
_FAILED_TO_FIND_REPLICA_MSG.format(replica_id=replica_id))
|
636
769
|
|
637
|
-
|
770
|
+
replica_provisioned = (
|
638
771
|
lambda: _get_replica_status() != serve_state.ReplicaStatus.PROVISIONING)
|
639
772
|
with open(launch_log_file_name, 'r', newline='', encoding='utf-8') as f:
|
640
|
-
for line in
|
641
|
-
|
642
|
-
|
643
|
-
|
773
|
+
for line in _follow_logs_with_provision_expanding(
|
774
|
+
f,
|
775
|
+
replica_cluster_name,
|
776
|
+
should_stop=replica_provisioned,
|
777
|
+
stop_on_eof=not follow,
|
778
|
+
):
|
644
779
|
print(line, end='', flush=True)
|
780
|
+
|
645
781
|
if (not follow and
|
646
782
|
_get_replica_status() == serve_state.ReplicaStatus.PROVISIONING):
|
647
783
|
# Early exit if not following the logs.
|
@@ -666,22 +802,6 @@ def stream_replica_logs(service_name: str, replica_id: int,
|
|
666
802
|
return ''
|
667
803
|
|
668
804
|
|
669
|
-
def _follow_logs(file: TextIO, *, finish_stream: Callable[[], bool],
|
670
|
-
exit_if_stream_end: bool) -> Iterator[str]:
|
671
|
-
line = ''
|
672
|
-
while True:
|
673
|
-
tmp = file.readline()
|
674
|
-
if tmp is not None and tmp != '':
|
675
|
-
line += tmp
|
676
|
-
if '\n' in line or '\r' in line:
|
677
|
-
yield line
|
678
|
-
line = ''
|
679
|
-
else:
|
680
|
-
if exit_if_stream_end or finish_stream():
|
681
|
-
break
|
682
|
-
time.sleep(1)
|
683
|
-
|
684
|
-
|
685
805
|
def stream_serve_process_logs(service_name: str, stream_controller: bool,
|
686
806
|
follow: bool) -> str:
|
687
807
|
msg = check_service_status_healthy(service_name)
|
@@ -700,9 +820,11 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool,
|
|
700
820
|
|
701
821
|
with open(os.path.expanduser(log_file), 'r', newline='',
|
702
822
|
encoding='utf-8') as f:
|
703
|
-
for line in
|
704
|
-
|
705
|
-
|
823
|
+
for line in log_utils.follow_logs(
|
824
|
+
f,
|
825
|
+
should_stop=_service_is_terminal,
|
826
|
+
stop_on_eof=not follow,
|
827
|
+
):
|
706
828
|
print(line, end='', flush=True)
|
707
829
|
return ''
|
708
830
|
|
@@ -721,28 +843,6 @@ def _get_replicas(service_record: Dict[str, Any]) -> str:
|
|
721
843
|
return f'{ready_replica_num}/{total_replica_num}'
|
722
844
|
|
723
845
|
|
724
|
-
def get_endpoint(service_record: Dict[str, Any]) -> str:
|
725
|
-
# Don't use backend_utils.is_controller_up since it is too slow.
|
726
|
-
handle = global_user_state.get_handle_from_cluster_name(
|
727
|
-
SKY_SERVE_CONTROLLER_NAME)
|
728
|
-
assert isinstance(handle, backends.CloudVmRayResourceHandle)
|
729
|
-
if handle is None:
|
730
|
-
return '-'
|
731
|
-
load_balancer_port = service_record['load_balancer_port']
|
732
|
-
if load_balancer_port is None:
|
733
|
-
return '-'
|
734
|
-
try:
|
735
|
-
endpoint = backend_utils.get_endpoints(handle.cluster_name,
|
736
|
-
load_balancer_port).get(
|
737
|
-
load_balancer_port, None)
|
738
|
-
except exceptions.ClusterNotUpError:
|
739
|
-
return '-'
|
740
|
-
if endpoint is None:
|
741
|
-
return '-'
|
742
|
-
assert isinstance(endpoint, str), endpoint
|
743
|
-
return endpoint
|
744
|
-
|
745
|
-
|
746
846
|
def format_service_table(service_records: List[Dict[str, Any]],
|
747
847
|
show_all: bool) -> str:
|
748
848
|
if not service_records:
|
@@ -752,10 +852,12 @@ def format_service_table(service_records: List[Dict[str, Any]],
|
|
752
852
|
'NAME', 'VERSION', 'UPTIME', 'STATUS', 'REPLICAS', 'ENDPOINT'
|
753
853
|
]
|
754
854
|
if show_all:
|
755
|
-
service_columns.extend([
|
855
|
+
service_columns.extend([
|
856
|
+
'AUTOSCALING_POLICY', 'LOAD_BALANCING_POLICY', 'REQUESTED_RESOURCES'
|
857
|
+
])
|
756
858
|
service_table = log_utils.create_table(service_columns)
|
757
859
|
|
758
|
-
replica_infos = []
|
860
|
+
replica_infos: List[Dict[str, Any]] = []
|
759
861
|
for record in service_records:
|
760
862
|
for replica in record['replica_info']:
|
761
863
|
replica['service_name'] = record['name']
|
@@ -770,14 +872,12 @@ def format_service_table(service_records: List[Dict[str, Any]],
|
|
770
872
|
service_status = record['status']
|
771
873
|
status_str = service_status.colored_str()
|
772
874
|
replicas = _get_replicas(record)
|
773
|
-
endpoint =
|
875
|
+
endpoint = record['endpoint']
|
876
|
+
if endpoint is None:
|
877
|
+
endpoint = '-'
|
774
878
|
policy = record['policy']
|
775
|
-
|
776
|
-
|
777
|
-
if record.get('requested_resources_str') is None:
|
778
|
-
requested_resources_str = str(record['requested_resources'])
|
779
|
-
else:
|
780
|
-
requested_resources_str = record['requested_resources_str']
|
879
|
+
requested_resources_str = record['requested_resources_str']
|
880
|
+
load_balancing_policy = record['load_balancing_policy']
|
781
881
|
|
782
882
|
service_values = [
|
783
883
|
service_name,
|
@@ -788,7 +888,8 @@ def format_service_table(service_records: List[Dict[str, Any]],
|
|
788
888
|
endpoint,
|
789
889
|
]
|
790
890
|
if show_all:
|
791
|
-
service_values.extend(
|
891
|
+
service_values.extend(
|
892
|
+
[policy, load_balancing_policy, requested_resources_str])
|
792
893
|
service_table.add_row(service_values)
|
793
894
|
|
794
895
|
replica_table = _format_replica_table(replica_infos, show_all)
|
@@ -830,7 +931,8 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
|
|
830
931
|
region = '-'
|
831
932
|
zone = '-'
|
832
933
|
|
833
|
-
replica_handle: 'backends.CloudVmRayResourceHandle' = record[
|
934
|
+
replica_handle: Optional['backends.CloudVmRayResourceHandle'] = record[
|
935
|
+
'handle']
|
834
936
|
if replica_handle is not None:
|
835
937
|
resources_str = resources_utils.get_readable_resources_repr(
|
836
938
|
replica_handle, simplify=not show_all)
|
@@ -902,6 +1004,18 @@ class ServeCodeGen:
|
|
902
1004
|
]
|
903
1005
|
return cls._build(code)
|
904
1006
|
|
1007
|
+
@classmethod
|
1008
|
+
def terminate_replica(cls, service_name: str, replica_id: int,
|
1009
|
+
purge: bool) -> str:
|
1010
|
+
code = [
|
1011
|
+
f'(lambda: print(serve_utils.terminate_replica({service_name!r}, '
|
1012
|
+
f'{replica_id}, {purge}), end="", flush=True) '
|
1013
|
+
'if getattr(constants, "SERVE_VERSION", 0) >= 2 else '
|
1014
|
+
f'exec("raise RuntimeError('
|
1015
|
+
f'{constants.TERMINATE_REPLICA_VERSION_MISMATCH_ERROR!r})"))()'
|
1016
|
+
]
|
1017
|
+
return cls._build(code)
|
1018
|
+
|
905
1019
|
@classmethod
|
906
1020
|
def wait_service_registration(cls, service_name: str, job_id: int) -> str:
|
907
1021
|
code = [
|
@@ -933,21 +1047,18 @@ class ServeCodeGen:
|
|
933
1047
|
def _build(cls, code: List[str]) -> str:
|
934
1048
|
code = cls._PREFIX + code
|
935
1049
|
generated_code = '; '.join(code)
|
936
|
-
|
1050
|
+
# Use the local user id to make sure the operation goes to the correct
|
1051
|
+
# user.
|
1052
|
+
return (f'export {skylet_constants.USER_ID_ENV_VAR}='
|
1053
|
+
f'"{common_utils.get_user_hash()}"; '
|
1054
|
+
f'{skylet_constants.SKY_PYTHON_CMD} '
|
937
1055
|
f'-u -c {shlex.quote(generated_code)}')
|
938
1056
|
|
939
1057
|
@classmethod
|
940
1058
|
def update_service(cls, service_name: str, version: int, mode: str) -> str:
|
941
1059
|
code = [
|
942
|
-
# Backward compatibility for old serve version on the remote
|
943
|
-
# machine. The `mode` argument was added in #3249, and if the remote
|
944
|
-
# machine has an old SkyPilot version before that, we need to avoid
|
945
|
-
# passing the `mode` argument to the job_lib functions.
|
946
|
-
# TODO(zhwu): Remove this in 0.7.0 release.
|
947
|
-
f'mode_kwargs = {{"mode": {mode!r}}} '
|
948
|
-
'if getattr(constants, "SERVE_VERSION", 0) >= 1 else {}',
|
949
1060
|
f'msg = serve_utils.update_service_encoded({service_name!r}, '
|
950
|
-
f'{version},
|
1061
|
+
f'{version}, mode={mode!r})',
|
951
1062
|
'print(msg, end="", flush=True)',
|
952
1063
|
]
|
953
1064
|
return cls._build(code)
|