skypilot-nightly 1.0.0.dev20250417__py3-none-any.whl → 1.0.0.dev20250422__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/aws.py +2 -13
- sky/backends/backend_utils.py +28 -0
- sky/backends/wheel_utils.py +9 -0
- sky/cli.py +93 -24
- sky/client/cli.py +93 -24
- sky/client/common.py +10 -3
- sky/client/sdk.py +6 -3
- sky/clouds/aws.py +5 -5
- sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +9 -9
- sky/dashboard/out/404.html +1 -0
- sky/dashboard/out/_next/static/2GsKhI8XKYj9B2969iIDf/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/2GsKhI8XKYj9B2969iIDf/_ssgManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/236-d437cf66e68a6f64.js +6 -0
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +15 -0
- sky/dashboard/out/_next/static/chunks/37-72fdc8f71d6e4784.js +6 -0
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +59 -0
- sky/dashboard/out/_next/static/chunks/845-2ea1cc63ba1f4067.js +1 -0
- sky/dashboard/out/_next/static/chunks/979-7cd0778078b9cfad.js +1 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-3001e84c61acddfb.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-b09f7fbf6d5d74f6.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b57ec043f09c5813.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ef2e0e91a9222cac.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +1 -0
- sky/dashboard/out/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +1 -0
- sky/dashboard/out/_next/static/css/f3538cd90cfca88c.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -0
- sky/dashboard/out/clusters/[cluster].html +1 -0
- sky/dashboard/out/clusters.html +1 -0
- sky/dashboard/out/favicon.ico +0 -0
- sky/dashboard/out/index.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -0
- sky/dashboard/out/jobs.html +1 -0
- sky/dashboard/out/skypilot.svg +15 -0
- sky/dashboard/out/videos/cursor-small.mp4 +0 -0
- sky/data/data_transfer.py +2 -1
- sky/data/storage.py +24 -14
- sky/optimizer.py +7 -9
- sky/provision/provisioner.py +2 -1
- sky/provision/runpod/utils.py +32 -6
- sky/resources.py +11 -2
- sky/serve/__init__.py +2 -0
- sky/serve/autoscalers.py +6 -2
- sky/serve/client/sdk.py +61 -0
- sky/serve/replica_managers.py +6 -8
- sky/serve/serve_utils.py +33 -1
- sky/serve/server/core.py +187 -5
- sky/serve/server/server.py +28 -0
- sky/server/common.py +19 -1
- sky/server/constants.py +6 -0
- sky/server/requests/executor.py +4 -0
- sky/server/requests/payloads.py +27 -15
- sky/server/server.py +43 -0
- sky/setup_files/MANIFEST.in +1 -0
- sky/sky_logging.py +10 -0
- sky/skypilot_config.py +58 -37
- sky/templates/kubernetes-ray.yml.j2 +6 -2
- sky/utils/config_utils.py +0 -1
- sky/utils/controller_utils.py +0 -1
- {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/RECORD +73 -40
- {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/top_level.txt +0 -0
sky/provision/provisioner.py
CHANGED
@@ -670,6 +670,7 @@ def post_provision_runtime_setup(
|
|
670
670
|
ux_utils.error_message(
|
671
671
|
'Failed to set up SkyPilot runtime on cluster.',
|
672
672
|
provision_logging.config.log_path))
|
673
|
-
|
673
|
+
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
674
|
+
logger.debug(f'Stacktrace:\n{traceback.format_exc()}')
|
674
675
|
with ux_utils.print_exception_no_traceback():
|
675
676
|
raise
|
sky/provision/runpod/utils.py
CHANGED
@@ -14,11 +14,19 @@ from sky.utils import common_utils
|
|
14
14
|
logger = sky_logging.init_logger(__name__)
|
15
15
|
|
16
16
|
GPU_NAME_MAP = {
|
17
|
+
# AMD
|
18
|
+
'MI300X': 'AMD Instinct MI300X OAM',
|
19
|
+
|
20
|
+
# NVIDIA A-series
|
17
21
|
'A100-80GB': 'NVIDIA A100 80GB PCIe',
|
18
|
-
'A100-40GB': 'NVIDIA A100-PCIE-40GB',
|
19
22
|
'A100-80GB-SXM': 'NVIDIA A100-SXM4-80GB',
|
20
23
|
'A30': 'NVIDIA A30',
|
21
24
|
'A40': 'NVIDIA A40',
|
25
|
+
|
26
|
+
# NVIDIA B-series
|
27
|
+
'B200': 'NVIDIA B200',
|
28
|
+
|
29
|
+
# GeForce
|
22
30
|
'RTX3070': 'NVIDIA GeForce RTX 3070',
|
23
31
|
'RTX3080': 'NVIDIA GeForce RTX 3080',
|
24
32
|
'RTX3080Ti': 'NVIDIA GeForce RTX 3080 Ti',
|
@@ -26,25 +34,43 @@ GPU_NAME_MAP = {
|
|
26
34
|
'RTX3090Ti': 'NVIDIA GeForce RTX 3090 Ti',
|
27
35
|
'RTX4070Ti': 'NVIDIA GeForce RTX 4070 Ti',
|
28
36
|
'RTX4080': 'NVIDIA GeForce RTX 4080',
|
37
|
+
'RTX4080SUPER': 'NVIDIA GeForce RTX 4080 SUPER',
|
29
38
|
'RTX4090': 'NVIDIA GeForce RTX 4090',
|
39
|
+
'RTX5080': 'NVIDIA GeForce RTX 5080',
|
40
|
+
'RTX5090': 'NVIDIA GeForce RTX 5090',
|
41
|
+
|
42
|
+
# NVIDIA H100/H200
|
30
43
|
# Following instance is displayed as SXM at the console
|
31
44
|
# but the ID from the API appears as HBM
|
32
45
|
'H100-SXM': 'NVIDIA H100 80GB HBM3',
|
46
|
+
'H100-NVL': 'NVIDIA H100 NVL',
|
33
47
|
'H100': 'NVIDIA H100 PCIe',
|
48
|
+
'H200-SXM': 'NVIDIA H200',
|
49
|
+
|
50
|
+
# NVIDIA L-series
|
34
51
|
'L4': 'NVIDIA L4',
|
35
52
|
'L40': 'NVIDIA L40',
|
36
|
-
'
|
53
|
+
'L40S': 'NVIDIA L40S',
|
54
|
+
|
55
|
+
# Ada generation (GeForce & RTX A)
|
56
|
+
'RTX2000-Ada': 'NVIDIA RTX 2000 Ada Generation',
|
37
57
|
'RTX4000-Ada': 'NVIDIA RTX 4000 Ada Generation',
|
58
|
+
'RTX4000-Ada-SFF': 'NVIDIA RTX 4000 SFF Ada Generation',
|
59
|
+
'RTX5000-Ada': 'NVIDIA RTX 5000 Ada Generation',
|
38
60
|
'RTX6000-Ada': 'NVIDIA RTX 6000 Ada Generation',
|
61
|
+
|
62
|
+
# NVIDIA RTX A-series
|
63
|
+
'RTXA2000': 'NVIDIA RTX A2000',
|
39
64
|
'RTXA4000': 'NVIDIA RTX A4000',
|
40
65
|
'RTXA4500': 'NVIDIA RTX A4500',
|
41
66
|
'RTXA5000': 'NVIDIA RTX A5000',
|
42
67
|
'RTXA6000': 'NVIDIA RTX A6000',
|
43
|
-
|
68
|
+
|
69
|
+
# Tesla V100 variants
|
44
70
|
'V100-16GB-FHHL': 'Tesla V100-FHHL-16GB',
|
45
|
-
'V100-16GB-SXM2': 'V100-SXM2-16GB',
|
46
|
-
'
|
47
|
-
'V100-16GB-PCIe': 'Tesla V100-PCIE-16GB'
|
71
|
+
'V100-16GB-SXM2': 'Tesla V100-SXM2-16GB',
|
72
|
+
'V100-32GB-SXM2': 'Tesla V100-SXM2-32GB',
|
73
|
+
'V100-16GB-PCIe': 'Tesla V100-PCIE-16GB',
|
48
74
|
}
|
49
75
|
|
50
76
|
|
sky/resources.py
CHANGED
@@ -1370,7 +1370,8 @@ class Resources:
|
|
1370
1370
|
return features
|
1371
1371
|
|
1372
1372
|
@staticmethod
|
1373
|
-
def
|
1373
|
+
def _apply_resource_config_aliases(
|
1374
|
+
config: Optional[Dict[str, Any]]) -> None:
|
1374
1375
|
"""Mutatively applies overriding aliases to the passed in config.
|
1375
1376
|
|
1376
1377
|
Note: Nested aliases are not supported.
|
@@ -1399,7 +1400,15 @@ class Resources:
|
|
1399
1400
|
if config is None:
|
1400
1401
|
return {Resources()}
|
1401
1402
|
|
1402
|
-
Resources.
|
1403
|
+
Resources._apply_resource_config_aliases(config)
|
1404
|
+
anyof = config.get('any_of')
|
1405
|
+
if anyof is not None and isinstance(anyof, list):
|
1406
|
+
for anyof_config in anyof:
|
1407
|
+
Resources._apply_resource_config_aliases(anyof_config)
|
1408
|
+
ordered = config.get('ordered')
|
1409
|
+
if ordered is not None and isinstance(ordered, list):
|
1410
|
+
for ordered_config in ordered:
|
1411
|
+
Resources._apply_resource_config_aliases(ordered_config)
|
1403
1412
|
common_utils.validate_schema(config, schemas.get_resources_schema(),
|
1404
1413
|
'Invalid resources YAML: ')
|
1405
1414
|
|
sky/serve/__init__.py
CHANGED
@@ -3,6 +3,7 @@ import os
|
|
3
3
|
|
4
4
|
from sky.serve.client.sdk import down
|
5
5
|
from sky.serve.client.sdk import status
|
6
|
+
from sky.serve.client.sdk import sync_down_logs
|
6
7
|
from sky.serve.client.sdk import tail_logs
|
7
8
|
from sky.serve.client.sdk import terminate_replica
|
8
9
|
from sky.serve.client.sdk import up
|
@@ -37,6 +38,7 @@ __all__ = [
|
|
37
38
|
'LB_POLICIES',
|
38
39
|
'ReplicaStatus',
|
39
40
|
'ServiceComponent',
|
41
|
+
'sync_down_logs',
|
40
42
|
'ServiceStatus',
|
41
43
|
'ServeCodeGen',
|
42
44
|
'SkyServiceSpec',
|
sky/serve/autoscalers.py
CHANGED
@@ -676,8 +676,12 @@ class FallbackRequestRateAutoscaler(RequestRateAutoscaler):
|
|
676
676
|
# because the provisioning spot can fail to UP due to the capacity
|
677
677
|
# issue, and on-demand should fill the gap between the required
|
678
678
|
# number of spot and ready spot.
|
679
|
-
|
680
|
-
|
679
|
+
# When scaling down spot instances, it is possible that the number
|
680
|
+
# of ready spot is more than the number of spot to provision, thus
|
681
|
+
# generate a negative number. In this case, we don't need to
|
682
|
+
# provision on-demand instances.
|
683
|
+
num_ondemand_to_provision += max(
|
684
|
+
0, num_spot_to_provision - num_ready_spot)
|
681
685
|
|
682
686
|
# Make sure we don't launch on-demand fallback for
|
683
687
|
# overprovisioned replicas.
|
sky/serve/client/sdk.py
CHANGED
@@ -374,3 +374,64 @@ def tail_logs(service_name: str,
|
|
374
374
|
)
|
375
375
|
request_id = server_common.get_request_id(response)
|
376
376
|
sdk.stream_response(request_id, response, output_stream)
|
377
|
+
|
378
|
+
|
379
|
+
@usage_lib.entrypoint
|
380
|
+
@server_common.check_server_healthy_or_start
|
381
|
+
def sync_down_logs(service_name: str,
|
382
|
+
local_dir: str,
|
383
|
+
*,
|
384
|
+
targets: Optional[Union[
|
385
|
+
str, 'serve_utils.ServiceComponent',
|
386
|
+
List[Union[str,
|
387
|
+
'serve_utils.ServiceComponent']]]] = None,
|
388
|
+
replica_ids: Optional[List[int]] = None) -> None:
|
389
|
+
"""Sync down logs from the service components to a local directory.
|
390
|
+
|
391
|
+
This function syncs logs from the specified service components (controller,
|
392
|
+
load balancer, replicas) via the API server to a specified local directory.
|
393
|
+
|
394
|
+
Args:
|
395
|
+
service_name: The name of the service to download logs from.
|
396
|
+
targets: Which component(s) to download logs for. If None or empty,
|
397
|
+
means download all logs (controller, load-balancer, all replicas).
|
398
|
+
Can be a string (e.g. "controller"), or a `ServiceComponent` object,
|
399
|
+
or a list of them for multiple components. Currently accepted
|
400
|
+
values:
|
401
|
+
- "controller"/ServiceComponent.CONTROLLER
|
402
|
+
- "load_balancer"/ServiceComponent.LOAD_BALANCER
|
403
|
+
- "replica"/ServiceComponent.REPLICA
|
404
|
+
replica_ids: The list of replica IDs to download logs from, specified
|
405
|
+
when target includes `ServiceComponent.REPLICA`. If target includes
|
406
|
+
`ServiceComponent.REPLICA` but this is None/empty, logs for all
|
407
|
+
replicas will be downloaded.
|
408
|
+
local_dir: Local directory to sync down logs to. Defaults to
|
409
|
+
`~/sky_logs`.
|
410
|
+
|
411
|
+
Raises:
|
412
|
+
RuntimeError: If fails to gather logs or fails to rsync from the
|
413
|
+
controller.
|
414
|
+
sky.exceptions.ClusterNotUpError: If the controller is not up.
|
415
|
+
ValueError: Arguments not valid.
|
416
|
+
"""
|
417
|
+
# Avoid circular import.
|
418
|
+
from sky.client import sdk # pylint: disable=import-outside-toplevel
|
419
|
+
|
420
|
+
body = payloads.ServeDownloadLogsBody(
|
421
|
+
service_name=service_name,
|
422
|
+
# No need to set here, since the server will override it
|
423
|
+
# to a directory on the API server.
|
424
|
+
local_dir=local_dir,
|
425
|
+
targets=targets,
|
426
|
+
replica_ids=replica_ids,
|
427
|
+
)
|
428
|
+
response = requests.post(
|
429
|
+
f'{server_common.get_server_url()}/serve/sync-down-logs',
|
430
|
+
json=json.loads(body.model_dump_json()),
|
431
|
+
timeout=(5, None),
|
432
|
+
)
|
433
|
+
remote_dir = sdk.stream_and_get(server_common.get_request_id(response))
|
434
|
+
|
435
|
+
# Download from API server paths to the client's local_dir
|
436
|
+
client_common.download_logs_from_api_server([remote_dir], remote_dir,
|
437
|
+
local_dir)
|
sky/serve/replica_managers.py
CHANGED
@@ -257,14 +257,6 @@ class ReplicaStatusProperty:
|
|
257
257
|
# is set to True and it can fail immediately due to spot availability.
|
258
258
|
failed_spot_availability: bool = False
|
259
259
|
|
260
|
-
def remove_terminated_replica(self) -> bool:
|
261
|
-
"""Whether to remove the replica record from the replica table.
|
262
|
-
|
263
|
-
If not, the replica will stay in the replica table permanently to
|
264
|
-
notify the user that something is wrong with the user code / setup.
|
265
|
-
"""
|
266
|
-
return self.is_scale_down
|
267
|
-
|
268
260
|
def unrecoverable_failure(self) -> bool:
|
269
261
|
"""Whether the replica fails and cannot be recovered.
|
270
262
|
|
@@ -730,6 +722,12 @@ class SkyPilotReplicaManager(ReplicaManager):
|
|
730
722
|
replica_drain_delay_seconds: int,
|
731
723
|
is_scale_down: bool = False,
|
732
724
|
purge: bool = False) -> None:
|
725
|
+
left_in_record = not (is_scale_down or purge)
|
726
|
+
if left_in_record:
|
727
|
+
assert sync_down_logs, (
|
728
|
+
'For the replica left in the record, '
|
729
|
+
'the logs should always be synced down. '
|
730
|
+
'So that the user can see the logs to debug.')
|
733
731
|
|
734
732
|
if replica_id in self._launch_process_pool:
|
735
733
|
info = serve_state.get_replica_info_from_id(self._service_name,
|
sky/serve/serve_utils.py
CHANGED
@@ -13,7 +13,7 @@ import threading
|
|
13
13
|
import time
|
14
14
|
import typing
|
15
15
|
from typing import (Any, Callable, DefaultDict, Dict, Generic, Iterator, List,
|
16
|
-
Optional, TextIO, Type, TypeVar)
|
16
|
+
Optional, TextIO, Type, TypeVar, Union)
|
17
17
|
import uuid
|
18
18
|
|
19
19
|
import colorama
|
@@ -81,6 +81,38 @@ class ServiceComponent(enum.Enum):
|
|
81
81
|
REPLICA = 'replica'
|
82
82
|
|
83
83
|
|
84
|
+
@dataclasses.dataclass
|
85
|
+
class ServiceComponentTarget:
|
86
|
+
"""Represents a target service component with an optional replica ID.
|
87
|
+
"""
|
88
|
+
component: ServiceComponent
|
89
|
+
replica_id: Optional[int] = None
|
90
|
+
|
91
|
+
def __init__(self,
|
92
|
+
component: Union[str, ServiceComponent],
|
93
|
+
replica_id: Optional[int] = None):
|
94
|
+
if isinstance(component, str):
|
95
|
+
component = ServiceComponent(component)
|
96
|
+
self.component = component
|
97
|
+
self.replica_id = replica_id
|
98
|
+
|
99
|
+
def __post_init__(self):
|
100
|
+
"""Validate that replica_id is only provided for REPLICA component."""
|
101
|
+
if (self.component
|
102
|
+
== ServiceComponent.REPLICA) != (self.replica_id is None):
|
103
|
+
raise ValueError(
|
104
|
+
'replica_id must be specified if and only if component is '
|
105
|
+
'REPLICA.')
|
106
|
+
|
107
|
+
def __hash__(self) -> int:
|
108
|
+
return hash((self.component, self.replica_id))
|
109
|
+
|
110
|
+
def __str__(self) -> str:
|
111
|
+
if self.component == ServiceComponent.REPLICA:
|
112
|
+
return f'{self.component.value}-{self.replica_id}'
|
113
|
+
return self.component.value
|
114
|
+
|
115
|
+
|
84
116
|
class UserSignal(enum.Enum):
|
85
117
|
"""User signal to send to controller.
|
86
118
|
|
sky/serve/server/core.py
CHANGED
@@ -1,9 +1,11 @@
|
|
1
1
|
"""SkyServe core APIs."""
|
2
|
+
import pathlib
|
2
3
|
import re
|
3
4
|
import signal
|
4
5
|
import tempfile
|
5
6
|
import threading
|
6
|
-
|
7
|
+
import typing
|
8
|
+
from typing import Any, Dict, List, Optional, Set, Tuple, Union
|
7
9
|
|
8
10
|
import colorama
|
9
11
|
|
@@ -29,6 +31,9 @@ from sky.utils import rich_utils
|
|
29
31
|
from sky.utils import subprocess_utils
|
30
32
|
from sky.utils import ux_utils
|
31
33
|
|
34
|
+
if typing.TYPE_CHECKING:
|
35
|
+
from sky.backends import cloud_vm_ray_backend
|
36
|
+
|
32
37
|
logger = sky_logging.init_logger(__name__)
|
33
38
|
|
34
39
|
|
@@ -64,6 +69,41 @@ def _rewrite_tls_credential_paths_and_get_tls_env_vars(
|
|
64
69
|
return tls_template_vars
|
65
70
|
|
66
71
|
|
72
|
+
def _get_all_replica_targets(
|
73
|
+
service_name: str, backend: backends.CloudVmRayBackend,
|
74
|
+
handle: backends.CloudVmRayResourceHandle
|
75
|
+
) -> Set[serve_utils.ServiceComponentTarget]:
|
76
|
+
"""Helper function to get targets for all live replicas."""
|
77
|
+
code = serve_utils.ServeCodeGen.get_service_status([service_name])
|
78
|
+
returncode, serve_status_payload, stderr = backend.run_on_head(
|
79
|
+
handle,
|
80
|
+
code,
|
81
|
+
require_outputs=True,
|
82
|
+
stream_logs=False,
|
83
|
+
separate_stderr=True)
|
84
|
+
|
85
|
+
try:
|
86
|
+
subprocess_utils.handle_returncode(returncode,
|
87
|
+
code,
|
88
|
+
'Failed to fetch services',
|
89
|
+
stderr,
|
90
|
+
stream_logs=True)
|
91
|
+
except exceptions.CommandError as e:
|
92
|
+
raise RuntimeError(e.error_msg) from e
|
93
|
+
|
94
|
+
service_records = serve_utils.load_service_status(serve_status_payload)
|
95
|
+
if not service_records:
|
96
|
+
raise ValueError(f'Service {service_name!r} not found.')
|
97
|
+
assert len(service_records) == 1
|
98
|
+
service_record = service_records[0]
|
99
|
+
|
100
|
+
return {
|
101
|
+
serve_utils.ServiceComponentTarget(serve_utils.ServiceComponent.REPLICA,
|
102
|
+
replica_info['replica_id'])
|
103
|
+
for replica_info in service_record['replica_info']
|
104
|
+
}
|
105
|
+
|
106
|
+
|
67
107
|
@usage_lib.entrypoint
|
68
108
|
def up(
|
69
109
|
task: 'sky.Task',
|
@@ -685,11 +725,14 @@ def status(
|
|
685
725
|
return service_records
|
686
726
|
|
687
727
|
|
728
|
+
ServiceComponentOrStr = Union[str, serve_utils.ServiceComponent]
|
729
|
+
|
730
|
+
|
688
731
|
@usage_lib.entrypoint
|
689
732
|
def tail_logs(
|
690
733
|
service_name: str,
|
691
734
|
*,
|
692
|
-
target:
|
735
|
+
target: ServiceComponentOrStr,
|
693
736
|
replica_id: Optional[int] = None,
|
694
737
|
follow: bool = True,
|
695
738
|
) -> None:
|
@@ -743,10 +786,11 @@ def tail_logs(
|
|
743
786
|
with ux_utils.print_exception_no_traceback():
|
744
787
|
raise ValueError('`replica_id` must be None when using '
|
745
788
|
'target=CONTROLLER/LOAD_BALANCER.')
|
789
|
+
|
790
|
+
controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
|
746
791
|
handle = backend_utils.is_controller_accessible(
|
747
|
-
controller=
|
748
|
-
stopped_message=
|
749
|
-
value.default_hint_if_non_existent))
|
792
|
+
controller=controller_type,
|
793
|
+
stopped_message=controller_type.value.default_hint_if_non_existent)
|
750
794
|
|
751
795
|
backend = backend_utils.get_backend_from_handle(handle)
|
752
796
|
assert isinstance(backend, backends.CloudVmRayBackend), backend
|
@@ -775,3 +819,141 @@ def tail_logs(
|
|
775
819
|
stream_logs=True,
|
776
820
|
process_stream=False,
|
777
821
|
ssh_mode=command_runner.SshMode.INTERACTIVE)
|
822
|
+
|
823
|
+
|
824
|
+
@usage_lib.entrypoint
|
825
|
+
def sync_down_logs(
|
826
|
+
service_name: str,
|
827
|
+
*,
|
828
|
+
local_dir: str,
|
829
|
+
targets: Union[ServiceComponentOrStr, List[ServiceComponentOrStr],
|
830
|
+
None] = None,
|
831
|
+
replica_ids: Optional[List[int]] = None,
|
832
|
+
) -> str:
|
833
|
+
"""Sync down logs from the controller for the given service.
|
834
|
+
|
835
|
+
This function is called by the server endpoint. It gathers logs from the
|
836
|
+
controller, load balancer, and/or replicas and places them in a directory
|
837
|
+
under the user's log space on the API server filesystem.
|
838
|
+
|
839
|
+
Args:
|
840
|
+
service_name: The name of the service to download logs from.
|
841
|
+
local_dir: The local directory to save the logs to.
|
842
|
+
targets: Which component(s) to download logs for. If None or empty,
|
843
|
+
means download all logs (controller, load-balancer, all replicas).
|
844
|
+
Can be a string (e.g. "controller"), or a `ServiceComponent` object,
|
845
|
+
or a list of them for multiple components. Currently accepted
|
846
|
+
values:
|
847
|
+
- "controller"/ServiceComponent.CONTROLLER
|
848
|
+
- "load_balancer"/ServiceComponent.LOAD_BALANCER
|
849
|
+
- "replica"/ServiceComponent.REPLICA
|
850
|
+
replica_ids: The list of replica IDs to download logs from, specified
|
851
|
+
when target includes `ServiceComponent.REPLICA`. If target includes
|
852
|
+
`ServiceComponent.REPLICA` but this is None/empty, logs for all
|
853
|
+
replicas will be downloaded.
|
854
|
+
|
855
|
+
Returns:
|
856
|
+
A dict mapping component names to local paths where the logs were synced
|
857
|
+
down to.
|
858
|
+
|
859
|
+
Raises:
|
860
|
+
RuntimeError: If fails to gather logs or fails to rsync from the
|
861
|
+
controller.
|
862
|
+
sky.exceptions.ClusterNotUpError: If the controller is not up.
|
863
|
+
ValueError: Arguments not valid.
|
864
|
+
"""
|
865
|
+
# Step 0) get the controller handle
|
866
|
+
with rich_utils.safe_status(
|
867
|
+
ux_utils.spinner_message('Checking service status...')):
|
868
|
+
controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
|
869
|
+
handle = backend_utils.is_controller_accessible(
|
870
|
+
controller=controller_type,
|
871
|
+
stopped_message=controller_type.value.default_hint_if_non_existent)
|
872
|
+
backend: backends.CloudVmRayBackend = (
|
873
|
+
backend_utils.get_backend_from_handle(handle))
|
874
|
+
|
875
|
+
requested_components: Set[serve_utils.ServiceComponent] = set()
|
876
|
+
if not targets:
|
877
|
+
# No targets specified -> request all components
|
878
|
+
requested_components = {
|
879
|
+
serve_utils.ServiceComponent.CONTROLLER,
|
880
|
+
serve_utils.ServiceComponent.LOAD_BALANCER,
|
881
|
+
serve_utils.ServiceComponent.REPLICA
|
882
|
+
}
|
883
|
+
else:
|
884
|
+
# Parse provided targets
|
885
|
+
if isinstance(targets, (str, serve_utils.ServiceComponent)):
|
886
|
+
requested_components = {serve_utils.ServiceComponent(targets)}
|
887
|
+
else: # list
|
888
|
+
requested_components = {
|
889
|
+
serve_utils.ServiceComponent(t) for t in targets
|
890
|
+
}
|
891
|
+
|
892
|
+
normalized_targets: Set[serve_utils.ServiceComponentTarget] = set()
|
893
|
+
if serve_utils.ServiceComponent.CONTROLLER in requested_components:
|
894
|
+
normalized_targets.add(
|
895
|
+
serve_utils.ServiceComponentTarget(
|
896
|
+
serve_utils.ServiceComponent.CONTROLLER))
|
897
|
+
if serve_utils.ServiceComponent.LOAD_BALANCER in requested_components:
|
898
|
+
normalized_targets.add(
|
899
|
+
serve_utils.ServiceComponentTarget(
|
900
|
+
serve_utils.ServiceComponent.LOAD_BALANCER))
|
901
|
+
if serve_utils.ServiceComponent.REPLICA in requested_components:
|
902
|
+
with rich_utils.safe_status(
|
903
|
+
ux_utils.spinner_message('Getting live replica infos...')):
|
904
|
+
replica_targets = _get_all_replica_targets(service_name, backend,
|
905
|
+
handle)
|
906
|
+
if not replica_ids:
|
907
|
+
# Replica target requested but no specific IDs
|
908
|
+
# -> Get all replica logs
|
909
|
+
normalized_targets.update(replica_targets)
|
910
|
+
else:
|
911
|
+
# Replica target requested with specific IDs
|
912
|
+
requested_replica_targets = [
|
913
|
+
serve_utils.ServiceComponentTarget(
|
914
|
+
serve_utils.ServiceComponent.REPLICA, rid)
|
915
|
+
for rid in replica_ids
|
916
|
+
]
|
917
|
+
for target in requested_replica_targets:
|
918
|
+
if target not in replica_targets:
|
919
|
+
logger.warning(f'Replica ID {target.replica_id} not found '
|
920
|
+
f'for {service_name}. Skipping...')
|
921
|
+
else:
|
922
|
+
normalized_targets.add(target)
|
923
|
+
|
924
|
+
def sync_down_logs_by_target(target: serve_utils.ServiceComponentTarget):
|
925
|
+
component = target.component
|
926
|
+
# We need to set one side of the pipe to a logs stream, and the other
|
927
|
+
# side to a file.
|
928
|
+
log_path = str(pathlib.Path(local_dir) / f'{target}.log')
|
929
|
+
stream_logs_code: str
|
930
|
+
|
931
|
+
if component == serve_utils.ServiceComponent.CONTROLLER:
|
932
|
+
stream_logs_code = (
|
933
|
+
serve_utils.ServeCodeGen.stream_serve_process_logs(
|
934
|
+
service_name, stream_controller=True, follow=False))
|
935
|
+
elif component == serve_utils.ServiceComponent.LOAD_BALANCER:
|
936
|
+
stream_logs_code = (
|
937
|
+
serve_utils.ServeCodeGen.stream_serve_process_logs(
|
938
|
+
service_name, stream_controller=False, follow=False))
|
939
|
+
elif component == serve_utils.ServiceComponent.REPLICA:
|
940
|
+
replica_id = target.replica_id
|
941
|
+
assert replica_id is not None, service_name
|
942
|
+
stream_logs_code = serve_utils.ServeCodeGen.stream_replica_logs(
|
943
|
+
service_name, replica_id, follow=False)
|
944
|
+
else:
|
945
|
+
assert False, component
|
946
|
+
|
947
|
+
# Refer to the notes in
|
948
|
+
# sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
|
949
|
+
backend.run_on_head(handle,
|
950
|
+
stream_logs_code,
|
951
|
+
stream_logs=False,
|
952
|
+
process_stream=False,
|
953
|
+
ssh_mode=command_runner.SshMode.INTERACTIVE,
|
954
|
+
log_path=log_path)
|
955
|
+
|
956
|
+
subprocess_utils.run_in_parallel(sync_down_logs_by_target,
|
957
|
+
list(normalized_targets))
|
958
|
+
|
959
|
+
return local_dir
|
sky/serve/server/server.py
CHANGED
@@ -1,13 +1,17 @@
|
|
1
1
|
"""Rest APIs for SkyServe."""
|
2
2
|
|
3
|
+
import pathlib
|
4
|
+
|
3
5
|
import fastapi
|
4
6
|
|
5
7
|
from sky import sky_logging
|
6
8
|
from sky.serve.server import core
|
9
|
+
from sky.server import common as server_common
|
7
10
|
from sky.server import stream_utils
|
8
11
|
from sky.server.requests import executor
|
9
12
|
from sky.server.requests import payloads
|
10
13
|
from sky.server.requests import requests as api_requests
|
14
|
+
from sky.skylet import constants
|
11
15
|
from sky.utils import common
|
12
16
|
|
13
17
|
logger = sky_logging.init_logger(__name__)
|
@@ -110,3 +114,27 @@ async def tail_logs(
|
|
110
114
|
logs_path=request_task.log_path,
|
111
115
|
background_tasks=background_tasks,
|
112
116
|
)
|
117
|
+
|
118
|
+
|
119
|
+
@router.post('/sync-down-logs')
|
120
|
+
async def download_logs(
|
121
|
+
request: fastapi.Request,
|
122
|
+
download_logs_body: payloads.ServeDownloadLogsBody,
|
123
|
+
) -> None:
|
124
|
+
user_hash = download_logs_body.env_vars[constants.USER_ID_ENV_VAR]
|
125
|
+
timestamp = sky_logging.get_run_timestamp()
|
126
|
+
logs_dir_on_api_server = (
|
127
|
+
pathlib.Path(server_common.api_server_user_logs_dir_prefix(user_hash)) /
|
128
|
+
'service' / f'{download_logs_body.service_name}_{timestamp}')
|
129
|
+
logs_dir_on_api_server.mkdir(parents=True, exist_ok=True)
|
130
|
+
# We should reuse the original request body, so that the env vars, such as
|
131
|
+
# user hash, are kept the same.
|
132
|
+
download_logs_body.local_dir = str(logs_dir_on_api_server)
|
133
|
+
executor.schedule_request(
|
134
|
+
request_id=request.state.request_id,
|
135
|
+
request_name='serve.sync_down_logs',
|
136
|
+
request_body=download_logs_body,
|
137
|
+
func=core.sync_down_logs,
|
138
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
139
|
+
request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
|
140
|
+
)
|
sky/server/common.py
CHANGED
@@ -293,7 +293,25 @@ def _start_api_server(deploy: bool = False,
|
|
293
293
|
time.sleep(0.5)
|
294
294
|
else:
|
295
295
|
break
|
296
|
-
|
296
|
+
|
297
|
+
dashboard_msg = (f'Dashboard: {get_server_url(host)}/dashboard')
|
298
|
+
api_server_info = get_api_server_status(get_server_url(host))
|
299
|
+
if api_server_info.version == _DEV_VERSION:
|
300
|
+
dashboard_msg += (
|
301
|
+
f'\n{colorama.Style.RESET_ALL}{ux_utils.INDENT_SYMBOL}'
|
302
|
+
f'{colorama.Fore.YELLOW}')
|
303
|
+
if not os.path.isdir(server_constants.DASHBOARD_DIR):
|
304
|
+
dashboard_msg += (
|
305
|
+
'Dashboard is not built, '
|
306
|
+
'to build: npm --prefix sky/dashboard run build')
|
307
|
+
else:
|
308
|
+
dashboard_msg += (
|
309
|
+
'Dashboard may be stale when installed from source, '
|
310
|
+
'to rebuild: npm --prefix sky/dashboard run build')
|
311
|
+
dashboard_msg += f'{colorama.Style.RESET_ALL}'
|
312
|
+
logger.info(
|
313
|
+
ux_utils.finishing_message(
|
314
|
+
f'SkyPilot API server started. {dashboard_msg}'))
|
297
315
|
|
298
316
|
|
299
317
|
def check_server_healthy(endpoint: Optional[str] = None,) -> None:
|
sky/server/constants.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
"""Constants for the API servers."""
|
2
2
|
|
3
|
+
import os
|
4
|
+
|
3
5
|
from sky.skylet import constants
|
4
6
|
|
5
7
|
# API server version, whenever there is a change in API server that requires a
|
@@ -24,3 +26,7 @@ CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
|
|
24
26
|
|
25
27
|
# Environment variable for a file path to the API cookie file.
|
26
28
|
API_COOKIE_FILE_ENV_VAR = f'{constants.SKYPILOT_ENV_VAR_PREFIX}API_COOKIE_FILE'
|
29
|
+
|
30
|
+
# The path to the dashboard build output
|
31
|
+
DASHBOARD_DIR = os.path.join(os.path.dirname(__file__), '..', 'dashboard',
|
32
|
+
'out')
|
sky/server/requests/executor.py
CHANGED
@@ -358,6 +358,10 @@ def _request_execution_wrapper(request_id: str,
|
|
358
358
|
# captured in the log file.
|
359
359
|
try:
|
360
360
|
with override_request_env_and_config(request_body):
|
361
|
+
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
362
|
+
config = skypilot_config.to_dict()
|
363
|
+
logger.debug(f'request config: \n'
|
364
|
+
f'{common_utils.dump_yaml_str(dict(config))}')
|
361
365
|
return_value = func(**request_body.to_kwargs())
|
362
366
|
f.flush()
|
363
367
|
except KeyboardInterrupt:
|