skypilot-nightly 1.0.0.dev20250417__py3-none-any.whl → 1.0.0.dev20250422__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/aws.py +2 -13
  3. sky/backends/backend_utils.py +28 -0
  4. sky/backends/wheel_utils.py +9 -0
  5. sky/cli.py +93 -24
  6. sky/client/cli.py +93 -24
  7. sky/client/common.py +10 -3
  8. sky/client/sdk.py +6 -3
  9. sky/clouds/aws.py +5 -5
  10. sky/clouds/service_catalog/data_fetchers/fetch_gcp.py +9 -9
  11. sky/dashboard/out/404.html +1 -0
  12. sky/dashboard/out/_next/static/2GsKhI8XKYj9B2969iIDf/_buildManifest.js +1 -0
  13. sky/dashboard/out/_next/static/2GsKhI8XKYj9B2969iIDf/_ssgManifest.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/236-d437cf66e68a6f64.js +6 -0
  15. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +15 -0
  16. sky/dashboard/out/_next/static/chunks/37-72fdc8f71d6e4784.js +6 -0
  17. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +59 -0
  18. sky/dashboard/out/_next/static/chunks/845-2ea1cc63ba1f4067.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/979-7cd0778078b9cfad.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +33 -0
  22. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/pages/_app-3001e84c61acddfb.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-b09f7fbf6d5d74f6.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b57ec043f09c5813.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ef2e0e91a9222cac.js +1 -0
  31. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +1 -0
  34. sky/dashboard/out/_next/static/css/f3538cd90cfca88c.css +3 -0
  35. sky/dashboard/out/clusters/[cluster]/[job].html +1 -0
  36. sky/dashboard/out/clusters/[cluster].html +1 -0
  37. sky/dashboard/out/clusters.html +1 -0
  38. sky/dashboard/out/favicon.ico +0 -0
  39. sky/dashboard/out/index.html +1 -0
  40. sky/dashboard/out/jobs/[job].html +1 -0
  41. sky/dashboard/out/jobs.html +1 -0
  42. sky/dashboard/out/skypilot.svg +15 -0
  43. sky/dashboard/out/videos/cursor-small.mp4 +0 -0
  44. sky/data/data_transfer.py +2 -1
  45. sky/data/storage.py +24 -14
  46. sky/optimizer.py +7 -9
  47. sky/provision/provisioner.py +2 -1
  48. sky/provision/runpod/utils.py +32 -6
  49. sky/resources.py +11 -2
  50. sky/serve/__init__.py +2 -0
  51. sky/serve/autoscalers.py +6 -2
  52. sky/serve/client/sdk.py +61 -0
  53. sky/serve/replica_managers.py +6 -8
  54. sky/serve/serve_utils.py +33 -1
  55. sky/serve/server/core.py +187 -5
  56. sky/serve/server/server.py +28 -0
  57. sky/server/common.py +19 -1
  58. sky/server/constants.py +6 -0
  59. sky/server/requests/executor.py +4 -0
  60. sky/server/requests/payloads.py +27 -15
  61. sky/server/server.py +43 -0
  62. sky/setup_files/MANIFEST.in +1 -0
  63. sky/sky_logging.py +10 -0
  64. sky/skypilot_config.py +58 -37
  65. sky/templates/kubernetes-ray.yml.j2 +6 -2
  66. sky/utils/config_utils.py +0 -1
  67. sky/utils/controller_utils.py +0 -1
  68. {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/METADATA +1 -1
  69. {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/RECORD +73 -40
  70. {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/WHEEL +1 -1
  71. {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/entry_points.txt +0 -0
  72. {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/licenses/LICENSE +0 -0
  73. {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250422.dist-info}/top_level.txt +0 -0
@@ -670,6 +670,7 @@ def post_provision_runtime_setup(
670
670
  ux_utils.error_message(
671
671
  'Failed to set up SkyPilot runtime on cluster.',
672
672
  provision_logging.config.log_path))
673
- logger.debug(f'Stacktrace:\n{traceback.format_exc()}')
673
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
674
+ logger.debug(f'Stacktrace:\n{traceback.format_exc()}')
674
675
  with ux_utils.print_exception_no_traceback():
675
676
  raise
@@ -14,11 +14,19 @@ from sky.utils import common_utils
14
14
  logger = sky_logging.init_logger(__name__)
15
15
 
16
16
  GPU_NAME_MAP = {
17
+ # AMD
18
+ 'MI300X': 'AMD Instinct MI300X OAM',
19
+
20
+ # NVIDIA A-series
17
21
  'A100-80GB': 'NVIDIA A100 80GB PCIe',
18
- 'A100-40GB': 'NVIDIA A100-PCIE-40GB',
19
22
  'A100-80GB-SXM': 'NVIDIA A100-SXM4-80GB',
20
23
  'A30': 'NVIDIA A30',
21
24
  'A40': 'NVIDIA A40',
25
+
26
+ # NVIDIA B-series
27
+ 'B200': 'NVIDIA B200',
28
+
29
+ # GeForce
22
30
  'RTX3070': 'NVIDIA GeForce RTX 3070',
23
31
  'RTX3080': 'NVIDIA GeForce RTX 3080',
24
32
  'RTX3080Ti': 'NVIDIA GeForce RTX 3080 Ti',
@@ -26,25 +34,43 @@ GPU_NAME_MAP = {
26
34
  'RTX3090Ti': 'NVIDIA GeForce RTX 3090 Ti',
27
35
  'RTX4070Ti': 'NVIDIA GeForce RTX 4070 Ti',
28
36
  'RTX4080': 'NVIDIA GeForce RTX 4080',
37
+ 'RTX4080SUPER': 'NVIDIA GeForce RTX 4080 SUPER',
29
38
  'RTX4090': 'NVIDIA GeForce RTX 4090',
39
+ 'RTX5080': 'NVIDIA GeForce RTX 5080',
40
+ 'RTX5090': 'NVIDIA GeForce RTX 5090',
41
+
42
+ # NVIDIA H100/H200
30
43
  # Following instance is displayed as SXM at the console
31
44
  # but the ID from the API appears as HBM
32
45
  'H100-SXM': 'NVIDIA H100 80GB HBM3',
46
+ 'H100-NVL': 'NVIDIA H100 NVL',
33
47
  'H100': 'NVIDIA H100 PCIe',
48
+ 'H200-SXM': 'NVIDIA H200',
49
+
50
+ # NVIDIA L-series
34
51
  'L4': 'NVIDIA L4',
35
52
  'L40': 'NVIDIA L40',
36
- 'RTX4000-Ada-SFF': 'NVIDIA RTX 4000 SFF Ada Generation',
53
+ 'L40S': 'NVIDIA L40S',
54
+
55
+ # Ada generation (GeForce & RTX A)
56
+ 'RTX2000-Ada': 'NVIDIA RTX 2000 Ada Generation',
37
57
  'RTX4000-Ada': 'NVIDIA RTX 4000 Ada Generation',
58
+ 'RTX4000-Ada-SFF': 'NVIDIA RTX 4000 SFF Ada Generation',
59
+ 'RTX5000-Ada': 'NVIDIA RTX 5000 Ada Generation',
38
60
  'RTX6000-Ada': 'NVIDIA RTX 6000 Ada Generation',
61
+
62
+ # NVIDIA RTX A-series
63
+ 'RTXA2000': 'NVIDIA RTX A2000',
39
64
  'RTXA4000': 'NVIDIA RTX A4000',
40
65
  'RTXA4500': 'NVIDIA RTX A4500',
41
66
  'RTXA5000': 'NVIDIA RTX A5000',
42
67
  'RTXA6000': 'NVIDIA RTX A6000',
43
- 'RTX5000': 'Quadro RTX 5000',
68
+
69
+ # Tesla V100 variants
44
70
  'V100-16GB-FHHL': 'Tesla V100-FHHL-16GB',
45
- 'V100-16GB-SXM2': 'V100-SXM2-16GB',
46
- 'RTXA2000': 'NVIDIA RTX A2000',
47
- 'V100-16GB-PCIe': 'Tesla V100-PCIE-16GB'
71
+ 'V100-16GB-SXM2': 'Tesla V100-SXM2-16GB',
72
+ 'V100-32GB-SXM2': 'Tesla V100-SXM2-32GB',
73
+ 'V100-16GB-PCIe': 'Tesla V100-PCIE-16GB',
48
74
  }
49
75
 
50
76
 
sky/resources.py CHANGED
@@ -1370,7 +1370,8 @@ class Resources:
1370
1370
  return features
1371
1371
 
1372
1372
  @staticmethod
1373
- def apply_resource_config_aliases(config: Optional[Dict[str, Any]]) -> None:
1373
+ def _apply_resource_config_aliases(
1374
+ config: Optional[Dict[str, Any]]) -> None:
1374
1375
  """Mutatively applies overriding aliases to the passed in config.
1375
1376
 
1376
1377
  Note: Nested aliases are not supported.
@@ -1399,7 +1400,15 @@ class Resources:
1399
1400
  if config is None:
1400
1401
  return {Resources()}
1401
1402
 
1402
- Resources.apply_resource_config_aliases(config)
1403
+ Resources._apply_resource_config_aliases(config)
1404
+ anyof = config.get('any_of')
1405
+ if anyof is not None and isinstance(anyof, list):
1406
+ for anyof_config in anyof:
1407
+ Resources._apply_resource_config_aliases(anyof_config)
1408
+ ordered = config.get('ordered')
1409
+ if ordered is not None and isinstance(ordered, list):
1410
+ for ordered_config in ordered:
1411
+ Resources._apply_resource_config_aliases(ordered_config)
1403
1412
  common_utils.validate_schema(config, schemas.get_resources_schema(),
1404
1413
  'Invalid resources YAML: ')
1405
1414
 
sky/serve/__init__.py CHANGED
@@ -3,6 +3,7 @@ import os
3
3
 
4
4
  from sky.serve.client.sdk import down
5
5
  from sky.serve.client.sdk import status
6
+ from sky.serve.client.sdk import sync_down_logs
6
7
  from sky.serve.client.sdk import tail_logs
7
8
  from sky.serve.client.sdk import terminate_replica
8
9
  from sky.serve.client.sdk import up
@@ -37,6 +38,7 @@ __all__ = [
37
38
  'LB_POLICIES',
38
39
  'ReplicaStatus',
39
40
  'ServiceComponent',
41
+ 'sync_down_logs',
40
42
  'ServiceStatus',
41
43
  'ServeCodeGen',
42
44
  'SkyServiceSpec',
sky/serve/autoscalers.py CHANGED
@@ -676,8 +676,12 @@ class FallbackRequestRateAutoscaler(RequestRateAutoscaler):
676
676
  # because the provisioning spot can fail to UP due to the capacity
677
677
  # issue, and on-demand should fill the gap between the required
678
678
  # number of spot and ready spot.
679
- num_ondemand_to_provision += (num_spot_to_provision -
680
- num_ready_spot)
679
+ # When scaling down spot instances, it is possible that the number
680
+ # of ready spot is more than the number of spot to provision, thus
681
+ # generate a negative number. In this case, we don't need to
682
+ # provision on-demand instances.
683
+ num_ondemand_to_provision += max(
684
+ 0, num_spot_to_provision - num_ready_spot)
681
685
 
682
686
  # Make sure we don't launch on-demand fallback for
683
687
  # overprovisioned replicas.
sky/serve/client/sdk.py CHANGED
@@ -374,3 +374,64 @@ def tail_logs(service_name: str,
374
374
  )
375
375
  request_id = server_common.get_request_id(response)
376
376
  sdk.stream_response(request_id, response, output_stream)
377
+
378
+
379
+ @usage_lib.entrypoint
380
+ @server_common.check_server_healthy_or_start
381
+ def sync_down_logs(service_name: str,
382
+ local_dir: str,
383
+ *,
384
+ targets: Optional[Union[
385
+ str, 'serve_utils.ServiceComponent',
386
+ List[Union[str,
387
+ 'serve_utils.ServiceComponent']]]] = None,
388
+ replica_ids: Optional[List[int]] = None) -> None:
389
+ """Sync down logs from the service components to a local directory.
390
+
391
+ This function syncs logs from the specified service components (controller,
392
+ load balancer, replicas) via the API server to a specified local directory.
393
+
394
+ Args:
395
+ service_name: The name of the service to download logs from.
396
+ targets: Which component(s) to download logs for. If None or empty,
397
+ means download all logs (controller, load-balancer, all replicas).
398
+ Can be a string (e.g. "controller"), or a `ServiceComponent` object,
399
+ or a list of them for multiple components. Currently accepted
400
+ values:
401
+ - "controller"/ServiceComponent.CONTROLLER
402
+ - "load_balancer"/ServiceComponent.LOAD_BALANCER
403
+ - "replica"/ServiceComponent.REPLICA
404
+ replica_ids: The list of replica IDs to download logs from, specified
405
+ when target includes `ServiceComponent.REPLICA`. If target includes
406
+ `ServiceComponent.REPLICA` but this is None/empty, logs for all
407
+ replicas will be downloaded.
408
+ local_dir: Local directory to sync down logs to. Defaults to
409
+ `~/sky_logs`.
410
+
411
+ Raises:
412
+ RuntimeError: If fails to gather logs or fails to rsync from the
413
+ controller.
414
+ sky.exceptions.ClusterNotUpError: If the controller is not up.
415
+ ValueError: Arguments not valid.
416
+ """
417
+ # Avoid circular import.
418
+ from sky.client import sdk # pylint: disable=import-outside-toplevel
419
+
420
+ body = payloads.ServeDownloadLogsBody(
421
+ service_name=service_name,
422
+ # No need to set here, since the server will override it
423
+ # to a directory on the API server.
424
+ local_dir=local_dir,
425
+ targets=targets,
426
+ replica_ids=replica_ids,
427
+ )
428
+ response = requests.post(
429
+ f'{server_common.get_server_url()}/serve/sync-down-logs',
430
+ json=json.loads(body.model_dump_json()),
431
+ timeout=(5, None),
432
+ )
433
+ remote_dir = sdk.stream_and_get(server_common.get_request_id(response))
434
+
435
+ # Download from API server paths to the client's local_dir
436
+ client_common.download_logs_from_api_server([remote_dir], remote_dir,
437
+ local_dir)
@@ -257,14 +257,6 @@ class ReplicaStatusProperty:
257
257
  # is set to True and it can fail immediately due to spot availability.
258
258
  failed_spot_availability: bool = False
259
259
 
260
- def remove_terminated_replica(self) -> bool:
261
- """Whether to remove the replica record from the replica table.
262
-
263
- If not, the replica will stay in the replica table permanently to
264
- notify the user that something is wrong with the user code / setup.
265
- """
266
- return self.is_scale_down
267
-
268
260
  def unrecoverable_failure(self) -> bool:
269
261
  """Whether the replica fails and cannot be recovered.
270
262
 
@@ -730,6 +722,12 @@ class SkyPilotReplicaManager(ReplicaManager):
730
722
  replica_drain_delay_seconds: int,
731
723
  is_scale_down: bool = False,
732
724
  purge: bool = False) -> None:
725
+ left_in_record = not (is_scale_down or purge)
726
+ if left_in_record:
727
+ assert sync_down_logs, (
728
+ 'For the replica left in the record, '
729
+ 'the logs should always be synced down. '
730
+ 'So that the user can see the logs to debug.')
733
731
 
734
732
  if replica_id in self._launch_process_pool:
735
733
  info = serve_state.get_replica_info_from_id(self._service_name,
sky/serve/serve_utils.py CHANGED
@@ -13,7 +13,7 @@ import threading
13
13
  import time
14
14
  import typing
15
15
  from typing import (Any, Callable, DefaultDict, Dict, Generic, Iterator, List,
16
- Optional, TextIO, Type, TypeVar)
16
+ Optional, TextIO, Type, TypeVar, Union)
17
17
  import uuid
18
18
 
19
19
  import colorama
@@ -81,6 +81,38 @@ class ServiceComponent(enum.Enum):
81
81
  REPLICA = 'replica'
82
82
 
83
83
 
84
+ @dataclasses.dataclass
85
+ class ServiceComponentTarget:
86
+ """Represents a target service component with an optional replica ID.
87
+ """
88
+ component: ServiceComponent
89
+ replica_id: Optional[int] = None
90
+
91
+ def __init__(self,
92
+ component: Union[str, ServiceComponent],
93
+ replica_id: Optional[int] = None):
94
+ if isinstance(component, str):
95
+ component = ServiceComponent(component)
96
+ self.component = component
97
+ self.replica_id = replica_id
98
+
99
+ def __post_init__(self):
100
+ """Validate that replica_id is only provided for REPLICA component."""
101
+ if (self.component
102
+ == ServiceComponent.REPLICA) != (self.replica_id is None):
103
+ raise ValueError(
104
+ 'replica_id must be specified if and only if component is '
105
+ 'REPLICA.')
106
+
107
+ def __hash__(self) -> int:
108
+ return hash((self.component, self.replica_id))
109
+
110
+ def __str__(self) -> str:
111
+ if self.component == ServiceComponent.REPLICA:
112
+ return f'{self.component.value}-{self.replica_id}'
113
+ return self.component.value
114
+
115
+
84
116
  class UserSignal(enum.Enum):
85
117
  """User signal to send to controller.
86
118
 
sky/serve/server/core.py CHANGED
@@ -1,9 +1,11 @@
1
1
  """SkyServe core APIs."""
2
+ import pathlib
2
3
  import re
3
4
  import signal
4
5
  import tempfile
5
6
  import threading
6
- from typing import Any, Dict, List, Optional, Tuple, Union
7
+ import typing
8
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
7
9
 
8
10
  import colorama
9
11
 
@@ -29,6 +31,9 @@ from sky.utils import rich_utils
29
31
  from sky.utils import subprocess_utils
30
32
  from sky.utils import ux_utils
31
33
 
34
+ if typing.TYPE_CHECKING:
35
+ from sky.backends import cloud_vm_ray_backend
36
+
32
37
  logger = sky_logging.init_logger(__name__)
33
38
 
34
39
 
@@ -64,6 +69,41 @@ def _rewrite_tls_credential_paths_and_get_tls_env_vars(
64
69
  return tls_template_vars
65
70
 
66
71
 
72
+ def _get_all_replica_targets(
73
+ service_name: str, backend: backends.CloudVmRayBackend,
74
+ handle: backends.CloudVmRayResourceHandle
75
+ ) -> Set[serve_utils.ServiceComponentTarget]:
76
+ """Helper function to get targets for all live replicas."""
77
+ code = serve_utils.ServeCodeGen.get_service_status([service_name])
78
+ returncode, serve_status_payload, stderr = backend.run_on_head(
79
+ handle,
80
+ code,
81
+ require_outputs=True,
82
+ stream_logs=False,
83
+ separate_stderr=True)
84
+
85
+ try:
86
+ subprocess_utils.handle_returncode(returncode,
87
+ code,
88
+ 'Failed to fetch services',
89
+ stderr,
90
+ stream_logs=True)
91
+ except exceptions.CommandError as e:
92
+ raise RuntimeError(e.error_msg) from e
93
+
94
+ service_records = serve_utils.load_service_status(serve_status_payload)
95
+ if not service_records:
96
+ raise ValueError(f'Service {service_name!r} not found.')
97
+ assert len(service_records) == 1
98
+ service_record = service_records[0]
99
+
100
+ return {
101
+ serve_utils.ServiceComponentTarget(serve_utils.ServiceComponent.REPLICA,
102
+ replica_info['replica_id'])
103
+ for replica_info in service_record['replica_info']
104
+ }
105
+
106
+
67
107
  @usage_lib.entrypoint
68
108
  def up(
69
109
  task: 'sky.Task',
@@ -685,11 +725,14 @@ def status(
685
725
  return service_records
686
726
 
687
727
 
728
+ ServiceComponentOrStr = Union[str, serve_utils.ServiceComponent]
729
+
730
+
688
731
  @usage_lib.entrypoint
689
732
  def tail_logs(
690
733
  service_name: str,
691
734
  *,
692
- target: Union[str, serve_utils.ServiceComponent],
735
+ target: ServiceComponentOrStr,
693
736
  replica_id: Optional[int] = None,
694
737
  follow: bool = True,
695
738
  ) -> None:
@@ -743,10 +786,11 @@ def tail_logs(
743
786
  with ux_utils.print_exception_no_traceback():
744
787
  raise ValueError('`replica_id` must be None when using '
745
788
  'target=CONTROLLER/LOAD_BALANCER.')
789
+
790
+ controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
746
791
  handle = backend_utils.is_controller_accessible(
747
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
748
- stopped_message=(controller_utils.Controllers.SKY_SERVE_CONTROLLER.
749
- value.default_hint_if_non_existent))
792
+ controller=controller_type,
793
+ stopped_message=controller_type.value.default_hint_if_non_existent)
750
794
 
751
795
  backend = backend_utils.get_backend_from_handle(handle)
752
796
  assert isinstance(backend, backends.CloudVmRayBackend), backend
@@ -775,3 +819,141 @@ def tail_logs(
775
819
  stream_logs=True,
776
820
  process_stream=False,
777
821
  ssh_mode=command_runner.SshMode.INTERACTIVE)
822
+
823
+
824
+ @usage_lib.entrypoint
825
+ def sync_down_logs(
826
+ service_name: str,
827
+ *,
828
+ local_dir: str,
829
+ targets: Union[ServiceComponentOrStr, List[ServiceComponentOrStr],
830
+ None] = None,
831
+ replica_ids: Optional[List[int]] = None,
832
+ ) -> str:
833
+ """Sync down logs from the controller for the given service.
834
+
835
+ This function is called by the server endpoint. It gathers logs from the
836
+ controller, load balancer, and/or replicas and places them in a directory
837
+ under the user's log space on the API server filesystem.
838
+
839
+ Args:
840
+ service_name: The name of the service to download logs from.
841
+ local_dir: The local directory to save the logs to.
842
+ targets: Which component(s) to download logs for. If None or empty,
843
+ means download all logs (controller, load-balancer, all replicas).
844
+ Can be a string (e.g. "controller"), or a `ServiceComponent` object,
845
+ or a list of them for multiple components. Currently accepted
846
+ values:
847
+ - "controller"/ServiceComponent.CONTROLLER
848
+ - "load_balancer"/ServiceComponent.LOAD_BALANCER
849
+ - "replica"/ServiceComponent.REPLICA
850
+ replica_ids: The list of replica IDs to download logs from, specified
851
+ when target includes `ServiceComponent.REPLICA`. If target includes
852
+ `ServiceComponent.REPLICA` but this is None/empty, logs for all
853
+ replicas will be downloaded.
854
+
855
+ Returns:
856
+ A dict mapping component names to local paths where the logs were synced
857
+ down to.
858
+
859
+ Raises:
860
+ RuntimeError: If fails to gather logs or fails to rsync from the
861
+ controller.
862
+ sky.exceptions.ClusterNotUpError: If the controller is not up.
863
+ ValueError: Arguments not valid.
864
+ """
865
+ # Step 0) get the controller handle
866
+ with rich_utils.safe_status(
867
+ ux_utils.spinner_message('Checking service status...')):
868
+ controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
869
+ handle = backend_utils.is_controller_accessible(
870
+ controller=controller_type,
871
+ stopped_message=controller_type.value.default_hint_if_non_existent)
872
+ backend: backends.CloudVmRayBackend = (
873
+ backend_utils.get_backend_from_handle(handle))
874
+
875
+ requested_components: Set[serve_utils.ServiceComponent] = set()
876
+ if not targets:
877
+ # No targets specified -> request all components
878
+ requested_components = {
879
+ serve_utils.ServiceComponent.CONTROLLER,
880
+ serve_utils.ServiceComponent.LOAD_BALANCER,
881
+ serve_utils.ServiceComponent.REPLICA
882
+ }
883
+ else:
884
+ # Parse provided targets
885
+ if isinstance(targets, (str, serve_utils.ServiceComponent)):
886
+ requested_components = {serve_utils.ServiceComponent(targets)}
887
+ else: # list
888
+ requested_components = {
889
+ serve_utils.ServiceComponent(t) for t in targets
890
+ }
891
+
892
+ normalized_targets: Set[serve_utils.ServiceComponentTarget] = set()
893
+ if serve_utils.ServiceComponent.CONTROLLER in requested_components:
894
+ normalized_targets.add(
895
+ serve_utils.ServiceComponentTarget(
896
+ serve_utils.ServiceComponent.CONTROLLER))
897
+ if serve_utils.ServiceComponent.LOAD_BALANCER in requested_components:
898
+ normalized_targets.add(
899
+ serve_utils.ServiceComponentTarget(
900
+ serve_utils.ServiceComponent.LOAD_BALANCER))
901
+ if serve_utils.ServiceComponent.REPLICA in requested_components:
902
+ with rich_utils.safe_status(
903
+ ux_utils.spinner_message('Getting live replica infos...')):
904
+ replica_targets = _get_all_replica_targets(service_name, backend,
905
+ handle)
906
+ if not replica_ids:
907
+ # Replica target requested but no specific IDs
908
+ # -> Get all replica logs
909
+ normalized_targets.update(replica_targets)
910
+ else:
911
+ # Replica target requested with specific IDs
912
+ requested_replica_targets = [
913
+ serve_utils.ServiceComponentTarget(
914
+ serve_utils.ServiceComponent.REPLICA, rid)
915
+ for rid in replica_ids
916
+ ]
917
+ for target in requested_replica_targets:
918
+ if target not in replica_targets:
919
+ logger.warning(f'Replica ID {target.replica_id} not found '
920
+ f'for {service_name}. Skipping...')
921
+ else:
922
+ normalized_targets.add(target)
923
+
924
+ def sync_down_logs_by_target(target: serve_utils.ServiceComponentTarget):
925
+ component = target.component
926
+ # We need to set one side of the pipe to a logs stream, and the other
927
+ # side to a file.
928
+ log_path = str(pathlib.Path(local_dir) / f'{target}.log')
929
+ stream_logs_code: str
930
+
931
+ if component == serve_utils.ServiceComponent.CONTROLLER:
932
+ stream_logs_code = (
933
+ serve_utils.ServeCodeGen.stream_serve_process_logs(
934
+ service_name, stream_controller=True, follow=False))
935
+ elif component == serve_utils.ServiceComponent.LOAD_BALANCER:
936
+ stream_logs_code = (
937
+ serve_utils.ServeCodeGen.stream_serve_process_logs(
938
+ service_name, stream_controller=False, follow=False))
939
+ elif component == serve_utils.ServiceComponent.REPLICA:
940
+ replica_id = target.replica_id
941
+ assert replica_id is not None, service_name
942
+ stream_logs_code = serve_utils.ServeCodeGen.stream_replica_logs(
943
+ service_name, replica_id, follow=False)
944
+ else:
945
+ assert False, component
946
+
947
+ # Refer to the notes in
948
+ # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
949
+ backend.run_on_head(handle,
950
+ stream_logs_code,
951
+ stream_logs=False,
952
+ process_stream=False,
953
+ ssh_mode=command_runner.SshMode.INTERACTIVE,
954
+ log_path=log_path)
955
+
956
+ subprocess_utils.run_in_parallel(sync_down_logs_by_target,
957
+ list(normalized_targets))
958
+
959
+ return local_dir
@@ -1,13 +1,17 @@
1
1
  """Rest APIs for SkyServe."""
2
2
 
3
+ import pathlib
4
+
3
5
  import fastapi
4
6
 
5
7
  from sky import sky_logging
6
8
  from sky.serve.server import core
9
+ from sky.server import common as server_common
7
10
  from sky.server import stream_utils
8
11
  from sky.server.requests import executor
9
12
  from sky.server.requests import payloads
10
13
  from sky.server.requests import requests as api_requests
14
+ from sky.skylet import constants
11
15
  from sky.utils import common
12
16
 
13
17
  logger = sky_logging.init_logger(__name__)
@@ -110,3 +114,27 @@ async def tail_logs(
110
114
  logs_path=request_task.log_path,
111
115
  background_tasks=background_tasks,
112
116
  )
117
+
118
+
119
+ @router.post('/sync-down-logs')
120
+ async def download_logs(
121
+ request: fastapi.Request,
122
+ download_logs_body: payloads.ServeDownloadLogsBody,
123
+ ) -> None:
124
+ user_hash = download_logs_body.env_vars[constants.USER_ID_ENV_VAR]
125
+ timestamp = sky_logging.get_run_timestamp()
126
+ logs_dir_on_api_server = (
127
+ pathlib.Path(server_common.api_server_user_logs_dir_prefix(user_hash)) /
128
+ 'service' / f'{download_logs_body.service_name}_{timestamp}')
129
+ logs_dir_on_api_server.mkdir(parents=True, exist_ok=True)
130
+ # We should reuse the original request body, so that the env vars, such as
131
+ # user hash, are kept the same.
132
+ download_logs_body.local_dir = str(logs_dir_on_api_server)
133
+ executor.schedule_request(
134
+ request_id=request.state.request_id,
135
+ request_name='serve.sync_down_logs',
136
+ request_body=download_logs_body,
137
+ func=core.sync_down_logs,
138
+ schedule_type=api_requests.ScheduleType.SHORT,
139
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
140
+ )
sky/server/common.py CHANGED
@@ -293,7 +293,25 @@ def _start_api_server(deploy: bool = False,
293
293
  time.sleep(0.5)
294
294
  else:
295
295
  break
296
- logger.info(ux_utils.finishing_message('SkyPilot API server started.'))
296
+
297
+ dashboard_msg = (f'Dashboard: {get_server_url(host)}/dashboard')
298
+ api_server_info = get_api_server_status(get_server_url(host))
299
+ if api_server_info.version == _DEV_VERSION:
300
+ dashboard_msg += (
301
+ f'\n{colorama.Style.RESET_ALL}{ux_utils.INDENT_SYMBOL}'
302
+ f'{colorama.Fore.YELLOW}')
303
+ if not os.path.isdir(server_constants.DASHBOARD_DIR):
304
+ dashboard_msg += (
305
+ 'Dashboard is not built, '
306
+ 'to build: npm --prefix sky/dashboard run build')
307
+ else:
308
+ dashboard_msg += (
309
+ 'Dashboard may be stale when installed from source, '
310
+ 'to rebuild: npm --prefix sky/dashboard run build')
311
+ dashboard_msg += f'{colorama.Style.RESET_ALL}'
312
+ logger.info(
313
+ ux_utils.finishing_message(
314
+ f'SkyPilot API server started. {dashboard_msg}'))
297
315
 
298
316
 
299
317
  def check_server_healthy(endpoint: Optional[str] = None,) -> None:
sky/server/constants.py CHANGED
@@ -1,5 +1,7 @@
1
1
  """Constants for the API servers."""
2
2
 
3
+ import os
4
+
3
5
  from sky.skylet import constants
4
6
 
5
7
  # API server version, whenever there is a change in API server that requires a
@@ -24,3 +26,7 @@ CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
24
26
 
25
27
  # Environment variable for a file path to the API cookie file.
26
28
  API_COOKIE_FILE_ENV_VAR = f'{constants.SKYPILOT_ENV_VAR_PREFIX}API_COOKIE_FILE'
29
+
30
+ # The path to the dashboard build output
31
+ DASHBOARD_DIR = os.path.join(os.path.dirname(__file__), '..', 'dashboard',
32
+ 'out')
@@ -358,6 +358,10 @@ def _request_execution_wrapper(request_id: str,
358
358
  # captured in the log file.
359
359
  try:
360
360
  with override_request_env_and_config(request_body):
361
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
362
+ config = skypilot_config.to_dict()
363
+ logger.debug(f'request config: \n'
364
+ f'{common_utils.dump_yaml_str(dict(config))}')
361
365
  return_value = func(**request_body.to_kwargs())
362
366
  f.flush()
363
367
  except KeyboardInterrupt: