skypilot-nightly 1.0.0.dev20250417__py3-none-any.whl → 1.0.0.dev20250421__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +28 -0
  3. sky/backends/wheel_utils.py +9 -0
  4. sky/cli.py +92 -19
  5. sky/client/cli.py +92 -19
  6. sky/client/common.py +10 -3
  7. sky/client/sdk.py +6 -3
  8. sky/dashboard/out/404.html +1 -0
  9. sky/dashboard/out/_next/static/chunks/236-d437cf66e68a6f64.js +6 -0
  10. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +15 -0
  11. sky/dashboard/out/_next/static/chunks/37-72fdc8f71d6e4784.js +6 -0
  12. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +59 -0
  13. sky/dashboard/out/_next/static/chunks/845-2ea1cc63ba1f4067.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/979-7cd0778078b9cfad.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +33 -0
  17. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/pages/_app-3001e84c61acddfb.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-b09f7fbf6d5d74f6.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-b57ec043f09c5813.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ef2e0e91a9222cac.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js +1 -0
  28. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +1 -0
  29. sky/dashboard/out/_next/static/css/f3538cd90cfca88c.css +3 -0
  30. sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_buildManifest.js +1 -0
  31. sky/dashboard/out/_next/static/mS9YfLA5hhsJMeBj9W8J7/_ssgManifest.js +1 -0
  32. sky/dashboard/out/clusters/[cluster]/[job].html +1 -0
  33. sky/dashboard/out/clusters/[cluster].html +1 -0
  34. sky/dashboard/out/clusters.html +1 -0
  35. sky/dashboard/out/favicon.ico +0 -0
  36. sky/dashboard/out/index.html +1 -0
  37. sky/dashboard/out/jobs/[job].html +1 -0
  38. sky/dashboard/out/jobs.html +1 -0
  39. sky/dashboard/out/skypilot.svg +15 -0
  40. sky/dashboard/out/videos/cursor-small.mp4 +0 -0
  41. sky/data/data_transfer.py +2 -1
  42. sky/data/storage.py +24 -14
  43. sky/optimizer.py +7 -9
  44. sky/provision/provisioner.py +2 -1
  45. sky/resources.py +11 -2
  46. sky/serve/__init__.py +2 -0
  47. sky/serve/autoscalers.py +6 -2
  48. sky/serve/client/sdk.py +61 -0
  49. sky/serve/replica_managers.py +6 -8
  50. sky/serve/serve_utils.py +33 -1
  51. sky/serve/server/core.py +187 -5
  52. sky/serve/server/server.py +28 -0
  53. sky/server/common.py +19 -1
  54. sky/server/constants.py +6 -0
  55. sky/server/requests/executor.py +4 -0
  56. sky/server/requests/payloads.py +12 -15
  57. sky/server/server.py +43 -0
  58. sky/setup_files/MANIFEST.in +1 -0
  59. sky/sky_logging.py +10 -0
  60. sky/skypilot_config.py +51 -31
  61. sky/templates/kubernetes-ray.yml.j2 +6 -2
  62. sky/utils/config_utils.py +0 -1
  63. sky/utils/controller_utils.py +0 -1
  64. {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/METADATA +1 -1
  65. {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/RECORD +69 -36
  66. {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/WHEEL +1 -1
  67. {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/entry_points.txt +0 -0
  68. {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/licenses/LICENSE +0 -0
  69. {skypilot_nightly-1.0.0.dev20250417.dist-info → skypilot_nightly-1.0.0.dev20250421.dist-info}/top_level.txt +0 -0
sky/optimizer.py CHANGED
@@ -335,9 +335,6 @@ class Optimizer:
335
335
  orig_resources)
336
336
 
337
337
  for resources in launchable_list:
338
- if do_print:
339
- logger.debug(f'resources: {resources}')
340
-
341
338
  if minimize_cost:
342
339
  cost_per_node = resources.get_cost(estimated_runtime)
343
340
  num_available_reserved_nodes = (
@@ -355,13 +352,14 @@ class Optimizer:
355
352
  # Minimize run time.
356
353
  estimated_cost_or_time = estimated_runtime
357
354
  if do_print:
358
- logger.debug(
359
- ' estimated_runtime: {:.0f} s ({:.1f} hr)'.format(
360
- estimated_runtime, estimated_runtime / 3600))
355
+ debug_msg = (
356
+ f'resources: {resources}, '
357
+ f'estimated_runtime: {estimated_runtime} s '
358
+ f'({estimated_runtime / 3600:.1f} hr)')
361
359
  if minimize_cost:
362
- logger.debug(
363
- ' estimated_cost (not incl. egress): ${:.1f}'.
364
- format(estimated_cost_or_time))
360
+ debug_msg += (', estimated_cost: '
361
+ f'${estimated_cost_or_time:.1f}')
362
+ logger.debug(debug_msg)
365
363
  node_to_cost_map[node][resources] = estimated_cost_or_time
366
364
  if not node_to_cost_map[node]:
367
365
  source_hint = 'catalog'
@@ -670,6 +670,7 @@ def post_provision_runtime_setup(
670
670
  ux_utils.error_message(
671
671
  'Failed to set up SkyPilot runtime on cluster.',
672
672
  provision_logging.config.log_path))
673
- logger.debug(f'Stacktrace:\n{traceback.format_exc()}')
673
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
674
+ logger.debug(f'Stacktrace:\n{traceback.format_exc()}')
674
675
  with ux_utils.print_exception_no_traceback():
675
676
  raise
sky/resources.py CHANGED
@@ -1370,7 +1370,8 @@ class Resources:
1370
1370
  return features
1371
1371
 
1372
1372
  @staticmethod
1373
- def apply_resource_config_aliases(config: Optional[Dict[str, Any]]) -> None:
1373
+ def _apply_resource_config_aliases(
1374
+ config: Optional[Dict[str, Any]]) -> None:
1374
1375
  """Mutatively applies overriding aliases to the passed in config.
1375
1376
 
1376
1377
  Note: Nested aliases are not supported.
@@ -1399,7 +1400,15 @@ class Resources:
1399
1400
  if config is None:
1400
1401
  return {Resources()}
1401
1402
 
1402
- Resources.apply_resource_config_aliases(config)
1403
+ Resources._apply_resource_config_aliases(config)
1404
+ anyof = config.get('any_of')
1405
+ if anyof is not None and isinstance(anyof, list):
1406
+ for anyof_config in anyof:
1407
+ Resources._apply_resource_config_aliases(anyof_config)
1408
+ ordered = config.get('ordered')
1409
+ if ordered is not None and isinstance(ordered, list):
1410
+ for ordered_config in ordered:
1411
+ Resources._apply_resource_config_aliases(ordered_config)
1403
1412
  common_utils.validate_schema(config, schemas.get_resources_schema(),
1404
1413
  'Invalid resources YAML: ')
1405
1414
 
sky/serve/__init__.py CHANGED
@@ -3,6 +3,7 @@ import os
3
3
 
4
4
  from sky.serve.client.sdk import down
5
5
  from sky.serve.client.sdk import status
6
+ from sky.serve.client.sdk import sync_down_logs
6
7
  from sky.serve.client.sdk import tail_logs
7
8
  from sky.serve.client.sdk import terminate_replica
8
9
  from sky.serve.client.sdk import up
@@ -37,6 +38,7 @@ __all__ = [
37
38
  'LB_POLICIES',
38
39
  'ReplicaStatus',
39
40
  'ServiceComponent',
41
+ 'sync_down_logs',
40
42
  'ServiceStatus',
41
43
  'ServeCodeGen',
42
44
  'SkyServiceSpec',
sky/serve/autoscalers.py CHANGED
@@ -676,8 +676,12 @@ class FallbackRequestRateAutoscaler(RequestRateAutoscaler):
676
676
  # because the provisioning spot can fail to UP due to the capacity
677
677
  # issue, and on-demand should fill the gap between the required
678
678
  # number of spot and ready spot.
679
- num_ondemand_to_provision += (num_spot_to_provision -
680
- num_ready_spot)
679
+ # When scaling down spot instances, it is possible that the number
680
+ # of ready spot is more than the number of spot to provision, thus
681
+ # generate a negative number. In this case, we don't need to
682
+ # provision on-demand instances.
683
+ num_ondemand_to_provision += max(
684
+ 0, num_spot_to_provision - num_ready_spot)
681
685
 
682
686
  # Make sure we don't launch on-demand fallback for
683
687
  # overprovisioned replicas.
sky/serve/client/sdk.py CHANGED
@@ -374,3 +374,64 @@ def tail_logs(service_name: str,
374
374
  )
375
375
  request_id = server_common.get_request_id(response)
376
376
  sdk.stream_response(request_id, response, output_stream)
377
+
378
+
379
+ @usage_lib.entrypoint
380
+ @server_common.check_server_healthy_or_start
381
+ def sync_down_logs(service_name: str,
382
+ local_dir: str,
383
+ *,
384
+ targets: Optional[Union[
385
+ str, 'serve_utils.ServiceComponent',
386
+ List[Union[str,
387
+ 'serve_utils.ServiceComponent']]]] = None,
388
+ replica_ids: Optional[List[int]] = None) -> None:
389
+ """Sync down logs from the service components to a local directory.
390
+
391
+ This function syncs logs from the specified service components (controller,
392
+ load balancer, replicas) via the API server to a specified local directory.
393
+
394
+ Args:
395
+ service_name: The name of the service to download logs from.
396
+ targets: Which component(s) to download logs for. If None or empty,
397
+ means download all logs (controller, load-balancer, all replicas).
398
+ Can be a string (e.g. "controller"), or a `ServiceComponent` object,
399
+ or a list of them for multiple components. Currently accepted
400
+ values:
401
+ - "controller"/ServiceComponent.CONTROLLER
402
+ - "load_balancer"/ServiceComponent.LOAD_BALANCER
403
+ - "replica"/ServiceComponent.REPLICA
404
+ replica_ids: The list of replica IDs to download logs from, specified
405
+ when target includes `ServiceComponent.REPLICA`. If target includes
406
+ `ServiceComponent.REPLICA` but this is None/empty, logs for all
407
+ replicas will be downloaded.
408
+ local_dir: Local directory to sync down logs to. Defaults to
409
+ `~/sky_logs`.
410
+
411
+ Raises:
412
+ RuntimeError: If fails to gather logs or fails to rsync from the
413
+ controller.
414
+ sky.exceptions.ClusterNotUpError: If the controller is not up.
415
+ ValueError: Arguments not valid.
416
+ """
417
+ # Avoid circular import.
418
+ from sky.client import sdk # pylint: disable=import-outside-toplevel
419
+
420
+ body = payloads.ServeDownloadLogsBody(
421
+ service_name=service_name,
422
+ # No need to set here, since the server will override it
423
+ # to a directory on the API server.
424
+ local_dir=local_dir,
425
+ targets=targets,
426
+ replica_ids=replica_ids,
427
+ )
428
+ response = requests.post(
429
+ f'{server_common.get_server_url()}/serve/sync-down-logs',
430
+ json=json.loads(body.model_dump_json()),
431
+ timeout=(5, None),
432
+ )
433
+ remote_dir = sdk.stream_and_get(server_common.get_request_id(response))
434
+
435
+ # Download from API server paths to the client's local_dir
436
+ client_common.download_logs_from_api_server([remote_dir], remote_dir,
437
+ local_dir)
@@ -257,14 +257,6 @@ class ReplicaStatusProperty:
257
257
  # is set to True and it can fail immediately due to spot availability.
258
258
  failed_spot_availability: bool = False
259
259
 
260
- def remove_terminated_replica(self) -> bool:
261
- """Whether to remove the replica record from the replica table.
262
-
263
- If not, the replica will stay in the replica table permanently to
264
- notify the user that something is wrong with the user code / setup.
265
- """
266
- return self.is_scale_down
267
-
268
260
  def unrecoverable_failure(self) -> bool:
269
261
  """Whether the replica fails and cannot be recovered.
270
262
 
@@ -730,6 +722,12 @@ class SkyPilotReplicaManager(ReplicaManager):
730
722
  replica_drain_delay_seconds: int,
731
723
  is_scale_down: bool = False,
732
724
  purge: bool = False) -> None:
725
+ left_in_record = not (is_scale_down or purge)
726
+ if left_in_record:
727
+ assert sync_down_logs, (
728
+ 'For the replica left in the record, '
729
+ 'the logs should always be synced down. '
730
+ 'So that the user can see the logs to debug.')
733
731
 
734
732
  if replica_id in self._launch_process_pool:
735
733
  info = serve_state.get_replica_info_from_id(self._service_name,
sky/serve/serve_utils.py CHANGED
@@ -13,7 +13,7 @@ import threading
13
13
  import time
14
14
  import typing
15
15
  from typing import (Any, Callable, DefaultDict, Dict, Generic, Iterator, List,
16
- Optional, TextIO, Type, TypeVar)
16
+ Optional, TextIO, Type, TypeVar, Union)
17
17
  import uuid
18
18
 
19
19
  import colorama
@@ -81,6 +81,38 @@ class ServiceComponent(enum.Enum):
81
81
  REPLICA = 'replica'
82
82
 
83
83
 
84
+ @dataclasses.dataclass
85
+ class ServiceComponentTarget:
86
+ """Represents a target service component with an optional replica ID.
87
+ """
88
+ component: ServiceComponent
89
+ replica_id: Optional[int] = None
90
+
91
+ def __init__(self,
92
+ component: Union[str, ServiceComponent],
93
+ replica_id: Optional[int] = None):
94
+ if isinstance(component, str):
95
+ component = ServiceComponent(component)
96
+ self.component = component
97
+ self.replica_id = replica_id
98
+
99
+ def __post_init__(self):
100
+ """Validate that replica_id is only provided for REPLICA component."""
101
+ if (self.component
102
+ == ServiceComponent.REPLICA) != (self.replica_id is None):
103
+ raise ValueError(
104
+ 'replica_id must be specified if and only if component is '
105
+ 'REPLICA.')
106
+
107
+ def __hash__(self) -> int:
108
+ return hash((self.component, self.replica_id))
109
+
110
+ def __str__(self) -> str:
111
+ if self.component == ServiceComponent.REPLICA:
112
+ return f'{self.component.value}-{self.replica_id}'
113
+ return self.component.value
114
+
115
+
84
116
  class UserSignal(enum.Enum):
85
117
  """User signal to send to controller.
86
118
 
sky/serve/server/core.py CHANGED
@@ -1,9 +1,11 @@
1
1
  """SkyServe core APIs."""
2
+ import pathlib
2
3
  import re
3
4
  import signal
4
5
  import tempfile
5
6
  import threading
6
- from typing import Any, Dict, List, Optional, Tuple, Union
7
+ import typing
8
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
7
9
 
8
10
  import colorama
9
11
 
@@ -29,6 +31,9 @@ from sky.utils import rich_utils
29
31
  from sky.utils import subprocess_utils
30
32
  from sky.utils import ux_utils
31
33
 
34
+ if typing.TYPE_CHECKING:
35
+ from sky.backends import cloud_vm_ray_backend
36
+
32
37
  logger = sky_logging.init_logger(__name__)
33
38
 
34
39
 
@@ -64,6 +69,41 @@ def _rewrite_tls_credential_paths_and_get_tls_env_vars(
64
69
  return tls_template_vars
65
70
 
66
71
 
72
+ def _get_all_replica_targets(
73
+ service_name: str, backend: backends.CloudVmRayBackend,
74
+ handle: backends.CloudVmRayResourceHandle
75
+ ) -> Set[serve_utils.ServiceComponentTarget]:
76
+ """Helper function to get targets for all live replicas."""
77
+ code = serve_utils.ServeCodeGen.get_service_status([service_name])
78
+ returncode, serve_status_payload, stderr = backend.run_on_head(
79
+ handle,
80
+ code,
81
+ require_outputs=True,
82
+ stream_logs=False,
83
+ separate_stderr=True)
84
+
85
+ try:
86
+ subprocess_utils.handle_returncode(returncode,
87
+ code,
88
+ 'Failed to fetch services',
89
+ stderr,
90
+ stream_logs=True)
91
+ except exceptions.CommandError as e:
92
+ raise RuntimeError(e.error_msg) from e
93
+
94
+ service_records = serve_utils.load_service_status(serve_status_payload)
95
+ if not service_records:
96
+ raise ValueError(f'Service {service_name!r} not found.')
97
+ assert len(service_records) == 1
98
+ service_record = service_records[0]
99
+
100
+ return {
101
+ serve_utils.ServiceComponentTarget(serve_utils.ServiceComponent.REPLICA,
102
+ replica_info['replica_id'])
103
+ for replica_info in service_record['replica_info']
104
+ }
105
+
106
+
67
107
  @usage_lib.entrypoint
68
108
  def up(
69
109
  task: 'sky.Task',
@@ -685,11 +725,14 @@ def status(
685
725
  return service_records
686
726
 
687
727
 
728
+ ServiceComponentOrStr = Union[str, serve_utils.ServiceComponent]
729
+
730
+
688
731
  @usage_lib.entrypoint
689
732
  def tail_logs(
690
733
  service_name: str,
691
734
  *,
692
- target: Union[str, serve_utils.ServiceComponent],
735
+ target: ServiceComponentOrStr,
693
736
  replica_id: Optional[int] = None,
694
737
  follow: bool = True,
695
738
  ) -> None:
@@ -743,10 +786,11 @@ def tail_logs(
743
786
  with ux_utils.print_exception_no_traceback():
744
787
  raise ValueError('`replica_id` must be None when using '
745
788
  'target=CONTROLLER/LOAD_BALANCER.')
789
+
790
+ controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
746
791
  handle = backend_utils.is_controller_accessible(
747
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
748
- stopped_message=(controller_utils.Controllers.SKY_SERVE_CONTROLLER.
749
- value.default_hint_if_non_existent))
792
+ controller=controller_type,
793
+ stopped_message=controller_type.value.default_hint_if_non_existent)
750
794
 
751
795
  backend = backend_utils.get_backend_from_handle(handle)
752
796
  assert isinstance(backend, backends.CloudVmRayBackend), backend
@@ -775,3 +819,141 @@ def tail_logs(
775
819
  stream_logs=True,
776
820
  process_stream=False,
777
821
  ssh_mode=command_runner.SshMode.INTERACTIVE)
822
+
823
+
824
+ @usage_lib.entrypoint
825
+ def sync_down_logs(
826
+ service_name: str,
827
+ *,
828
+ local_dir: str,
829
+ targets: Union[ServiceComponentOrStr, List[ServiceComponentOrStr],
830
+ None] = None,
831
+ replica_ids: Optional[List[int]] = None,
832
+ ) -> str:
833
+ """Sync down logs from the controller for the given service.
834
+
835
+ This function is called by the server endpoint. It gathers logs from the
836
+ controller, load balancer, and/or replicas and places them in a directory
837
+ under the user's log space on the API server filesystem.
838
+
839
+ Args:
840
+ service_name: The name of the service to download logs from.
841
+ local_dir: The local directory to save the logs to.
842
+ targets: Which component(s) to download logs for. If None or empty,
843
+ means download all logs (controller, load-balancer, all replicas).
844
+ Can be a string (e.g. "controller"), or a `ServiceComponent` object,
845
+ or a list of them for multiple components. Currently accepted
846
+ values:
847
+ - "controller"/ServiceComponent.CONTROLLER
848
+ - "load_balancer"/ServiceComponent.LOAD_BALANCER
849
+ - "replica"/ServiceComponent.REPLICA
850
+ replica_ids: The list of replica IDs to download logs from, specified
851
+ when target includes `ServiceComponent.REPLICA`. If target includes
852
+ `ServiceComponent.REPLICA` but this is None/empty, logs for all
853
+ replicas will be downloaded.
854
+
855
+ Returns:
856
+ A dict mapping component names to local paths where the logs were synced
857
+ down to.
858
+
859
+ Raises:
860
+ RuntimeError: If fails to gather logs or fails to rsync from the
861
+ controller.
862
+ sky.exceptions.ClusterNotUpError: If the controller is not up.
863
+ ValueError: Arguments not valid.
864
+ """
865
+ # Step 0) get the controller handle
866
+ with rich_utils.safe_status(
867
+ ux_utils.spinner_message('Checking service status...')):
868
+ controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
869
+ handle = backend_utils.is_controller_accessible(
870
+ controller=controller_type,
871
+ stopped_message=controller_type.value.default_hint_if_non_existent)
872
+ backend: backends.CloudVmRayBackend = (
873
+ backend_utils.get_backend_from_handle(handle))
874
+
875
+ requested_components: Set[serve_utils.ServiceComponent] = set()
876
+ if not targets:
877
+ # No targets specified -> request all components
878
+ requested_components = {
879
+ serve_utils.ServiceComponent.CONTROLLER,
880
+ serve_utils.ServiceComponent.LOAD_BALANCER,
881
+ serve_utils.ServiceComponent.REPLICA
882
+ }
883
+ else:
884
+ # Parse provided targets
885
+ if isinstance(targets, (str, serve_utils.ServiceComponent)):
886
+ requested_components = {serve_utils.ServiceComponent(targets)}
887
+ else: # list
888
+ requested_components = {
889
+ serve_utils.ServiceComponent(t) for t in targets
890
+ }
891
+
892
+ normalized_targets: Set[serve_utils.ServiceComponentTarget] = set()
893
+ if serve_utils.ServiceComponent.CONTROLLER in requested_components:
894
+ normalized_targets.add(
895
+ serve_utils.ServiceComponentTarget(
896
+ serve_utils.ServiceComponent.CONTROLLER))
897
+ if serve_utils.ServiceComponent.LOAD_BALANCER in requested_components:
898
+ normalized_targets.add(
899
+ serve_utils.ServiceComponentTarget(
900
+ serve_utils.ServiceComponent.LOAD_BALANCER))
901
+ if serve_utils.ServiceComponent.REPLICA in requested_components:
902
+ with rich_utils.safe_status(
903
+ ux_utils.spinner_message('Getting live replica infos...')):
904
+ replica_targets = _get_all_replica_targets(service_name, backend,
905
+ handle)
906
+ if not replica_ids:
907
+ # Replica target requested but no specific IDs
908
+ # -> Get all replica logs
909
+ normalized_targets.update(replica_targets)
910
+ else:
911
+ # Replica target requested with specific IDs
912
+ requested_replica_targets = [
913
+ serve_utils.ServiceComponentTarget(
914
+ serve_utils.ServiceComponent.REPLICA, rid)
915
+ for rid in replica_ids
916
+ ]
917
+ for target in requested_replica_targets:
918
+ if target not in replica_targets:
919
+ logger.warning(f'Replica ID {target.replica_id} not found '
920
+ f'for {service_name}. Skipping...')
921
+ else:
922
+ normalized_targets.add(target)
923
+
924
+ def sync_down_logs_by_target(target: serve_utils.ServiceComponentTarget):
925
+ component = target.component
926
+ # We need to set one side of the pipe to a logs stream, and the other
927
+ # side to a file.
928
+ log_path = str(pathlib.Path(local_dir) / f'{target}.log')
929
+ stream_logs_code: str
930
+
931
+ if component == serve_utils.ServiceComponent.CONTROLLER:
932
+ stream_logs_code = (
933
+ serve_utils.ServeCodeGen.stream_serve_process_logs(
934
+ service_name, stream_controller=True, follow=False))
935
+ elif component == serve_utils.ServiceComponent.LOAD_BALANCER:
936
+ stream_logs_code = (
937
+ serve_utils.ServeCodeGen.stream_serve_process_logs(
938
+ service_name, stream_controller=False, follow=False))
939
+ elif component == serve_utils.ServiceComponent.REPLICA:
940
+ replica_id = target.replica_id
941
+ assert replica_id is not None, service_name
942
+ stream_logs_code = serve_utils.ServeCodeGen.stream_replica_logs(
943
+ service_name, replica_id, follow=False)
944
+ else:
945
+ assert False, component
946
+
947
+ # Refer to the notes in
948
+ # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
949
+ backend.run_on_head(handle,
950
+ stream_logs_code,
951
+ stream_logs=False,
952
+ process_stream=False,
953
+ ssh_mode=command_runner.SshMode.INTERACTIVE,
954
+ log_path=log_path)
955
+
956
+ subprocess_utils.run_in_parallel(sync_down_logs_by_target,
957
+ list(normalized_targets))
958
+
959
+ return local_dir
@@ -1,13 +1,17 @@
1
1
  """Rest APIs for SkyServe."""
2
2
 
3
+ import pathlib
4
+
3
5
  import fastapi
4
6
 
5
7
  from sky import sky_logging
6
8
  from sky.serve.server import core
9
+ from sky.server import common as server_common
7
10
  from sky.server import stream_utils
8
11
  from sky.server.requests import executor
9
12
  from sky.server.requests import payloads
10
13
  from sky.server.requests import requests as api_requests
14
+ from sky.skylet import constants
11
15
  from sky.utils import common
12
16
 
13
17
  logger = sky_logging.init_logger(__name__)
@@ -110,3 +114,27 @@ async def tail_logs(
110
114
  logs_path=request_task.log_path,
111
115
  background_tasks=background_tasks,
112
116
  )
117
+
118
+
119
+ @router.post('/sync-down-logs')
120
+ async def download_logs(
121
+ request: fastapi.Request,
122
+ download_logs_body: payloads.ServeDownloadLogsBody,
123
+ ) -> None:
124
+ user_hash = download_logs_body.env_vars[constants.USER_ID_ENV_VAR]
125
+ timestamp = sky_logging.get_run_timestamp()
126
+ logs_dir_on_api_server = (
127
+ pathlib.Path(server_common.api_server_user_logs_dir_prefix(user_hash)) /
128
+ 'service' / f'{download_logs_body.service_name}_{timestamp}')
129
+ logs_dir_on_api_server.mkdir(parents=True, exist_ok=True)
130
+ # We should reuse the original request body, so that the env vars, such as
131
+ # user hash, are kept the same.
132
+ download_logs_body.local_dir = str(logs_dir_on_api_server)
133
+ executor.schedule_request(
134
+ request_id=request.state.request_id,
135
+ request_name='serve.sync_down_logs',
136
+ request_body=download_logs_body,
137
+ func=core.sync_down_logs,
138
+ schedule_type=api_requests.ScheduleType.SHORT,
139
+ request_cluster_name=common.SKY_SERVE_CONTROLLER_NAME,
140
+ )
sky/server/common.py CHANGED
@@ -293,7 +293,25 @@ def _start_api_server(deploy: bool = False,
293
293
  time.sleep(0.5)
294
294
  else:
295
295
  break
296
- logger.info(ux_utils.finishing_message('SkyPilot API server started.'))
296
+
297
+ dashboard_msg = (f'Dashboard: {get_server_url(host)}/dashboard')
298
+ api_server_info = get_api_server_status(get_server_url(host))
299
+ if api_server_info.version == _DEV_VERSION:
300
+ dashboard_msg += (
301
+ f'\n{colorama.Style.RESET_ALL}{ux_utils.INDENT_SYMBOL}'
302
+ f'{colorama.Fore.YELLOW}')
303
+ if not os.path.isdir(server_constants.DASHBOARD_DIR):
304
+ dashboard_msg += (
305
+ 'Dashboard is not built, '
306
+ 'to build: npm --prefix sky/dashboard run build')
307
+ else:
308
+ dashboard_msg += (
309
+ 'Dashboard may be stale when installed from source, '
310
+ 'to rebuild: npm --prefix sky/dashboard run build')
311
+ dashboard_msg += f'{colorama.Style.RESET_ALL}'
312
+ logger.info(
313
+ ux_utils.finishing_message(
314
+ f'SkyPilot API server started. {dashboard_msg}'))
297
315
 
298
316
 
299
317
  def check_server_healthy(endpoint: Optional[str] = None,) -> None:
sky/server/constants.py CHANGED
@@ -1,5 +1,7 @@
1
1
  """Constants for the API servers."""
2
2
 
3
+ import os
4
+
3
5
  from sky.skylet import constants
4
6
 
5
7
  # API server version, whenever there is a change in API server that requires a
@@ -24,3 +26,7 @@ CLUSTER_REFRESH_DAEMON_INTERVAL_SECONDS = 60
24
26
 
25
27
  # Environment variable for a file path to the API cookie file.
26
28
  API_COOKIE_FILE_ENV_VAR = f'{constants.SKYPILOT_ENV_VAR_PREFIX}API_COOKIE_FILE'
29
+
30
+ # The path to the dashboard build output
31
+ DASHBOARD_DIR = os.path.join(os.path.dirname(__file__), '..', 'dashboard',
32
+ 'out')
@@ -358,6 +358,10 @@ def _request_execution_wrapper(request_id: str,
358
358
  # captured in the log file.
359
359
  try:
360
360
  with override_request_env_and_config(request_body):
361
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
362
+ config = skypilot_config.to_dict()
363
+ logger.debug(f'request config: \n'
364
+ f'{common_utils.dump_yaml_str(dict(config))}')
361
365
  return_value = func(**request_body.to_kwargs())
362
366
  f.flush()
363
367
  except KeyboardInterrupt:
@@ -6,7 +6,6 @@ with the backend functions. The benefit of having the default values in the
6
6
  payloads is that a user can find the default values in the Restful API docs.
7
7
  """
8
8
  import getpass
9
- import json
10
9
  import os
11
10
  import typing
12
11
  from typing import Any, Dict, List, Optional, Tuple, Union
@@ -47,7 +46,7 @@ def request_body_env_vars() -> dict:
47
46
  # Remove the path to config file, as the config content is included in the
48
47
  # request body and will be merged with the config on the server side.
49
48
  env_vars.pop(skypilot_config.ENV_VAR_SKYPILOT_CONFIG, None)
50
- env_vars.pop(skypilot_config.ENV_VAR_USER_CONFIG, None)
49
+ env_vars.pop(skypilot_config.ENV_VAR_GLOBAL_CONFIG, None)
51
50
  env_vars.pop(skypilot_config.ENV_VAR_PROJECT_CONFIG, None)
52
51
  return env_vars
53
52
 
@@ -56,20 +55,9 @@ def get_override_skypilot_config_from_client() -> Dict[str, Any]:
56
55
  """Returns the override configs from the client."""
57
56
  config = skypilot_config.to_dict()
58
57
  # Remove the API server config, as we should not specify the SkyPilot
59
- # server endpoint on the server side. This avoids the warning below.
58
+ # server endpoint on the server side. This avoids the warning at
59
+ # server-side.
60
60
  config.pop_nested(('api_server',), default_value=None)
61
- ignored_key_values = {}
62
- for nested_key in constants.SKIPPED_CLIENT_OVERRIDE_KEYS:
63
- value = config.pop_nested(nested_key, default_value=None)
64
- if value is not None:
65
- ignored_key_values['.'.join(nested_key)] = value
66
- if ignored_key_values:
67
- logger.debug(f'The following keys ({json.dumps(ignored_key_values)}) '
68
- 'are specified in the client SkyPilot config at '
69
- f'{skypilot_config.loaded_config_path()!r}. '
70
- 'This will be ignored. If you want to specify it, '
71
- 'please modify it on server side or contact your '
72
- 'administrator.')
73
61
  return config
74
62
 
75
63
 
@@ -420,6 +408,15 @@ class ServeLogsBody(RequestBody):
420
408
  follow: bool = True
421
409
 
422
410
 
411
+ class ServeDownloadLogsBody(RequestBody):
412
+ """The request body for the serve download logs endpoint."""
413
+ service_name: str
414
+ local_dir: str
415
+ targets: Optional[Union[str, serve.ServiceComponent,
416
+ List[Union[str, serve.ServiceComponent]]]]
417
+ replica_ids: Optional[List[int]] = None
418
+
419
+
423
420
  class ServeStatusBody(RequestBody):
424
421
  """The request body for the serve status endpoint."""
425
422
  service_names: Optional[Union[str, List[str]]]