skypilot-nightly 1.0.0.dev20250806__py3-none-any.whl → 1.0.0.dev20250808__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (137) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +20 -1
  3. sky/backends/cloud_vm_ray_backend.py +42 -6
  4. sky/check.py +11 -1
  5. sky/client/cli/command.py +248 -119
  6. sky/client/sdk.py +146 -66
  7. sky/client/sdk_async.py +5 -1
  8. sky/core.py +5 -2
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/-DXZksWqf2waNHeU9YTQe/_buildManifest.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +11 -0
  12. sky/dashboard/out/_next/static/chunks/1871-980a395e92633a5c.js +6 -0
  13. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/{4937.d6bf67771e353356.js → 4937.a2baa2df5572a276.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/{691.6d99cbfba347cebf.js → 691.5eeedf82cc243343.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/6989-6129c1cfbcf51063.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +16 -0
  24. sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/9025.a1bef12d672bb66d.js +6 -0
  26. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +31 -0
  28. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/{9847.4c46c5e229c78704.js → 9847.757720f3b40c0aa5.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/{_app-2a43ea3241bbdacd.js → _app-491a4d699d95e808.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +11 -0
  32. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/pages/{clusters-47f1ddae13a2f8e4.js → clusters-b30460f683e6ba96.js} +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-2a44e70b500b6b70.js → [context]-13d53fffc03ccb52.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/pages/{infra-22faac9325016d83.js → infra-fc9222e26c8e2f0d.js} +1 -1
  37. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +11 -0
  38. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-f5ccf5d39d87aebe.js +21 -0
  39. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/pages/{users-b90c865a690bfe84.js → users-7ed36e44e779d5c7.js} +1 -1
  41. sky/dashboard/out/_next/static/chunks/pages/{volumes-7af733f5d7b6ed1c.js → volumes-c9695d657f78b5dc.js} +1 -1
  42. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-f72f73bcef9541dc.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/webpack-339efec49c0cc7d0.js +1 -0
  46. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +3 -0
  47. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  48. sky/dashboard/out/clusters/[cluster].html +1 -1
  49. sky/dashboard/out/clusters.html +1 -1
  50. sky/dashboard/out/config.html +1 -1
  51. sky/dashboard/out/index.html +1 -1
  52. sky/dashboard/out/infra/[context].html +1 -1
  53. sky/dashboard/out/infra.html +1 -1
  54. sky/dashboard/out/jobs/[job].html +1 -1
  55. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  56. sky/dashboard/out/jobs.html +1 -1
  57. sky/dashboard/out/users.html +1 -1
  58. sky/dashboard/out/volumes.html +1 -1
  59. sky/dashboard/out/workspace/new.html +1 -1
  60. sky/dashboard/out/workspaces/[name].html +1 -1
  61. sky/dashboard/out/workspaces.html +1 -1
  62. sky/execution.py +6 -4
  63. sky/global_user_state.py +22 -3
  64. sky/jobs/__init__.py +2 -0
  65. sky/jobs/client/sdk.py +67 -19
  66. sky/jobs/controller.py +2 -1
  67. sky/jobs/server/core.py +48 -1
  68. sky/jobs/server/server.py +52 -3
  69. sky/jobs/state.py +5 -1
  70. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  71. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  72. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  73. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  74. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  75. sky/serve/client/impl.py +93 -6
  76. sky/serve/client/sdk.py +22 -53
  77. sky/serve/constants.py +2 -1
  78. sky/serve/controller.py +4 -2
  79. sky/serve/serve_state.py +444 -324
  80. sky/serve/serve_utils.py +77 -46
  81. sky/serve/server/core.py +13 -197
  82. sky/serve/server/impl.py +239 -2
  83. sky/serve/service.py +8 -3
  84. sky/server/common.py +18 -7
  85. sky/server/constants.py +1 -1
  86. sky/server/requests/executor.py +5 -3
  87. sky/server/requests/payloads.py +19 -0
  88. sky/setup_files/alembic.ini +4 -0
  89. sky/task.py +18 -11
  90. sky/templates/kubernetes-ray.yml.j2 +5 -0
  91. sky/templates/sky-serve-controller.yaml.j2 +1 -0
  92. sky/usage/usage_lib.py +8 -6
  93. sky/utils/annotations.py +8 -3
  94. sky/utils/cli_utils/status_utils.py +1 -1
  95. sky/utils/common_utils.py +11 -1
  96. sky/utils/db/db_utils.py +31 -0
  97. sky/utils/db/migration_utils.py +6 -2
  98. sky/utils/kubernetes/deploy_remote_cluster.py +3 -1
  99. sky/utils/resource_checker.py +162 -21
  100. sky/volumes/client/sdk.py +4 -4
  101. sky/workspaces/core.py +210 -6
  102. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/METADATA +19 -14
  103. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/RECORD +109 -103
  104. sky/client/sdk.pyi +0 -301
  105. sky/dashboard/out/_next/static/Gelsd19kVxXcX7aQQGsGu/_buildManifest.js +0 -1
  106. sky/dashboard/out/_next/static/chunks/1043-75af48ca5d5aaf57.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/1141-8678a9102cc5f67e.js +0 -11
  108. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +0 -1
  109. sky/dashboard/out/_next/static/chunks/1871-ced1c14230cad6e1.js +0 -6
  110. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +0 -1
  111. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  112. sky/dashboard/out/_next/static/chunks/2622-951867535095b0eb.js +0 -1
  113. sky/dashboard/out/_next/static/chunks/3785.0a173cd4393f0fef.js +0 -1
  114. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +0 -16
  116. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/6601-2109d22e7861861c.js +0 -1
  118. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  119. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/9025.99f29acb7617963e.js +0 -6
  121. sky/dashboard/out/_next/static/chunks/938-bda2685db5eae6cf.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-7cb24da04ca00956.js +0 -11
  123. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-1e95993124dbfc57.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/pages/config-d56e64f30db7b42e.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90693cb88b5599a7.js +0 -11
  126. sky/dashboard/out/_next/static/chunks/pages/jobs-ab318e52eb4424a7.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +0 -1
  128. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-35e0de5bca55e594.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/pages/workspaces-062525fb5462acb6.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/webpack-387626669badf82e.js +0 -1
  131. sky/dashboard/out/_next/static/css/b3227360726f12eb.css +0 -3
  132. /sky/dashboard/out/_next/static/{Gelsd19kVxXcX7aQQGsGu → -DXZksWqf2waNHeU9YTQe}/_ssgManifest.js +0 -0
  133. /sky/dashboard/out/_next/static/chunks/{6135-2d7ed3350659d073.js → 6135-85426374db04811e.js} +0 -0
  134. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/WHEEL +0 -0
  135. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/entry_points.txt +0 -0
  136. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/licenses/LICENSE +0 -0
  137. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/top_level.txt +0 -0
sky/serve/server/impl.py CHANGED
@@ -1,7 +1,12 @@
1
1
  """Implementation of the SkyServe core APIs."""
2
+ import pathlib
2
3
  import re
4
+ import shlex
5
+ import signal
3
6
  import tempfile
4
- from typing import Any, Dict, List, Optional, Tuple, Union
7
+ import threading
8
+ from typing import Any, Dict, List, Optional, Set, Tuple, Union
9
+ import uuid
5
10
 
6
11
  import colorama
7
12
  import filelock
@@ -21,6 +26,7 @@ from sky.serve import serve_state
21
26
  from sky.serve import serve_utils
22
27
  from sky.skylet import constants
23
28
  from sky.utils import admin_policy_utils
29
+ from sky.utils import command_runner
24
30
  from sky.utils import common
25
31
  from sky.utils import common_utils
26
32
  from sky.utils import controller_utils
@@ -189,7 +195,12 @@ def up(
189
195
  task_resources=task.resources)
190
196
  controller_job_id = None
191
197
  if serve_utils.is_consolidation_mode(pool):
192
- controller_job_id = 0
198
+ # We need a unique integer per sky.serve.up call to avoid name
199
+ # conflict. Originally in non-consolidation mode, this is the ray
200
+ # job id; now we use the request id hash instead. Here we also
201
+ # make sure it is a 32-bit integer to avoid overflow on sqlalchemy.
202
+ rid = common_utils.get_current_request_id()
203
+ controller_job_id = hash(uuid.UUID(rid).int) & 0x7FFFFFFF
193
204
 
194
205
  vars_to_fill = {
195
206
  'remote_task_yaml_path': remote_tmp_task_yaml_path,
@@ -201,6 +212,7 @@ def up(
201
212
  'modified_catalogs':
202
213
  service_catalog_common.get_modified_catalog_file_mounts(),
203
214
  'consolidation_mode_job_id': controller_job_id,
215
+ 'entrypoint': shlex.quote(common_utils.get_current_command()),
204
216
  **tls_template_vars,
205
217
  **controller_utils.shared_controller_vars_to_fill(
206
218
  controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
@@ -708,3 +720,228 @@ def status(
708
720
  service_record['endpoint'] = f'{protocol}://{endpoint}'
709
721
 
710
722
  return service_records
723
+
724
+
725
+ ServiceComponentOrStr = Union[str, serve_utils.ServiceComponent]
726
+
727
+
728
+ def tail_logs(
729
+ service_name: str,
730
+ *,
731
+ target: ServiceComponentOrStr,
732
+ replica_id: Optional[int] = None,
733
+ follow: bool = True,
734
+ tail: Optional[int] = None,
735
+ pool: bool = False,
736
+ ) -> None:
737
+ """Tail logs of a service or pool."""
738
+ if isinstance(target, str):
739
+ target = serve_utils.ServiceComponent(target)
740
+
741
+ if pool and target == serve_utils.ServiceComponent.LOAD_BALANCER:
742
+ raise ValueError(f'Target {target} is not supported for pool.')
743
+
744
+ if target == serve_utils.ServiceComponent.REPLICA:
745
+ if replica_id is None:
746
+ with ux_utils.print_exception_no_traceback():
747
+ raise ValueError(
748
+ '`replica_id` must be specified when using target=REPLICA.')
749
+ else:
750
+ if replica_id is not None:
751
+ with ux_utils.print_exception_no_traceback():
752
+ raise ValueError('`replica_id` must be None when using '
753
+ 'target=CONTROLLER/LOAD_BALANCER.')
754
+
755
+ controller_type = controller_utils.get_controller_for_pool(pool)
756
+ handle = backend_utils.is_controller_accessible(
757
+ controller=controller_type,
758
+ stopped_message=controller_type.value.default_hint_if_non_existent)
759
+
760
+ backend = backend_utils.get_backend_from_handle(handle)
761
+ assert isinstance(backend, backends.CloudVmRayBackend), backend
762
+
763
+ if target != serve_utils.ServiceComponent.REPLICA:
764
+ code = serve_utils.ServeCodeGen.stream_serve_process_logs(
765
+ service_name,
766
+ stream_controller=(
767
+ target == serve_utils.ServiceComponent.CONTROLLER),
768
+ follow=follow,
769
+ tail=tail,
770
+ pool=pool)
771
+ else:
772
+ assert replica_id is not None, service_name
773
+ code = serve_utils.ServeCodeGen.stream_replica_logs(service_name,
774
+ replica_id,
775
+ follow,
776
+ tail=tail,
777
+ pool=pool)
778
+
779
+ # With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
780
+ # kill the process, so we need to handle it manually here.
781
+ if threading.current_thread() is threading.main_thread():
782
+ signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
783
+ signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
784
+
785
+ # Refer to the notes in
786
+ # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
787
+ backend.run_on_head(handle,
788
+ code,
789
+ stream_logs=True,
790
+ process_stream=False,
791
+ ssh_mode=command_runner.SshMode.INTERACTIVE)
792
+
793
+
794
+ def _get_all_replica_targets(
795
+ service_name: str, backend: backends.CloudVmRayBackend,
796
+ handle: backends.CloudVmRayResourceHandle,
797
+ pool: bool) -> Set[serve_utils.ServiceComponentTarget]:
798
+ """Helper function to get targets for all live replicas."""
799
+ code = serve_utils.ServeCodeGen.get_service_status([service_name],
800
+ pool=pool)
801
+ returncode, serve_status_payload, stderr = backend.run_on_head(
802
+ handle,
803
+ code,
804
+ require_outputs=True,
805
+ stream_logs=False,
806
+ separate_stderr=True)
807
+
808
+ try:
809
+ subprocess_utils.handle_returncode(returncode,
810
+ code,
811
+ 'Failed to fetch services',
812
+ stderr,
813
+ stream_logs=True)
814
+ except exceptions.CommandError as e:
815
+ raise RuntimeError(e.error_msg) from e
816
+
817
+ service_records = serve_utils.load_service_status(serve_status_payload)
818
+ if not service_records:
819
+ raise ValueError(f'Service {service_name!r} not found.')
820
+ assert len(service_records) == 1
821
+ service_record = service_records[0]
822
+
823
+ return {
824
+ serve_utils.ServiceComponentTarget(serve_utils.ServiceComponent.REPLICA,
825
+ replica_info['replica_id'])
826
+ for replica_info in service_record['replica_info']
827
+ }
828
+
829
+
830
+ def sync_down_logs(
831
+ service_name: str,
832
+ *,
833
+ local_dir: str,
834
+ targets: Union[ServiceComponentOrStr, List[ServiceComponentOrStr],
835
+ None] = None,
836
+ replica_ids: Optional[List[int]] = None,
837
+ tail: Optional[int] = None,
838
+ pool: bool = False,
839
+ ) -> str:
840
+ """Sync down logs of a service or pool."""
841
+ noun = 'pool' if pool else 'service'
842
+ repnoun = 'worker' if pool else 'replica'
843
+ caprepnoun = repnoun.capitalize()
844
+
845
+ # Step 0) get the controller handle
846
+ with rich_utils.safe_status(
847
+ ux_utils.spinner_message(f'Checking {noun} status...')):
848
+ controller_type = controller_utils.get_controller_for_pool(pool)
849
+ handle = backend_utils.is_controller_accessible(
850
+ controller=controller_type,
851
+ stopped_message=controller_type.value.default_hint_if_non_existent)
852
+ backend: backends.CloudVmRayBackend = (
853
+ backend_utils.get_backend_from_handle(handle))
854
+
855
+ requested_components: Set[serve_utils.ServiceComponent] = set()
856
+ if not targets:
857
+ # No targets specified -> request all components
858
+ requested_components = {
859
+ serve_utils.ServiceComponent.CONTROLLER,
860
+ serve_utils.ServiceComponent.LOAD_BALANCER,
861
+ serve_utils.ServiceComponent.REPLICA
862
+ }
863
+ else:
864
+ # Parse provided targets
865
+ if isinstance(targets, (str, serve_utils.ServiceComponent)):
866
+ requested_components = {serve_utils.ServiceComponent(targets)}
867
+ else: # list
868
+ requested_components = {
869
+ serve_utils.ServiceComponent(t) for t in targets
870
+ }
871
+
872
+ normalized_targets: Set[serve_utils.ServiceComponentTarget] = set()
873
+ if serve_utils.ServiceComponent.CONTROLLER in requested_components:
874
+ normalized_targets.add(
875
+ serve_utils.ServiceComponentTarget(
876
+ serve_utils.ServiceComponent.CONTROLLER))
877
+ if serve_utils.ServiceComponent.LOAD_BALANCER in requested_components:
878
+ normalized_targets.add(
879
+ serve_utils.ServiceComponentTarget(
880
+ serve_utils.ServiceComponent.LOAD_BALANCER))
881
+ if serve_utils.ServiceComponent.REPLICA in requested_components:
882
+ with rich_utils.safe_status(
883
+ ux_utils.spinner_message(f'Getting live {repnoun} infos...')):
884
+ replica_targets = _get_all_replica_targets(service_name, backend,
885
+ handle, pool)
886
+ if not replica_ids:
887
+ # Replica target requested but no specific IDs
888
+ # -> Get all replica logs
889
+ normalized_targets.update(replica_targets)
890
+ else:
891
+ # Replica target requested with specific IDs
892
+ requested_replica_targets = [
893
+ serve_utils.ServiceComponentTarget(
894
+ serve_utils.ServiceComponent.REPLICA, rid)
895
+ for rid in replica_ids
896
+ ]
897
+ for target in requested_replica_targets:
898
+ if target not in replica_targets:
899
+ logger.warning(f'{caprepnoun} ID {target.replica_id} not '
900
+ f'found for {service_name}. Skipping...')
901
+ else:
902
+ normalized_targets.add(target)
903
+
904
+ def sync_down_logs_by_target(target: serve_utils.ServiceComponentTarget):
905
+ component = target.component
906
+ # We need to set one side of the pipe to a logs stream, and the other
907
+ # side to a file.
908
+ log_path = str(pathlib.Path(local_dir) / f'{target}.log')
909
+ stream_logs_code: str
910
+
911
+ if component == serve_utils.ServiceComponent.CONTROLLER:
912
+ stream_logs_code = (
913
+ serve_utils.ServeCodeGen.stream_serve_process_logs(
914
+ service_name,
915
+ stream_controller=True,
916
+ follow=False,
917
+ tail=tail,
918
+ pool=pool))
919
+ elif component == serve_utils.ServiceComponent.LOAD_BALANCER:
920
+ stream_logs_code = (
921
+ serve_utils.ServeCodeGen.stream_serve_process_logs(
922
+ service_name,
923
+ stream_controller=False,
924
+ follow=False,
925
+ tail=tail,
926
+ pool=pool))
927
+ elif component == serve_utils.ServiceComponent.REPLICA:
928
+ replica_id = target.replica_id
929
+ assert replica_id is not None, service_name
930
+ stream_logs_code = serve_utils.ServeCodeGen.stream_replica_logs(
931
+ service_name, replica_id, follow=False, tail=tail, pool=pool)
932
+ else:
933
+ assert False, component
934
+
935
+ # Refer to the notes in
936
+ # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
937
+ backend.run_on_head(handle,
938
+ stream_logs_code,
939
+ stream_logs=False,
940
+ process_stream=False,
941
+ ssh_mode=command_runner.SshMode.INTERACTIVE,
942
+ log_path=log_path)
943
+
944
+ subprocess_utils.run_in_parallel(sync_down_logs_by_target,
945
+ list(normalized_targets))
946
+
947
+ return local_dir
sky/serve/service.py CHANGED
@@ -176,7 +176,7 @@ def _cleanup_task_run_script(job_id: int) -> None:
176
176
  logger.warning(f'Task run script {this_task_run_script} not found')
177
177
 
178
178
 
179
- def _start(service_name: str, tmp_task_yaml: str, job_id: int):
179
+ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
180
180
  """Starts the service.
181
181
  This including the controller and load balancer.
182
182
  """
@@ -228,7 +228,8 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
228
228
  status=serve_state.ServiceStatus.CONTROLLER_INIT,
229
229
  tls_encrypted=service_spec.tls_credential is not None,
230
230
  pool=service_spec.pool,
231
- controller_pid=os.getpid())
231
+ controller_pid=os.getpid(),
232
+ entrypoint=entrypoint)
232
233
  # Directly throw an error here. See sky/serve/api.py::up
233
234
  # for more details.
234
235
  if not success:
@@ -365,8 +366,12 @@ if __name__ == '__main__':
365
366
  required=True,
366
367
  type=int,
367
368
  help='Job id for the service job.')
369
+ parser.add_argument('--entrypoint',
370
+ type=str,
371
+ help='Entrypoint to launch the service',
372
+ required=True)
368
373
  args = parser.parse_args()
369
374
  # We start process with 'spawn', because 'fork' could result in weird
370
375
  # behaviors; 'spawn' is also cross-platform.
371
376
  multiprocessing.set_start_method('spawn', force=True)
372
- _start(args.service_name, args.task_yaml, args.job_id)
377
+ _start(args.service_name, args.task_yaml, args.job_id, args.entrypoint)
sky/server/common.py CHANGED
@@ -16,13 +16,15 @@ import tempfile
16
16
  import threading
17
17
  import time
18
18
  import typing
19
- from typing import Any, Dict, Literal, Optional, Tuple, Union
19
+ from typing import (Any, Callable, cast, Dict, Generic, Literal, Optional,
20
+ Tuple, TypeVar, Union)
20
21
  from urllib import parse
21
22
  import uuid
22
23
 
23
24
  import cachetools
24
25
  import colorama
25
26
  import filelock
27
+ from typing_extensions import ParamSpec
26
28
 
27
29
  from sky import exceptions
28
30
  from sky import sky_logging
@@ -87,7 +89,14 @@ _SERVER_INSTALL_VERSION_MISMATCH_WARNING = (
87
89
  'restarting the API server.'
88
90
  f'{colorama.Style.RESET_ALL}')
89
91
 
90
- RequestId = str
92
+ T = TypeVar('T')
93
+ P = ParamSpec('P')
94
+
95
+
96
+ class RequestId(str, Generic[T]):
97
+ pass
98
+
99
+
91
100
  ApiVersion = Optional[str]
92
101
 
93
102
  logger = sky_logging.init_logger(__name__)
@@ -486,7 +495,7 @@ def handle_request_error(response: 'requests.Response') -> None:
486
495
  f'{response.text}')
487
496
 
488
497
 
489
- def get_request_id(response: 'requests.Response') -> RequestId:
498
+ def get_request_id(response: 'requests.Response') -> RequestId[T]:
490
499
  handle_request_error(response)
491
500
  request_id = response.headers.get('X-Skypilot-Request-ID')
492
501
  if request_id is None:
@@ -497,7 +506,7 @@ def get_request_id(response: 'requests.Response') -> RequestId:
497
506
  'Failed to get request ID from SkyPilot API server at '
498
507
  f'{get_server_url()}. Response: {response.status_code} '
499
508
  f'{response.text}')
500
- return request_id
509
+ return RequestId[T](request_id)
501
510
 
502
511
 
503
512
  def _start_api_server(deploy: bool = False,
@@ -753,14 +762,14 @@ def check_server_healthy_or_start_fn(deploy: bool = False,
753
762
  metrics_port, enable_basic_auth)
754
763
 
755
764
 
756
- def check_server_healthy_or_start(func):
765
+ def check_server_healthy_or_start(func: Callable[P, T]) -> Callable[P, T]:
757
766
 
758
767
  @functools.wraps(func)
759
768
  def wrapper(*args, deploy: bool = False, host: str = '127.0.0.1', **kwargs):
760
769
  check_server_healthy_or_start_fn(deploy, host)
761
770
  return func(*args, **kwargs)
762
771
 
763
- return wrapper
772
+ return cast(Callable[P, T], wrapper)
764
773
 
765
774
 
766
775
  def process_mounts_in_task_on_api_server(task: str, env_vars: Dict[str, str],
@@ -878,7 +887,8 @@ def request_body_to_params(body: 'pydantic.BaseModel') -> Dict[str, Any]:
878
887
 
879
888
  def reload_for_new_request(client_entrypoint: Optional[str],
880
889
  client_command: Optional[str],
881
- using_remote_api_server: bool, user: 'models.User'):
890
+ using_remote_api_server: bool, user: 'models.User',
891
+ request_id: str) -> None:
882
892
  """Reload modules, global variables, and usage message for a new request."""
883
893
  # This should be called first to make sure the logger is up-to-date.
884
894
  sky_logging.reload_logger()
@@ -892,6 +902,7 @@ def reload_for_new_request(client_entrypoint: Optional[str],
892
902
  client_command=client_command,
893
903
  using_remote_api_server=using_remote_api_server,
894
904
  user=user,
905
+ request_id=request_id,
895
906
  )
896
907
 
897
908
  # Clear cache should be called before reload_logger and usage reset,
sky/server/constants.py CHANGED
@@ -10,7 +10,7 @@ from sky.skylet import constants
10
10
  # based on version info is needed.
11
11
  # For more details and code guidelines, refer to:
12
12
  # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
- API_VERSION = 14
13
+ API_VERSION = 16
14
14
 
15
15
  # The minimum peer API version that the code should still work with.
16
16
  # Notes (dev):
@@ -271,7 +271,8 @@ def _get_queue(schedule_type: api_requests.ScheduleType) -> RequestQueue:
271
271
 
272
272
  @contextlib.contextmanager
273
273
  def override_request_env_and_config(
274
- request_body: payloads.RequestBody) -> Generator[None, None, None]:
274
+ request_body: payloads.RequestBody,
275
+ request_id: str) -> Generator[None, None, None]:
275
276
  """Override the environment and SkyPilot config for a request."""
276
277
  original_env = os.environ.copy()
277
278
  os.environ.update(request_body.env_vars)
@@ -292,7 +293,8 @@ def override_request_env_and_config(
292
293
  client_entrypoint=request_body.entrypoint,
293
294
  client_command=request_body.entrypoint_command,
294
295
  using_remote_api_server=request_body.using_remote_api_server,
295
- user=user)
296
+ user=user,
297
+ request_id=request_id)
296
298
  try:
297
299
  logger.debug(
298
300
  f'override path: {request_body.override_skypilot_config_path}')
@@ -376,7 +378,7 @@ def _request_execution_wrapper(request_id: str,
376
378
  # config, as there can be some logs during override that needs to be
377
379
  # captured in the log file.
378
380
  try:
379
- with override_request_env_and_config(request_body), \
381
+ with override_request_env_and_config(request_body, request_id), \
380
382
  tempstore.tempdir():
381
383
  if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
382
384
  config = skypilot_config.to_dict()
@@ -707,6 +707,25 @@ class JobsPoolStatusBody(RequestBody):
707
707
  pool_names: Optional[Union[str, List[str]]]
708
708
 
709
709
 
710
+ class JobsPoolLogsBody(RequestBody):
711
+ """The request body for the jobs pool logs endpoint."""
712
+ pool_name: str
713
+ target: Union[str, serve.ServiceComponent]
714
+ worker_id: Optional[int] = None
715
+ follow: bool = True
716
+ tail: Optional[int] = None
717
+
718
+
719
+ class JobsPoolDownloadLogsBody(RequestBody):
720
+ """The request body for the jobs pool download logs endpoint."""
721
+ pool_name: str
722
+ local_dir: str
723
+ targets: Optional[Union[str, serve.ServiceComponent,
724
+ List[Union[str, serve.ServiceComponent]]]]
725
+ worker_ids: Optional[List[int]] = None
726
+ tail: Optional[int] = None
727
+
728
+
710
729
  class UploadZipFileResponse(pydantic.BaseModel):
711
730
  """The response body for the upload zip file endpoint."""
712
731
  status: str
@@ -94,6 +94,10 @@ version_table = alembic_version_state_db
94
94
  version_locations = %(here)s/../schemas/db/spot_jobs
95
95
  version_table = alembic_version_spot_jobs_db
96
96
 
97
+ [serve_db]
98
+ version_locations = %(here)s/../schemas/db/serve_state
99
+ version_table = alembic_version_serve_state_db
100
+
97
101
  [post_write_hooks]
98
102
  # post_write_hooks defines scripts or Python functions that are run
99
103
  # on newly generated revision scripts. See the documentation for further
sky/task.py CHANGED
@@ -241,8 +241,8 @@ class Task:
241
241
  self,
242
242
  name: Optional[str] = None,
243
243
  *,
244
- setup: Optional[str] = None,
245
- run: Optional[CommandOrCommandGen] = None,
244
+ setup: Optional[Union[str, List[str]]] = None,
245
+ run: Optional[Union[CommandOrCommandGen, List[str]]] = None,
246
246
  envs: Optional[Dict[str, str]] = None,
247
247
  secrets: Optional[Dict[str, str]] = None,
248
248
  workdir: Optional[Union[str, Dict[str, Any]]] = None,
@@ -293,15 +293,15 @@ class Task:
293
293
 
294
294
  Args:
295
295
  name: A string name for the Task for display purposes.
296
- setup: A setup command, which will be run before executing the run
296
+ setup: A setup command(s), which will be run before executing the run
297
297
  commands ``run``, and executed under ``workdir``.
298
298
  run: The actual command for the task. If not None, either a shell
299
- command (str) or a command generator (callable). If latter, it
300
- must take a node rank and a list of node addresses as input and
301
- return a shell command (str) (valid to return None for some nodes,
302
- in which case no commands are run on them). Run commands will be
303
- run under ``workdir``. Note the command generator should be a
304
- self-contained lambda.
299
+ command(s) (str, list(str)) or a command generator (callable). If
300
+ latter, it must take a node rank and a list of node addresses as
301
+ input and return a shell command (str) (valid to return None for
302
+ some nodes, in which case no commands are run on them). Run
303
+ commands will be run under ``workdir``. Note the command generator
304
+ should be a self-contained lambda.
305
305
  envs: A dictionary of environment variables to set before running the
306
306
  setup and run commands.
307
307
  secrets: A dictionary of secret environment variables to set before
@@ -347,15 +347,22 @@ class Task:
347
347
  YAML config.
348
348
  """
349
349
  self.name = name
350
- self.run = run
351
350
  self.storage_mounts: Dict[str, storage_lib.Storage] = {}
352
351
  self.storage_plans: Dict[storage_lib.Storage,
353
352
  storage_lib.StoreType] = {}
354
- self.setup = setup
355
353
  self._envs = envs or {}
356
354
  self._secrets = secrets or {}
357
355
  self._volumes = volumes or {}
358
356
 
357
+ # concatenate commands if given as list
358
+ def _concat(commands):
359
+ if isinstance(commands, list):
360
+ return '\n'.join(commands)
361
+ return commands
362
+
363
+ self.run = _concat(run)
364
+ self.setup = _concat(setup)
365
+
359
366
  # Validate Docker login configuration early if both envs and secrets
360
367
  # contain Docker variables
361
368
  if self._envs or self._secrets:
@@ -777,6 +777,11 @@ available_node_types:
777
777
  {{ ray_installation_commands }}
778
778
 
779
779
  VIRTUAL_ENV=~/skypilot-runtime ~/.local/bin/uv pip install skypilot[kubernetes,remote]
780
+ # Wait for `patch` package to be installed before applying ray patches
781
+ until dpkg -l | grep -q "^ii patch "; do
782
+ sleep 0.1
783
+ echo "Waiting for patch package to be installed..."
784
+ done
780
785
  # Apply Ray patches for progress bar fix
781
786
  ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
782
787
  VIRTUAL_ENV=~/skypilot-runtime python -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
@@ -57,6 +57,7 @@ run: |
57
57
  -u -m sky.serve.service \
58
58
  --service-name {{service_name}} \
59
59
  --task-yaml {{remote_task_yaml_path}} \
60
+ --entrypoint {{entrypoint}} \
60
61
  {%- if consolidation_mode_job_id is not none %}
61
62
  --job-id {{consolidation_mode_job_id}} \
62
63
  {%- else %}
sky/usage/usage_lib.py CHANGED
@@ -10,6 +10,8 @@ import traceback
10
10
  import typing
11
11
  from typing import Any, Callable, Dict, List, Optional, Union
12
12
 
13
+ from typing_extensions import ParamSpec
14
+
13
15
  import sky
14
16
  from sky import sky_logging
15
17
  from sky.adaptors import common as adaptors_common
@@ -517,26 +519,26 @@ def entrypoint_context(name: str, fallback: bool = False):
517
519
 
518
520
 
519
521
  T = typing.TypeVar('T')
522
+ P = ParamSpec('P')
520
523
 
521
524
 
522
525
  @typing.overload
523
526
  def entrypoint(
524
527
  name_or_fn: str,
525
- fallback: bool = False
526
- ) -> Callable[[Callable[..., T]], Callable[..., T]]:
528
+ fallback: bool = False) -> Callable[[Callable[P, T]], Callable[P, T]]:
527
529
  ...
528
530
 
529
531
 
530
532
  @typing.overload
531
- def entrypoint(name_or_fn: Callable[..., T],
532
- fallback: bool = False) -> Callable[..., T]:
533
+ def entrypoint(name_or_fn: Callable[P, T],
534
+ fallback: bool = False) -> Callable[P, T]:
533
535
  ...
534
536
 
535
537
 
536
538
  def entrypoint(
537
- name_or_fn: Union[str, Callable[..., T]],
539
+ name_or_fn: Union[str, Callable[P, T]],
538
540
  fallback: bool = False
539
- ) -> Union[Callable[..., T], Callable[[Callable[..., T]], Callable[..., T]]]:
541
+ ) -> Union[Callable[P, T], Callable[[Callable[P, T]], Callable[P, T]]]:
540
542
  return common_utils.make_decorator(entrypoint_context,
541
543
  name_or_fn,
542
544
  fallback=fallback)
sky/utils/annotations.py CHANGED
@@ -1,14 +1,19 @@
1
1
  """Annotations for public APIs."""
2
2
 
3
3
  import functools
4
- from typing import Callable, Literal
4
+ from typing import Callable, Literal, TypeVar
5
+
6
+ from typing_extensions import ParamSpec
5
7
 
6
8
  # Whether the current process is a SkyPilot API server process.
7
9
  is_on_api_server = True
8
10
  FUNCTIONS_NEED_RELOAD_CACHE = []
9
11
 
12
+ T = TypeVar('T')
13
+ P = ParamSpec('P')
14
+
10
15
 
11
- def client_api(func):
16
+ def client_api(func: Callable[P, T]) -> Callable[P, T]:
12
17
  """Mark a function as a client-side API.
13
18
 
14
19
  Code invoked by server-side functions will find annotations.is_on_api_server
@@ -38,7 +43,7 @@ def lru_cache(scope: Literal['global', 'request'], *lru_cache_args,
38
43
  lru_cache_kwargs: Keyword arguments for functools.lru_cache.
39
44
  """
40
45
 
41
- def decorator(func: Callable) -> Callable:
46
+ def decorator(func: Callable[P, T]) -> Callable[P, T]:
42
47
  if scope == 'global':
43
48
  return functools.lru_cache(*lru_cache_args,
44
49
  **lru_cache_kwargs)(func)
@@ -401,7 +401,7 @@ def _get_estimated_cost_for_cost_report(
401
401
 
402
402
 
403
403
  def show_kubernetes_cluster_status_table(
404
- clusters: List['kubernetes_utils.KubernetesSkyPilotClusterInfo'],
404
+ clusters: List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
405
405
  show_all: bool) -> None:
406
406
  """Compute cluster table values and display for Kubernetes clusters."""
407
407
  status_columns = [
sky/utils/common_utils.py CHANGED
@@ -271,12 +271,13 @@ _current_command: Optional[str] = None
271
271
  _current_client_entrypoint: Optional[str] = None
272
272
  _using_remote_api_server: Optional[bool] = None
273
273
  _current_user: Optional['models.User'] = None
274
+ _current_request_id: Optional[str] = None
274
275
 
275
276
 
276
277
  def set_request_context(client_entrypoint: Optional[str],
277
278
  client_command: Optional[str],
278
279
  using_remote_api_server: bool,
279
- user: Optional['models.User']):
280
+ user: Optional['models.User'], request_id: str) -> None:
280
281
  """Override the current client entrypoint and command.
281
282
 
282
283
  This is useful when we are on the SkyPilot API server side and we have a
@@ -286,10 +287,19 @@ def set_request_context(client_entrypoint: Optional[str],
286
287
  global _current_client_entrypoint
287
288
  global _using_remote_api_server
288
289
  global _current_user
290
+ global _current_request_id
289
291
  _current_command = client_command
290
292
  _current_client_entrypoint = client_entrypoint
291
293
  _using_remote_api_server = using_remote_api_server
292
294
  _current_user = user
295
+ _current_request_id = request_id
296
+
297
+
298
+ def get_current_request_id() -> str:
299
+ """Returns the current request id."""
300
+ if _current_request_id is not None:
301
+ return _current_request_id
302
+ return 'dummy-request-id'
293
303
 
294
304
 
295
305
  def get_current_command() -> str: