skypilot-nightly 1.0.0.dev20250806__py3-none-any.whl → 1.0.0.dev20250808__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (137) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +20 -1
  3. sky/backends/cloud_vm_ray_backend.py +42 -6
  4. sky/check.py +11 -1
  5. sky/client/cli/command.py +248 -119
  6. sky/client/sdk.py +146 -66
  7. sky/client/sdk_async.py +5 -1
  8. sky/core.py +5 -2
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/-DXZksWqf2waNHeU9YTQe/_buildManifest.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +11 -0
  12. sky/dashboard/out/_next/static/chunks/1871-980a395e92633a5c.js +6 -0
  13. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/{4937.d6bf67771e353356.js → 4937.a2baa2df5572a276.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/{691.6d99cbfba347cebf.js → 691.5eeedf82cc243343.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/6989-6129c1cfbcf51063.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +16 -0
  24. sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/9025.a1bef12d672bb66d.js +6 -0
  26. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +31 -0
  28. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/{9847.4c46c5e229c78704.js → 9847.757720f3b40c0aa5.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/{_app-2a43ea3241bbdacd.js → _app-491a4d699d95e808.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +11 -0
  32. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/pages/{clusters-47f1ddae13a2f8e4.js → clusters-b30460f683e6ba96.js} +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-2a44e70b500b6b70.js → [context]-13d53fffc03ccb52.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/pages/{infra-22faac9325016d83.js → infra-fc9222e26c8e2f0d.js} +1 -1
  37. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +11 -0
  38. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-f5ccf5d39d87aebe.js +21 -0
  39. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/pages/{users-b90c865a690bfe84.js → users-7ed36e44e779d5c7.js} +1 -1
  41. sky/dashboard/out/_next/static/chunks/pages/{volumes-7af733f5d7b6ed1c.js → volumes-c9695d657f78b5dc.js} +1 -1
  42. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-f72f73bcef9541dc.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/webpack-339efec49c0cc7d0.js +1 -0
  46. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +3 -0
  47. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  48. sky/dashboard/out/clusters/[cluster].html +1 -1
  49. sky/dashboard/out/clusters.html +1 -1
  50. sky/dashboard/out/config.html +1 -1
  51. sky/dashboard/out/index.html +1 -1
  52. sky/dashboard/out/infra/[context].html +1 -1
  53. sky/dashboard/out/infra.html +1 -1
  54. sky/dashboard/out/jobs/[job].html +1 -1
  55. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  56. sky/dashboard/out/jobs.html +1 -1
  57. sky/dashboard/out/users.html +1 -1
  58. sky/dashboard/out/volumes.html +1 -1
  59. sky/dashboard/out/workspace/new.html +1 -1
  60. sky/dashboard/out/workspaces/[name].html +1 -1
  61. sky/dashboard/out/workspaces.html +1 -1
  62. sky/execution.py +6 -4
  63. sky/global_user_state.py +22 -3
  64. sky/jobs/__init__.py +2 -0
  65. sky/jobs/client/sdk.py +67 -19
  66. sky/jobs/controller.py +2 -1
  67. sky/jobs/server/core.py +48 -1
  68. sky/jobs/server/server.py +52 -3
  69. sky/jobs/state.py +5 -1
  70. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  71. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  72. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  73. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  74. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  75. sky/serve/client/impl.py +93 -6
  76. sky/serve/client/sdk.py +22 -53
  77. sky/serve/constants.py +2 -1
  78. sky/serve/controller.py +4 -2
  79. sky/serve/serve_state.py +444 -324
  80. sky/serve/serve_utils.py +77 -46
  81. sky/serve/server/core.py +13 -197
  82. sky/serve/server/impl.py +239 -2
  83. sky/serve/service.py +8 -3
  84. sky/server/common.py +18 -7
  85. sky/server/constants.py +1 -1
  86. sky/server/requests/executor.py +5 -3
  87. sky/server/requests/payloads.py +19 -0
  88. sky/setup_files/alembic.ini +4 -0
  89. sky/task.py +18 -11
  90. sky/templates/kubernetes-ray.yml.j2 +5 -0
  91. sky/templates/sky-serve-controller.yaml.j2 +1 -0
  92. sky/usage/usage_lib.py +8 -6
  93. sky/utils/annotations.py +8 -3
  94. sky/utils/cli_utils/status_utils.py +1 -1
  95. sky/utils/common_utils.py +11 -1
  96. sky/utils/db/db_utils.py +31 -0
  97. sky/utils/db/migration_utils.py +6 -2
  98. sky/utils/kubernetes/deploy_remote_cluster.py +3 -1
  99. sky/utils/resource_checker.py +162 -21
  100. sky/volumes/client/sdk.py +4 -4
  101. sky/workspaces/core.py +210 -6
  102. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/METADATA +19 -14
  103. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/RECORD +109 -103
  104. sky/client/sdk.pyi +0 -301
  105. sky/dashboard/out/_next/static/Gelsd19kVxXcX7aQQGsGu/_buildManifest.js +0 -1
  106. sky/dashboard/out/_next/static/chunks/1043-75af48ca5d5aaf57.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/1141-8678a9102cc5f67e.js +0 -11
  108. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +0 -1
  109. sky/dashboard/out/_next/static/chunks/1871-ced1c14230cad6e1.js +0 -6
  110. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +0 -1
  111. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  112. sky/dashboard/out/_next/static/chunks/2622-951867535095b0eb.js +0 -1
  113. sky/dashboard/out/_next/static/chunks/3785.0a173cd4393f0fef.js +0 -1
  114. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +0 -16
  116. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/6601-2109d22e7861861c.js +0 -1
  118. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  119. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/9025.99f29acb7617963e.js +0 -6
  121. sky/dashboard/out/_next/static/chunks/938-bda2685db5eae6cf.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-7cb24da04ca00956.js +0 -11
  123. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-1e95993124dbfc57.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/pages/config-d56e64f30db7b42e.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90693cb88b5599a7.js +0 -11
  126. sky/dashboard/out/_next/static/chunks/pages/jobs-ab318e52eb4424a7.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +0 -1
  128. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-35e0de5bca55e594.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/pages/workspaces-062525fb5462acb6.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/webpack-387626669badf82e.js +0 -1
  131. sky/dashboard/out/_next/static/css/b3227360726f12eb.css +0 -3
  132. /sky/dashboard/out/_next/static/{Gelsd19kVxXcX7aQQGsGu → -DXZksWqf2waNHeU9YTQe}/_ssgManifest.js +0 -0
  133. /sky/dashboard/out/_next/static/chunks/{6135-2d7ed3350659d073.js → 6135-85426374db04811e.js} +0 -0
  134. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/WHEEL +0 -0
  135. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/entry_points.txt +0 -0
  136. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/licenses/LICENSE +0 -0
  137. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/top_level.txt +0 -0
sky/client/cli/command.py CHANGED
@@ -35,7 +35,7 @@ import sys
35
35
  import traceback
36
36
  import typing
37
37
  from typing import (Any, Callable, Dict, Generator, List, Optional, Set, Tuple,
38
- Union)
38
+ TypeVar, Union)
39
39
 
40
40
  import click
41
41
  import colorama
@@ -116,6 +116,8 @@ _DAG_NOT_SUPPORTED_MESSAGE = ('YAML specifies a DAG which is only supported by '
116
116
  '`sky jobs launch`. `{command}` supports a '
117
117
  'single task only.')
118
118
 
119
+ T = TypeVar('T')
120
+
119
121
 
120
122
  def _get_cluster_records_and_set_ssh_config(
121
123
  clusters: Optional[List[str]],
@@ -224,8 +226,8 @@ def _get_glob_matches(candidate_names: List[str],
224
226
  return list(set(glob_storages))
225
227
 
226
228
 
227
- def _async_call_or_wait(request_id: str, async_call: bool,
228
- request_name: str) -> Any:
229
+ def _async_call_or_wait(request_id: server_common.RequestId[T],
230
+ async_call: bool, request_name: str) -> Any:
229
231
  short_request_id = request_id[:8]
230
232
  if not async_call:
231
233
  try:
@@ -1411,7 +1413,7 @@ def exec(
1411
1413
 
1412
1414
 
1413
1415
  def _handle_jobs_queue_request(
1414
- request_id: str,
1416
+ request_id: server_common.RequestId[List[Dict[str, Any]]],
1415
1417
  show_all: bool,
1416
1418
  show_user: bool,
1417
1419
  max_num_jobs_to_show: Optional[int],
@@ -1492,7 +1494,7 @@ def _handle_jobs_queue_request(
1492
1494
 
1493
1495
 
1494
1496
  def _handle_services_request(
1495
- request_id: str,
1497
+ request_id: server_common.RequestId[List[Dict[str, Any]]],
1496
1498
  service_names: Optional[List[str]],
1497
1499
  show_all: bool,
1498
1500
  show_endpoint: bool,
@@ -1879,17 +1881,19 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1879
1881
  skip_finished=True,
1880
1882
  all_users=all_users)
1881
1883
 
1882
- def submit_services() -> Optional[str]:
1884
+ def submit_services(
1885
+ ) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
1883
1886
  return serve_lib.status(service_names=None)
1884
1887
 
1885
- def submit_pools() -> Optional[str]:
1888
+ def submit_pools(
1889
+ ) -> Optional[server_common.RequestId[List[Dict[str, Any]]]]:
1886
1890
  try:
1887
1891
  return managed_jobs.pool_status(pool_names=None)
1888
1892
  except exceptions.APINotSupportedError as e:
1889
1893
  logger.debug(f'Pools are not supported in the remote server: {e}')
1890
1894
  return None
1891
1895
 
1892
- def submit_workspace() -> Optional[str]:
1896
+ def submit_workspace() -> Optional[server_common.RequestId[Dict[str, Any]]]:
1893
1897
  try:
1894
1898
  return sdk.workspaces()
1895
1899
  except RuntimeError:
@@ -1928,11 +1932,14 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1928
1932
  if not (ip or show_endpoints):
1929
1933
  workspace_request_id = workspace_request_future.result()
1930
1934
 
1931
- managed_jobs_queue_request_id = ('' if not managed_jobs_queue_request_id
1932
- else managed_jobs_queue_request_id)
1933
- service_status_request_id = ('' if not service_status_request_id else
1935
+ managed_jobs_queue_request_id = (server_common.RequestId()
1936
+ if not managed_jobs_queue_request_id else
1937
+ managed_jobs_queue_request_id)
1938
+ service_status_request_id = (server_common.RequestId()
1939
+ if not service_status_request_id else
1934
1940
  service_status_request_id)
1935
- pool_status_request_id = ('' if not pool_status_request_id else
1941
+ pool_status_request_id = (server_common.RequestId()
1942
+ if not pool_status_request_id else
1936
1943
  pool_status_request_id)
1937
1944
 
1938
1945
  # Phase 3: Get cluster records and handle special cases
@@ -1957,7 +1964,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1957
1964
  if workspace_request_id is not None:
1958
1965
  all_workspaces = sdk.get(workspace_request_id)
1959
1966
  else:
1960
- all_workspaces = [constants.SKYPILOT_DEFAULT_WORKSPACE]
1967
+ all_workspaces = {constants.SKYPILOT_DEFAULT_WORKSPACE: {}}
1961
1968
  active_workspace = skypilot_config.get_active_workspace()
1962
1969
  show_workspace = len(all_workspaces) > 1
1963
1970
  _show_enabled_infra(active_workspace, show_workspace)
@@ -3836,7 +3843,7 @@ def show_gpus(
3836
3843
  yield k8s_messages
3837
3844
  yield '\n\n'
3838
3845
 
3839
- result = sdk.stream_and_get(
3846
+ list_accelerator_counts_result = sdk.stream_and_get(
3840
3847
  sdk.list_accelerator_counts(
3841
3848
  gpus_only=True,
3842
3849
  clouds=clouds_to_list,
@@ -3853,14 +3860,20 @@ def show_gpus(
3853
3860
 
3854
3861
  # "Common" GPUs
3855
3862
  for gpu in catalog.get_common_gpus():
3856
- if gpu in result:
3857
- gpu_table.add_row([gpu, _list_to_str(result.pop(gpu))])
3863
+ if gpu in list_accelerator_counts_result:
3864
+ gpu_table.add_row([
3865
+ gpu,
3866
+ _list_to_str(list_accelerator_counts_result.pop(gpu))
3867
+ ])
3858
3868
  yield from gpu_table.get_string()
3859
3869
 
3860
3870
  # Google TPUs
3861
3871
  for tpu in catalog.get_tpus():
3862
- if tpu in result:
3863
- tpu_table.add_row([tpu, _list_to_str(result.pop(tpu))])
3872
+ if tpu in list_accelerator_counts_result:
3873
+ tpu_table.add_row([
3874
+ tpu,
3875
+ _list_to_str(list_accelerator_counts_result.pop(tpu))
3876
+ ])
3864
3877
  if tpu_table.get_string():
3865
3878
  yield '\n\n'
3866
3879
  yield from tpu_table.get_string()
@@ -3868,7 +3881,7 @@ def show_gpus(
3868
3881
  # Other GPUs
3869
3882
  if show_all:
3870
3883
  yield '\n\n'
3871
- for gpu, qty in sorted(result.items()):
3884
+ for gpu, qty in sorted(list_accelerator_counts_result.items()):
3872
3885
  other_table.add_row([gpu, _list_to_str(qty)])
3873
3886
  yield from other_table.get_string()
3874
3887
  yield '\n\n'
@@ -3919,7 +3932,7 @@ def show_gpus(
3919
3932
 
3920
3933
  # For clouds other than Kubernetes, get the accelerator details
3921
3934
  # Case-sensitive
3922
- result = sdk.stream_and_get(
3935
+ list_accelerators_result = sdk.stream_and_get(
3923
3936
  sdk.list_accelerators(gpus_only=True,
3924
3937
  name_filter=name,
3925
3938
  quantity_filter=quantity,
@@ -3935,8 +3948,8 @@ def show_gpus(
3935
3948
  # - Group by cloud
3936
3949
  # - Sort within each group by prices
3937
3950
  # - Sort groups by each cloud's (min price, min spot price)
3938
- new_result = {}
3939
- for i, (gpu, items) in enumerate(result.items()):
3951
+ new_result: Dict[str, List[catalog_common.InstanceTypeInfo]] = {}
3952
+ for i, (gpu, items) in enumerate(list_accelerators_result.items()):
3940
3953
  df = pd.DataFrame([t._asdict() for t in items])
3941
3954
  # Determine the minimum prices for each cloud.
3942
3955
  min_price_df = df.groupby('cloud').agg(min_price=('price', 'min'),
@@ -3954,14 +3967,14 @@ def show_gpus(
3954
3967
  for row in df.to_records(index=False)
3955
3968
  ]
3956
3969
  new_result[gpu] = sorted_dataclasses
3957
- result = new_result
3970
+ list_accelerators_result = new_result
3958
3971
 
3959
3972
  if print_section_titles and not show_all:
3960
3973
  yield '\n\n'
3961
3974
  yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3962
3975
  f'Cloud GPUs{colorama.Style.RESET_ALL}\n')
3963
3976
 
3964
- if not result:
3977
+ if not list_accelerators_result:
3965
3978
  quantity_str = (f' with requested quantity {quantity}'
3966
3979
  if quantity else '')
3967
3980
  cloud_str = f' on {cloud_obj}.' if cloud_name else ' in cloud catalogs.'
@@ -3969,7 +3982,7 @@ def show_gpus(
3969
3982
  yield 'To show available accelerators, run: sky show-gpus --all'
3970
3983
  return
3971
3984
 
3972
- for i, (gpu, items) in enumerate(result.items()):
3985
+ for i, (gpu, items) in enumerate(list_accelerators_result.items()):
3973
3986
  accelerator_table_headers = [
3974
3987
  'GPU',
3975
3988
  'QTY',
@@ -4972,6 +4985,205 @@ def jobs_pool_down(
4972
4985
  _async_call_or_wait(request_id, async_call, 'sky.jobs.pool_down')
4973
4986
 
4974
4987
 
4988
+ def _handle_serve_logs(
4989
+ service_name: str,
4990
+ follow: bool,
4991
+ controller: bool,
4992
+ load_balancer: bool,
4993
+ replica_ids: Tuple[int, ...],
4994
+ sync_down: bool,
4995
+ tail: Optional[int],
4996
+ pool: bool, # pylint: disable=redefined-outer-name
4997
+ ):
4998
+ noun = 'pool' if pool else 'service'
4999
+ capnoun = noun.capitalize()
5000
+ repnoun = 'worker' if pool else 'replica'
5001
+ if tail is not None:
5002
+ if tail < 0:
5003
+ raise click.UsageError('--tail must be a non-negative integer.')
5004
+ # TODO(arda): We could add ability to tail and follow logs together.
5005
+ if follow:
5006
+ follow = False
5007
+ logger.warning(
5008
+ f'{colorama.Fore.YELLOW}'
5009
+ '--tail and --follow cannot be used together. '
5010
+ f'Changed the mode to --no-follow.{colorama.Style.RESET_ALL}')
5011
+
5012
+ chosen_components: Set[serve_lib.ServiceComponent] = set()
5013
+ if controller:
5014
+ chosen_components.add(serve_lib.ServiceComponent.CONTROLLER)
5015
+ if load_balancer:
5016
+ assert not pool, 'Load balancer is not supported for pools.'
5017
+ chosen_components.add(serve_lib.ServiceComponent.LOAD_BALANCER)
5018
+ # replica_ids contains the specific replica IDs provided by the user.
5019
+ # If it's not empty, it implies the user wants replica logs.
5020
+ if replica_ids:
5021
+ chosen_components.add(serve_lib.ServiceComponent.REPLICA)
5022
+
5023
+ if sync_down:
5024
+ # For sync-down, multiple targets are allowed.
5025
+ # If no specific components/replicas are mentioned, sync all.
5026
+ # Note: Multiple replicas or targets can only be specified when
5027
+ # using --sync-down.
5028
+ targets_to_sync = list(chosen_components)
5029
+ if not targets_to_sync and not replica_ids:
5030
+ # Default to all components if nothing specific is requested
5031
+ targets_to_sync = [
5032
+ serve_lib.ServiceComponent.CONTROLLER,
5033
+ serve_lib.ServiceComponent.REPLICA,
5034
+ ]
5035
+ if not pool:
5036
+ targets_to_sync.append(serve_lib.ServiceComponent.LOAD_BALANCER)
5037
+
5038
+ timestamp = sky_logging.get_run_timestamp()
5039
+ log_dir = (pathlib.Path(constants.SKY_LOGS_DIRECTORY) / noun /
5040
+ f'{service_name}_{timestamp}').expanduser()
5041
+ log_dir.mkdir(parents=True, exist_ok=True)
5042
+
5043
+ with rich_utils.client_status(
5044
+ ux_utils.spinner_message(f'Downloading {noun} logs...')):
5045
+ if pool:
5046
+ managed_jobs.pool_sync_down_logs(service_name,
5047
+ str(log_dir),
5048
+ targets=targets_to_sync,
5049
+ worker_ids=list(replica_ids),
5050
+ tail=tail)
5051
+ else:
5052
+ serve_lib.sync_down_logs(service_name,
5053
+ str(log_dir),
5054
+ targets=targets_to_sync,
5055
+ replica_ids=list(replica_ids),
5056
+ tail=tail)
5057
+ style = colorama.Style
5058
+ fore = colorama.Fore
5059
+ logger.info(f'{fore.CYAN}{capnoun} {service_name} logs: '
5060
+ f'{log_dir}{style.RESET_ALL}')
5061
+ return
5062
+
5063
+ # Tailing requires exactly one target.
5064
+ num_targets = len(chosen_components)
5065
+ # If REPLICA component is chosen, len(replica_ids) must be 1 for tailing.
5066
+ if serve_lib.ServiceComponent.REPLICA in chosen_components:
5067
+ if len(replica_ids) != 1:
5068
+ raise click.UsageError(
5069
+ f'Can only tail logs from a single {repnoun} at a time. '
5070
+ f'Provide exactly one {repnoun.upper()}_ID or use --sync-down '
5071
+ f'to download logs from multiple {repnoun}s.')
5072
+ # If replica is chosen and len is 1, num_targets effectively counts it.
5073
+ # We need to ensure no other component (controller/LB) is selected.
5074
+ if num_targets > 1:
5075
+ raise click.UsageError(
5076
+ 'Can only tail logs from one target at a time (controller, '
5077
+ f'load balancer, or a single {repnoun}). Use --sync-down '
5078
+ 'to download logs from multiple sources.')
5079
+ elif num_targets == 0:
5080
+ raise click.UsageError(
5081
+ 'Specify a target to tail: --controller, --load-balancer, or '
5082
+ f'a {repnoun.upper()}_ID.')
5083
+ elif num_targets > 1:
5084
+ raise click.UsageError(
5085
+ 'Can only tail logs from one target at a time. Use --sync-down '
5086
+ 'to download logs from multiple sources.')
5087
+
5088
+ # At this point, we have exactly one target for tailing.
5089
+ assert len(chosen_components) == 1
5090
+ assert len(replica_ids) in [0, 1]
5091
+ target_component = chosen_components.pop()
5092
+ target_replica_id: Optional[int] = replica_ids[0] if replica_ids else None
5093
+
5094
+ try:
5095
+ if pool:
5096
+ managed_jobs.pool_tail_logs(service_name,
5097
+ target=target_component,
5098
+ worker_id=target_replica_id,
5099
+ follow=follow,
5100
+ tail=tail)
5101
+ else:
5102
+ serve_lib.tail_logs(service_name,
5103
+ target=target_component,
5104
+ replica_id=target_replica_id,
5105
+ follow=follow,
5106
+ tail=tail)
5107
+ except exceptions.ClusterNotUpError:
5108
+ with ux_utils.print_exception_no_traceback():
5109
+ raise
5110
+
5111
+
5112
+ @pool.command('logs', cls=_DocumentedCodeCommand)
5113
+ @flags.config_option(expose_value=False)
5114
+ @click.option(
5115
+ '--follow/--no-follow',
5116
+ is_flag=True,
5117
+ default=True,
5118
+ help=('Follow the logs of the job. [default: --follow] '
5119
+ 'If --no-follow is specified, print the log so far and exit.'))
5120
+ @click.option('--controller',
5121
+ is_flag=True,
5122
+ default=False,
5123
+ required=False,
5124
+ help='Show the controller logs of this pool.')
5125
+ @click.option('--sync-down',
5126
+ '-s',
5127
+ is_flag=True,
5128
+ default=False,
5129
+ help='Sync down logs to the local machine. Can be combined with '
5130
+ '--controller or worker ID to narrow scope.')
5131
+ @click.option(
5132
+ '--tail',
5133
+ default=None,
5134
+ type=int,
5135
+ help='The number of lines to display from the end of the log file. '
5136
+ 'Default is None, which means print all lines.')
5137
+ @click.argument('pool_name', required=True, type=str)
5138
+ @click.argument('worker_ids', required=False, type=int, nargs=-1)
5139
+ @usage_lib.entrypoint
5140
+ # TODO(tian): Add default argument for this CLI if none of the flags are
5141
+ # specified.
5142
+ def pool_logs(
5143
+ pool_name: str,
5144
+ follow: bool,
5145
+ controller: bool,
5146
+ worker_ids: Tuple[int, ...],
5147
+ sync_down: bool,
5148
+ tail: Optional[int],
5149
+ ):
5150
+ """Tail or sync down logs of a pool.
5151
+
5152
+ Logs can be tailed from one target (controller, or a single worker) or
5153
+ synced down from multiple targets simultaneously.
5154
+
5155
+ Example:
5156
+
5157
+ .. code-block:: bash
5158
+
5159
+ # Tail the controller logs of a pool
5160
+ sky pool logs --controller [POOL_NAME]
5161
+ \b
5162
+ # Print the worker logs so far and exit
5163
+ sky pool logs --no-follow [POOL_NAME]
5164
+ \b
5165
+ # Tail the logs of worker 1
5166
+ sky pool logs [POOL_NAME] 1
5167
+ \b
5168
+ # Show the last 100 lines of the controller logs
5169
+ sky pool logs --controller --tail 100 [POOL_NAME]
5170
+ \b
5171
+ # Sync down all logs of the pool (controller, all workers)
5172
+ sky pool logs [POOL_NAME] --sync-down
5173
+ \b
5174
+ # Sync down controller logs and logs for workers 1 and 3
5175
+ sky pool logs [POOL_NAME] 1 3 --controller --sync-down
5176
+ """
5177
+ _handle_serve_logs(pool_name,
5178
+ follow=follow,
5179
+ controller=controller,
5180
+ load_balancer=False,
5181
+ replica_ids=worker_ids,
5182
+ sync_down=sync_down,
5183
+ tail=tail,
5184
+ pool=True)
5185
+
5186
+
4975
5187
  @cli.command(cls=_DocumentedCodeCommand)
4976
5188
  @flags.config_option(expose_value=False)
4977
5189
  @usage_lib.entrypoint
@@ -5555,6 +5767,7 @@ def serve_down(
5555
5767
  show_default=True)
5556
5768
 
5557
5769
  if replica_id_is_defined:
5770
+ assert replica_id is not None
5558
5771
  request_id = serve_lib.terminate_replica(service_names[0], replica_id,
5559
5772
  purge)
5560
5773
  else:
@@ -5635,99 +5848,14 @@ def serve_logs(
5635
5848
  # Sync down controller logs and logs for replicas 1 and 3
5636
5849
  sky serve logs [SERVICE_NAME] 1 3 --controller --sync-down
5637
5850
  """
5638
- if tail is not None:
5639
- if tail < 0:
5640
- raise click.UsageError('--tail must be a non-negative integer.')
5641
- # TODO(arda): We could add ability to tail and follow logs together.
5642
- if follow:
5643
- follow = False
5644
- logger.warning(
5645
- f'{colorama.Fore.YELLOW}'
5646
- '--tail and --follow cannot be used together. '
5647
- f'Changed the mode to --no-follow.{colorama.Style.RESET_ALL}')
5648
-
5649
- chosen_components: Set[serve_lib.ServiceComponent] = set()
5650
- if controller:
5651
- chosen_components.add(serve_lib.ServiceComponent.CONTROLLER)
5652
- if load_balancer:
5653
- chosen_components.add(serve_lib.ServiceComponent.LOAD_BALANCER)
5654
- # replica_ids contains the specific replica IDs provided by the user.
5655
- # If it's not empty, it implies the user wants replica logs.
5656
- if replica_ids:
5657
- chosen_components.add(serve_lib.ServiceComponent.REPLICA)
5658
-
5659
- if sync_down:
5660
- # For sync-down, multiple targets are allowed.
5661
- # If no specific components/replicas are mentioned, sync all.
5662
- # Note: Multiple replicas or targets can only be specified when
5663
- # using --sync-down.
5664
- targets_to_sync = list(chosen_components)
5665
- if not targets_to_sync and not replica_ids:
5666
- # Default to all components if nothing specific is requested
5667
- targets_to_sync = [
5668
- serve_lib.ServiceComponent.CONTROLLER,
5669
- serve_lib.ServiceComponent.LOAD_BALANCER,
5670
- serve_lib.ServiceComponent.REPLICA,
5671
- ]
5672
-
5673
- timestamp = sky_logging.get_run_timestamp()
5674
- log_dir = (pathlib.Path(constants.SKY_LOGS_DIRECTORY) / 'service' /
5675
- f'{service_name}_{timestamp}').expanduser()
5676
- log_dir.mkdir(parents=True, exist_ok=True)
5677
-
5678
- with rich_utils.client_status(
5679
- ux_utils.spinner_message('Downloading service logs...')):
5680
- serve_lib.sync_down_logs(service_name,
5681
- local_dir=str(log_dir),
5682
- targets=targets_to_sync,
5683
- replica_ids=list(replica_ids),
5684
- tail=tail)
5685
- style = colorama.Style
5686
- fore = colorama.Fore
5687
- logger.info(f'{fore.CYAN}Service {service_name} logs: '
5688
- f'{log_dir}{style.RESET_ALL}')
5689
- return
5690
-
5691
- # Tailing requires exactly one target.
5692
- num_targets = len(chosen_components)
5693
- # If REPLICA component is chosen, len(replica_ids) must be 1 for tailing.
5694
- if serve_lib.ServiceComponent.REPLICA in chosen_components:
5695
- if len(replica_ids) != 1:
5696
- raise click.UsageError(
5697
- 'Can only tail logs from a single replica at a time. '
5698
- 'Provide exactly one REPLICA_ID or use --sync-down '
5699
- 'to download logs from multiple replicas.')
5700
- # If replica is chosen and len is 1, num_targets effectively counts it.
5701
- # We need to ensure no other component (controller/LB) is selected.
5702
- if num_targets > 1:
5703
- raise click.UsageError(
5704
- 'Can only tail logs from one target at a time (controller, '
5705
- 'load balancer, or a single replica). Use --sync-down '
5706
- 'to download logs from multiple sources.')
5707
- elif num_targets == 0:
5708
- raise click.UsageError(
5709
- 'Specify a target to tail: --controller, --load-balancer, or '
5710
- 'a REPLICA_ID.')
5711
- elif num_targets > 1:
5712
- raise click.UsageError(
5713
- 'Can only tail logs from one target at a time. Use --sync-down '
5714
- 'to download logs from multiple sources.')
5715
-
5716
- # At this point, we have exactly one target for tailing.
5717
- assert len(chosen_components) == 1
5718
- assert len(replica_ids) in [0, 1]
5719
- target_component = chosen_components.pop()
5720
- target_replica_id: Optional[int] = replica_ids[0] if replica_ids else None
5721
-
5722
- try:
5723
- serve_lib.tail_logs(service_name,
5724
- target=target_component,
5725
- replica_id=target_replica_id,
5726
- follow=follow,
5727
- tail=tail)
5728
- except exceptions.ClusterNotUpError:
5729
- with ux_utils.print_exception_no_traceback():
5730
- raise
5851
+ _handle_serve_logs(service_name,
5852
+ follow=follow,
5853
+ controller=controller,
5854
+ load_balancer=load_balancer,
5855
+ replica_ids=replica_ids,
5856
+ sync_down=sync_down,
5857
+ tail=tail,
5858
+ pool=False)
5731
5859
 
5732
5860
 
5733
5861
  @cli.group(cls=_NaturalOrderGroup, hidden=True)
@@ -5924,7 +6052,8 @@ def api_logs(request_id: Optional[str], server_logs: bool,
5924
6052
  if request_id is not None and log_path is not None:
5925
6053
  raise click.BadParameter(
5926
6054
  'Only one of request ID and log path can be provided.')
5927
- sdk.stream_and_get(request_id, log_path, tail)
6055
+ sdk.stream_and_get(server_common.RequestId[None](request_id), log_path,
6056
+ tail)
5928
6057
 
5929
6058
 
5930
6059
  @api.command('cancel', cls=_DocumentedCodeCommand)