skypilot-nightly 1.0.0.dev20250806__py3-none-any.whl → 1.0.0.dev20250808__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (137) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +20 -1
  3. sky/backends/cloud_vm_ray_backend.py +42 -6
  4. sky/check.py +11 -1
  5. sky/client/cli/command.py +248 -119
  6. sky/client/sdk.py +146 -66
  7. sky/client/sdk_async.py +5 -1
  8. sky/core.py +5 -2
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/-DXZksWqf2waNHeU9YTQe/_buildManifest.js +1 -0
  11. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +11 -0
  12. sky/dashboard/out/_next/static/chunks/1871-980a395e92633a5c.js +6 -0
  13. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +1 -0
  14. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +1 -0
  16. sky/dashboard/out/_next/static/chunks/{4937.d6bf67771e353356.js → 4937.a2baa2df5572a276.js} +1 -1
  17. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/{691.6d99cbfba347cebf.js → 691.5eeedf82cc243343.js} +1 -1
  20. sky/dashboard/out/_next/static/chunks/6989-6129c1cfbcf51063.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/8056-34d27f51e6d1c631.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +16 -0
  24. sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/9025.a1bef12d672bb66d.js +6 -0
  26. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +31 -0
  28. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +1 -0
  29. sky/dashboard/out/_next/static/chunks/{9847.4c46c5e229c78704.js → 9847.757720f3b40c0aa5.js} +1 -1
  30. sky/dashboard/out/_next/static/chunks/pages/{_app-2a43ea3241bbdacd.js → _app-491a4d699d95e808.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-ae17cec0fc6483d9.js +11 -0
  32. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-155d477a6c3e04e2.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/pages/{clusters-47f1ddae13a2f8e4.js → clusters-b30460f683e6ba96.js} +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-2a44e70b500b6b70.js → [context]-13d53fffc03ccb52.js} +1 -1
  36. sky/dashboard/out/_next/static/chunks/pages/{infra-22faac9325016d83.js → infra-fc9222e26c8e2f0d.js} +1 -1
  37. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +11 -0
  38. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-f5ccf5d39d87aebe.js +21 -0
  39. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/pages/{users-b90c865a690bfe84.js → users-7ed36e44e779d5c7.js} +1 -1
  41. sky/dashboard/out/_next/static/chunks/pages/{volumes-7af733f5d7b6ed1c.js → volumes-c9695d657f78b5dc.js} +1 -1
  42. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-f72f73bcef9541dc.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/webpack-339efec49c0cc7d0.js +1 -0
  46. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +3 -0
  47. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  48. sky/dashboard/out/clusters/[cluster].html +1 -1
  49. sky/dashboard/out/clusters.html +1 -1
  50. sky/dashboard/out/config.html +1 -1
  51. sky/dashboard/out/index.html +1 -1
  52. sky/dashboard/out/infra/[context].html +1 -1
  53. sky/dashboard/out/infra.html +1 -1
  54. sky/dashboard/out/jobs/[job].html +1 -1
  55. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  56. sky/dashboard/out/jobs.html +1 -1
  57. sky/dashboard/out/users.html +1 -1
  58. sky/dashboard/out/volumes.html +1 -1
  59. sky/dashboard/out/workspace/new.html +1 -1
  60. sky/dashboard/out/workspaces/[name].html +1 -1
  61. sky/dashboard/out/workspaces.html +1 -1
  62. sky/execution.py +6 -4
  63. sky/global_user_state.py +22 -3
  64. sky/jobs/__init__.py +2 -0
  65. sky/jobs/client/sdk.py +67 -19
  66. sky/jobs/controller.py +2 -1
  67. sky/jobs/server/core.py +48 -1
  68. sky/jobs/server/server.py +52 -3
  69. sky/jobs/state.py +5 -1
  70. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  71. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  72. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  73. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  74. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  75. sky/serve/client/impl.py +93 -6
  76. sky/serve/client/sdk.py +22 -53
  77. sky/serve/constants.py +2 -1
  78. sky/serve/controller.py +4 -2
  79. sky/serve/serve_state.py +444 -324
  80. sky/serve/serve_utils.py +77 -46
  81. sky/serve/server/core.py +13 -197
  82. sky/serve/server/impl.py +239 -2
  83. sky/serve/service.py +8 -3
  84. sky/server/common.py +18 -7
  85. sky/server/constants.py +1 -1
  86. sky/server/requests/executor.py +5 -3
  87. sky/server/requests/payloads.py +19 -0
  88. sky/setup_files/alembic.ini +4 -0
  89. sky/task.py +18 -11
  90. sky/templates/kubernetes-ray.yml.j2 +5 -0
  91. sky/templates/sky-serve-controller.yaml.j2 +1 -0
  92. sky/usage/usage_lib.py +8 -6
  93. sky/utils/annotations.py +8 -3
  94. sky/utils/cli_utils/status_utils.py +1 -1
  95. sky/utils/common_utils.py +11 -1
  96. sky/utils/db/db_utils.py +31 -0
  97. sky/utils/db/migration_utils.py +6 -2
  98. sky/utils/kubernetes/deploy_remote_cluster.py +3 -1
  99. sky/utils/resource_checker.py +162 -21
  100. sky/volumes/client/sdk.py +4 -4
  101. sky/workspaces/core.py +210 -6
  102. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/METADATA +19 -14
  103. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/RECORD +109 -103
  104. sky/client/sdk.pyi +0 -301
  105. sky/dashboard/out/_next/static/Gelsd19kVxXcX7aQQGsGu/_buildManifest.js +0 -1
  106. sky/dashboard/out/_next/static/chunks/1043-75af48ca5d5aaf57.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/1141-8678a9102cc5f67e.js +0 -11
  108. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +0 -1
  109. sky/dashboard/out/_next/static/chunks/1871-ced1c14230cad6e1.js +0 -6
  110. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +0 -1
  111. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +0 -1
  112. sky/dashboard/out/_next/static/chunks/2622-951867535095b0eb.js +0 -1
  113. sky/dashboard/out/_next/static/chunks/3785.0a173cd4393f0fef.js +0 -1
  114. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +0 -1
  115. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +0 -16
  116. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +0 -1
  117. sky/dashboard/out/_next/static/chunks/6601-2109d22e7861861c.js +0 -1
  118. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  119. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +0 -1
  120. sky/dashboard/out/_next/static/chunks/9025.99f29acb7617963e.js +0 -6
  121. sky/dashboard/out/_next/static/chunks/938-bda2685db5eae6cf.js +0 -1
  122. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-7cb24da04ca00956.js +0 -11
  123. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-1e95993124dbfc57.js +0 -1
  124. sky/dashboard/out/_next/static/chunks/pages/config-d56e64f30db7b42e.js +0 -1
  125. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-90693cb88b5599a7.js +0 -11
  126. sky/dashboard/out/_next/static/chunks/pages/jobs-ab318e52eb4424a7.js +0 -1
  127. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +0 -1
  128. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-35e0de5bca55e594.js +0 -1
  129. sky/dashboard/out/_next/static/chunks/pages/workspaces-062525fb5462acb6.js +0 -1
  130. sky/dashboard/out/_next/static/chunks/webpack-387626669badf82e.js +0 -1
  131. sky/dashboard/out/_next/static/css/b3227360726f12eb.css +0 -3
  132. /sky/dashboard/out/_next/static/{Gelsd19kVxXcX7aQQGsGu → -DXZksWqf2waNHeU9YTQe}/_ssgManifest.js +0 -0
  133. /sky/dashboard/out/_next/static/chunks/{6135-2d7ed3350659d073.js → 6135-85426374db04811e.js} +0 -0
  134. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/WHEEL +0 -0
  135. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/entry_points.txt +0 -0
  136. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/licenses/LICENSE +0 -0
  137. {skypilot_nightly-1.0.0.dev20250806.dist-info → skypilot_nightly-1.0.0.dev20250808.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py CHANGED
@@ -20,6 +20,7 @@ import uuid
20
20
 
21
21
  import colorama
22
22
  import filelock
23
+ import yaml
23
24
 
24
25
  from sky import backends
25
26
  from sky import exceptions
@@ -65,13 +66,12 @@ def get_num_service_threshold():
65
66
 
66
67
  _CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
67
68
 
68
- # NOTE(dev): We assume log paths are either in ~/sky_logs/... or ~/.sky/...
69
- # and always appear after a space. Be careful when changing UX as this
70
- # assumption is used to expand some log files while ignoring others.
71
- _SKYPILOT_LOG_DIRS = r'~/(sky_logs|\.sky)'
72
- _SKYPILOT_PROVISION_LOG_PATTERN = (
73
- fr'.* ({_SKYPILOT_LOG_DIRS}/.*provision\.log)')
74
- _SKYPILOT_LOG_PATTERN = fr'.* ({_SKYPILOT_LOG_DIRS}/.*\.log)'
69
+ # NOTE(dev): We assume log are print with the hint 'sky api logs -l'. Be careful
70
+ # when changing UX as this assumption is used to expand some log files while
71
+ # ignoring others.
72
+ _SKYPILOT_LOG_HINT = r'.*sky api logs -l'
73
+ _SKYPILOT_PROVISION_LOG_PATTERN = (fr'{_SKYPILOT_LOG_HINT} (.*/provision\.log)')
74
+ _SKYPILOT_LOG_PATTERN = fr'{_SKYPILOT_LOG_HINT} (.*\.log)'
75
75
 
76
76
  # TODO(tian): Find all existing replica id and print here.
77
77
  _FAILED_TO_FIND_REPLICA_MSG = (
@@ -668,12 +668,18 @@ def _get_service_status(
668
668
  if record['pool']:
669
669
  latest_yaml_path = generate_task_yaml_file_name(service_name,
670
670
  record['version'])
671
- original_config = common_utils.read_yaml(latest_yaml_path)
672
- original_config.pop('run', None)
673
- svc: Dict[str, Any] = original_config.pop('service')
674
- if svc is not None:
675
- svc.pop('pool', None)
676
- original_config['pool'] = svc
671
+ raw_yaml_config = common_utils.read_yaml(latest_yaml_path)
672
+ original_config = raw_yaml_config.get('_user_specified_yaml')
673
+ if original_config is None:
674
+ # Fall back to old display format.
675
+ original_config = raw_yaml_config
676
+ original_config.pop('run', None)
677
+ svc: Dict[str, Any] = original_config.pop('service')
678
+ if svc is not None:
679
+ svc.pop('pool', None) # Remove pool from service config
680
+ original_config['pool'] = svc # Add pool to root config
681
+ else:
682
+ original_config = yaml.safe_load(original_config)
677
683
  record['pool_yaml'] = common_utils.dump_yaml_str(original_config)
678
684
 
679
685
  record['target_num_replicas'] = 0
@@ -959,8 +965,10 @@ def wait_service_registration(service_name: str, job_id: int,
959
965
  """
960
966
  start_time = time.time()
961
967
  setup_completed = False
968
+ noun = 'pool' if pool else 'service'
962
969
  while True:
963
- # TODO(tian): PID-based tracking.
970
+ # Only do this check for non-consolidation mode as consolidation mode
971
+ # has no setup process.
964
972
  if not is_consolidation_mode(pool):
965
973
  job_status = job_lib.get_status(job_id)
966
974
  if job_status is None or job_status < job_lib.JobStatus.RUNNING:
@@ -971,7 +979,7 @@ def wait_service_registration(service_name: str, job_id: int,
971
979
  with ux_utils.print_exception_no_traceback():
972
980
  raise RuntimeError(
973
981
  f'Failed to start the controller process for '
974
- f'the service {service_name!r} within '
982
+ f'the {noun} {service_name!r} within '
975
983
  f'{constants.CONTROLLER_SETUP_TIMEOUT_SECONDS}'
976
984
  f' seconds.')
977
985
  # No need to check the service status as the controller process
@@ -979,22 +987,26 @@ def wait_service_registration(service_name: str, job_id: int,
979
987
  time.sleep(1)
980
988
  continue
981
989
 
982
- if not setup_completed:
983
- setup_completed = True
984
- # Reset the start time to wait for the service to be registered.
985
- start_time = time.time()
990
+ if not setup_completed:
991
+ setup_completed = True
992
+ # Reset the start time to wait for the service to be registered.
993
+ start_time = time.time()
986
994
 
987
- record = serve_state.get_service_from_name(service_name)
995
+ record = _get_service_status(service_name,
996
+ pool=pool,
997
+ with_replica_info=False)
988
998
  if record is not None:
989
- # TODO(tian): PID-based tracking.
990
- if (not is_consolidation_mode(pool) and
991
- job_id != record['controller_job_id']):
999
+ if job_id != record['controller_job_id']:
1000
+ if pool:
1001
+ command_to_run = 'sky jobs pool apply --pool'
1002
+ else:
1003
+ command_to_run = 'sky serve update'
992
1004
  with ux_utils.print_exception_no_traceback():
993
1005
  raise ValueError(
994
- f'The service {service_name!r} is already running. '
995
- 'Please specify a different name for your service. '
996
- 'To update an existing service, run: sky serve update '
997
- f'{service_name} <new-service-yaml>')
1006
+ f'The {noun} {service_name!r} is already running. '
1007
+ f'Please specify a different name for your {noun}. '
1008
+ f'To update an existing {noun}, run: {command_to_run}'
1009
+ f' {service_name} <new-{noun}-yaml>')
998
1010
  lb_port = record['load_balancer_port']
999
1011
  if lb_port is not None:
1000
1012
  return message_utils.encode_payload(lb_port)
@@ -1023,12 +1035,16 @@ def load_service_initialization_result(payload: str) -> int:
1023
1035
  return message_utils.decode_payload(payload)
1024
1036
 
1025
1037
 
1026
- def check_service_status_healthy(service_name: str) -> Optional[str]:
1027
- service_record = serve_state.get_service_from_name(service_name)
1038
+ def _check_service_status_healthy(service_name: str,
1039
+ pool: bool) -> Optional[str]:
1040
+ service_record = _get_service_status(service_name,
1041
+ pool,
1042
+ with_replica_info=False)
1043
+ capnoun = 'Service' if not pool else 'Pool'
1028
1044
  if service_record is None:
1029
- return f'Service {service_name!r} does not exist.'
1045
+ return f'{capnoun} {service_name!r} does not exist.'
1030
1046
  if service_record['status'] == serve_state.ServiceStatus.CONTROLLER_INIT:
1031
- return (f'Service {service_name!r} is still initializing its '
1047
+ return (f'{capnoun} {service_name!r} is still initializing its '
1032
1048
  'controller. Please try again later.')
1033
1049
  return None
1034
1050
 
@@ -1067,7 +1083,10 @@ def _process_line(line: str,
1067
1083
  log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
1068
1084
 
1069
1085
  if provision_log_prompt is not None:
1070
- nested_log_path = os.path.expanduser(provision_log_prompt.group(1))
1086
+ log_path = provision_log_prompt.group(1)
1087
+ nested_log_path = pathlib.Path(
1088
+ skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
1089
+ log_path).resolve()
1071
1090
 
1072
1091
  try:
1073
1092
  with open(nested_log_path, 'r', newline='', encoding='utf-8') as f:
@@ -1159,12 +1178,14 @@ def _capped_follow_logs_with_provision_expanding(
1159
1178
 
1160
1179
 
1161
1180
  def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1162
- tail: Optional[int]) -> str:
1163
- msg = check_service_status_healthy(service_name)
1181
+ tail: Optional[int], pool: bool) -> str:
1182
+ msg = _check_service_status_healthy(service_name, pool=pool)
1164
1183
  if msg is not None:
1165
1184
  return msg
1185
+ repnoun = 'worker' if pool else 'replica'
1186
+ caprepnoun = repnoun.capitalize()
1166
1187
  print(f'{colorama.Fore.YELLOW}Start streaming logs for launching process '
1167
- f'of replica {replica_id}.{colorama.Style.RESET_ALL}')
1188
+ f'of {repnoun} {replica_id}.{colorama.Style.RESET_ALL}')
1168
1189
  log_file_name = generate_replica_log_file_name(service_name, replica_id)
1169
1190
  if os.path.exists(log_file_name):
1170
1191
  if tail is not None:
@@ -1181,7 +1202,7 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1181
1202
  launch_log_file_name = generate_replica_launch_log_file_name(
1182
1203
  service_name, replica_id)
1183
1204
  if not os.path.exists(launch_log_file_name):
1184
- return (f'{colorama.Fore.RED}Replica {replica_id} doesn\'t exist.'
1205
+ return (f'{colorama.Fore.RED}{caprepnoun} {replica_id} doesn\'t exist.'
1185
1206
  f'{colorama.Style.RESET_ALL}')
1186
1207
 
1187
1208
  replica_cluster_name = generate_replica_cluster_name(
@@ -1231,6 +1252,10 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1231
1252
  print(line, end='', flush=True)
1232
1253
  return ''
1233
1254
 
1255
+ # For pools, we don't stream the job logs as the run section is ignored.
1256
+ if pool:
1257
+ return ''
1258
+
1234
1259
  backend = backends.CloudVmRayBackend()
1235
1260
  handle = global_user_state.get_handle_from_cluster_name(
1236
1261
  replica_cluster_name)
@@ -1245,13 +1270,13 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1245
1270
 
1246
1271
  # Notify user here to make sure user won't think the log is finished.
1247
1272
  print(f'{colorama.Fore.YELLOW}Start streaming logs for task job '
1248
- f'of replica {replica_id}...{colorama.Style.RESET_ALL}')
1273
+ f'of {repnoun} {replica_id}...{colorama.Style.RESET_ALL}')
1249
1274
 
1250
1275
  # Always tail the latest logs, which represent user setup & run.
1251
1276
  if tail is None:
1252
1277
  returncode = backend.tail_logs(handle, job_id=None, follow=follow)
1253
1278
  if returncode != 0:
1254
- return (f'{colorama.Fore.RED}Failed to stream logs for replica '
1279
+ return (f'{colorama.Fore.RED}Failed to stream logs for {repnoun} '
1255
1280
  f'{replica_id}.{colorama.Style.RESET_ALL}')
1256
1281
  elif not follow and tail > 0:
1257
1282
  final = backend.tail_logs(handle,
@@ -1278,8 +1303,9 @@ def stream_replica_logs(service_name: str, replica_id: int, follow: bool,
1278
1303
 
1279
1304
 
1280
1305
  def stream_serve_process_logs(service_name: str, stream_controller: bool,
1281
- follow: bool, tail: Optional[int]) -> str:
1282
- msg = check_service_status_healthy(service_name)
1306
+ follow: bool, tail: Optional[int],
1307
+ pool: bool) -> str:
1308
+ msg = _check_service_status_healthy(service_name, pool)
1283
1309
  if msg is not None:
1284
1310
  return msg
1285
1311
  if stream_controller:
@@ -1288,7 +1314,9 @@ def stream_serve_process_logs(service_name: str, stream_controller: bool,
1288
1314
  log_file = generate_remote_load_balancer_log_file_name(service_name)
1289
1315
 
1290
1316
  def _service_is_terminal() -> bool:
1291
- record = serve_state.get_service_from_name(service_name)
1317
+ record = _get_service_status(service_name,
1318
+ pool,
1319
+ with_replica_info=False)
1292
1320
  if record is None:
1293
1321
  return True
1294
1322
  return record['status'] in serve_state.ServiceStatus.failed_statuses()
@@ -1531,21 +1559,24 @@ class ServeCodeGen:
1531
1559
 
1532
1560
  @classmethod
1533
1561
  def stream_replica_logs(cls, service_name: str, replica_id: int,
1534
- follow: bool, tail: Optional[int]) -> str:
1562
+ follow: bool, tail: Optional[int],
1563
+ pool: bool) -> str:
1535
1564
  code = [
1565
+ f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
1536
1566
  'msg = serve_utils.stream_replica_logs('
1537
- f'{service_name!r}, {replica_id!r}, follow={follow}, tail={tail})',
1538
- 'print(msg, flush=True)'
1567
+ f'{service_name!r}, {replica_id!r}, follow={follow}, tail={tail}, '
1568
+ '**kwargs)', 'print(msg, flush=True)'
1539
1569
  ]
1540
1570
  return cls._build(code)
1541
1571
 
1542
1572
  @classmethod
1543
1573
  def stream_serve_process_logs(cls, service_name: str,
1544
1574
  stream_controller: bool, follow: bool,
1545
- tail: Optional[int]) -> str:
1575
+ tail: Optional[int], pool: bool) -> str:
1546
1576
  code = [
1577
+ f'kwargs={{}} if serve_version < 5 else {{"pool": {pool}}}',
1547
1578
  f'msg = serve_utils.stream_serve_process_logs({service_name!r}, '
1548
- f'{stream_controller}, follow={follow}, tail={tail})',
1579
+ f'{stream_controller}, follow={follow}, tail={tail}, **kwargs)',
1549
1580
  'print(msg, flush=True)'
1550
1581
  ]
1551
1582
  return cls._build(code)
sky/serve/server/core.py CHANGED
@@ -1,9 +1,6 @@
1
1
  """SkyServe core APIs."""
2
- import pathlib
3
- import signal
4
- import threading
5
2
  import typing
6
- from typing import Any, Dict, List, Optional, Set, Tuple, Union
3
+ from typing import Any, Dict, List, Optional, Tuple, Union
7
4
 
8
5
  from sky import backends
9
6
  from sky import exceptions
@@ -12,11 +9,8 @@ from sky.backends import backend_utils
12
9
  from sky.serve import serve_utils
13
10
  from sky.serve.server import impl
14
11
  from sky.usage import usage_lib
15
- from sky.utils import command_runner
16
12
  from sky.utils import controller_utils
17
- from sky.utils import rich_utils
18
13
  from sky.utils import subprocess_utils
19
- from sky.utils import ux_utils
20
14
 
21
15
  if typing.TYPE_CHECKING:
22
16
  import sky
@@ -24,42 +18,6 @@ if typing.TYPE_CHECKING:
24
18
  logger = sky_logging.init_logger(__name__)
25
19
 
26
20
 
27
- def _get_all_replica_targets(
28
- service_name: str, backend: backends.CloudVmRayBackend,
29
- handle: backends.CloudVmRayResourceHandle
30
- ) -> Set[serve_utils.ServiceComponentTarget]:
31
- """Helper function to get targets for all live replicas."""
32
- code = serve_utils.ServeCodeGen.get_service_status([service_name],
33
- pool=False)
34
- returncode, serve_status_payload, stderr = backend.run_on_head(
35
- handle,
36
- code,
37
- require_outputs=True,
38
- stream_logs=False,
39
- separate_stderr=True)
40
-
41
- try:
42
- subprocess_utils.handle_returncode(returncode,
43
- code,
44
- 'Failed to fetch services',
45
- stderr,
46
- stream_logs=True)
47
- except exceptions.CommandError as e:
48
- raise RuntimeError(e.error_msg) from e
49
-
50
- service_records = serve_utils.load_service_status(serve_status_payload)
51
- if not service_records:
52
- raise ValueError(f'Service {service_name!r} not found.')
53
- assert len(service_records) == 1
54
- service_record = service_records[0]
55
-
56
- return {
57
- serve_utils.ServiceComponentTarget(serve_utils.ServiceComponent.REPLICA,
58
- replica_info['replica_id'])
59
- for replica_info in service_record['replica_info']
60
- }
61
-
62
-
63
21
  @usage_lib.entrypoint
64
22
  def up(
65
23
  task: 'sky.Task',
@@ -277,59 +235,12 @@ def tail_logs(
277
235
  sky.exceptions.ClusterNotUpError: the sky serve controller is not up.
278
236
  ValueError: arguments not valid, or failed to tail the logs.
279
237
  """
280
- if isinstance(target, str):
281
- target = serve_utils.ServiceComponent(target)
282
- if not isinstance(target, serve_utils.ServiceComponent):
283
- with ux_utils.print_exception_no_traceback():
284
- raise ValueError(f'`target` must be a string or '
285
- f'sky.serve.ServiceComponent, got {type(target)}.')
286
-
287
- if target == serve_utils.ServiceComponent.REPLICA:
288
- if replica_id is None:
289
- with ux_utils.print_exception_no_traceback():
290
- raise ValueError(
291
- '`replica_id` must be specified when using target=REPLICA.')
292
- else:
293
- if replica_id is not None:
294
- with ux_utils.print_exception_no_traceback():
295
- raise ValueError('`replica_id` must be None when using '
296
- 'target=CONTROLLER/LOAD_BALANCER.')
297
-
298
- controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
299
- handle = backend_utils.is_controller_accessible(
300
- controller=controller_type,
301
- stopped_message=controller_type.value.default_hint_if_non_existent)
302
-
303
- backend = backend_utils.get_backend_from_handle(handle)
304
- assert isinstance(backend, backends.CloudVmRayBackend), backend
305
-
306
- if target != serve_utils.ServiceComponent.REPLICA:
307
- code = serve_utils.ServeCodeGen.stream_serve_process_logs(
308
- service_name,
309
- stream_controller=(
310
- target == serve_utils.ServiceComponent.CONTROLLER),
311
- follow=follow,
312
- tail=tail)
313
- else:
314
- assert replica_id is not None, service_name
315
- code = serve_utils.ServeCodeGen.stream_replica_logs(service_name,
316
- replica_id,
317
- follow,
318
- tail=tail)
319
-
320
- # With the stdin=subprocess.DEVNULL, the ctrl-c will not directly
321
- # kill the process, so we need to handle it manually here.
322
- if threading.current_thread() is threading.main_thread():
323
- signal.signal(signal.SIGINT, backend_utils.interrupt_handler)
324
- signal.signal(signal.SIGTSTP, backend_utils.stop_handler)
325
-
326
- # Refer to the notes in
327
- # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
328
- backend.run_on_head(handle,
329
- code,
330
- stream_logs=True,
331
- process_stream=False,
332
- ssh_mode=command_runner.SshMode.INTERACTIVE)
238
+ return impl.tail_logs(service_name,
239
+ target=target,
240
+ replica_id=replica_id,
241
+ follow=follow,
242
+ tail=tail,
243
+ pool=False)
333
244
 
334
245
 
335
246
  @usage_lib.entrypoint
@@ -374,104 +285,9 @@ def sync_down_logs(
374
285
  sky.exceptions.ClusterNotUpError: If the controller is not up.
375
286
  ValueError: Arguments not valid.
376
287
  """
377
- # Step 0) get the controller handle
378
- with rich_utils.safe_status(
379
- ux_utils.spinner_message('Checking service status...')):
380
- controller_type = controller_utils.Controllers.SKY_SERVE_CONTROLLER
381
- handle = backend_utils.is_controller_accessible(
382
- controller=controller_type,
383
- stopped_message=controller_type.value.default_hint_if_non_existent)
384
- backend: backends.CloudVmRayBackend = (
385
- backend_utils.get_backend_from_handle(handle))
386
-
387
- requested_components: Set[serve_utils.ServiceComponent] = set()
388
- if not targets:
389
- # No targets specified -> request all components
390
- requested_components = {
391
- serve_utils.ServiceComponent.CONTROLLER,
392
- serve_utils.ServiceComponent.LOAD_BALANCER,
393
- serve_utils.ServiceComponent.REPLICA
394
- }
395
- else:
396
- # Parse provided targets
397
- if isinstance(targets, (str, serve_utils.ServiceComponent)):
398
- requested_components = {serve_utils.ServiceComponent(targets)}
399
- else: # list
400
- requested_components = {
401
- serve_utils.ServiceComponent(t) for t in targets
402
- }
403
-
404
- normalized_targets: Set[serve_utils.ServiceComponentTarget] = set()
405
- if serve_utils.ServiceComponent.CONTROLLER in requested_components:
406
- normalized_targets.add(
407
- serve_utils.ServiceComponentTarget(
408
- serve_utils.ServiceComponent.CONTROLLER))
409
- if serve_utils.ServiceComponent.LOAD_BALANCER in requested_components:
410
- normalized_targets.add(
411
- serve_utils.ServiceComponentTarget(
412
- serve_utils.ServiceComponent.LOAD_BALANCER))
413
- if serve_utils.ServiceComponent.REPLICA in requested_components:
414
- with rich_utils.safe_status(
415
- ux_utils.spinner_message('Getting live replica infos...')):
416
- replica_targets = _get_all_replica_targets(service_name, backend,
417
- handle)
418
- if not replica_ids:
419
- # Replica target requested but no specific IDs
420
- # -> Get all replica logs
421
- normalized_targets.update(replica_targets)
422
- else:
423
- # Replica target requested with specific IDs
424
- requested_replica_targets = [
425
- serve_utils.ServiceComponentTarget(
426
- serve_utils.ServiceComponent.REPLICA, rid)
427
- for rid in replica_ids
428
- ]
429
- for target in requested_replica_targets:
430
- if target not in replica_targets:
431
- logger.warning(f'Replica ID {target.replica_id} not found '
432
- f'for {service_name}. Skipping...')
433
- else:
434
- normalized_targets.add(target)
435
-
436
- def sync_down_logs_by_target(target: serve_utils.ServiceComponentTarget):
437
- component = target.component
438
- # We need to set one side of the pipe to a logs stream, and the other
439
- # side to a file.
440
- log_path = str(pathlib.Path(local_dir) / f'{target}.log')
441
- stream_logs_code: str
442
-
443
- if component == serve_utils.ServiceComponent.CONTROLLER:
444
- stream_logs_code = (
445
- serve_utils.ServeCodeGen.stream_serve_process_logs(
446
- service_name,
447
- stream_controller=True,
448
- follow=False,
449
- tail=tail))
450
- elif component == serve_utils.ServiceComponent.LOAD_BALANCER:
451
- stream_logs_code = (
452
- serve_utils.ServeCodeGen.stream_serve_process_logs(
453
- service_name,
454
- stream_controller=False,
455
- follow=False,
456
- tail=tail))
457
- elif component == serve_utils.ServiceComponent.REPLICA:
458
- replica_id = target.replica_id
459
- assert replica_id is not None, service_name
460
- stream_logs_code = serve_utils.ServeCodeGen.stream_replica_logs(
461
- service_name, replica_id, follow=False, tail=tail)
462
- else:
463
- assert False, component
464
-
465
- # Refer to the notes in
466
- # sky/backends/cloud_vm_ray_backend.py::CloudVmRayBackend::tail_logs.
467
- backend.run_on_head(handle,
468
- stream_logs_code,
469
- stream_logs=False,
470
- process_stream=False,
471
- ssh_mode=command_runner.SshMode.INTERACTIVE,
472
- log_path=log_path)
473
-
474
- subprocess_utils.run_in_parallel(sync_down_logs_by_target,
475
- list(normalized_targets))
476
-
477
- return local_dir
288
+ return impl.sync_down_logs(service_name,
289
+ local_dir=local_dir,
290
+ targets=targets,
291
+ replica_ids=replica_ids,
292
+ tail=tail,
293
+ pool=False)