skypilot-nightly 1.0.0.dev20250812__py3-none-any.whl → 1.0.0.dev20250815__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (179) hide show
  1. sky/__init__.py +4 -2
  2. sky/adaptors/nebius.py +43 -1
  3. sky/backends/backend_utils.py +74 -7
  4. sky/backends/cloud_vm_ray_backend.py +169 -29
  5. sky/catalog/cudo_catalog.py +1 -1
  6. sky/catalog/data_fetchers/fetch_cudo.py +1 -1
  7. sky/catalog/data_fetchers/fetch_nebius.py +6 -3
  8. sky/client/cli/command.py +62 -85
  9. sky/client/common.py +1 -1
  10. sky/client/sdk.py +69 -19
  11. sky/client/sdk_async.py +5 -4
  12. sky/clouds/aws.py +52 -1
  13. sky/clouds/kubernetes.py +15 -5
  14. sky/clouds/nebius.py +3 -1
  15. sky/dag.py +1 -0
  16. sky/dashboard/out/404.html +1 -1
  17. sky/dashboard/out/_next/static/I-djf3wB8zZl_bI67BOyZ/_buildManifest.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/1141-a96678fed5043c12.js +1 -0
  19. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/3015-77d22ae2fad4071c.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/3785.8ce85b31e5c602e9.js +1 -0
  22. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +21 -0
  23. sky/dashboard/out/_next/static/chunks/4509-fa63866741388427.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +10 -0
  25. sky/dashboard/out/_next/static/chunks/4725.68d5ce4d6bcb7991.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/6014.d466a44b73af8348.js +6 -0
  27. sky/dashboard/out/_next/static/chunks/{6135-85426374db04811e.js → 6135-4b4d5e824b7f9d3c.js} +1 -1
  28. sky/dashboard/out/_next/static/chunks/6633-efe924b9b8136699.js +40 -0
  29. sky/dashboard/out/_next/static/chunks/6856-58370d8c9a79f72b.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/{6989-6129c1cfbcf51063.js → 6989-01359c57e018caa4.js} +1 -1
  31. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +6 -0
  33. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +18 -0
  34. sky/dashboard/out/_next/static/chunks/7557-5855617d0421ed55.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/8310.4ae62d5937045bf3.js +31 -0
  36. sky/dashboard/out/_next/static/chunks/8838.e7953f42af2b0544.js +45 -0
  37. sky/dashboard/out/_next/static/chunks/8969-6d493b1e2fa45826.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/{1871-980a395e92633a5c.js → 9037-f71c3c42670a4be0.js} +2 -2
  39. sky/dashboard/out/_next/static/chunks/9277.71481d5b2e606e33.js +51 -0
  40. sky/dashboard/out/_next/static/chunks/pages/{_app-491a4d699d95e808.js → _app-ce361c6959bc2001.js} +2 -2
  41. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/{[job]-078751bad714c017.js → [job]-6d43d6a6bd1d4c77.js} +2 -2
  42. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-30c5954a7b1f67d7.js +16 -0
  43. sky/dashboard/out/_next/static/chunks/pages/clusters-fa94c3548b5834aa.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-13d53fffc03ccb52.js → [context]-5264c5645299cde9.js} +1 -1
  45. sky/dashboard/out/_next/static/chunks/pages/{infra-fc9222e26c8e2f0d.js → infra-83991650ae4bd083.js} +1 -1
  46. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-ad2cd5aab787bc15.js +6 -0
  47. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-f5ccf5d39d87aebe.js → [pool]-7d4182df6625fe10.js} +2 -7
  48. sky/dashboard/out/_next/static/chunks/pages/jobs-c6a6a8a737ad7e2d.js +1 -0
  49. sky/dashboard/out/_next/static/chunks/pages/users-d112a9b3d854abb2.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/volumes-b87fec189298a0c0.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-f72f73bcef9541dc.js → [name]-8a86ca4c98812df9.js} +1 -1
  52. sky/dashboard/out/_next/static/chunks/pages/workspaces-74ef46fc370f7c71.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/webpack-aba778a6d6eb496d.js +1 -0
  54. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  55. sky/dashboard/out/clusters/[cluster].html +1 -1
  56. sky/dashboard/out/clusters.html +1 -1
  57. sky/dashboard/out/config.html +1 -1
  58. sky/dashboard/out/index.html +1 -1
  59. sky/dashboard/out/infra/[context].html +1 -1
  60. sky/dashboard/out/infra.html +1 -1
  61. sky/dashboard/out/jobs/[job].html +1 -1
  62. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  63. sky/dashboard/out/jobs.html +1 -1
  64. sky/dashboard/out/users.html +1 -1
  65. sky/dashboard/out/volumes.html +1 -1
  66. sky/dashboard/out/workspace/new.html +1 -1
  67. sky/dashboard/out/workspaces/[name].html +1 -1
  68. sky/dashboard/out/workspaces.html +1 -1
  69. sky/data/storage.py +11 -1
  70. sky/exceptions.py +5 -0
  71. sky/execution.py +13 -10
  72. sky/global_user_state.py +191 -8
  73. sky/jobs/constants.py +1 -1
  74. sky/jobs/controller.py +0 -1
  75. sky/jobs/recovery_strategy.py +3 -3
  76. sky/jobs/scheduler.py +35 -87
  77. sky/jobs/server/core.py +82 -22
  78. sky/jobs/server/utils.py +1 -1
  79. sky/jobs/state.py +7 -5
  80. sky/jobs/utils.py +167 -8
  81. sky/provision/__init__.py +1 -0
  82. sky/provision/aws/config.py +25 -0
  83. sky/provision/aws/instance.py +37 -13
  84. sky/provision/azure/instance.py +2 -0
  85. sky/provision/cudo/cudo_wrapper.py +1 -1
  86. sky/provision/cudo/instance.py +2 -0
  87. sky/provision/do/instance.py +2 -0
  88. sky/provision/fluidstack/instance.py +2 -0
  89. sky/provision/gcp/instance.py +2 -0
  90. sky/provision/hyperbolic/instance.py +2 -1
  91. sky/provision/kubernetes/instance.py +133 -0
  92. sky/provision/lambda_cloud/instance.py +2 -0
  93. sky/provision/nebius/instance.py +2 -0
  94. sky/provision/nebius/utils.py +101 -86
  95. sky/provision/oci/instance.py +2 -0
  96. sky/provision/paperspace/instance.py +2 -1
  97. sky/provision/paperspace/utils.py +1 -1
  98. sky/provision/provisioner.py +13 -8
  99. sky/provision/runpod/instance.py +2 -0
  100. sky/provision/runpod/utils.py +1 -1
  101. sky/provision/scp/instance.py +2 -0
  102. sky/provision/vast/instance.py +2 -0
  103. sky/provision/vsphere/instance.py +2 -0
  104. sky/resources.py +6 -7
  105. sky/schemas/__init__.py +0 -0
  106. sky/schemas/api/__init__.py +0 -0
  107. sky/schemas/api/responses.py +70 -0
  108. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  109. sky/schemas/generated/__init__.py +0 -0
  110. sky/schemas/generated/autostopv1_pb2.py +36 -0
  111. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  112. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  113. sky/serve/constants.py +3 -7
  114. sky/serve/replica_managers.py +138 -117
  115. sky/serve/serve_state.py +42 -0
  116. sky/serve/serve_utils.py +58 -36
  117. sky/serve/server/impl.py +15 -19
  118. sky/serve/service.py +82 -33
  119. sky/server/constants.py +1 -1
  120. sky/server/requests/payloads.py +6 -0
  121. sky/server/requests/serializers/decoders.py +12 -2
  122. sky/server/requests/serializers/encoders.py +10 -2
  123. sky/server/server.py +64 -16
  124. sky/setup_files/dependencies.py +11 -10
  125. sky/skylet/autostop_lib.py +38 -5
  126. sky/skylet/constants.py +3 -1
  127. sky/skylet/services.py +44 -0
  128. sky/skylet/skylet.py +49 -4
  129. sky/task.py +19 -16
  130. sky/templates/aws-ray.yml.j2 +2 -2
  131. sky/templates/jobs-controller.yaml.j2 +6 -0
  132. sky/templates/kubernetes-ray.yml.j2 +1 -0
  133. sky/utils/command_runner.py +1 -1
  134. sky/utils/common_utils.py +20 -0
  135. sky/utils/config_utils.py +29 -5
  136. sky/utils/controller_utils.py +86 -0
  137. sky/utils/db/db_utils.py +17 -0
  138. sky/utils/db/migration_utils.py +1 -1
  139. sky/utils/log_utils.py +14 -5
  140. sky/utils/resources_utils.py +25 -1
  141. sky/utils/schemas.py +6 -0
  142. sky/utils/ux_utils.py +36 -5
  143. sky/volumes/server/core.py +2 -2
  144. sky/volumes/server/server.py +2 -2
  145. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/METADATA +5 -7
  146. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/RECORD +151 -142
  147. sky/dashboard/out/_next/static/Fuy7OzApYTUMz2QgoP7dP/_buildManifest.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/1141-a8a8f1adba34c892.js +0 -11
  149. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +0 -30
  150. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +0 -15
  151. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/3785.6003d293cb83eab4.js +0 -1
  153. sky/dashboard/out/_next/static/chunks/4725.29550342bd53afd8.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +0 -15
  155. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +0 -13
  156. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/691.5eeedf82cc243343.js +0 -55
  158. sky/dashboard/out/_next/static/chunks/6990-0f886f16e0d55ff8.js +0 -1
  159. sky/dashboard/out/_next/static/chunks/8056-5bdeda81199c0def.js +0 -1
  160. sky/dashboard/out/_next/static/chunks/8252.62b0d23aed618bb2.js +0 -16
  161. sky/dashboard/out/_next/static/chunks/8969-c9686994ddafcf01.js +0 -1
  162. sky/dashboard/out/_next/static/chunks/9159-11421c0f2909236f.js +0 -1
  163. sky/dashboard/out/_next/static/chunks/9360.85b0b1b4054574dd.js +0 -31
  164. sky/dashboard/out/_next/static/chunks/9666.cd4273f2a5c5802c.js +0 -1
  165. sky/dashboard/out/_next/static/chunks/9847.757720f3b40c0aa5.js +0 -30
  166. sky/dashboard/out/_next/static/chunks/9984.c5564679e467d245.js +0 -1
  167. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-da9cc0901349c2e9.js +0 -1
  168. sky/dashboard/out/_next/static/chunks/pages/clusters-b30460f683e6ba96.js +0 -1
  169. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-154f55cf8af55be5.js +0 -11
  170. sky/dashboard/out/_next/static/chunks/pages/jobs-cdc60fb5d371e16a.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/pages/users-7ed36e44e779d5c7.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/pages/volumes-c9695d657f78b5dc.js +0 -1
  173. sky/dashboard/out/_next/static/chunks/pages/workspaces-8f67be60165724cc.js +0 -1
  174. sky/dashboard/out/_next/static/chunks/webpack-7fd0cf9dbecff10f.js +0 -1
  175. /sky/dashboard/out/_next/static/{Fuy7OzApYTUMz2QgoP7dP → I-djf3wB8zZl_bI67BOyZ}/_ssgManifest.js +0 -0
  176. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/WHEEL +0 -0
  177. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/entry_points.txt +0 -0
  178. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/licenses/LICENSE +0 -0
  179. {skypilot_nightly-1.0.0.dev20250812.dist-info → skypilot_nightly-1.0.0.dev20250815.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py CHANGED
@@ -57,21 +57,16 @@ else:
57
57
 
58
58
  logger = sky_logging.init_logger(__name__)
59
59
 
60
-
61
- @annotations.lru_cache(scope='request')
62
- def get_num_service_threshold():
63
- """Get number of services threshold, calculating it only when needed."""
64
- system_memory_gb = psutil.virtual_memory().total // (1024**3)
65
- return system_memory_gb // constants.CONTROLLER_MEMORY_USAGE_GB
66
-
67
-
68
60
  _CONTROLLER_URL = 'http://localhost:{CONTROLLER_PORT}'
69
61
 
70
62
  # NOTE(dev): We assume log are print with the hint 'sky api logs -l'. Be careful
71
63
  # when changing UX as this assumption is used to expand some log files while
72
64
  # ignoring others.
73
65
  _SKYPILOT_LOG_HINT = r'.*sky api logs -l'
74
- _SKYPILOT_PROVISION_LOG_PATTERN = (fr'{_SKYPILOT_LOG_HINT} (.*/provision\.log)')
66
+ _SKYPILOT_PROVISION_API_LOG_PATTERN = (
67
+ fr'{_SKYPILOT_LOG_HINT} (.*/provision\.log)')
68
+ # New hint pattern for provision logs
69
+ _SKYPILOT_PROVISION_LOG_CMD_PATTERN = r'.*sky logs --provision\s+(\S+)'
75
70
  _SKYPILOT_LOG_PATTERN = fr'{_SKYPILOT_LOG_HINT} (.*\.log)'
76
71
 
77
72
  # TODO(tian): Find all existing replica id and print here.
@@ -524,6 +519,8 @@ def generate_remote_tls_certfile_name(service_name: str) -> str:
524
519
 
525
520
 
526
521
  def generate_replica_cluster_name(service_name: str, replica_id: int) -> str:
522
+ # NOTE(dev): This format is used in sky/serve/service.py::_cleanup, for
523
+ # checking replica cluster existence. Be careful when changing it.
527
524
  return f'{service_name}-{replica_id}'
528
525
 
529
526
 
@@ -796,9 +793,13 @@ def load_version_string(payload: str) -> str:
796
793
  return message_utils.decode_payload(payload)
797
794
 
798
795
 
799
- def num_replicas(service_name: str) -> int:
796
+ def get_ready_replicas(
797
+ service_name: str) -> List['replica_managers.ReplicaInfo']:
800
798
  logger.info(f'Get number of replicas for pool {service_name!r}')
801
- return len(serve_state.get_replica_infos(service_name))
799
+ return [
800
+ info for info in serve_state.get_replica_infos(service_name)
801
+ if info.status == serve_state.ReplicaStatus.READY
802
+ ]
802
803
 
803
804
 
804
805
  def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
@@ -823,12 +824,8 @@ def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
823
824
  logger.error(f'Service {service_name!r} is not a cluster pool.')
824
825
  return None
825
826
  with filelock.FileLock(get_service_filelock_path(service_name)):
826
-
827
827
  logger.debug(f'Get next cluster name for pool {service_name!r}')
828
- ready_replicas = [
829
- info for info in serve_state.get_replica_infos(service_name)
830
- if info.status == serve_state.ReplicaStatus.READY
831
- ]
828
+ ready_replicas = get_ready_replicas(service_name)
832
829
  idle_replicas: List['replica_managers.ReplicaInfo'] = []
833
830
  for replica_info in ready_replicas:
834
831
  jobs_on_replica = managed_job_state.get_nonterminal_job_ids_by_pool(
@@ -1044,11 +1041,18 @@ def wait_service_registration(service_name: str, job_id: int,
1044
1041
  lb_port = record['load_balancer_port']
1045
1042
  if lb_port is not None:
1046
1043
  return message_utils.encode_payload(lb_port)
1047
- elif len(serve_state.get_services()) >= get_num_service_threshold():
1048
- with ux_utils.print_exception_no_traceback():
1049
- raise RuntimeError('Max number of services reached. '
1050
- 'To spin up more services, please '
1051
- 'tear down some existing services.')
1044
+ else:
1045
+ controller_log_path = os.path.expanduser(
1046
+ generate_remote_controller_log_file_name(service_name))
1047
+ if os.path.exists(controller_log_path):
1048
+ with open(controller_log_path, 'r', encoding='utf-8') as f:
1049
+ log_content = f.read()
1050
+ if (constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR
1051
+ in log_content):
1052
+ with ux_utils.print_exception_no_traceback():
1053
+ raise RuntimeError('Max number of services reached. '
1054
+ 'To spin up more services, please '
1055
+ 'tear down some existing services.')
1052
1056
  elapsed = time.time() - start_time
1053
1057
  if elapsed > constants.SERVICE_REGISTER_TIMEOUT_SECONDS:
1054
1058
  # Print the controller log to help user debug.
@@ -1113,31 +1117,49 @@ def _process_line(line: str,
1113
1117
  return False
1114
1118
  return cluster_record['status'] == status_lib.ClusterStatus.UP
1115
1119
 
1116
- provision_log_prompt = re.match(_SKYPILOT_PROVISION_LOG_PATTERN, line)
1120
+ provision_api_log_prompt = re.match(_SKYPILOT_PROVISION_API_LOG_PATTERN,
1121
+ line)
1122
+ provision_log_cmd_prompt = re.match(_SKYPILOT_PROVISION_LOG_CMD_PATTERN,
1123
+ line)
1117
1124
  log_prompt = re.match(_SKYPILOT_LOG_PATTERN, line)
1118
1125
 
1119
- if provision_log_prompt is not None:
1120
- log_path = provision_log_prompt.group(1)
1121
- nested_log_path = pathlib.Path(
1122
- skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
1123
- log_path).resolve()
1124
-
1126
+ def _stream_provision_path(p: pathlib.Path) -> Iterator[str]:
1125
1127
  try:
1126
- with open(nested_log_path, 'r', newline='', encoding='utf-8') as f:
1127
- # We still exit if more than 10 seconds without new content
1128
- # to avoid any internal bug that causes the launch to fail
1129
- # while cluster status remains INIT.
1128
+ with open(p, 'r', newline='', encoding='utf-8') as f:
1129
+ # Exit if >10s without new content to avoid hanging when INIT
1130
1130
  yield from log_utils.follow_logs(f,
1131
1131
  should_stop=cluster_is_up,
1132
1132
  stop_on_eof=stop_on_eof,
1133
1133
  idle_timeout_seconds=10)
1134
1134
  except FileNotFoundError:
1135
+ # Fall back cleanly if the hinted path doesn't exist
1135
1136
  yield line
1136
-
1137
1137
  yield (f'{colorama.Fore.YELLOW}{colorama.Style.BRIGHT}'
1138
- f'Try to expand log file {nested_log_path} but not '
1139
- f'found. Skipping...{colorama.Style.RESET_ALL}')
1140
- pass
1138
+ f'Try to expand log file {p} but not found. Skipping...'
1139
+ f'{colorama.Style.RESET_ALL}')
1140
+ return
1141
+
1142
+ if provision_api_log_prompt is not None:
1143
+ rel_path = provision_api_log_prompt.group(1)
1144
+ nested_log_path = pathlib.Path(
1145
+ skylet_constants.SKY_LOGS_DIRECTORY).expanduser().joinpath(
1146
+ rel_path).resolve()
1147
+ yield from _stream_provision_path(nested_log_path)
1148
+ return
1149
+
1150
+ if provision_log_cmd_prompt is not None:
1151
+ # Resolve provision log via cluster table first, then history.
1152
+ log_path_str = global_user_state.get_cluster_provision_log_path(
1153
+ cluster_name)
1154
+ if not log_path_str:
1155
+ log_path_str = (
1156
+ global_user_state.get_cluster_history_provision_log_path(
1157
+ cluster_name))
1158
+ if not log_path_str:
1159
+ yield line
1160
+ return
1161
+ yield from _stream_provision_path(
1162
+ pathlib.Path(log_path_str).expanduser().resolve())
1141
1163
  return
1142
1164
 
1143
1165
  if log_prompt is not None:
sky/serve/server/impl.py CHANGED
@@ -11,7 +11,6 @@ import uuid
11
11
  import colorama
12
12
  import filelock
13
13
 
14
- import sky
15
14
  from sky import backends
16
15
  from sky import exceptions
17
16
  from sky import execution
@@ -25,6 +24,7 @@ from sky.serve import constants as serve_constants
25
24
  from sky.serve import serve_state
26
25
  from sky.serve import serve_utils
27
26
  from sky.skylet import constants
27
+ from sky.skylet import job_lib
28
28
  from sky.utils import admin_policy_utils
29
29
  from sky.utils import command_runner
30
30
  from sky.utils import common
@@ -39,7 +39,7 @@ logger = sky_logging.init_logger(__name__)
39
39
 
40
40
 
41
41
  def _rewrite_tls_credential_paths_and_get_tls_env_vars(
42
- service_name: str, task: 'sky.Task') -> Dict[str, Any]:
42
+ service_name: str, task: 'task_lib.Task') -> Dict[str, Any]:
43
43
  """Rewrite the paths of TLS credentials in the task.
44
44
 
45
45
  Args:
@@ -103,15 +103,11 @@ def _get_service_record(
103
103
 
104
104
 
105
105
  def up(
106
- task: 'sky.Task',
106
+ task: 'task_lib.Task',
107
107
  service_name: Optional[str] = None,
108
108
  pool: bool = False,
109
109
  ) -> Tuple[str, str]:
110
110
  """Spins up a service or a pool."""
111
- if pool and not serve_utils.is_consolidation_mode(pool):
112
- raise ValueError(
113
- 'Pool is only supported in consolidation mode. To fix, set '
114
- '`jobs.controller.consolidation_mode: true` in SkyPilot config.')
115
111
  task.validate()
116
112
  serve_utils.validate_service_task(task, pool=pool)
117
113
  assert task.service is not None
@@ -191,8 +187,7 @@ def up(
191
187
  controller_log_file = (
192
188
  serve_utils.generate_remote_controller_log_file_name(service_name))
193
189
  controller_resources = controller_utils.get_controller_resources(
194
- controller=controller_utils.Controllers.SKY_SERVE_CONTROLLER,
195
- task_resources=task.resources)
190
+ controller=controller, task_resources=task.resources)
196
191
  controller_job_id = None
197
192
  if serve_utils.is_consolidation_mode(pool):
198
193
  # We need a unique integer per sky.serve.up call to avoid name
@@ -228,10 +223,11 @@ def up(
228
223
  # balancer port from the controller? So we don't need to open so many
229
224
  # ports here. Or, we should have a nginx traffic control to refuse
230
225
  # any connection to the unregistered ports.
231
- controller_resources = {
232
- r.copy(ports=[serve_constants.LOAD_BALANCER_PORT_RANGE])
233
- for r in controller_resources
234
- }
226
+ if not pool:
227
+ controller_resources = {
228
+ r.copy(ports=[serve_constants.LOAD_BALANCER_PORT_RANGE])
229
+ for r in controller_resources
230
+ }
235
231
  controller_task.set_resources(controller_resources)
236
232
 
237
233
  # # Set service_name so the backend will know to modify default ray
@@ -325,7 +321,7 @@ def up(
325
321
  [controller_job_id],
326
322
  stream_logs=False)
327
323
  controller_job_status = list(statuses.values())[0]
328
- if controller_job_status == sky.JobStatus.PENDING:
324
+ if controller_job_status == job_lib.JobStatus.PENDING:
329
325
  # Max number of services reached due to vCPU constraint.
330
326
  # The controller job is pending due to ray job scheduling.
331
327
  # We manually cancel the job here.
@@ -350,7 +346,7 @@ def up(
350
346
  else:
351
347
  lb_port = serve_utils.load_service_initialization_result(
352
348
  lb_port_payload)
353
- if not serve_utils.is_consolidation_mode(pool):
349
+ if not serve_utils.is_consolidation_mode(pool) and not pool:
354
350
  socket_endpoint = backend_utils.get_endpoints(
355
351
  controller_handle.cluster_name,
356
352
  lb_port,
@@ -374,10 +370,10 @@ def up(
374
370
  f'\n📋 Useful Commands'
375
371
  f'\n{ux_utils.INDENT_SYMBOL}To submit jobs to the pool:\t'
376
372
  f'{ux_utils.BOLD}sky jobs launch --pool {service_name} '
377
- f'<run-command>{ux_utils.RESET_BOLD}'
373
+ f'<yaml_file>{ux_utils.RESET_BOLD}'
378
374
  f'\n{ux_utils.INDENT_SYMBOL}To submit multiple jobs:\t'
379
375
  f'{ux_utils.BOLD}sky jobs launch --pool {service_name} '
380
- f'--num-jobs 10 <run-command>{ux_utils.RESET_BOLD}'
376
+ f'--num-jobs 10 <yaml_file>{ux_utils.RESET_BOLD}'
381
377
  f'\n{ux_utils.INDENT_SYMBOL}To check the pool status:\t'
382
378
  f'{ux_utils.BOLD}sky jobs pool status {service_name}'
383
379
  f'{ux_utils.RESET_BOLD}'
@@ -421,7 +417,7 @@ def up(
421
417
 
422
418
 
423
419
  def update(
424
- task: 'sky.Task',
420
+ task: 'task_lib.Task',
425
421
  service_name: str,
426
422
  mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
427
423
  pool: bool = False,
@@ -576,7 +572,7 @@ def update(
576
572
 
577
573
 
578
574
  def apply(
579
- task: 'sky.Task',
575
+ task: 'task_lib.Task',
580
576
  service_name: str,
581
577
  mode: serve_utils.UpdateMode = serve_utils.DEFAULT_UPDATE_MODE,
582
578
  pool: bool = False,
sky/serve/service.py CHANGED
@@ -15,11 +15,13 @@ import filelock
15
15
 
16
16
  from sky import authentication
17
17
  from sky import exceptions
18
+ from sky import global_user_state
18
19
  from sky import sky_logging
19
20
  from sky import task as task_lib
20
21
  from sky.backends import backend_utils
21
22
  from sky.backends import cloud_vm_ray_backend
22
23
  from sky.data import data_utils
24
+ from sky.jobs import scheduler as jobs_scheduler
23
25
  from sky.serve import constants
24
26
  from sky.serve import controller
25
27
  from sky.serve import load_balancer
@@ -28,6 +30,7 @@ from sky.serve import serve_state
28
30
  from sky.serve import serve_utils
29
31
  from sky.skylet import constants as skylet_constants
30
32
  from sky.utils import common_utils
33
+ from sky.utils import controller_utils
31
34
  from sky.utils import subprocess_utils
32
35
  from sky.utils import ux_utils
33
36
 
@@ -110,6 +113,9 @@ def cleanup_storage(task_yaml: str) -> bool:
110
113
  return not failed
111
114
 
112
115
 
116
+ # NOTE(dev): We don't need to acquire the `with_lock` in replica manager here
117
+ # because we killed all the processes (controller & replica manager) before
118
+ # calling this function.
113
119
  def _cleanup(service_name: str) -> bool:
114
120
  """Clean up all service related resources, i.e. replicas and storage."""
115
121
  # Cleanup the HA recovery script first as it is possible that some error
@@ -120,31 +126,71 @@ def _cleanup(service_name: str) -> bool:
120
126
  replica_infos = serve_state.get_replica_infos(service_name)
121
127
  info2proc: Dict[replica_managers.ReplicaInfo,
122
128
  multiprocessing.Process] = dict()
129
+ # NOTE(dev): This relies on `sky/serve/serve_utils.py::
130
+ # generate_replica_cluster_name`. Change it if you change the function.
131
+ existing_cluster_names = global_user_state.get_cluster_names_start_with(
132
+ service_name)
123
133
  for info in replica_infos:
134
+ if info.cluster_name not in existing_cluster_names:
135
+ logger.info(f'Cluster {info.cluster_name} for replica '
136
+ f'{info.replica_id} not found. Might be a failed '
137
+ 'cluster. Skipping.')
138
+ continue
124
139
  p = multiprocessing.Process(target=replica_managers.terminate_cluster,
125
140
  args=(info.cluster_name,))
126
- p.start()
127
141
  info2proc[info] = p
128
142
  # Set replica status to `SHUTTING_DOWN`
129
143
  info.status_property.sky_launch_status = (
130
- replica_managers.ProcessStatus.SUCCEEDED)
144
+ replica_managers.common_utils.ProcessStatus.SUCCEEDED)
131
145
  info.status_property.sky_down_status = (
132
- replica_managers.ProcessStatus.RUNNING)
146
+ replica_managers.common_utils.ProcessStatus.SCHEDULED)
133
147
  serve_state.add_or_update_replica(service_name, info.replica_id, info)
134
- logger.info(f'Terminating replica {info.replica_id} ...')
135
- for info, p in info2proc.items():
136
- p.join()
137
- if p.exitcode == 0:
138
- serve_state.remove_replica(service_name, info.replica_id)
139
- logger.info(f'Replica {info.replica_id} terminated successfully.')
140
- else:
141
- # Set replica status to `FAILED_CLEANUP`
142
- info.status_property.sky_down_status = (
143
- replica_managers.ProcessStatus.FAILED)
144
- serve_state.add_or_update_replica(service_name, info.replica_id,
145
- info)
146
- failed = True
147
- logger.error(f'Replica {info.replica_id} failed to terminate.')
148
+ logger.info(f'Scheduling to terminate replica {info.replica_id} ...')
149
+
150
+ def _set_to_failed_cleanup(info: replica_managers.ReplicaInfo) -> None:
151
+ nonlocal failed
152
+ # Set replica status to `FAILED_CLEANUP`
153
+ info.status_property.sky_down_status = (
154
+ replica_managers.common_utils.ProcessStatus.FAILED)
155
+ serve_state.add_or_update_replica(service_name, info.replica_id, info)
156
+ failed = True
157
+ logger.error(f'Replica {info.replica_id} failed to terminate.')
158
+
159
+ # Please reference to sky/serve/replica_managers.py::_refresh_process_pool.
160
+ # TODO(tian): Refactor to use the same logic and code.
161
+ while info2proc:
162
+ snapshot = list(info2proc.items())
163
+ for info, p in snapshot:
164
+ if p.is_alive():
165
+ continue
166
+ if (info.status_property.sky_down_status ==
167
+ replica_managers.common_utils.ProcessStatus.SCHEDULED):
168
+ if controller_utils.can_terminate():
169
+ try:
170
+ p.start()
171
+ except Exception as e: # pylint: disable=broad-except
172
+ _set_to_failed_cleanup(info)
173
+ logger.error(f'Failed to start process for replica '
174
+ f'{info.replica_id}: {e}')
175
+ del info2proc[info]
176
+ else:
177
+ info.status_property.sky_down_status = (
178
+ common_utils.ProcessStatus.RUNNING)
179
+ serve_state.add_or_update_replica(
180
+ service_name, info.replica_id, info)
181
+ else:
182
+ logger.info('Terminate process for replica '
183
+ f'{info.replica_id} finished.')
184
+ p.join()
185
+ del info2proc[info]
186
+ if p.exitcode == 0:
187
+ serve_state.remove_replica(service_name, info.replica_id)
188
+ logger.info(
189
+ f'Replica {info.replica_id} terminated successfully.')
190
+ else:
191
+ _set_to_failed_cleanup(info)
192
+ time.sleep(3)
193
+
148
194
  versions = serve_state.get_service_versions(service_name)
149
195
  serve_state.remove_service_versions(service_name)
150
196
 
@@ -214,22 +260,25 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int, entrypoint: str):
214
260
  service_name, version)
215
261
 
216
262
  if not is_recovery:
217
- if (len(serve_state.get_services()) >=
218
- serve_utils.get_num_service_threshold()):
219
- cleanup_storage(tmp_task_yaml)
220
- with ux_utils.print_exception_no_traceback():
221
- raise RuntimeError('Max number of services reached.')
222
- success = serve_state.add_service(
223
- service_name,
224
- controller_job_id=job_id,
225
- policy=service_spec.autoscaling_policy_str(),
226
- requested_resources_str=backend_utils.get_task_resources_str(task),
227
- load_balancing_policy=service_spec.load_balancing_policy,
228
- status=serve_state.ServiceStatus.CONTROLLER_INIT,
229
- tls_encrypted=service_spec.tls_credential is not None,
230
- pool=service_spec.pool,
231
- controller_pid=os.getpid(),
232
- entrypoint=entrypoint)
263
+ with filelock.FileLock(controller_utils.get_resources_lock_path()):
264
+ if not controller_utils.can_start_new_process():
265
+ cleanup_storage(tmp_task_yaml)
266
+ with ux_utils.print_exception_no_traceback():
267
+ raise RuntimeError(
268
+ constants.MAX_NUMBER_OF_SERVICES_REACHED_ERROR)
269
+ success = serve_state.add_service(
270
+ service_name,
271
+ controller_job_id=job_id,
272
+ policy=service_spec.autoscaling_policy_str(),
273
+ requested_resources_str=backend_utils.get_task_resources_str(
274
+ task),
275
+ load_balancing_policy=service_spec.load_balancing_policy,
276
+ status=serve_state.ServiceStatus.CONTROLLER_INIT,
277
+ tls_encrypted=service_spec.tls_credential is not None,
278
+ pool=service_spec.pool,
279
+ controller_pid=os.getpid(),
280
+ entrypoint=entrypoint)
281
+ jobs_scheduler.maybe_schedule_next_jobs()
233
282
  # Directly throw an error here. See sky/serve/api.py::up
234
283
  # for more details.
235
284
  if not success:
sky/server/constants.py CHANGED
@@ -10,7 +10,7 @@ from sky.skylet import constants
10
10
  # based on version info is needed.
11
11
  # For more details and code guidelines, refer to:
12
12
  # https://docs.skypilot.co/en/latest/developers/CONTRIBUTING.html#backward-compatibility-guidelines
13
- API_VERSION = 16
13
+ API_VERSION = 17
14
14
 
15
15
  # The minimum peer API version that the code should still work with.
16
16
  # Notes (dev):
@@ -497,6 +497,12 @@ class JobsQueueBody(RequestBody):
497
497
  skip_finished: bool = False
498
498
  all_users: bool = False
499
499
  job_ids: Optional[List[int]] = None
500
+ user_match: Optional[str] = None
501
+ workspace_match: Optional[str] = None
502
+ name_match: Optional[str] = None
503
+ pool_match: Optional[str] = None
504
+ page: Optional[int] = None
505
+ limit: Optional[int] = None
500
506
 
501
507
 
502
508
  class JobsCancelBody(RequestBody):
@@ -102,8 +102,18 @@ def decode_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
102
102
 
103
103
 
104
104
  @register_decoders('jobs.queue')
105
- def decode_jobs_queue(return_value: List[dict],) -> List[Dict[str, Any]]:
106
- jobs = return_value
105
+ def decode_jobs_queue(return_value):
106
+ """Decode jobs queue response.
107
+
108
+ Supports legacy list, or a dict {jobs, total}.
109
+ - Returns list[job]
110
+ """
111
+ # Case 1: dict shape {jobs, total}
112
+ if isinstance(return_value, dict) and 'jobs' in return_value:
113
+ jobs = return_value.get('jobs', [])
114
+ else:
115
+ # Case 2: legacy list
116
+ jobs = return_value
107
117
  for job in jobs:
108
118
  job['status'] = managed_jobs.ManagedJobStatus(job['status'])
109
119
  return jobs
@@ -106,10 +106,18 @@ def encode_status_kubernetes(
106
106
 
107
107
 
108
108
  @register_encoder('jobs.queue')
109
- def encode_jobs_queue(jobs: List[dict],) -> List[Dict[str, Any]]:
109
+ def encode_jobs_queue(jobs_or_tuple):
110
+ # Support returning either a plain jobs list or a (jobs, total) tuple
111
+ if isinstance(jobs_or_tuple, tuple) and len(jobs_or_tuple) == 2:
112
+ jobs, total = jobs_or_tuple
113
+ else:
114
+ jobs = jobs_or_tuple
115
+ total = None
110
116
  for job in jobs:
111
117
  job['status'] = job['status'].value
112
- return jobs
118
+ if total is None:
119
+ return jobs
120
+ return {'jobs': jobs, 'total': total}
113
121
 
114
122
 
115
123
  def _encode_serve_status(
sky/server/server.py CHANGED
@@ -17,7 +17,7 @@ import resource
17
17
  import shutil
18
18
  import sys
19
19
  import threading
20
- from typing import Any, Dict, List, Literal, Optional, Set, Tuple
20
+ from typing import Dict, List, Literal, Optional, Set, Tuple
21
21
  import uuid
22
22
  import zipfile
23
23
 
@@ -42,6 +42,7 @@ from sky.data import storage_utils
42
42
  from sky.jobs.server import server as jobs_rest
43
43
  from sky.metrics import utils as metrics_utils
44
44
  from sky.provision.kubernetes import utils as kubernetes_utils
45
+ from sky.schemas.api import responses
45
46
  from sky.serve.server import server as serve_rest
46
47
  from sky.server import common
47
48
  from sky.server import config as server_config
@@ -791,8 +792,6 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
791
792
  ctx.override_envs(validate_body.env_vars)
792
793
 
793
794
  def validate_dag(dag: dag_utils.dag_lib.Dag):
794
- # Resolve the volumes before admin policy and validation.
795
- dag.resolve_and_validate_volumes()
796
795
  # TODO: Admin policy may contain arbitrary code, which may be expensive
797
796
  # to run and may block the server thread. However, moving it into the
798
797
  # executor adds a ~150ms penalty on the local API server because of
@@ -801,6 +800,7 @@ async def validate(validate_body: payloads.ValidateBody) -> None:
801
800
  with admin_policy_utils.apply_and_use_config_in_current_request(
802
801
  dag,
803
802
  request_options=validate_body.get_request_options()) as dag:
803
+ dag.resolve_and_validate_volumes()
804
804
  # Skip validating workdir and file_mounts, as those need to be
805
805
  # validated after the files are uploaded to the SkyPilot API server
806
806
  # with `upload_mounts_to_api_server`.
@@ -1283,6 +1283,46 @@ async def download(download_body: payloads.DownloadBody) -> None:
1283
1283
  detail=f'Error creating zip file: {str(e)}')
1284
1284
 
1285
1285
 
1286
+ @app.post('/provision_logs')
1287
+ async def provision_logs(cluster_body: payloads.ClusterNameBody,
1288
+ follow: bool = True,
1289
+ tail: int = 0) -> fastapi.responses.StreamingResponse:
1290
+ """Streams the provision.log for the latest launch request of a cluster."""
1291
+ # Prefer clusters table first, then cluster_history as fallback.
1292
+ log_path_str = global_user_state.get_cluster_provision_log_path(
1293
+ cluster_body.cluster_name)
1294
+ if not log_path_str:
1295
+ log_path_str = global_user_state.get_cluster_history_provision_log_path(
1296
+ cluster_body.cluster_name)
1297
+ if not log_path_str:
1298
+ raise fastapi.HTTPException(
1299
+ status_code=404,
1300
+ detail=('Provision log path is not recorded for this cluster. '
1301
+ 'Please relaunch to generate provisioning logs.'))
1302
+
1303
+ log_path = pathlib.Path(log_path_str).expanduser().resolve()
1304
+ if not log_path.exists():
1305
+ raise fastapi.HTTPException(
1306
+ status_code=404,
1307
+ detail=f'Provision log path does not exist: {str(log_path)}')
1308
+
1309
+ # Tail semantics: 0 means print all lines. Convert 0 -> None for streamer.
1310
+ effective_tail = None if tail is None or tail <= 0 else tail
1311
+
1312
+ return fastapi.responses.StreamingResponse(
1313
+ content=stream_utils.log_streamer(None,
1314
+ log_path,
1315
+ tail=effective_tail,
1316
+ follow=follow),
1317
+ media_type='text/plain',
1318
+ headers={
1319
+ 'Cache-Control': 'no-cache, no-transform',
1320
+ 'X-Accel-Buffering': 'no',
1321
+ 'Transfer-Encoding': 'chunked',
1322
+ },
1323
+ )
1324
+
1325
+
1286
1326
  @app.post('/cost_report')
1287
1327
  async def cost_report(request: fastapi.Request,
1288
1328
  cost_report_body: payloads.CostReportBody) -> None:
@@ -1531,8 +1571,12 @@ async def api_status(
1531
1571
  return encoded_request_tasks
1532
1572
 
1533
1573
 
1534
- @app.get('/api/health')
1535
- async def health(request: fastapi.Request) -> Dict[str, Any]:
1574
+ @app.get(
1575
+ '/api/health',
1576
+ # response_model_exclude_unset omits unset fields
1577
+ # in the response JSON.
1578
+ response_model_exclude_unset=True)
1579
+ async def health(request: fastapi.Request) -> responses.APIHealthResponse:
1536
1580
  """Checks the health of the API server.
1537
1581
 
1538
1582
  Returns:
@@ -1570,7 +1614,8 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
1570
1614
  # - There is no harm when an malicious client calls /api/health
1571
1615
  # without authentication since no sensitive information is
1572
1616
  # returned.
1573
- return {'status': common.ApiServerStatus.HEALTHY}
1617
+ return responses.APIHealthResponse(
1618
+ status=common.ApiServerStatus.HEALTHY,)
1574
1619
  # TODO(aylei): remove this after min_compatible_api_version >= 14.
1575
1620
  if client_version < 14:
1576
1621
  # For Client with API version < 14, the NEEDS_AUTH status is not
@@ -1579,19 +1624,19 @@ async def health(request: fastapi.Request) -> Dict[str, Any]:
1579
1624
  detail='Authentication required')
1580
1625
 
1581
1626
  logger.debug(f'Health endpoint: request.state.auth_user = {user}')
1582
- return {
1583
- 'status': server_status,
1627
+ return responses.APIHealthResponse(
1628
+ status=server_status,
1584
1629
  # Kept for backward compatibility, clients before 0.11.0 will read this
1585
1630
  # field to check compatibility and hint the user to upgrade the CLI.
1586
1631
  # TODO(aylei): remove this field after 0.13.0
1587
- 'api_version': str(server_constants.API_VERSION),
1588
- 'version': sky.__version__,
1589
- 'version_on_disk': common.get_skypilot_version_on_disk(),
1590
- 'commit': sky.__commit__,
1591
- 'user': user.to_dict() if user is not None else None,
1592
- 'basic_auth_enabled': os.environ.get(
1593
- constants.ENV_VAR_ENABLE_BASIC_AUTH, 'false').lower() == 'true',
1594
- }
1632
+ api_version=str(server_constants.API_VERSION),
1633
+ version=sky.__version__,
1634
+ version_on_disk=common.get_skypilot_version_on_disk(),
1635
+ commit=sky.__commit__,
1636
+ basic_auth_enabled=os.environ.get(constants.ENV_VAR_ENABLE_BASIC_AUTH,
1637
+ 'false').lower() == 'true',
1638
+ user=user if user is not None else None,
1639
+ )
1595
1640
 
1596
1641
 
1597
1642
  @app.websocket('/kubernetes-pod-ssh-proxy')
@@ -1809,6 +1854,9 @@ if __name__ == '__main__':
1809
1854
  global_tasks.append(background.create_task(metrics_server.serve()))
1810
1855
  global_tasks.append(
1811
1856
  background.create_task(requests_lib.requests_gc_daemon()))
1857
+ global_tasks.append(
1858
+ background.create_task(
1859
+ global_user_state.cluster_event_retention_daemon()))
1812
1860
  threading.Thread(target=background.run_forever, daemon=True).start()
1813
1861
 
1814
1862
  queue_server, workers = executor.start(config)