skypilot-nightly 1.0.0.dev20250729__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (186) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +4 -1
  3. sky/backends/cloud_vm_ray_backend.py +4 -3
  4. sky/catalog/__init__.py +3 -3
  5. sky/catalog/aws_catalog.py +12 -0
  6. sky/catalog/common.py +2 -2
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +448 -60
  9. sky/client/common.py +12 -9
  10. sky/clouds/nebius.py +1 -1
  11. sky/clouds/utils/gcp_utils.py +1 -1
  12. sky/clouds/vast.py +1 -2
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/_next/static/chunks/1043-928582d4860fef92.js +1 -0
  15. sky/dashboard/out/_next/static/chunks/1141-3f10a5a9f697c630.js +11 -0
  16. sky/dashboard/out/_next/static/chunks/1559-6c00e20454194859.js +30 -0
  17. sky/dashboard/out/_next/static/chunks/1664-22b00e32c9ff96a4.js +1 -0
  18. sky/dashboard/out/_next/static/chunks/1871-1df8b686a51f3e3a.js +6 -0
  19. sky/dashboard/out/_next/static/chunks/2003.f90b06bb1f914295.js +1 -0
  20. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  21. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  22. sky/dashboard/out/_next/static/chunks/2641.142718b6b78a6f9b.js +1 -0
  23. sky/dashboard/out/_next/static/chunks/3698-7874720877646365.js +1 -0
  24. sky/dashboard/out/_next/static/chunks/3785.95524bc443db8260.js +1 -0
  25. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  26. sky/dashboard/out/_next/static/chunks/4725.42f21f250f91f65b.js +1 -0
  27. sky/dashboard/out/_next/static/chunks/4869.18e6a4361a380763.js +16 -0
  28. sky/dashboard/out/_next/static/chunks/4937.d6bf67771e353356.js +15 -0
  29. sky/dashboard/out/_next/static/chunks/5230-f3bb2663e442e86c.js +1 -0
  30. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  31. sky/dashboard/out/_next/static/chunks/6135-d0e285ac5f3f2485.js +1 -0
  32. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  33. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  34. sky/dashboard/out/_next/static/chunks/6601-234b1cf963c7280b.js +1 -0
  35. sky/dashboard/out/_next/static/chunks/691.6d99cbfba347cebf.js +55 -0
  36. sky/dashboard/out/_next/static/chunks/6989-983d3ae7a874de98.js +1 -0
  37. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +1 -0
  38. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  39. sky/dashboard/out/_next/static/chunks/8969-9a8cca241b30db83.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/9025.7937c16bc8623516.js +6 -0
  41. sky/dashboard/out/_next/static/chunks/938-40d15b6261ec8dc1.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/9847.4c46c5e229c78704.js +30 -0
  43. sky/dashboard/out/_next/static/chunks/9984.78ee6d2c6fa4b0e8.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  45. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  46. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/_app-a67ae198457b9886.js +34 -0
  49. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-fa63e8b1d203f298.js +11 -0
  51. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-665fa5d96dd41d67.js +1 -0
  52. sky/dashboard/out/_next/static/chunks/pages/clusters-956ad430075efee8.js +1 -0
  53. sky/dashboard/out/_next/static/chunks/pages/config-8620d099cbef8608.js +1 -0
  54. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  55. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-9cfd875eecb6eaf5.js +1 -0
  56. sky/dashboard/out/_next/static/chunks/pages/infra-0fbdc9072f19fbe2.js +1 -0
  57. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-b25c109d6e41bcf4.js +11 -0
  58. sky/dashboard/out/_next/static/chunks/pages/jobs-6393a9edc7322b54.js +1 -0
  59. sky/dashboard/out/_next/static/chunks/pages/users-34d6bb10c3b3ee3d.js +1 -0
  60. sky/dashboard/out/_next/static/chunks/pages/volumes-225c8dae0634eb7f.js +1 -0
  61. sky/dashboard/out/_next/static/chunks/pages/workspace/new-92f741084a89e27b.js +1 -0
  62. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-4d41c9023287f59a.js +1 -0
  63. sky/dashboard/out/_next/static/chunks/pages/workspaces-e4cb7e97d37e93ad.js +1 -0
  64. sky/dashboard/out/_next/static/chunks/webpack-5adfc4d4b3db6f71.js +1 -0
  65. sky/dashboard/out/_next/static/oKqDxFQ88cquF4nQGE_0w/_buildManifest.js +1 -0
  66. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  67. sky/dashboard/out/clusters/[cluster].html +1 -1
  68. sky/dashboard/out/clusters.html +1 -1
  69. sky/dashboard/out/config.html +1 -1
  70. sky/dashboard/out/index.html +1 -1
  71. sky/dashboard/out/infra/[context].html +1 -1
  72. sky/dashboard/out/infra.html +1 -1
  73. sky/dashboard/out/jobs/[job].html +1 -1
  74. sky/dashboard/out/jobs.html +1 -1
  75. sky/dashboard/out/users.html +1 -1
  76. sky/dashboard/out/volumes.html +1 -1
  77. sky/dashboard/out/workspace/new.html +1 -1
  78. sky/dashboard/out/workspaces/[name].html +1 -1
  79. sky/dashboard/out/workspaces.html +1 -1
  80. sky/data/data_utils.py +25 -0
  81. sky/data/storage.py +1219 -1775
  82. sky/global_user_state.py +18 -8
  83. sky/jobs/__init__.py +3 -0
  84. sky/jobs/client/sdk.py +80 -3
  85. sky/jobs/controller.py +76 -25
  86. sky/jobs/recovery_strategy.py +80 -34
  87. sky/jobs/scheduler.py +68 -20
  88. sky/jobs/server/core.py +228 -136
  89. sky/jobs/server/server.py +40 -0
  90. sky/jobs/state.py +164 -31
  91. sky/jobs/utils.py +144 -68
  92. sky/logs/aws.py +4 -2
  93. sky/provision/kubernetes/utils.py +6 -4
  94. sky/provision/nebius/constants.py +3 -0
  95. sky/provision/vast/instance.py +2 -1
  96. sky/provision/vast/utils.py +9 -6
  97. sky/py.typed +0 -0
  98. sky/resources.py +24 -14
  99. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  100. sky/serve/autoscalers.py +8 -0
  101. sky/serve/client/impl.py +188 -0
  102. sky/serve/client/sdk.py +12 -82
  103. sky/serve/constants.py +5 -1
  104. sky/serve/controller.py +5 -0
  105. sky/serve/replica_managers.py +112 -37
  106. sky/serve/serve_state.py +16 -6
  107. sky/serve/serve_utils.py +274 -77
  108. sky/serve/server/core.py +8 -525
  109. sky/serve/server/impl.py +709 -0
  110. sky/serve/service.py +13 -9
  111. sky/serve/service_spec.py +74 -4
  112. sky/server/constants.py +1 -1
  113. sky/server/requests/payloads.py +33 -0
  114. sky/server/requests/requests.py +18 -1
  115. sky/server/requests/serializers/decoders.py +12 -3
  116. sky/server/requests/serializers/encoders.py +13 -2
  117. sky/server/server.py +6 -1
  118. sky/skylet/events.py +9 -0
  119. sky/skypilot_config.py +24 -21
  120. sky/task.py +41 -11
  121. sky/templates/jobs-controller.yaml.j2 +3 -0
  122. sky/templates/sky-serve-controller.yaml.j2 +18 -2
  123. sky/users/server.py +1 -1
  124. sky/utils/command_runner.py +4 -2
  125. sky/utils/controller_utils.py +14 -10
  126. sky/utils/dag_utils.py +4 -2
  127. sky/utils/db/migration_utils.py +2 -4
  128. sky/utils/schemas.py +24 -19
  129. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
  130. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +135 -130
  131. sky/dashboard/out/_next/static/Q2sVXboB_t7cgvntL-6nD/_buildManifest.js +0 -1
  132. sky/dashboard/out/_next/static/chunks/1043-869d9c78bf5dd3df.js +0 -1
  133. sky/dashboard/out/_next/static/chunks/1141-e49a159c30a6c4a7.js +0 -11
  134. sky/dashboard/out/_next/static/chunks/1559-18717d96ef2fcbe9.js +0 -30
  135. sky/dashboard/out/_next/static/chunks/1664-d65361e92b85e786.js +0 -1
  136. sky/dashboard/out/_next/static/chunks/1871-ea0e7283886407ca.js +0 -6
  137. sky/dashboard/out/_next/static/chunks/2003.b82e6db40ec4c463.js +0 -1
  138. sky/dashboard/out/_next/static/chunks/2350.23778a2b19aabd33.js +0 -1
  139. sky/dashboard/out/_next/static/chunks/2369.2d6e4757f8dfc2b7.js +0 -15
  140. sky/dashboard/out/_next/static/chunks/2641.74c19c4d45a2c034.js +0 -1
  141. sky/dashboard/out/_next/static/chunks/3698-9fa11dafb5cad4a6.js +0 -1
  142. sky/dashboard/out/_next/static/chunks/3785.59705416215ff08b.js +0 -1
  143. sky/dashboard/out/_next/static/chunks/3937.d7f1c55d1916c7f2.js +0 -1
  144. sky/dashboard/out/_next/static/chunks/4725.66125dcd9832aa5d.js +0 -1
  145. sky/dashboard/out/_next/static/chunks/4869.da729a7db3a31f43.js +0 -16
  146. sky/dashboard/out/_next/static/chunks/4937.d75809403fc264ac.js +0 -15
  147. sky/dashboard/out/_next/static/chunks/5230-df791914b54d91d9.js +0 -1
  148. sky/dashboard/out/_next/static/chunks/5739-5ea3ffa10fc884f2.js +0 -8
  149. sky/dashboard/out/_next/static/chunks/6135-2abbd0352f8ee061.js +0 -1
  150. sky/dashboard/out/_next/static/chunks/616-162f3033ffcd3d31.js +0 -39
  151. sky/dashboard/out/_next/static/chunks/6601-d4a381403a8bae91.js +0 -1
  152. sky/dashboard/out/_next/static/chunks/691.488b4aef97c28727.js +0 -55
  153. sky/dashboard/out/_next/static/chunks/6989-eab0e9c16b64fd9f.js +0 -1
  154. sky/dashboard/out/_next/static/chunks/6990-f64e03df359e04f7.js +0 -1
  155. sky/dashboard/out/_next/static/chunks/7411-2cc31dc0fdf2a9ad.js +0 -41
  156. sky/dashboard/out/_next/static/chunks/8969-8e0b2055bf5dd499.js +0 -1
  157. sky/dashboard/out/_next/static/chunks/9025.4a9099bdf3ed4875.js +0 -6
  158. sky/dashboard/out/_next/static/chunks/938-7ee806653aef0609.js +0 -1
  159. sky/dashboard/out/_next/static/chunks/9847.387abf8a14d722db.js +0 -30
  160. sky/dashboard/out/_next/static/chunks/9984.0460de9d3adf5582.js +0 -1
  161. sky/dashboard/out/_next/static/chunks/fd9d1056-61f2257a9cd8b32b.js +0 -1
  162. sky/dashboard/out/_next/static/chunks/framework-efc06c2733009cd3.js +0 -33
  163. sky/dashboard/out/_next/static/chunks/main-app-68c028b1bc5e1b72.js +0 -1
  164. sky/dashboard/out/_next/static/chunks/main-c0a4f1ea606d48d2.js +0 -1
  165. sky/dashboard/out/_next/static/chunks/pages/_app-da491665d4289aae.js +0 -34
  166. sky/dashboard/out/_next/static/chunks/pages/_error-c72a1f77a3c0be1b.js +0 -1
  167. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2186770cc2de1623.js +0 -11
  168. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-95afb019ab85801c.js +0 -6
  169. sky/dashboard/out/_next/static/chunks/pages/clusters-3d4be4961e1c94eb.js +0 -1
  170. sky/dashboard/out/_next/static/chunks/pages/config-a2673b256b6d416f.js +0 -1
  171. sky/dashboard/out/_next/static/chunks/pages/index-89e7daf7b7df02e0.js +0 -1
  172. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-a90b4fe4616dc501.js +0 -1
  173. sky/dashboard/out/_next/static/chunks/pages/infra-0d3d1f890c5d188a.js +0 -1
  174. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dc0299ffefebcdbe.js +0 -16
  175. sky/dashboard/out/_next/static/chunks/pages/jobs-49f790d12a85027c.js +0 -1
  176. sky/dashboard/out/_next/static/chunks/pages/users-6790fcefd5487b13.js +0 -1
  177. sky/dashboard/out/_next/static/chunks/pages/volumes-61ea7ba7e56f8d06.js +0 -1
  178. sky/dashboard/out/_next/static/chunks/pages/workspace/new-5629d4e551dba1ee.js +0 -1
  179. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-6bcd4b20914d76c9.js +0 -1
  180. sky/dashboard/out/_next/static/chunks/pages/workspaces-5f7fe4b7d55b8612.js +0 -1
  181. sky/dashboard/out/_next/static/chunks/webpack-a305898dc479711e.js +0 -1
  182. /sky/dashboard/out/_next/static/{Q2sVXboB_t7cgvntL-6nD → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
  183. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
  184. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
  185. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
  186. {skypilot_nightly-1.0.0.dev20250729.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0
sky/serve/serve_utils.py CHANGED
@@ -11,6 +11,7 @@ import shlex
11
11
  import shutil
12
12
  import threading
13
13
  import time
14
+ import traceback
14
15
  import typing
15
16
  from typing import (Any, Callable, DefaultDict, Deque, Dict, Generic, Iterator,
16
17
  List, Optional, TextIO, Type, TypeVar, Union)
@@ -22,7 +23,10 @@ import filelock
22
23
  from sky import backends
23
24
  from sky import exceptions
24
25
  from sky import global_user_state
26
+ from sky import sky_logging
27
+ from sky import skypilot_config
25
28
  from sky.adaptors import common as adaptors_common
29
+ from sky.jobs import state as managed_job_state
26
30
  from sky.serve import constants
27
31
  from sky.serve import serve_state
28
32
  from sky.serve import spot_placer
@@ -47,6 +51,8 @@ else:
47
51
  psutil = adaptors_common.LazyImport('psutil')
48
52
  requests = adaptors_common.LazyImport('requests')
49
53
 
54
+ logger = sky_logging.init_logger(__name__)
55
+
50
56
 
51
57
  @annotations.lru_cache(scope='request')
52
58
  def get_num_service_threshold():
@@ -244,7 +250,22 @@ class RequestTimestamp(RequestsAggregator):
244
250
  return f'RequestTimestamp(timestamps={self.timestamps})'
245
251
 
246
252
 
247
- def validate_service_task(task: 'sky.Task') -> None:
253
+ def get_service_filelock_path(pool: str) -> str:
254
+ path = (pathlib.Path(constants.SKYSERVE_METADATA_DIR) / pool /
255
+ 'pool.lock').expanduser().absolute()
256
+ path.parents[0].mkdir(parents=True, exist_ok=True)
257
+ return str(path)
258
+
259
+
260
+ @annotations.lru_cache(scope='request', maxsize=1)
261
+ def is_consolidation_mode() -> bool:
262
+ consolidation_mode = skypilot_config.get_nested(
263
+ ('serve', 'controller', 'consolidation_mode'), default_value=False)
264
+ # _check_consolidation_mode_consistency(consolidation_mode)
265
+ return consolidation_mode
266
+
267
+
268
+ def validate_service_task(task: 'sky.Task', pool: bool) -> None:
248
269
  """Validate the task for Sky Serve.
249
270
 
250
271
  Args:
@@ -267,17 +288,25 @@ def validate_service_task(task: 'sky.Task') -> None:
267
288
  'use `dynamic_ondemand_fallback` or set '
268
289
  'base_ondemand_fallback_replicas.')
269
290
 
291
+ field_name = 'service' if not pool else 'pool'
270
292
  if task.service is None:
271
293
  with ux_utils.print_exception_no_traceback():
272
- raise RuntimeError('Service section not found.')
294
+ raise RuntimeError(f'{field_name.capitalize()} section not found.')
295
+
296
+ if pool != task.service.pool:
297
+ with ux_utils.print_exception_no_traceback():
298
+ raise ValueError(f'{field_name.capitalize()} section in the YAML '
299
+ f'file does not match the pool argument. '
300
+ f'To fix, add a valid `{field_name}` field.')
273
301
 
274
302
  policy_description = ('on-demand'
275
303
  if task.service.dynamic_ondemand_fallback else 'spot')
276
304
  for resource in list(task.resources):
277
305
  if resource.job_recovery is not None:
306
+ sys_name = 'SkyServe' if not pool else 'Cluster Pool'
278
307
  with ux_utils.print_exception_no_traceback():
279
- raise ValueError('job_recovery is disabled for SkyServe. '
280
- 'SkyServe will replenish preempted spot '
308
+ raise ValueError(f'job_recovery is disabled for {sys_name}. '
309
+ f'{sys_name} will replenish preempted spot '
281
310
  f'with {policy_description} instances.')
282
311
 
283
312
  # Try to create a spot placer from the task yaml. Check if the task yaml
@@ -300,7 +329,7 @@ def validate_service_task(task: 'sky.Task') -> None:
300
329
  raise ValueError(
301
330
  '`spot_placer` is only supported for spot resources. '
302
331
  'Please explicitly specify `use_spot: true` in resources.')
303
- if task.service.ports is None:
332
+ if not pool and task.service.ports is None:
304
333
  requested_ports = list(
305
334
  resources_utils.port_ranges_to_set(requested_resources.ports))
306
335
  if len(requested_ports) != 1:
@@ -320,10 +349,16 @@ def validate_service_task(task: 'sky.Task') -> None:
320
349
  f'Got multiple ports: {service_port} and '
321
350
  f'{replica_ingress_port} in different resources. '
322
351
  'Please specify the same port instead.')
352
+ if pool:
353
+ if (task.service.ports is not None or
354
+ requested_resources.ports is not None):
355
+ with ux_utils.print_exception_no_traceback():
356
+ raise ValueError('Cannot specify ports in a cluster pool.')
323
357
 
324
358
 
325
- def generate_service_name():
326
- return f'sky-service-{uuid.uuid4().hex[:4]}'
359
+ def generate_service_name(pool: bool = False):
360
+ noun = 'pool' if pool else 'service'
361
+ return f'sky-{noun}-{uuid.uuid4().hex[:4]}'
327
362
 
328
363
 
329
364
  def generate_remote_service_dir_name(service_name: str) -> str:
@@ -426,6 +461,9 @@ def set_service_status_and_active_versions_from_replica(
426
461
 
427
462
 
428
463
  def update_service_status() -> None:
464
+ if is_consolidation_mode():
465
+ # TODO(tian): PID-based tracking.
466
+ return
429
467
  services = serve_state.get_services()
430
468
  for record in services:
431
469
  if record['status'] == serve_state.ServiceStatus.SHUTTING_DOWN:
@@ -440,11 +478,14 @@ def update_service_status() -> None:
440
478
  record['name'], serve_state.ServiceStatus.CONTROLLER_FAILED)
441
479
 
442
480
 
443
- def update_service_encoded(service_name: str, version: int, mode: str) -> str:
444
- service_status = _get_service_status(service_name)
481
+ def update_service_encoded(service_name: str, version: int, mode: str,
482
+ pool: bool) -> str:
483
+ noun = 'pool' if pool else 'service'
484
+ capnoun = noun.capitalize()
485
+ service_status = _get_service_status(service_name, pool=pool)
445
486
  if service_status is None:
446
487
  with ux_utils.print_exception_no_traceback():
447
- raise ValueError(f'Service {service_name!r} does not exist.')
488
+ raise ValueError(f'{capnoun} {service_name!r} does not exist.')
448
489
  controller_port = service_status['controller_port']
449
490
  resp = requests.post(
450
491
  _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
@@ -455,27 +496,30 @@ def update_service_encoded(service_name: str, version: int, mode: str) -> str:
455
496
  })
456
497
  if resp.status_code == 404:
457
498
  with ux_utils.print_exception_no_traceback():
499
+ # This only happens for services since pool is added after the
500
+ # update feature is introduced.
458
501
  raise ValueError(
459
502
  'The service is up-ed in an old version and does not '
460
503
  'support update. Please `sky serve down` '
461
504
  'it first and relaunch the service. ')
462
505
  elif resp.status_code == 400:
463
506
  with ux_utils.print_exception_no_traceback():
464
- raise ValueError(f'Client error during service update: {resp.text}')
507
+ raise ValueError(f'Client error during {noun} update: {resp.text}')
465
508
  elif resp.status_code == 500:
466
509
  with ux_utils.print_exception_no_traceback():
467
510
  raise RuntimeError(
468
- f'Server error during service update: {resp.text}')
511
+ f'Server error during {noun} update: {resp.text}')
469
512
  elif resp.status_code != 200:
470
513
  with ux_utils.print_exception_no_traceback():
471
- raise ValueError(f'Failed to update service: {resp.text}')
514
+ raise ValueError(f'Failed to update {noun}: {resp.text}')
472
515
 
473
516
  service_msg = resp.json()['message']
474
517
  return message_utils.encode_payload(service_msg)
475
518
 
476
519
 
477
520
  def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
478
- service_status = _get_service_status(service_name)
521
+ # TODO(tian): Currently pool does not support terminating replica.
522
+ service_status = _get_service_status(service_name, pool=False)
479
523
  if service_status is None:
480
524
  with ux_utils.print_exception_no_traceback():
481
525
  raise ValueError(f'Service {service_name!r} does not exist.')
@@ -506,6 +550,7 @@ def terminate_replica(service_name: str, replica_id: int, purge: bool) -> str:
506
550
 
507
551
  def _get_service_status(
508
552
  service_name: str,
553
+ pool: bool,
509
554
  with_replica_info: bool = True) -> Optional[Dict[str, Any]]:
510
555
  """Get the status dict of the service.
511
556
 
@@ -520,27 +565,63 @@ def _get_service_status(
520
565
  record = serve_state.get_service_from_name(service_name)
521
566
  if record is None:
522
567
  return None
568
+ if record['pool'] != pool:
569
+ return None
570
+
571
+ record['pool_yaml'] = ''
572
+ if record['pool']:
573
+ latest_yaml_path = generate_task_yaml_file_name(service_name,
574
+ record['version'])
575
+ original_config = common_utils.read_yaml(latest_yaml_path)
576
+ original_config.pop('run', None)
577
+ svc: Dict[str, Any] = original_config.pop('service')
578
+ if svc is not None:
579
+ svc.pop('pool', None)
580
+ original_config['pool'] = svc
581
+ record['pool_yaml'] = common_utils.dump_yaml_str(original_config)
582
+
583
+ record['target_num_replicas'] = 0
584
+ try:
585
+ controller_port = record['controller_port']
586
+ resp = requests.get(
587
+ _CONTROLLER_URL.format(CONTROLLER_PORT=controller_port) +
588
+ '/autoscaler/info')
589
+ record['target_num_replicas'] = resp.json()['target_num_replicas']
590
+ except requests.exceptions.RequestException:
591
+ record['target_num_replicas'] = None
592
+ except Exception as e: # pylint: disable=broad-except
593
+ logger.error(f'Failed to get autoscaler info for {service_name}: '
594
+ f'{common_utils.format_exception(e)}\n'
595
+ f'Traceback: {traceback.format_exc()}')
596
+
523
597
  if with_replica_info:
524
598
  record['replica_info'] = [
525
- info.to_info_dict(with_handle=True)
599
+ info.to_info_dict(with_handle=True, with_url=not pool)
526
600
  for info in serve_state.get_replica_infos(service_name)
527
601
  ]
602
+ if pool:
603
+ for replica_info in record['replica_info']:
604
+ job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(
605
+ service_name, replica_info['name'])
606
+ replica_info['used_by'] = job_ids[0] if job_ids else None
528
607
  return record
529
608
 
530
609
 
531
- def get_service_status_encoded(service_names: Optional[List[str]]) -> str:
610
+ def get_service_status_encoded(service_names: Optional[List[str]],
611
+ pool: bool) -> str:
532
612
  service_statuses: List[Dict[str, str]] = []
533
613
  if service_names is None:
534
614
  # Get all service names
535
615
  service_names = serve_state.get_glob_service_names(None)
536
616
  for service_name in service_names:
537
- service_status = _get_service_status(service_name)
617
+ service_status = _get_service_status(service_name, pool=pool)
538
618
  if service_status is None:
539
619
  continue
540
620
  service_statuses.append({
541
621
  k: base64.b64encode(pickle.dumps(v)).decode('utf-8')
542
622
  for k, v in service_status.items()
543
623
  })
624
+ service_statuses = sorted(service_statuses, key=lambda x: x['name'])
544
625
  # We have to use payload_type here to avoid the issue of
545
626
  # message_utils.decode_payload() not being able to correctly decode the
546
627
  # message with <sky-payload> tags.
@@ -579,6 +660,71 @@ def load_version_string(payload: str) -> str:
579
660
  return message_utils.decode_payload(payload)
580
661
 
581
662
 
663
+ def num_replicas(service_name: str) -> int:
664
+ logger.info(f'Get number of replicas for pool {service_name!r}')
665
+ return len(serve_state.get_replica_infos(service_name))
666
+
667
+
668
+ def get_next_cluster_name(service_name: str, job_id: int) -> Optional[str]:
669
+ """Get the next available cluster name from idle replicas.
670
+
671
+ Args:
672
+ service_name: The name of the service.
673
+ job_id: Optional job ID to associate with the acquired cluster.
674
+ If None, a placeholder will be used.
675
+
676
+ Returns:
677
+ The cluster name if an idle replica is found, None otherwise.
678
+ """
679
+ # Check if service exists
680
+ service_status = _get_service_status(service_name,
681
+ pool=True,
682
+ with_replica_info=False)
683
+ if service_status is None:
684
+ logger.error(f'Service {service_name!r} does not exist.')
685
+ return None
686
+ if not service_status['pool']:
687
+ logger.error(f'Service {service_name!r} is not a cluster pool.')
688
+ return None
689
+ with filelock.FileLock(get_service_filelock_path(service_name)):
690
+
691
+ logger.debug(f'Get next cluster name for pool {service_name!r}')
692
+ ready_replicas = [
693
+ info for info in serve_state.get_replica_infos(service_name)
694
+ if info.status == serve_state.ReplicaStatus.READY
695
+ ]
696
+ idle_replicas: List['replica_managers.ReplicaInfo'] = []
697
+ for replica_info in ready_replicas:
698
+ jobs_on_replica = managed_job_state.get_nonterminal_job_ids_by_pool(
699
+ service_name, replica_info.cluster_name)
700
+ # TODO(tian): Make it resources aware. Currently we allow and only
701
+ # allow one job per replica. In the following PR, we should:
702
+ # i) When the replica is launched with `any_of` resources (
703
+ # replicas can have different resources), we should check if
704
+ # the resources that jobs require are available on the replica.
705
+ # e.g., if a job requires A100:1 on a {L4:1, A100:1} pool, it
706
+ # should only goes to replica with A100.
707
+ # ii) When a job only requires a subset of the resources on the
708
+ # replica, each replica should be able to handle multiple jobs
709
+ # at the same time. e.g., if a job requires A100:1 on a A100:8
710
+ # pool, it should be able to run 4 jobs at the same time.
711
+ if not jobs_on_replica:
712
+ idle_replicas.append(replica_info)
713
+ if not idle_replicas:
714
+ logger.info(f'No idle replicas found for pool {service_name!r}')
715
+ return None
716
+
717
+ # Select the first idle replica.
718
+ # TODO(tian): "Load balancing" policy.
719
+ replica_info = idle_replicas[0]
720
+ logger.info(f'Selected replica {replica_info.replica_id} with cluster '
721
+ f'{replica_info.cluster_name!r} for job {job_id!r} in pool '
722
+ f'{service_name!r}')
723
+ managed_job_state.set_current_cluster_name(job_id,
724
+ replica_info.cluster_name)
725
+ return replica_info.cluster_name
726
+
727
+
582
728
  def _terminate_failed_services(
583
729
  service_name: str,
584
730
  service_status: Optional[serve_state.ServiceStatus]) -> Optional[str]:
@@ -618,17 +764,38 @@ def _terminate_failed_services(
618
764
  f'controller: {remaining_identity}{colorama.Style.RESET_ALL}')
619
765
 
620
766
 
621
- def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
767
+ def terminate_services(service_names: Optional[List[str]], purge: bool,
768
+ pool: bool) -> str:
769
+ noun = 'pool' if pool else 'service'
770
+ capnoun = noun.capitalize()
622
771
  service_names = serve_state.get_glob_service_names(service_names)
623
772
  terminated_service_names: List[str] = []
624
773
  messages: List[str] = []
625
774
  for service_name in service_names:
626
775
  service_status = _get_service_status(service_name,
776
+ pool=pool,
627
777
  with_replica_info=False)
778
+ if service_status is None:
779
+ continue
628
780
  if (service_status is not None and service_status['status']
629
781
  == serve_state.ServiceStatus.SHUTTING_DOWN):
630
782
  # Already scheduled to be terminated.
631
783
  continue
784
+ if pool:
785
+ nonterminal_job_ids = (
786
+ managed_job_state.get_nonterminal_job_ids_by_pool(service_name))
787
+ if nonterminal_job_ids:
788
+ nonterminal_job_ids_str = ','.join(
789
+ str(job_id) for job_id in nonterminal_job_ids)
790
+ num_nonterminal_jobs = len(nonterminal_job_ids)
791
+ messages.append(
792
+ f'{colorama.Fore.YELLOW}{capnoun} {service_name!r} has '
793
+ f'{num_nonterminal_jobs} nonterminal jobs: '
794
+ f'{nonterminal_job_ids_str}. To terminate the {noun}, '
795
+ f'please run `sky jobs cancel --pool {service_name}` to '
796
+ 'cancel all jobs in the pool first.'
797
+ f'{colorama.Style.RESET_ALL}')
798
+ continue
632
799
  # If the `services` and `version_specs` table are not aligned, it might
633
800
  # result in a None service status. In this case, the controller process
634
801
  # is not functioning as well and we should also use the
@@ -636,10 +803,11 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
636
803
  # This is a safeguard for a rare case, that is accidentally abort
637
804
  # between `serve_state.add_service` and
638
805
  # `serve_state.add_or_update_version` in service.py.
639
- if (service_status is None or service_status['status']
806
+ purge_cmd = (f'sky jobs pool down {service_name} --purge'
807
+ if pool else f'sky serve down {service_name} --purge')
808
+ if (service_status['status']
640
809
  in serve_state.ServiceStatus.failed_statuses()):
641
- failed_status = (service_status['status']
642
- if service_status is not None else None)
810
+ failed_status = service_status['status']
643
811
  if purge:
644
812
  message = _terminate_failed_services(service_name,
645
813
  failed_status)
@@ -647,11 +815,10 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
647
815
  messages.append(message)
648
816
  else:
649
817
  messages.append(
650
- f'{colorama.Fore.YELLOW}Service {service_name!r} is in '
818
+ f'{colorama.Fore.YELLOW}{capnoun} {service_name!r} is in '
651
819
  f'failed status ({failed_status}). Skipping '
652
820
  'its termination as it could lead to a resource leak. '
653
- f'(Use `sky serve down {service_name} --purge` to '
654
- 'forcefully terminate the service.)'
821
+ f'(Use `{purge_cmd}` to forcefully terminate the {noun}.)'
655
822
  f'{colorama.Style.RESET_ALL}')
656
823
  # Don't add to terminated_service_names since it's not
657
824
  # actually terminated.
@@ -668,12 +835,12 @@ def terminate_services(service_names: Optional[List[str]], purge: bool) -> str:
668
835
  f.flush()
669
836
  terminated_service_names.append(f'{service_name!r}')
670
837
  if not terminated_service_names:
671
- messages.append('No service to terminate.')
838
+ messages.append(f'No {noun} to terminate.')
672
839
  else:
673
- identity_str = f'Service {terminated_service_names[0]} is'
840
+ identity_str = f'{capnoun} {terminated_service_names[0]} is'
674
841
  if len(terminated_service_names) > 1:
675
842
  terminated_service_names_str = ', '.join(terminated_service_names)
676
- identity_str = f'Services {terminated_service_names_str} are'
843
+ identity_str = f'{capnoun}s {terminated_service_names_str} are'
677
844
  messages.append(f'{identity_str} scheduled to be terminated.')
678
845
  return '\n'.join(messages)
679
846
 
@@ -694,32 +861,35 @@ def wait_service_registration(service_name: str, job_id: int) -> str:
694
861
  start_time = time.time()
695
862
  setup_completed = False
696
863
  while True:
697
- job_status = job_lib.get_status(job_id)
698
- if job_status is None or job_status < job_lib.JobStatus.RUNNING:
699
- # Wait for the controller process to finish setting up. It can be
700
- # slow if a lot cloud dependencies are being installed.
701
- if (time.time() - start_time >
702
- constants.CONTROLLER_SETUP_TIMEOUT_SECONDS):
703
- with ux_utils.print_exception_no_traceback():
704
- raise RuntimeError(
705
- f'Failed to start the controller '
706
- f'process for the service {service_name!r} '
707
- f'within '
708
- f'{constants.CONTROLLER_SETUP_TIMEOUT_SECONDS} seconds.'
709
- )
710
- # No need to check the service status as the controller process
711
- # is still setting up.
712
- time.sleep(1)
713
- continue
864
+ # TODO(tian): PID-based tracking.
865
+ if not is_consolidation_mode():
866
+ job_status = job_lib.get_status(job_id)
867
+ if job_status is None or job_status < job_lib.JobStatus.RUNNING:
868
+ # Wait for the controller process to finish setting up. It
869
+ # can be slow if a lot cloud dependencies are being installed.
870
+ if (time.time() - start_time >
871
+ constants.CONTROLLER_SETUP_TIMEOUT_SECONDS):
872
+ with ux_utils.print_exception_no_traceback():
873
+ raise RuntimeError(
874
+ f'Failed to start the controller process for '
875
+ f'the service {service_name!r} within '
876
+ f'{constants.CONTROLLER_SETUP_TIMEOUT_SECONDS}'
877
+ f' seconds.')
878
+ # No need to check the service status as the controller process
879
+ # is still setting up.
880
+ time.sleep(1)
881
+ continue
714
882
 
715
- if not setup_completed:
716
- setup_completed = True
717
- # Reset the start time to wait for the service to be registered.
718
- start_time = time.time()
883
+ if not setup_completed:
884
+ setup_completed = True
885
+ # Reset the start time to wait for the service to be registered.
886
+ start_time = time.time()
719
887
 
720
888
  record = serve_state.get_service_from_name(service_name)
721
889
  if record is not None:
722
- if job_id != record['controller_job_id']:
890
+ # TODO(tian): PID-based tracking.
891
+ if (not is_consolidation_mode() and
892
+ job_id != record['controller_job_id']):
723
893
  with ux_utils.print_exception_no_traceback():
724
894
  raise ValueError(
725
895
  f'The service {service_name!r} is already running. '
@@ -1059,18 +1229,25 @@ def _get_replicas(service_record: Dict[str, Any]) -> str:
1059
1229
  return f'{ready_replica_num}/{total_replica_num}'
1060
1230
 
1061
1231
 
1062
- def format_service_table(service_records: List[Dict[str, Any]],
1063
- show_all: bool) -> str:
1232
+ def format_service_table(service_records: List[Dict[str, Any]], show_all: bool,
1233
+ pool: bool) -> str:
1234
+ noun = 'pool' if pool else 'service'
1064
1235
  if not service_records:
1065
- return 'No existing services.'
1236
+ return f'No existing {noun}s.'
1066
1237
 
1067
1238
  service_columns = [
1068
- 'NAME', 'VERSION', 'UPTIME', 'STATUS', 'REPLICAS', 'ENDPOINT'
1239
+ 'NAME', 'VERSION', 'UPTIME', 'STATUS',
1240
+ 'REPLICAS' if not pool else 'WORKERS'
1069
1241
  ]
1242
+ if not pool:
1243
+ service_columns.append('ENDPOINT')
1070
1244
  if show_all:
1071
1245
  service_columns.extend([
1072
1246
  'AUTOSCALING_POLICY', 'LOAD_BALANCING_POLICY', 'REQUESTED_RESOURCES'
1073
1247
  ])
1248
+ if pool:
1249
+ # Remove the load balancing policy column for pools.
1250
+ service_columns.pop(-2)
1074
1251
  service_table = log_utils.create_table(service_columns)
1075
1252
 
1076
1253
  replica_infos: List[Dict[str, Any]] = []
@@ -1101,35 +1278,44 @@ def format_service_table(service_records: List[Dict[str, Any]],
1101
1278
  uptime,
1102
1279
  status_str,
1103
1280
  replicas,
1104
- endpoint,
1105
1281
  ]
1282
+ if not pool:
1283
+ service_values.append(endpoint)
1106
1284
  if show_all:
1107
1285
  service_values.extend(
1108
1286
  [policy, load_balancing_policy, requested_resources_str])
1287
+ if pool:
1288
+ service_values.pop(-2)
1109
1289
  service_table.add_row(service_values)
1110
1290
 
1111
- replica_table = _format_replica_table(replica_infos, show_all)
1291
+ replica_table = _format_replica_table(replica_infos, show_all, pool)
1292
+ replica_noun = 'Pool Workers' if pool else 'Service Replicas'
1112
1293
  return (f'{service_table}\n'
1113
1294
  f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
1114
- f'Service Replicas{colorama.Style.RESET_ALL}\n'
1295
+ f'{replica_noun}{colorama.Style.RESET_ALL}\n'
1115
1296
  f'{replica_table}')
1116
1297
 
1117
1298
 
1118
- def _format_replica_table(replica_records: List[Dict[str, Any]],
1119
- show_all: bool) -> str:
1299
+ def _format_replica_table(replica_records: List[Dict[str, Any]], show_all: bool,
1300
+ pool: bool) -> str:
1301
+ noun = 'worker' if pool else 'replica'
1120
1302
  if not replica_records:
1121
- return 'No existing replicas.'
1303
+ return f'No existing {noun}s.'
1122
1304
 
1123
1305
  replica_columns = [
1124
- 'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT', 'LAUNCHED', 'INFRA',
1125
- 'RESOURCES', 'STATUS'
1306
+ 'POOL_NAME' if pool else 'SERVICE_NAME', 'ID', 'VERSION', 'ENDPOINT',
1307
+ 'LAUNCHED', 'INFRA', 'RESOURCES', 'STATUS'
1126
1308
  ]
1309
+ if pool:
1310
+ replica_columns.append('USED_BY')
1311
+ # Remove the endpoint column for pool workers.
1312
+ replica_columns.pop(3)
1127
1313
  replica_table = log_utils.create_table(replica_columns)
1128
1314
 
1129
1315
  truncate_hint = ''
1130
1316
  if not show_all:
1131
1317
  if len(replica_records) > _REPLICA_TRUNC_NUM:
1132
- truncate_hint = '\n... (use --all to show all replicas)'
1318
+ truncate_hint = f'\n... (use --all to show all {noun}s)'
1133
1319
  replica_records = replica_records[:_REPLICA_TRUNC_NUM]
1134
1320
 
1135
1321
  for record in replica_records:
@@ -1143,6 +1329,8 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
1143
1329
  resources_str = '-'
1144
1330
  replica_status = record['status']
1145
1331
  status_str = replica_status.colored_str()
1332
+ used_by = record.get('used_by', None)
1333
+ used_by_str = str(used_by) if used_by is not None else '-'
1146
1334
 
1147
1335
  replica_handle: Optional['backends.CloudVmRayResourceHandle'] = record[
1148
1336
  'handle']
@@ -1161,6 +1349,9 @@ def _format_replica_table(replica_records: List[Dict[str, Any]],
1161
1349
  resources_str,
1162
1350
  status_str,
1163
1351
  ]
1352
+ if pool:
1353
+ replica_values.append(used_by_str)
1354
+ replica_values.pop(3)
1164
1355
  replica_table.add_row(replica_values)
1165
1356
 
1166
1357
  return f'{replica_table}{truncate_hint}'
@@ -1185,13 +1376,16 @@ class ServeCodeGen:
1185
1376
  'from sky.serve import serve_state',
1186
1377
  'from sky.serve import serve_utils',
1187
1378
  'from sky.serve import constants',
1379
+ 'serve_version = constants.SERVE_VERSION',
1188
1380
  ]
1189
1381
 
1190
1382
  @classmethod
1191
- def get_service_status(cls, service_names: Optional[List[str]]) -> str:
1383
+ def get_service_status(cls, service_names: Optional[List[str]],
1384
+ pool: bool) -> str:
1192
1385
  code = [
1193
- f'msg = serve_utils.get_service_status_encoded({service_names!r})',
1194
- 'print(msg, end="", flush=True)'
1386
+ f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
1387
+ f'msg = serve_utils.get_service_status_encoded({service_names!r}, '
1388
+ '**kwargs)', 'print(msg, end="", flush=True)'
1195
1389
  ]
1196
1390
  return cls._build(code)
1197
1391
 
@@ -1204,11 +1398,12 @@ class ServeCodeGen:
1204
1398
  return cls._build(code)
1205
1399
 
1206
1400
  @classmethod
1207
- def terminate_services(cls, service_names: Optional[List[str]],
1208
- purge: bool) -> str:
1401
+ def terminate_services(cls, service_names: Optional[List[str]], purge: bool,
1402
+ pool: bool) -> str:
1209
1403
  code = [
1404
+ f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
1210
1405
  f'msg = serve_utils.terminate_services({service_names!r}, '
1211
- f'purge={purge})', 'print(msg, end="", flush=True)'
1406
+ f'purge={purge}, **kwargs)', 'print(msg, end="", flush=True)'
1212
1407
  ]
1213
1408
  return cls._build(code)
1214
1409
 
@@ -1253,6 +1448,17 @@ class ServeCodeGen:
1253
1448
  ]
1254
1449
  return cls._build(code)
1255
1450
 
1451
+ @classmethod
1452
+ def update_service(cls, service_name: str, version: int, mode: str,
1453
+ pool: bool) -> str:
1454
+ code = [
1455
+ f'kwargs={{}} if serve_version < 3 else {{"pool": {pool}}}',
1456
+ f'msg = serve_utils.update_service_encoded({service_name!r}, '
1457
+ f'{version}, mode={mode!r}, **kwargs)',
1458
+ 'print(msg, end="", flush=True)',
1459
+ ]
1460
+ return cls._build(code)
1461
+
1256
1462
  @classmethod
1257
1463
  def _build(cls, code: List[str]) -> str:
1258
1464
  code = cls._PREFIX + code
@@ -1263,12 +1469,3 @@ class ServeCodeGen:
1263
1469
  f'"{common_utils.get_user_hash()}"; '
1264
1470
  f'{skylet_constants.SKY_PYTHON_CMD} '
1265
1471
  f'-u -c {shlex.quote(generated_code)}')
1266
-
1267
- @classmethod
1268
- def update_service(cls, service_name: str, version: int, mode: str) -> str:
1269
- code = [
1270
- f'msg = serve_utils.update_service_encoded({service_name!r}, '
1271
- f'{version}, mode={mode!r})',
1272
- 'print(msg, end="", flush=True)',
1273
- ]
1274
- return cls._build(code)