skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250731__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (72) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +4 -1
  3. sky/backends/cloud_vm_ray_backend.py +4 -3
  4. sky/catalog/__init__.py +3 -3
  5. sky/catalog/aws_catalog.py +12 -0
  6. sky/catalog/common.py +2 -2
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +448 -53
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  11. sky/dashboard/out/clusters/[cluster].html +1 -1
  12. sky/dashboard/out/clusters.html +1 -1
  13. sky/dashboard/out/config.html +1 -1
  14. sky/dashboard/out/index.html +1 -1
  15. sky/dashboard/out/infra/[context].html +1 -1
  16. sky/dashboard/out/infra.html +1 -1
  17. sky/dashboard/out/jobs/[job].html +1 -1
  18. sky/dashboard/out/jobs.html +1 -1
  19. sky/dashboard/out/users.html +1 -1
  20. sky/dashboard/out/volumes.html +1 -1
  21. sky/dashboard/out/workspace/new.html +1 -1
  22. sky/dashboard/out/workspaces/[name].html +1 -1
  23. sky/dashboard/out/workspaces.html +1 -1
  24. sky/jobs/__init__.py +3 -0
  25. sky/jobs/client/sdk.py +80 -3
  26. sky/jobs/controller.py +76 -25
  27. sky/jobs/recovery_strategy.py +80 -34
  28. sky/jobs/scheduler.py +68 -20
  29. sky/jobs/server/core.py +228 -136
  30. sky/jobs/server/server.py +40 -0
  31. sky/jobs/state.py +129 -24
  32. sky/jobs/utils.py +109 -51
  33. sky/provision/nebius/constants.py +3 -0
  34. sky/py.typed +0 -0
  35. sky/resources.py +16 -12
  36. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  37. sky/serve/autoscalers.py +8 -0
  38. sky/serve/client/impl.py +188 -0
  39. sky/serve/client/sdk.py +12 -82
  40. sky/serve/constants.py +5 -1
  41. sky/serve/controller.py +5 -0
  42. sky/serve/replica_managers.py +112 -37
  43. sky/serve/serve_state.py +16 -6
  44. sky/serve/serve_utils.py +274 -77
  45. sky/serve/server/core.py +8 -525
  46. sky/serve/server/impl.py +709 -0
  47. sky/serve/service.py +13 -9
  48. sky/serve/service_spec.py +74 -4
  49. sky/server/constants.py +1 -1
  50. sky/server/requests/payloads.py +33 -0
  51. sky/server/requests/requests.py +18 -1
  52. sky/server/requests/serializers/decoders.py +12 -3
  53. sky/server/requests/serializers/encoders.py +13 -2
  54. sky/skylet/events.py +9 -0
  55. sky/skypilot_config.py +24 -21
  56. sky/task.py +41 -11
  57. sky/templates/jobs-controller.yaml.j2 +3 -0
  58. sky/templates/sky-serve-controller.yaml.j2 +18 -2
  59. sky/users/server.py +1 -1
  60. sky/utils/command_runner.py +4 -2
  61. sky/utils/controller_utils.py +14 -10
  62. sky/utils/dag_utils.py +4 -2
  63. sky/utils/db/migration_utils.py +2 -4
  64. sky/utils/schemas.py +24 -19
  65. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/METADATA +1 -1
  66. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/RECORD +72 -68
  67. /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → oKqDxFQ88cquF4nQGE_0w}/_buildManifest.js +0 -0
  68. /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → oKqDxFQ88cquF4nQGE_0w}/_ssgManifest.js +0 -0
  69. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/WHEEL +0 -0
  70. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/entry_points.txt +0 -0
  71. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/licenses/LICENSE +0 -0
  72. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250731.dist-info}/top_level.txt +0 -0
sky/client/cli/command.py CHANGED
@@ -272,6 +272,65 @@ def _merge_env_vars(env_dict: Optional[Dict[str, str]],
272
272
  return list(env_dict.items())
273
273
 
274
274
 
275
+ def _format_job_ids_str(job_ids: List[int], max_length: int = 30) -> str:
276
+ """Format job IDs string with ellipsis if too long.
277
+
278
+ Args:
279
+ job_ids: List of job IDs to format.
280
+ max_length: Maximum length of the output string.
281
+
282
+ Returns:
283
+ Formatted string like "11,12,...,2017,2018" if truncated,
284
+ or the full string if it fits within max_length.
285
+ """
286
+ if not job_ids:
287
+ return ''
288
+
289
+ # Convert all to strings
290
+ job_strs = [str(job_id) for job_id in job_ids]
291
+ full_str = ','.join(job_strs)
292
+
293
+ # If it fits, return as is
294
+ if len(full_str) <= max_length:
295
+ return full_str
296
+
297
+ if len(job_strs) <= 2:
298
+ return full_str # Can't truncate further
299
+
300
+ # Need to truncate with ellipsis
301
+ ellipsis = '...'
302
+
303
+ # Start with minimum: first and last
304
+ start_count = 1
305
+ end_count = 1
306
+
307
+ while start_count + end_count < len(job_strs):
308
+ # Try adding one more to start
309
+ if start_count + 1 + end_count < len(job_strs):
310
+ start_part = ','.join(job_strs[:start_count + 1])
311
+ end_part = ','.join(job_strs[-end_count:])
312
+ candidate = f'{start_part},{ellipsis},{end_part}'
313
+ if len(candidate) <= max_length:
314
+ start_count += 1
315
+ continue
316
+
317
+ # Try adding one more to end
318
+ if start_count + end_count + 1 < len(job_strs):
319
+ start_part = ','.join(job_strs[:start_count])
320
+ end_part = ','.join(job_strs[-(end_count + 1):])
321
+ candidate = f'{start_part},{ellipsis},{end_part}'
322
+ if len(candidate) <= max_length:
323
+ end_count += 1
324
+ continue
325
+
326
+ # Can't add more
327
+ break
328
+
329
+ start_part = ','.join(job_strs[:start_count])
330
+ end_part = ','.join(job_strs[-end_count:])
331
+ return f'{start_part},{ellipsis},{end_part}'
332
+
333
+
275
334
  def _complete_cluster_name(ctx: click.Context, param: click.Parameter,
276
335
  incomplete: str) -> List[str]:
277
336
  """Handle shell completion for cluster names."""
@@ -1428,17 +1487,20 @@ def _handle_jobs_queue_request(
1428
1487
 
1429
1488
 
1430
1489
  def _handle_services_request(
1431
- request_id: str,
1432
- service_names: Optional[List[str]],
1433
- show_all: bool,
1434
- show_endpoint: bool,
1435
- is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
1490
+ request_id: str,
1491
+ service_names: Optional[List[str]],
1492
+ show_all: bool,
1493
+ show_endpoint: bool,
1494
+ pool: bool = False, # pylint: disable=redefined-outer-name
1495
+ is_called_by_user: bool = False
1496
+ ) -> Tuple[Optional[int], str]:
1436
1497
  """Get service statuses.
1437
1498
 
1438
1499
  Args:
1439
1500
  service_names: If not None, only show the statuses of these services.
1440
1501
  show_all: Show all information of each service.
1441
1502
  show_endpoint: If True, only show the endpoint of the service.
1503
+ pool: If True, the request is for a pool. Otherwise for a service.
1442
1504
  is_called_by_user: If this function is called by user directly, or an
1443
1505
  internal call.
1444
1506
 
@@ -1447,6 +1509,7 @@ def _handle_services_request(
1447
1509
  is an error when querying the services. In this case, msg contains the
1448
1510
  error message. Otherwise, msg contains the formatted service table.
1449
1511
  """
1512
+ noun = 'pool' if pool else 'service'
1450
1513
  num_services = None
1451
1514
  try:
1452
1515
  if not is_called_by_user:
@@ -1483,11 +1546,11 @@ def _handle_services_request(
1483
1546
  # print the original error.
1484
1547
  pass
1485
1548
  if not msg:
1486
- msg = ('Failed to fetch service statuses due to connection issues. '
1549
+ msg = (f'Failed to fetch {noun} statuses due to connection issues. '
1487
1550
  'Please try again later. Details: '
1488
1551
  f'{common_utils.format_exception(e, use_bracket=True)}')
1489
1552
  except Exception as e: # pylint: disable=broad-except
1490
- msg = ('Failed to fetch service statuses: '
1553
+ msg = (f'Failed to fetch {noun} statuses: '
1491
1554
  f'{common_utils.format_exception(e, use_bracket=True)}')
1492
1555
  else:
1493
1556
  if show_endpoint:
@@ -1502,14 +1565,16 @@ def _handle_services_request(
1502
1565
  endpoint = service_records[0]['endpoint']
1503
1566
  msg = '-' if endpoint is None else endpoint
1504
1567
  else:
1505
- msg = serve_lib.format_service_table(service_records, show_all)
1568
+ msg = serve_lib.format_service_table(service_records, show_all,
1569
+ pool)
1506
1570
  service_not_found_msg = ''
1507
1571
  if service_names is not None:
1508
1572
  for service_name in service_names:
1509
1573
  if not any(service_name == record['name']
1510
1574
  for record in service_records):
1511
1575
  service_not_found_msg += (
1512
- f'\nService {service_name!r} not found.')
1576
+ f'\n{noun.capitalize()} '
1577
+ f'{service_name!r} not found.')
1513
1578
  if service_not_found_msg:
1514
1579
  msg += f'\n{service_not_found_msg}'
1515
1580
  return num_services, msg
@@ -1665,6 +1730,11 @@ def _show_enabled_infra(active_workspace: str, show_workspace: bool):
1665
1730
  is_flag=True,
1666
1731
  required=False,
1667
1732
  help='Also show sky serve services, if any.')
1733
+ @click.option('--show-pools/--no-show-pools',
1734
+ default=True,
1735
+ is_flag=True,
1736
+ required=False,
1737
+ help='Also show cluster pools, if any.')
1668
1738
  @click.option(
1669
1739
  '--kubernetes',
1670
1740
  '--k8s',
@@ -1684,8 +1754,8 @@ def _show_enabled_infra(active_workspace: str, show_workspace: bool):
1684
1754
  # pylint: disable=redefined-builtin
1685
1755
  def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1686
1756
  endpoint: Optional[int], show_managed_jobs: bool,
1687
- show_services: bool, kubernetes: bool, clusters: List[str],
1688
- all_users: bool):
1757
+ show_services: bool, show_pools: bool, kubernetes: bool,
1758
+ clusters: List[str], all_users: bool):
1689
1759
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1690
1760
  """Show clusters.
1691
1761
 
@@ -1807,6 +1877,9 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1807
1877
  def submit_services() -> Optional[str]:
1808
1878
  return serve_lib.status(service_names=None)
1809
1879
 
1880
+ def submit_pools() -> Optional[str]:
1881
+ return managed_jobs.pool_status(pool_names=None)
1882
+
1810
1883
  def submit_workspace() -> Optional[str]:
1811
1884
  try:
1812
1885
  return sdk.workspaces()
@@ -1823,6 +1896,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1823
1896
  managed_jobs_queue_request_id = None
1824
1897
  service_status_request_id = None
1825
1898
  workspace_request_id = None
1899
+ pool_status_request_id = None
1826
1900
 
1827
1901
  # Submit all requests in parallel
1828
1902
  with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
@@ -1830,6 +1904,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1830
1904
  managed_jobs_request_future = executor.submit(submit_managed_jobs)
1831
1905
  if show_services:
1832
1906
  services_request_future = executor.submit(submit_services)
1907
+ if show_pools:
1908
+ pools_request_future = executor.submit(submit_pools)
1833
1909
  if not (ip or show_endpoints):
1834
1910
  workspace_request_future = executor.submit(submit_workspace)
1835
1911
 
@@ -1838,13 +1914,17 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1838
1914
  managed_jobs_queue_request_id = managed_jobs_request_future.result()
1839
1915
  if show_services:
1840
1916
  service_status_request_id = services_request_future.result()
1917
+ if show_pools:
1918
+ pool_status_request_id = pools_request_future.result()
1841
1919
  if not (ip or show_endpoints):
1842
1920
  workspace_request_id = workspace_request_future.result()
1843
1921
 
1844
- managed_jobs_queue_request_id = '' if not managed_jobs_queue_request_id \
1845
- else managed_jobs_queue_request_id
1846
- service_status_request_id = '' if not service_status_request_id \
1847
- else service_status_request_id
1922
+ managed_jobs_queue_request_id = ('' if not managed_jobs_queue_request_id
1923
+ else managed_jobs_queue_request_id)
1924
+ service_status_request_id = ('' if not service_status_request_id else
1925
+ service_status_request_id)
1926
+ pool_status_request_id = ('' if not pool_status_request_id else
1927
+ pool_status_request_id)
1848
1928
 
1849
1929
  # Phase 3: Get cluster records and handle special cases
1850
1930
  cluster_records = _get_cluster_records_and_set_ssh_config(
@@ -1919,7 +1999,34 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1919
1999
  job_info += '. '
1920
2000
  hints.append(
1921
2001
  controller_utils.Controllers.JOBS_CONTROLLER.value.
1922
- in_progress_hint.format(job_info=job_info))
2002
+ in_progress_hint(False).format(job_info=job_info))
2003
+
2004
+ if show_pools:
2005
+ num_pools = None
2006
+ if managed_jobs_query_interrupted:
2007
+ msg = 'KeyboardInterrupt'
2008
+ else:
2009
+ with rich_utils.client_status('[cyan]Checking pools[/]'):
2010
+ try:
2011
+ num_pools, msg = _handle_services_request(
2012
+ pool_status_request_id,
2013
+ service_names=None,
2014
+ show_all=False,
2015
+ show_endpoint=False,
2016
+ pool=True,
2017
+ is_called_by_user=False)
2018
+ except KeyboardInterrupt:
2019
+ sdk.api_cancel(pool_status_request_id, silent=True)
2020
+ num_pools = -1
2021
+ msg = 'KeyboardInterrupt'
2022
+ if num_pools is not None:
2023
+ if num_pools > 0:
2024
+ click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
2025
+ f'Pools{colorama.Style.RESET_ALL}')
2026
+ click.echo(msg)
2027
+ hints.append(
2028
+ controller_utils.Controllers.SKY_SERVE_CONTROLLER.value.
2029
+ in_progress_hint(True))
1923
2030
 
1924
2031
  if show_services:
1925
2032
  click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
@@ -1942,8 +2049,9 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1942
2049
  msg = 'KeyboardInterrupt'
1943
2050
  click.echo(msg)
1944
2051
  if num_services is not None:
1945
- hints.append(controller_utils.Controllers.SKY_SERVE_CONTROLLER.
1946
- value.in_progress_hint)
2052
+ hints.append(
2053
+ controller_utils.Controllers.SKY_SERVE_CONTROLLER.value.
2054
+ in_progress_hint(False))
1947
2055
 
1948
2056
  if num_pending_autostop > 0 and not refresh:
1949
2057
  # Don't print this hint if there's no pending autostop or user has
@@ -4193,6 +4301,17 @@ def jobs():
4193
4301
  is_flag=True,
4194
4302
  help=('If True, as soon as a job is submitted, return from this call '
4195
4303
  'and do not stream execution logs.'))
4304
+ @click.option('--pool',
4305
+ '-p',
4306
+ default=None,
4307
+ type=str,
4308
+ required=False,
4309
+ help='(Experimental; optional) Pool to use for jobs submission.')
4310
+ @click.option('--num-jobs',
4311
+ default=None,
4312
+ type=int,
4313
+ required=False,
4314
+ help='Number of jobs to submit.')
4196
4315
  @click.option('--git-url', type=str, help='Git repository URL.')
4197
4316
  @click.option('--git-ref',
4198
4317
  type=str,
@@ -4226,6 +4345,8 @@ def jobs_launch(
4226
4345
  ports: Tuple[str],
4227
4346
  detach_run: bool,
4228
4347
  yes: bool,
4348
+ pool: Optional[str], # pylint: disable=redefined-outer-name
4349
+ num_jobs: Optional[int],
4229
4350
  async_call: bool,
4230
4351
  config_override: Optional[Dict[str, Any]] = None,
4231
4352
  git_url: Optional[str] = None,
@@ -4245,6 +4366,9 @@ def jobs_launch(
4245
4366
 
4246
4367
  sky jobs launch 'echo hello!'
4247
4368
  """
4369
+ if pool is None and num_jobs is not None:
4370
+ raise click.UsageError('Cannot specify --num-jobs without --pool.')
4371
+
4248
4372
  if cluster is not None:
4249
4373
  if name is not None and name != cluster:
4250
4374
  raise click.UsageError('Cannot specify both --name and --cluster. '
@@ -4295,22 +4419,63 @@ def jobs_launch(
4295
4419
 
4296
4420
  common_utils.check_cluster_name_is_valid(name)
4297
4421
 
4422
+ if pool is not None:
4423
+ num_job_int = num_jobs if num_jobs is not None else 1
4424
+ plural = '' if num_job_int == 1 else 's'
4425
+ click.secho(f'Submitting to pool {colorama.Fore.CYAN}{pool!r}'
4426
+ f'{colorama.Style.RESET_ALL} with {colorama.Fore.CYAN}'
4427
+ f'{num_job_int}{colorama.Style.RESET_ALL} job{plural}.')
4428
+ print_setup_fm_warning = False
4429
+ for task_ in dag.tasks:
4430
+ if (task_.setup is not None or task_.file_mounts or
4431
+ task_.storage_mounts):
4432
+ print_setup_fm_warning = True
4433
+ break
4434
+ if print_setup_fm_warning:
4435
+ click.secho(
4436
+ f'{colorama.Fore.YELLOW}setup/file_mounts/storage_mounts'
4437
+ ' will be ignored in pool. To update a pool, please '
4438
+ f'use `sky pool apply {pool} pool.yaml`. '
4439
+ f'{colorama.Style.RESET_ALL}')
4440
+
4298
4441
  # Optimize info is only show if _need_confirmation.
4299
4442
  if not yes:
4300
4443
  click.secho(
4301
4444
  f'Managed job {dag.name!r} will be launched on (estimated):',
4302
4445
  fg='yellow')
4303
4446
 
4304
- request_id = managed_jobs.launch(dag, name, _need_confirmation=not yes)
4447
+ request_id = managed_jobs.launch(dag,
4448
+ name,
4449
+ pool,
4450
+ num_jobs,
4451
+ _need_confirmation=not yes)
4305
4452
  job_id_handle = _async_call_or_wait(request_id, async_call,
4306
4453
  'sky.jobs.launch')
4454
+
4307
4455
  if not async_call and not detach_run:
4308
- job_id = job_id_handle[0]
4309
- returncode = managed_jobs.tail_logs(name=None,
4310
- job_id=job_id,
4311
- follow=True,
4312
- controller=False)
4313
- sys.exit(returncode)
4456
+ job_ids = job_id_handle[0]
4457
+ if isinstance(job_ids, int) or len(job_ids) == 1:
4458
+ job_id = job_ids if isinstance(job_ids, int) else job_ids[0]
4459
+ returncode = managed_jobs.tail_logs(name=None,
4460
+ job_id=job_id,
4461
+ follow=True,
4462
+ controller=False)
4463
+ sys.exit(returncode)
4464
+ else:
4465
+ job_ids_str = _format_job_ids_str(job_ids)
4466
+ click.secho(
4467
+ f'Jobs submitted with IDs: {colorama.Fore.CYAN}'
4468
+ f'{job_ids_str}{colorama.Style.RESET_ALL}.'
4469
+ f'\n📋 Useful Commands'
4470
+ f'\n{ux_utils.INDENT_SYMBOL}To stream job logs:\t\t\t'
4471
+ f'{ux_utils.BOLD}sky jobs logs <job-id>'
4472
+ f'{ux_utils.RESET_BOLD}'
4473
+ f'\n{ux_utils.INDENT_SYMBOL}To stream controller logs:\t\t'
4474
+ f'{ux_utils.BOLD}sky jobs logs --controller <job-id>'
4475
+ f'{ux_utils.RESET_BOLD}'
4476
+ f'\n{ux_utils.INDENT_LAST_SYMBOL}To cancel all jobs on the '
4477
+ f'pool:\t{ux_utils.BOLD}sky jobs cancel --pool {pool}'
4478
+ f'{ux_utils.RESET_BOLD}')
4314
4479
 
4315
4480
 
4316
4481
  @jobs.command('queue', cls=_DocumentedCodeCommand)
@@ -4420,14 +4585,25 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
4420
4585
  required=False,
4421
4586
  type=str,
4422
4587
  help='Managed job name to cancel.')
4588
+ @click.option('--pool',
4589
+ '-p',
4590
+ required=False,
4591
+ type=str,
4592
+ help='Pool name to cancel.')
4423
4593
  @click.argument('job_ids', default=None, type=int, required=False, nargs=-1)
4424
4594
  @flags.all_option('Cancel all managed jobs for the current user.')
4425
4595
  @flags.yes_option()
4426
4596
  @flags.all_users_option('Cancel all managed jobs from all users.')
4427
4597
  @usage_lib.entrypoint
4428
4598
  # pylint: disable=redefined-builtin
4429
- def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
4430
- all_users: bool):
4599
+ def jobs_cancel(
4600
+ name: Optional[str],
4601
+ pool: Optional[str], # pylint: disable=redefined-outer-name
4602
+ job_ids: Tuple[int],
4603
+ all: bool,
4604
+ yes: bool,
4605
+ all_users: bool,
4606
+ ):
4431
4607
  """Cancel managed jobs.
4432
4608
 
4433
4609
  You can provide either a job name or a list of job IDs to be cancelled.
@@ -4442,22 +4618,29 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
4442
4618
  \b
4443
4619
  # Cancel managed jobs with IDs 1, 2, 3
4444
4620
  $ sky jobs cancel 1 2 3
4621
+ \b
4622
+ # Cancel all managed jobs in pool 'my-pool'
4623
+ $ sky jobs cancel -p my-pool
4445
4624
  """
4446
4625
  job_id_str = ','.join(map(str, job_ids))
4447
- if sum([bool(job_ids), name is not None, all or all_users]) != 1:
4626
+ if sum([
4627
+ bool(job_ids), name is not None, pool is not None, all or all_users
4628
+ ]) != 1:
4448
4629
  arguments = []
4449
4630
  arguments += [f'--job-ids {job_id_str}'] if job_ids else []
4450
4631
  arguments += [f'--name {name}'] if name is not None else []
4632
+ arguments += [f'--pool {pool}'] if pool is not None else []
4451
4633
  arguments += ['--all'] if all else []
4452
4634
  arguments += ['--all-users'] if all_users else []
4453
4635
  raise click.UsageError(
4454
- 'Can only specify one of JOB_IDS, --name, or --all/--all-users. '
4455
- f'Provided {" ".join(arguments)!r}.')
4636
+ 'Can only specify one of JOB_IDS, --name, --pool, or '
4637
+ f'--all/--all-users. Provided {" ".join(arguments)!r}.')
4456
4638
 
4457
4639
  if not yes:
4458
4640
  plural = 's' if len(job_ids) > 1 else ''
4459
4641
  job_identity_str = (f'managed job{plural} with ID{plural} {job_id_str}'
4460
- if job_ids else repr(name))
4642
+ if job_ids else f'{name!r}' if name is not None else
4643
+ f'managed jobs in pool {pool!r}')
4461
4644
  if all_users:
4462
4645
  job_identity_str = 'all managed jobs FOR ALL USERS'
4463
4646
  elif all:
@@ -4470,6 +4653,7 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
4470
4653
  sdk.stream_and_get(
4471
4654
  managed_jobs.cancel(job_ids=job_ids,
4472
4655
  name=name,
4656
+ pool=pool,
4473
4657
  all=all,
4474
4658
  all_users=all_users))
4475
4659
 
@@ -4547,24 +4731,47 @@ def jobs_dashboard():
4547
4731
  sdk.dashboard(starting_page='jobs')
4548
4732
 
4549
4733
 
4550
- @cli.command(cls=_DocumentedCodeCommand)
4551
- @flags.config_option(expose_value=False)
4552
- @usage_lib.entrypoint
4553
- def dashboard() -> None:
4554
- """Starts the dashboard for skypilot."""
4555
- sdk.dashboard()
4556
-
4557
-
4558
- @cli.group(cls=_NaturalOrderGroup)
4559
- def serve():
4560
- """SkyServe CLI (multi-region, multi-cloud serving)."""
4734
+ @jobs.group(cls=_NaturalOrderGroup)
4735
+ def pool():
4736
+ """(Experimental) Pool management commands."""
4561
4737
  pass
4562
4738
 
4563
4739
 
4564
- def _generate_task_with_service(
4565
- service_name: str,
4566
- service_yaml_args: Tuple[str, ...],
4740
+ # TODO(MaoZiming): Update Doc.
4741
+ # TODO(MaoZiming): Expose mix replica traffic option to user.
4742
+ # Currently, we do not mix traffic from old and new replicas.
4743
+ @pool.command('apply', cls=_DocumentedCodeCommand)
4744
+ @flags.config_option(expose_value=False)
4745
+ @click.argument('pool_yaml',
4746
+ required=True,
4747
+ type=str,
4748
+ nargs=-1,
4749
+ **_get_shell_complete_args(_complete_file_name))
4750
+ @click.option('--pool-name',
4751
+ '-p',
4752
+ default=None,
4753
+ type=str,
4754
+ help='A pool name. Unique for each pool. If not provided, '
4755
+ 'a unique name is autogenerated.')
4756
+ @click.option('--mode',
4757
+ default=serve_lib.DEFAULT_UPDATE_MODE.value,
4758
+ type=click.Choice([m.value for m in serve_lib.UpdateMode],
4759
+ case_sensitive=False),
4760
+ required=False,
4761
+ help=('Update mode. If "rolling", cluster pool will be updated '
4762
+ 'with rolling update. If "blue_green", cluster pool will '
4763
+ 'be updated with blue-green update. This option is only '
4764
+ 'valid when the pool is already running.'))
4765
+ @_add_click_options(flags.TASK_OPTIONS + flags.EXTRA_RESOURCES_OPTIONS +
4766
+ flags.COMMON_OPTIONS)
4767
+ @flags.yes_option()
4768
+ @timeline.event
4769
+ @usage_lib.entrypoint
4770
+ def jobs_pool_apply(
4771
+ pool_yaml: Tuple[str, ...],
4772
+ pool_name: Optional[str],
4567
4773
  workdir: Optional[str],
4774
+ infra: Optional[str],
4568
4775
  cloud: Optional[str],
4569
4776
  region: Optional[str],
4570
4777
  zone: Optional[str],
@@ -4573,21 +4780,196 @@ def _generate_task_with_service(
4573
4780
  image_id: Optional[str],
4574
4781
  env_file: Optional[Dict[str, str]],
4575
4782
  env: List[Tuple[str, str]],
4576
- secret: Optional[List[Tuple[str, str]]],
4783
+ secret: List[Tuple[str, str]],
4577
4784
  gpus: Optional[str],
4578
4785
  instance_type: Optional[str],
4579
- ports: Optional[Tuple[str]],
4786
+ ports: Tuple[str],
4580
4787
  cpus: Optional[str],
4581
4788
  memory: Optional[str],
4582
4789
  disk_size: Optional[int],
4583
4790
  disk_tier: Optional[str],
4584
4791
  network_tier: Optional[str],
4585
- not_supported_cmd: str,
4792
+ mode: str,
4793
+ yes: bool,
4794
+ async_call: bool,
4795
+ ):
4796
+ """Apply a config to a cluster pool for managed jobs submission.
4797
+
4798
+ If the pool is already running, the config will be applied to the pool.
4799
+ Otherwise, a new pool will be created.
4800
+
4801
+ POOL_YAML must point to a valid YAML file.
4802
+ """
4803
+ cloud, region, zone = _handle_infra_cloud_region_zone_options(
4804
+ infra, cloud, region, zone)
4805
+ if pool_name is None:
4806
+ pool_name = serve_lib.generate_service_name(pool=True)
4807
+
4808
+ task = _generate_task_with_service(
4809
+ service_name=pool_name,
4810
+ service_yaml_args=pool_yaml,
4811
+ workdir=workdir,
4812
+ cloud=cloud,
4813
+ region=region,
4814
+ zone=zone,
4815
+ gpus=gpus,
4816
+ cpus=cpus,
4817
+ memory=memory,
4818
+ instance_type=instance_type,
4819
+ num_nodes=num_nodes,
4820
+ use_spot=use_spot,
4821
+ image_id=image_id,
4822
+ env_file=env_file,
4823
+ env=env,
4824
+ secret=secret,
4825
+ disk_size=disk_size,
4826
+ disk_tier=disk_tier,
4827
+ network_tier=network_tier,
4828
+ ports=ports,
4829
+ not_supported_cmd='sky jobs pool up',
4830
+ pool=True,
4831
+ )
4832
+ assert task.service is not None
4833
+ if not task.service.pool:
4834
+ raise click.UsageError('The YAML file needs a `pool` section.')
4835
+ click.secho('Pool spec:', fg='cyan')
4836
+ click.echo(task.service)
4837
+ serve_lib.validate_service_task(task, pool=True)
4838
+
4839
+ click.secho(
4840
+ 'Each pool worker will use the following resources (estimated):',
4841
+ fg='cyan')
4842
+ with sky.Dag() as dag:
4843
+ dag.add(task)
4844
+
4845
+ request_id = managed_jobs.pool_apply(task,
4846
+ pool_name,
4847
+ mode=serve_lib.UpdateMode(mode),
4848
+ _need_confirmation=not yes)
4849
+ _async_call_or_wait(request_id, async_call, 'sky.jobs.pool_apply')
4850
+
4851
+
4852
+ @pool.command('status', cls=_DocumentedCodeCommand)
4853
+ @flags.config_option(expose_value=False)
4854
+ @flags.verbose_option()
4855
+ @click.argument('pool_names', required=False, type=str, nargs=-1)
4856
+ @usage_lib.entrypoint
4857
+ # pylint: disable=redefined-builtin
4858
+ def jobs_pool_status(verbose: bool, pool_names: List[str]):
4859
+ """Show statuses of cluster pools.
4860
+
4861
+ Show detailed statuses of one or more pools. If POOL_NAME is not
4862
+ provided, show all pools' status.
4863
+ """
4864
+ pool_names_to_query: Optional[List[str]] = pool_names
4865
+ if not pool_names:
4866
+ pool_names_to_query = None
4867
+ with rich_utils.client_status('[cyan]Checking pools[/]'):
4868
+ pool_status_request_id = managed_jobs.pool_status(pool_names_to_query)
4869
+ _, msg = _handle_services_request(pool_status_request_id,
4870
+ service_names=pool_names_to_query,
4871
+ show_all=verbose,
4872
+ show_endpoint=False,
4873
+ pool=True,
4874
+ is_called_by_user=True)
4875
+
4876
+ click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
4877
+ f'Pools{colorama.Style.RESET_ALL}')
4878
+ click.echo(msg)
4879
+
4880
+
4881
+ @pool.command('down', cls=_DocumentedCodeCommand)
4882
+ @flags.config_option(expose_value=False)
4883
+ @click.argument('pool_names', required=False, type=str, nargs=-1)
4884
+ @flags.all_option('Delete all pools.')
4885
+ @click.option('--purge',
4886
+ '-p',
4887
+ default=False,
4888
+ is_flag=True,
4889
+ help='Tear down pools in failed status.')
4890
+ @flags.yes_option()
4891
+ @_add_click_options(flags.COMMON_OPTIONS)
4892
+ @usage_lib.entrypoint
4893
+ # pylint: disable=redefined-builtin
4894
+ def jobs_pool_down(
4895
+ pool_names: List[str],
4896
+ all: bool,
4897
+ purge: bool,
4898
+ yes: bool,
4899
+ async_call: bool,
4900
+ ) -> None:
4901
+ """Delete pool(s).
4902
+
4903
+ POOL_NAMES is the name of the pool (or glob pattern) to delete. If
4904
+ both POOL_NAMES and ``--all`` are supplied, the latter takes precedence.
4905
+
4906
+ Deleting a pool will delete all of its workers and associated resources.
4907
+ """
4908
+ if sum([bool(pool_names), all]) != 1:
4909
+ argument_str = (f'POOL_NAMES={",".join(pool_names)}'
4910
+ if pool_names else '')
4911
+ argument_str += ' --all' if all else ''
4912
+ raise click.UsageError('Can only specify one of POOL_NAMES or --all. '
4913
+ f'Provided {argument_str!r}.')
4914
+
4915
+ if not yes:
4916
+ quoted_pool_names = [f'{name!r}' for name in pool_names]
4917
+ list_pool_str = ', '.join(quoted_pool_names)
4918
+ pool_identity_str = f'pool(s) {list_pool_str}'
4919
+ if all:
4920
+ pool_identity_str = 'all pools'
4921
+ click.confirm(f'Terminating {pool_identity_str}. Proceed?',
4922
+ default=True,
4923
+ abort=True,
4924
+ show_default=True)
4925
+
4926
+ request_id = managed_jobs.pool_down(pool_names, all=all, purge=purge)
4927
+ _async_call_or_wait(request_id, async_call, 'sky.jobs.pool_down')
4928
+
4929
+
4930
+ @cli.command(cls=_DocumentedCodeCommand)
4931
+ @flags.config_option(expose_value=False)
4932
+ @usage_lib.entrypoint
4933
+ def dashboard() -> None:
4934
+ """Starts the dashboard for skypilot."""
4935
+ sdk.dashboard()
4936
+
4937
+
4938
+ @cli.group(cls=_NaturalOrderGroup)
4939
+ def serve():
4940
+ """SkyServe CLI (multi-region, multi-cloud serving)."""
4941
+ pass
4942
+
4943
+
4944
+ def _generate_task_with_service(
4945
+ service_name: str,
4946
+ service_yaml_args: Tuple[str, ...],
4947
+ workdir: Optional[str],
4948
+ cloud: Optional[str],
4949
+ region: Optional[str],
4950
+ zone: Optional[str],
4951
+ num_nodes: Optional[int],
4952
+ use_spot: Optional[bool],
4953
+ image_id: Optional[str],
4954
+ env_file: Optional[Dict[str, str]],
4955
+ env: List[Tuple[str, str]],
4956
+ secret: Optional[List[Tuple[str, str]]],
4957
+ gpus: Optional[str],
4958
+ instance_type: Optional[str],
4959
+ ports: Optional[Tuple[str]],
4960
+ cpus: Optional[str],
4961
+ memory: Optional[str],
4962
+ disk_size: Optional[int],
4963
+ disk_tier: Optional[str],
4964
+ network_tier: Optional[str],
4965
+ not_supported_cmd: str,
4966
+ pool: bool, # pylint: disable=redefined-outer-name
4586
4967
  ) -> sky.Task:
4587
4968
  """Generate a task with service section from a service YAML file."""
4588
4969
  is_yaml, _ = _check_yaml(''.join(service_yaml_args))
4970
+ yaml_name = 'SERVICE_YAML' if not pool else 'POOL_YAML'
4589
4971
  if not is_yaml:
4590
- raise click.UsageError('SERVICE_YAML must be a valid YAML file.')
4972
+ raise click.UsageError(f'{yaml_name} must be a valid YAML file.')
4591
4973
  env = _merge_env_vars(env_file, env)
4592
4974
  # We keep nargs=-1 in service_yaml argument to reuse this function.
4593
4975
  task = _make_task_or_dag_from_entrypoint_with_overrides(
@@ -4617,9 +4999,17 @@ def _generate_task_with_service(
4617
4999
  _DAG_NOT_SUPPORTED_MESSAGE.format(command=not_supported_cmd))
4618
5000
 
4619
5001
  if task.service is None:
5002
+ field_name = 'service' if not pool else 'pool'
4620
5003
  with ux_utils.print_exception_no_traceback():
4621
- raise ValueError('Service section not found in the YAML file. '
4622
- 'To fix, add a valid `service` field.')
5004
+ raise ValueError(f'{field_name.capitalize()} section not found '
5005
+ 'in the YAML file. To fix, add a valid '
5006
+ f'`{field_name}` field.')
5007
+
5008
+ if task.service.pool:
5009
+ if task.service.ports is not None or ports:
5010
+ with ux_utils.print_exception_no_traceback():
5011
+ raise ValueError('Cannot specify ports in a cluster pool.')
5012
+ return task
4623
5013
 
4624
5014
  # NOTE(yi): we only allow one service port now.
4625
5015
  service_port: Optional[int] = int(
@@ -4779,10 +5169,14 @@ def serve_up(
4779
5169
  network_tier=network_tier,
4780
5170
  ports=ports,
4781
5171
  not_supported_cmd='sky serve up',
5172
+ pool=False,
4782
5173
  )
5174
+ assert task.service is not None
5175
+ if task.service.pool:
5176
+ raise click.UsageError('The YAML file needs a `service` section.')
4783
5177
  click.secho('Service spec:', fg='cyan')
4784
5178
  click.echo(task.service)
4785
- serve_lib.validate_service_task(task)
5179
+ serve_lib.validate_service_task(task, pool=False)
4786
5180
 
4787
5181
  click.secho('Each replica will use the following resources (estimated):',
4788
5182
  fg='cyan')
@@ -4881,10 +5275,11 @@ def serve_update(
4881
5275
  network_tier=network_tier,
4882
5276
  ports=ports,
4883
5277
  not_supported_cmd='sky serve update',
5278
+ pool=False,
4884
5279
  )
4885
5280
  click.secho('Service spec:', fg='cyan')
4886
5281
  click.echo(task.service)
4887
- serve_lib.validate_service_task(task)
5282
+ serve_lib.validate_service_task(task, pool=False)
4888
5283
 
4889
5284
  click.secho('New replica will use the following resources (estimated):',
4890
5285
  fg='cyan')