skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (81) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +4 -1
  3. sky/backends/cloud_vm_ray_backend.py +4 -3
  4. sky/catalog/__init__.py +3 -3
  5. sky/catalog/aws_catalog.py +12 -0
  6. sky/catalog/common.py +2 -2
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +452 -53
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-42cd1b19a6b01078.js} +1 -1
  11. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  12. sky/dashboard/out/clusters/[cluster].html +1 -1
  13. sky/dashboard/out/clusters.html +1 -1
  14. sky/dashboard/out/config.html +1 -1
  15. sky/dashboard/out/index.html +1 -1
  16. sky/dashboard/out/infra/[context].html +1 -1
  17. sky/dashboard/out/infra.html +1 -1
  18. sky/dashboard/out/jobs/[job].html +1 -1
  19. sky/dashboard/out/jobs.html +1 -1
  20. sky/dashboard/out/users.html +1 -1
  21. sky/dashboard/out/volumes.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/data/data_utils.py +21 -1
  26. sky/data/storage.py +12 -0
  27. sky/jobs/__init__.py +3 -0
  28. sky/jobs/client/sdk.py +80 -3
  29. sky/jobs/controller.py +76 -25
  30. sky/jobs/recovery_strategy.py +80 -34
  31. sky/jobs/scheduler.py +68 -20
  32. sky/jobs/server/core.py +228 -136
  33. sky/jobs/server/server.py +40 -0
  34. sky/jobs/state.py +129 -24
  35. sky/jobs/utils.py +109 -51
  36. sky/provision/nebius/constants.py +3 -0
  37. sky/provision/runpod/utils.py +27 -12
  38. sky/py.typed +0 -0
  39. sky/resources.py +16 -12
  40. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  41. sky/serve/autoscalers.py +8 -0
  42. sky/serve/client/impl.py +188 -0
  43. sky/serve/client/sdk.py +12 -82
  44. sky/serve/constants.py +5 -1
  45. sky/serve/controller.py +5 -0
  46. sky/serve/replica_managers.py +112 -37
  47. sky/serve/serve_state.py +16 -6
  48. sky/serve/serve_utils.py +274 -77
  49. sky/serve/server/core.py +8 -525
  50. sky/serve/server/impl.py +709 -0
  51. sky/serve/service.py +13 -9
  52. sky/serve/service_spec.py +74 -4
  53. sky/server/constants.py +1 -1
  54. sky/server/daemons.py +164 -0
  55. sky/server/requests/payloads.py +33 -0
  56. sky/server/requests/requests.py +2 -107
  57. sky/server/requests/serializers/decoders.py +12 -3
  58. sky/server/requests/serializers/encoders.py +13 -2
  59. sky/server/server.py +2 -1
  60. sky/server/uvicorn.py +2 -1
  61. sky/sky_logging.py +30 -0
  62. sky/skylet/constants.py +2 -1
  63. sky/skylet/events.py +9 -0
  64. sky/skypilot_config.py +24 -21
  65. sky/task.py +41 -11
  66. sky/templates/jobs-controller.yaml.j2 +3 -0
  67. sky/templates/sky-serve-controller.yaml.j2 +18 -2
  68. sky/users/server.py +1 -1
  69. sky/utils/command_runner.py +4 -2
  70. sky/utils/controller_utils.py +14 -10
  71. sky/utils/dag_utils.py +4 -2
  72. sky/utils/db/migration_utils.py +2 -4
  73. sky/utils/schemas.py +47 -19
  74. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/METADATA +1 -1
  75. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/RECORD +81 -76
  76. /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_buildManifest.js +0 -0
  77. /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_ssgManifest.js +0 -0
  78. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/WHEEL +0 -0
  79. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/entry_points.txt +0 -0
  80. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/licenses/LICENSE +0 -0
  81. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/top_level.txt +0 -0
sky/client/cli/command.py CHANGED
@@ -62,6 +62,7 @@ from sky.provision.kubernetes import constants as kubernetes_constants
62
62
  from sky.provision.kubernetes import utils as kubernetes_utils
63
63
  from sky.server import common as server_common
64
64
  from sky.server import constants as server_constants
65
+ from sky.server import versions
65
66
  from sky.server.requests import requests
66
67
  from sky.skylet import constants
67
68
  from sky.skylet import job_lib
@@ -272,6 +273,65 @@ def _merge_env_vars(env_dict: Optional[Dict[str, str]],
272
273
  return list(env_dict.items())
273
274
 
274
275
 
276
+ def _format_job_ids_str(job_ids: List[int], max_length: int = 30) -> str:
277
+ """Format job IDs string with ellipsis if too long.
278
+
279
+ Args:
280
+ job_ids: List of job IDs to format.
281
+ max_length: Maximum length of the output string.
282
+
283
+ Returns:
284
+ Formatted string like "11,12,...,2017,2018" if truncated,
285
+ or the full string if it fits within max_length.
286
+ """
287
+ if not job_ids:
288
+ return ''
289
+
290
+ # Convert all to strings
291
+ job_strs = [str(job_id) for job_id in job_ids]
292
+ full_str = ','.join(job_strs)
293
+
294
+ # If it fits, return as is
295
+ if len(full_str) <= max_length:
296
+ return full_str
297
+
298
+ if len(job_strs) <= 2:
299
+ return full_str # Can't truncate further
300
+
301
+ # Need to truncate with ellipsis
302
+ ellipsis = '...'
303
+
304
+ # Start with minimum: first and last
305
+ start_count = 1
306
+ end_count = 1
307
+
308
+ while start_count + end_count < len(job_strs):
309
+ # Try adding one more to start
310
+ if start_count + 1 + end_count < len(job_strs):
311
+ start_part = ','.join(job_strs[:start_count + 1])
312
+ end_part = ','.join(job_strs[-end_count:])
313
+ candidate = f'{start_part},{ellipsis},{end_part}'
314
+ if len(candidate) <= max_length:
315
+ start_count += 1
316
+ continue
317
+
318
+ # Try adding one more to end
319
+ if start_count + end_count + 1 < len(job_strs):
320
+ start_part = ','.join(job_strs[:start_count])
321
+ end_part = ','.join(job_strs[-(end_count + 1):])
322
+ candidate = f'{start_part},{ellipsis},{end_part}'
323
+ if len(candidate) <= max_length:
324
+ end_count += 1
325
+ continue
326
+
327
+ # Can't add more
328
+ break
329
+
330
+ start_part = ','.join(job_strs[:start_count])
331
+ end_part = ','.join(job_strs[-end_count:])
332
+ return f'{start_part},{ellipsis},{end_part}'
333
+
334
+
275
335
  def _complete_cluster_name(ctx: click.Context, param: click.Parameter,
276
336
  incomplete: str) -> List[str]:
277
337
  """Handle shell completion for cluster names."""
@@ -1428,17 +1488,20 @@ def _handle_jobs_queue_request(
1428
1488
 
1429
1489
 
1430
1490
  def _handle_services_request(
1431
- request_id: str,
1432
- service_names: Optional[List[str]],
1433
- show_all: bool,
1434
- show_endpoint: bool,
1435
- is_called_by_user: bool = False) -> Tuple[Optional[int], str]:
1491
+ request_id: str,
1492
+ service_names: Optional[List[str]],
1493
+ show_all: bool,
1494
+ show_endpoint: bool,
1495
+ pool: bool = False, # pylint: disable=redefined-outer-name
1496
+ is_called_by_user: bool = False
1497
+ ) -> Tuple[Optional[int], str]:
1436
1498
  """Get service statuses.
1437
1499
 
1438
1500
  Args:
1439
1501
  service_names: If not None, only show the statuses of these services.
1440
1502
  show_all: Show all information of each service.
1441
1503
  show_endpoint: If True, only show the endpoint of the service.
1504
+ pool: If True, the request is for a pool. Otherwise for a service.
1442
1505
  is_called_by_user: If this function is called by user directly, or an
1443
1506
  internal call.
1444
1507
 
@@ -1447,6 +1510,7 @@ def _handle_services_request(
1447
1510
  is an error when querying the services. In this case, msg contains the
1448
1511
  error message. Otherwise, msg contains the formatted service table.
1449
1512
  """
1513
+ noun = 'pool' if pool else 'service'
1450
1514
  num_services = None
1451
1515
  try:
1452
1516
  if not is_called_by_user:
@@ -1483,11 +1547,11 @@ def _handle_services_request(
1483
1547
  # print the original error.
1484
1548
  pass
1485
1549
  if not msg:
1486
- msg = ('Failed to fetch service statuses due to connection issues. '
1550
+ msg = (f'Failed to fetch {noun} statuses due to connection issues. '
1487
1551
  'Please try again later. Details: '
1488
1552
  f'{common_utils.format_exception(e, use_bracket=True)}')
1489
1553
  except Exception as e: # pylint: disable=broad-except
1490
- msg = ('Failed to fetch service statuses: '
1554
+ msg = (f'Failed to fetch {noun} statuses: '
1491
1555
  f'{common_utils.format_exception(e, use_bracket=True)}')
1492
1556
  else:
1493
1557
  if show_endpoint:
@@ -1502,14 +1566,16 @@ def _handle_services_request(
1502
1566
  endpoint = service_records[0]['endpoint']
1503
1567
  msg = '-' if endpoint is None else endpoint
1504
1568
  else:
1505
- msg = serve_lib.format_service_table(service_records, show_all)
1569
+ msg = serve_lib.format_service_table(service_records, show_all,
1570
+ pool)
1506
1571
  service_not_found_msg = ''
1507
1572
  if service_names is not None:
1508
1573
  for service_name in service_names:
1509
1574
  if not any(service_name == record['name']
1510
1575
  for record in service_records):
1511
1576
  service_not_found_msg += (
1512
- f'\nService {service_name!r} not found.')
1577
+ f'\n{noun.capitalize()} '
1578
+ f'{service_name!r} not found.')
1513
1579
  if service_not_found_msg:
1514
1580
  msg += f'\n{service_not_found_msg}'
1515
1581
  return num_services, msg
@@ -1665,6 +1731,11 @@ def _show_enabled_infra(active_workspace: str, show_workspace: bool):
1665
1731
  is_flag=True,
1666
1732
  required=False,
1667
1733
  help='Also show sky serve services, if any.')
1734
+ @click.option('--show-pools/--no-show-pools',
1735
+ default=True,
1736
+ is_flag=True,
1737
+ required=False,
1738
+ help='Also show cluster pools, if any.')
1668
1739
  @click.option(
1669
1740
  '--kubernetes',
1670
1741
  '--k8s',
@@ -1684,8 +1755,8 @@ def _show_enabled_infra(active_workspace: str, show_workspace: bool):
1684
1755
  # pylint: disable=redefined-builtin
1685
1756
  def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1686
1757
  endpoint: Optional[int], show_managed_jobs: bool,
1687
- show_services: bool, kubernetes: bool, clusters: List[str],
1688
- all_users: bool):
1758
+ show_services: bool, show_pools: bool, kubernetes: bool,
1759
+ clusters: List[str], all_users: bool):
1689
1760
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
1690
1761
  """Show clusters.
1691
1762
 
@@ -1757,6 +1828,9 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1757
1828
  show_endpoints = endpoints or endpoint is not None
1758
1829
  show_single_endpoint = endpoint is not None
1759
1830
  show_services = show_services and not any([clusters, ip, endpoints])
1831
+ remote_api_version = versions.get_remote_api_version()
1832
+ if remote_api_version is None or remote_api_version < 12:
1833
+ show_pools = False
1760
1834
 
1761
1835
  query_clusters: Optional[List[str]] = None if not clusters else clusters
1762
1836
  refresh_mode = common.StatusRefreshMode.NONE
@@ -1807,6 +1881,9 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1807
1881
  def submit_services() -> Optional[str]:
1808
1882
  return serve_lib.status(service_names=None)
1809
1883
 
1884
+ def submit_pools() -> Optional[str]:
1885
+ return managed_jobs.pool_status(pool_names=None)
1886
+
1810
1887
  def submit_workspace() -> Optional[str]:
1811
1888
  try:
1812
1889
  return sdk.workspaces()
@@ -1823,6 +1900,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1823
1900
  managed_jobs_queue_request_id = None
1824
1901
  service_status_request_id = None
1825
1902
  workspace_request_id = None
1903
+ pool_status_request_id = None
1826
1904
 
1827
1905
  # Submit all requests in parallel
1828
1906
  with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
@@ -1830,6 +1908,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1830
1908
  managed_jobs_request_future = executor.submit(submit_managed_jobs)
1831
1909
  if show_services:
1832
1910
  services_request_future = executor.submit(submit_services)
1911
+ if show_pools:
1912
+ pools_request_future = executor.submit(submit_pools)
1833
1913
  if not (ip or show_endpoints):
1834
1914
  workspace_request_future = executor.submit(submit_workspace)
1835
1915
 
@@ -1838,13 +1918,17 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1838
1918
  managed_jobs_queue_request_id = managed_jobs_request_future.result()
1839
1919
  if show_services:
1840
1920
  service_status_request_id = services_request_future.result()
1921
+ if show_pools:
1922
+ pool_status_request_id = pools_request_future.result()
1841
1923
  if not (ip or show_endpoints):
1842
1924
  workspace_request_id = workspace_request_future.result()
1843
1925
 
1844
- managed_jobs_queue_request_id = '' if not managed_jobs_queue_request_id \
1845
- else managed_jobs_queue_request_id
1846
- service_status_request_id = '' if not service_status_request_id \
1847
- else service_status_request_id
1926
+ managed_jobs_queue_request_id = ('' if not managed_jobs_queue_request_id
1927
+ else managed_jobs_queue_request_id)
1928
+ service_status_request_id = ('' if not service_status_request_id else
1929
+ service_status_request_id)
1930
+ pool_status_request_id = ('' if not pool_status_request_id else
1931
+ pool_status_request_id)
1848
1932
 
1849
1933
  # Phase 3: Get cluster records and handle special cases
1850
1934
  cluster_records = _get_cluster_records_and_set_ssh_config(
@@ -1919,7 +2003,34 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1919
2003
  job_info += '. '
1920
2004
  hints.append(
1921
2005
  controller_utils.Controllers.JOBS_CONTROLLER.value.
1922
- in_progress_hint.format(job_info=job_info))
2006
+ in_progress_hint(False).format(job_info=job_info))
2007
+
2008
+ if show_pools:
2009
+ num_pools = None
2010
+ if managed_jobs_query_interrupted:
2011
+ msg = 'KeyboardInterrupt'
2012
+ else:
2013
+ with rich_utils.client_status('[cyan]Checking pools[/]'):
2014
+ try:
2015
+ num_pools, msg = _handle_services_request(
2016
+ pool_status_request_id,
2017
+ service_names=None,
2018
+ show_all=False,
2019
+ show_endpoint=False,
2020
+ pool=True,
2021
+ is_called_by_user=False)
2022
+ except KeyboardInterrupt:
2023
+ sdk.api_cancel(pool_status_request_id, silent=True)
2024
+ num_pools = -1
2025
+ msg = 'KeyboardInterrupt'
2026
+ if num_pools is not None:
2027
+ if num_pools > 0:
2028
+ click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
2029
+ f'Pools{colorama.Style.RESET_ALL}')
2030
+ click.echo(msg)
2031
+ hints.append(
2032
+ controller_utils.Controllers.SKY_SERVE_CONTROLLER.value.
2033
+ in_progress_hint(True))
1923
2034
 
1924
2035
  if show_services:
1925
2036
  click.echo(f'\n{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
@@ -1942,8 +2053,9 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1942
2053
  msg = 'KeyboardInterrupt'
1943
2054
  click.echo(msg)
1944
2055
  if num_services is not None:
1945
- hints.append(controller_utils.Controllers.SKY_SERVE_CONTROLLER.
1946
- value.in_progress_hint)
2056
+ hints.append(
2057
+ controller_utils.Controllers.SKY_SERVE_CONTROLLER.value.
2058
+ in_progress_hint(False))
1947
2059
 
1948
2060
  if num_pending_autostop > 0 and not refresh:
1949
2061
  # Don't print this hint if there's no pending autostop or user has
@@ -4193,6 +4305,17 @@ def jobs():
4193
4305
  is_flag=True,
4194
4306
  help=('If True, as soon as a job is submitted, return from this call '
4195
4307
  'and do not stream execution logs.'))
4308
+ @click.option('--pool',
4309
+ '-p',
4310
+ default=None,
4311
+ type=str,
4312
+ required=False,
4313
+ help='(Experimental; optional) Pool to use for jobs submission.')
4314
+ @click.option('--num-jobs',
4315
+ default=None,
4316
+ type=int,
4317
+ required=False,
4318
+ help='Number of jobs to submit.')
4196
4319
  @click.option('--git-url', type=str, help='Git repository URL.')
4197
4320
  @click.option('--git-ref',
4198
4321
  type=str,
@@ -4226,6 +4349,8 @@ def jobs_launch(
4226
4349
  ports: Tuple[str],
4227
4350
  detach_run: bool,
4228
4351
  yes: bool,
4352
+ pool: Optional[str], # pylint: disable=redefined-outer-name
4353
+ num_jobs: Optional[int],
4229
4354
  async_call: bool,
4230
4355
  config_override: Optional[Dict[str, Any]] = None,
4231
4356
  git_url: Optional[str] = None,
@@ -4245,6 +4370,9 @@ def jobs_launch(
4245
4370
 
4246
4371
  sky jobs launch 'echo hello!'
4247
4372
  """
4373
+ if pool is None and num_jobs is not None:
4374
+ raise click.UsageError('Cannot specify --num-jobs without --pool.')
4375
+
4248
4376
  if cluster is not None:
4249
4377
  if name is not None and name != cluster:
4250
4378
  raise click.UsageError('Cannot specify both --name and --cluster. '
@@ -4295,22 +4423,63 @@ def jobs_launch(
4295
4423
 
4296
4424
  common_utils.check_cluster_name_is_valid(name)
4297
4425
 
4426
+ if pool is not None:
4427
+ num_job_int = num_jobs if num_jobs is not None else 1
4428
+ plural = '' if num_job_int == 1 else 's'
4429
+ click.secho(f'Submitting to pool {colorama.Fore.CYAN}{pool!r}'
4430
+ f'{colorama.Style.RESET_ALL} with {colorama.Fore.CYAN}'
4431
+ f'{num_job_int}{colorama.Style.RESET_ALL} job{plural}.')
4432
+ print_setup_fm_warning = False
4433
+ for task_ in dag.tasks:
4434
+ if (task_.setup is not None or task_.file_mounts or
4435
+ task_.storage_mounts):
4436
+ print_setup_fm_warning = True
4437
+ break
4438
+ if print_setup_fm_warning:
4439
+ click.secho(
4440
+ f'{colorama.Fore.YELLOW}setup/file_mounts/storage_mounts'
4441
+ ' will be ignored in pool. To update a pool, please '
4442
+ f'use `sky pool apply {pool} pool.yaml`. '
4443
+ f'{colorama.Style.RESET_ALL}')
4444
+
4298
4445
  # Optimize info is only show if _need_confirmation.
4299
4446
  if not yes:
4300
4447
  click.secho(
4301
4448
  f'Managed job {dag.name!r} will be launched on (estimated):',
4302
4449
  fg='yellow')
4303
4450
 
4304
- request_id = managed_jobs.launch(dag, name, _need_confirmation=not yes)
4451
+ request_id = managed_jobs.launch(dag,
4452
+ name,
4453
+ pool,
4454
+ num_jobs,
4455
+ _need_confirmation=not yes)
4305
4456
  job_id_handle = _async_call_or_wait(request_id, async_call,
4306
4457
  'sky.jobs.launch')
4458
+
4307
4459
  if not async_call and not detach_run:
4308
- job_id = job_id_handle[0]
4309
- returncode = managed_jobs.tail_logs(name=None,
4310
- job_id=job_id,
4311
- follow=True,
4312
- controller=False)
4313
- sys.exit(returncode)
4460
+ job_ids = job_id_handle[0]
4461
+ if isinstance(job_ids, int) or len(job_ids) == 1:
4462
+ job_id = job_ids if isinstance(job_ids, int) else job_ids[0]
4463
+ returncode = managed_jobs.tail_logs(name=None,
4464
+ job_id=job_id,
4465
+ follow=True,
4466
+ controller=False)
4467
+ sys.exit(returncode)
4468
+ else:
4469
+ job_ids_str = _format_job_ids_str(job_ids)
4470
+ click.secho(
4471
+ f'Jobs submitted with IDs: {colorama.Fore.CYAN}'
4472
+ f'{job_ids_str}{colorama.Style.RESET_ALL}.'
4473
+ f'\n📋 Useful Commands'
4474
+ f'\n{ux_utils.INDENT_SYMBOL}To stream job logs:\t\t\t'
4475
+ f'{ux_utils.BOLD}sky jobs logs <job-id>'
4476
+ f'{ux_utils.RESET_BOLD}'
4477
+ f'\n{ux_utils.INDENT_SYMBOL}To stream controller logs:\t\t'
4478
+ f'{ux_utils.BOLD}sky jobs logs --controller <job-id>'
4479
+ f'{ux_utils.RESET_BOLD}'
4480
+ f'\n{ux_utils.INDENT_LAST_SYMBOL}To cancel all jobs on the '
4481
+ f'pool:\t{ux_utils.BOLD}sky jobs cancel --pool {pool}'
4482
+ f'{ux_utils.RESET_BOLD}')
4314
4483
 
4315
4484
 
4316
4485
  @jobs.command('queue', cls=_DocumentedCodeCommand)
@@ -4420,14 +4589,25 @@ def jobs_queue(verbose: bool, refresh: bool, skip_finished: bool,
4420
4589
  required=False,
4421
4590
  type=str,
4422
4591
  help='Managed job name to cancel.')
4592
+ @click.option('--pool',
4593
+ '-p',
4594
+ required=False,
4595
+ type=str,
4596
+ help='Pool name to cancel.')
4423
4597
  @click.argument('job_ids', default=None, type=int, required=False, nargs=-1)
4424
4598
  @flags.all_option('Cancel all managed jobs for the current user.')
4425
4599
  @flags.yes_option()
4426
4600
  @flags.all_users_option('Cancel all managed jobs from all users.')
4427
4601
  @usage_lib.entrypoint
4428
4602
  # pylint: disable=redefined-builtin
4429
- def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
4430
- all_users: bool):
4603
+ def jobs_cancel(
4604
+ name: Optional[str],
4605
+ pool: Optional[str], # pylint: disable=redefined-outer-name
4606
+ job_ids: Tuple[int],
4607
+ all: bool,
4608
+ yes: bool,
4609
+ all_users: bool,
4610
+ ):
4431
4611
  """Cancel managed jobs.
4432
4612
 
4433
4613
  You can provide either a job name or a list of job IDs to be cancelled.
@@ -4442,22 +4622,29 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
4442
4622
  \b
4443
4623
  # Cancel managed jobs with IDs 1, 2, 3
4444
4624
  $ sky jobs cancel 1 2 3
4625
+ \b
4626
+ # Cancel all managed jobs in pool 'my-pool'
4627
+ $ sky jobs cancel -p my-pool
4445
4628
  """
4446
4629
  job_id_str = ','.join(map(str, job_ids))
4447
- if sum([bool(job_ids), name is not None, all or all_users]) != 1:
4630
+ if sum([
4631
+ bool(job_ids), name is not None, pool is not None, all or all_users
4632
+ ]) != 1:
4448
4633
  arguments = []
4449
4634
  arguments += [f'--job-ids {job_id_str}'] if job_ids else []
4450
4635
  arguments += [f'--name {name}'] if name is not None else []
4636
+ arguments += [f'--pool {pool}'] if pool is not None else []
4451
4637
  arguments += ['--all'] if all else []
4452
4638
  arguments += ['--all-users'] if all_users else []
4453
4639
  raise click.UsageError(
4454
- 'Can only specify one of JOB_IDS, --name, or --all/--all-users. '
4455
- f'Provided {" ".join(arguments)!r}.')
4640
+ 'Can only specify one of JOB_IDS, --name, --pool, or '
4641
+ f'--all/--all-users. Provided {" ".join(arguments)!r}.')
4456
4642
 
4457
4643
  if not yes:
4458
4644
  plural = 's' if len(job_ids) > 1 else ''
4459
4645
  job_identity_str = (f'managed job{plural} with ID{plural} {job_id_str}'
4460
- if job_ids else repr(name))
4646
+ if job_ids else f'{name!r}' if name is not None else
4647
+ f'managed jobs in pool {pool!r}')
4461
4648
  if all_users:
4462
4649
  job_identity_str = 'all managed jobs FOR ALL USERS'
4463
4650
  elif all:
@@ -4470,6 +4657,7 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
4470
4657
  sdk.stream_and_get(
4471
4658
  managed_jobs.cancel(job_ids=job_ids,
4472
4659
  name=name,
4660
+ pool=pool,
4473
4661
  all=all,
4474
4662
  all_users=all_users))
4475
4663
 
@@ -4547,24 +4735,47 @@ def jobs_dashboard():
4547
4735
  sdk.dashboard(starting_page='jobs')
4548
4736
 
4549
4737
 
4550
- @cli.command(cls=_DocumentedCodeCommand)
4551
- @flags.config_option(expose_value=False)
4552
- @usage_lib.entrypoint
4553
- def dashboard() -> None:
4554
- """Starts the dashboard for skypilot."""
4555
- sdk.dashboard()
4556
-
4557
-
4558
- @cli.group(cls=_NaturalOrderGroup)
4559
- def serve():
4560
- """SkyServe CLI (multi-region, multi-cloud serving)."""
4738
+ @jobs.group(cls=_NaturalOrderGroup)
4739
+ def pool():
4740
+ """(Experimental) Pool management commands."""
4561
4741
  pass
4562
4742
 
4563
4743
 
4564
- def _generate_task_with_service(
4565
- service_name: str,
4566
- service_yaml_args: Tuple[str, ...],
4744
+ # TODO(MaoZiming): Update Doc.
4745
+ # TODO(MaoZiming): Expose mix replica traffic option to user.
4746
+ # Currently, we do not mix traffic from old and new replicas.
4747
+ @pool.command('apply', cls=_DocumentedCodeCommand)
4748
+ @flags.config_option(expose_value=False)
4749
+ @click.argument('pool_yaml',
4750
+ required=True,
4751
+ type=str,
4752
+ nargs=-1,
4753
+ **_get_shell_complete_args(_complete_file_name))
4754
+ @click.option('--pool-name',
4755
+ '-p',
4756
+ default=None,
4757
+ type=str,
4758
+ help='A pool name. Unique for each pool. If not provided, '
4759
+ 'a unique name is autogenerated.')
4760
+ @click.option('--mode',
4761
+ default=serve_lib.DEFAULT_UPDATE_MODE.value,
4762
+ type=click.Choice([m.value for m in serve_lib.UpdateMode],
4763
+ case_sensitive=False),
4764
+ required=False,
4765
+ help=('Update mode. If "rolling", cluster pool will be updated '
4766
+ 'with rolling update. If "blue_green", cluster pool will '
4767
+ 'be updated with blue-green update. This option is only '
4768
+ 'valid when the pool is already running.'))
4769
+ @_add_click_options(flags.TASK_OPTIONS + flags.EXTRA_RESOURCES_OPTIONS +
4770
+ flags.COMMON_OPTIONS)
4771
+ @flags.yes_option()
4772
+ @timeline.event
4773
+ @usage_lib.entrypoint
4774
+ def jobs_pool_apply(
4775
+ pool_yaml: Tuple[str, ...],
4776
+ pool_name: Optional[str],
4567
4777
  workdir: Optional[str],
4778
+ infra: Optional[str],
4568
4779
  cloud: Optional[str],
4569
4780
  region: Optional[str],
4570
4781
  zone: Optional[str],
@@ -4573,21 +4784,196 @@ def _generate_task_with_service(
4573
4784
  image_id: Optional[str],
4574
4785
  env_file: Optional[Dict[str, str]],
4575
4786
  env: List[Tuple[str, str]],
4576
- secret: Optional[List[Tuple[str, str]]],
4787
+ secret: List[Tuple[str, str]],
4577
4788
  gpus: Optional[str],
4578
4789
  instance_type: Optional[str],
4579
- ports: Optional[Tuple[str]],
4790
+ ports: Tuple[str],
4580
4791
  cpus: Optional[str],
4581
4792
  memory: Optional[str],
4582
4793
  disk_size: Optional[int],
4583
4794
  disk_tier: Optional[str],
4584
4795
  network_tier: Optional[str],
4585
- not_supported_cmd: str,
4796
+ mode: str,
4797
+ yes: bool,
4798
+ async_call: bool,
4799
+ ):
4800
+ """Apply a config to a cluster pool for managed jobs submission.
4801
+
4802
+ If the pool is already running, the config will be applied to the pool.
4803
+ Otherwise, a new pool will be created.
4804
+
4805
+ POOL_YAML must point to a valid YAML file.
4806
+ """
4807
+ cloud, region, zone = _handle_infra_cloud_region_zone_options(
4808
+ infra, cloud, region, zone)
4809
+ if pool_name is None:
4810
+ pool_name = serve_lib.generate_service_name(pool=True)
4811
+
4812
+ task = _generate_task_with_service(
4813
+ service_name=pool_name,
4814
+ service_yaml_args=pool_yaml,
4815
+ workdir=workdir,
4816
+ cloud=cloud,
4817
+ region=region,
4818
+ zone=zone,
4819
+ gpus=gpus,
4820
+ cpus=cpus,
4821
+ memory=memory,
4822
+ instance_type=instance_type,
4823
+ num_nodes=num_nodes,
4824
+ use_spot=use_spot,
4825
+ image_id=image_id,
4826
+ env_file=env_file,
4827
+ env=env,
4828
+ secret=secret,
4829
+ disk_size=disk_size,
4830
+ disk_tier=disk_tier,
4831
+ network_tier=network_tier,
4832
+ ports=ports,
4833
+ not_supported_cmd='sky jobs pool up',
4834
+ pool=True,
4835
+ )
4836
+ assert task.service is not None
4837
+ if not task.service.pool:
4838
+ raise click.UsageError('The YAML file needs a `pool` section.')
4839
+ click.secho('Pool spec:', fg='cyan')
4840
+ click.echo(task.service)
4841
+ serve_lib.validate_service_task(task, pool=True)
4842
+
4843
+ click.secho(
4844
+ 'Each pool worker will use the following resources (estimated):',
4845
+ fg='cyan')
4846
+ with sky.Dag() as dag:
4847
+ dag.add(task)
4848
+
4849
+ request_id = managed_jobs.pool_apply(task,
4850
+ pool_name,
4851
+ mode=serve_lib.UpdateMode(mode),
4852
+ _need_confirmation=not yes)
4853
+ _async_call_or_wait(request_id, async_call, 'sky.jobs.pool_apply')
4854
+
4855
+
4856
+ @pool.command('status', cls=_DocumentedCodeCommand)
4857
+ @flags.config_option(expose_value=False)
4858
+ @flags.verbose_option()
4859
+ @click.argument('pool_names', required=False, type=str, nargs=-1)
4860
+ @usage_lib.entrypoint
4861
+ # pylint: disable=redefined-builtin
4862
+ def jobs_pool_status(verbose: bool, pool_names: List[str]):
4863
+ """Show statuses of cluster pools.
4864
+
4865
+ Show detailed statuses of one or more pools. If POOL_NAME is not
4866
+ provided, show all pools' status.
4867
+ """
4868
+ pool_names_to_query: Optional[List[str]] = pool_names
4869
+ if not pool_names:
4870
+ pool_names_to_query = None
4871
+ with rich_utils.client_status('[cyan]Checking pools[/]'):
4872
+ pool_status_request_id = managed_jobs.pool_status(pool_names_to_query)
4873
+ _, msg = _handle_services_request(pool_status_request_id,
4874
+ service_names=pool_names_to_query,
4875
+ show_all=verbose,
4876
+ show_endpoint=False,
4877
+ pool=True,
4878
+ is_called_by_user=True)
4879
+
4880
+ click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
4881
+ f'Pools{colorama.Style.RESET_ALL}')
4882
+ click.echo(msg)
4883
+
4884
+
4885
+ @pool.command('down', cls=_DocumentedCodeCommand)
4886
+ @flags.config_option(expose_value=False)
4887
+ @click.argument('pool_names', required=False, type=str, nargs=-1)
4888
+ @flags.all_option('Delete all pools.')
4889
+ @click.option('--purge',
4890
+ '-p',
4891
+ default=False,
4892
+ is_flag=True,
4893
+ help='Tear down pools in failed status.')
4894
+ @flags.yes_option()
4895
+ @_add_click_options(flags.COMMON_OPTIONS)
4896
+ @usage_lib.entrypoint
4897
+ # pylint: disable=redefined-builtin
4898
+ def jobs_pool_down(
4899
+ pool_names: List[str],
4900
+ all: bool,
4901
+ purge: bool,
4902
+ yes: bool,
4903
+ async_call: bool,
4904
+ ) -> None:
4905
+ """Delete pool(s).
4906
+
4907
+ POOL_NAMES is the name of the pool (or glob pattern) to delete. If
4908
+ both POOL_NAMES and ``--all`` are supplied, the latter takes precedence.
4909
+
4910
+ Deleting a pool will delete all of its workers and associated resources.
4911
+ """
4912
+ if sum([bool(pool_names), all]) != 1:
4913
+ argument_str = (f'POOL_NAMES={",".join(pool_names)}'
4914
+ if pool_names else '')
4915
+ argument_str += ' --all' if all else ''
4916
+ raise click.UsageError('Can only specify one of POOL_NAMES or --all. '
4917
+ f'Provided {argument_str!r}.')
4918
+
4919
+ if not yes:
4920
+ quoted_pool_names = [f'{name!r}' for name in pool_names]
4921
+ list_pool_str = ', '.join(quoted_pool_names)
4922
+ pool_identity_str = f'pool(s) {list_pool_str}'
4923
+ if all:
4924
+ pool_identity_str = 'all pools'
4925
+ click.confirm(f'Terminating {pool_identity_str}. Proceed?',
4926
+ default=True,
4927
+ abort=True,
4928
+ show_default=True)
4929
+
4930
+ request_id = managed_jobs.pool_down(pool_names, all=all, purge=purge)
4931
+ _async_call_or_wait(request_id, async_call, 'sky.jobs.pool_down')
4932
+
4933
+
4934
+ @cli.command(cls=_DocumentedCodeCommand)
4935
+ @flags.config_option(expose_value=False)
4936
+ @usage_lib.entrypoint
4937
+ def dashboard() -> None:
4938
+ """Starts the dashboard for skypilot."""
4939
+ sdk.dashboard()
4940
+
4941
+
4942
+ @cli.group(cls=_NaturalOrderGroup)
4943
+ def serve():
4944
+ """SkyServe CLI (multi-region, multi-cloud serving)."""
4945
+ pass
4946
+
4947
+
4948
+ def _generate_task_with_service(
4949
+ service_name: str,
4950
+ service_yaml_args: Tuple[str, ...],
4951
+ workdir: Optional[str],
4952
+ cloud: Optional[str],
4953
+ region: Optional[str],
4954
+ zone: Optional[str],
4955
+ num_nodes: Optional[int],
4956
+ use_spot: Optional[bool],
4957
+ image_id: Optional[str],
4958
+ env_file: Optional[Dict[str, str]],
4959
+ env: List[Tuple[str, str]],
4960
+ secret: Optional[List[Tuple[str, str]]],
4961
+ gpus: Optional[str],
4962
+ instance_type: Optional[str],
4963
+ ports: Optional[Tuple[str]],
4964
+ cpus: Optional[str],
4965
+ memory: Optional[str],
4966
+ disk_size: Optional[int],
4967
+ disk_tier: Optional[str],
4968
+ network_tier: Optional[str],
4969
+ not_supported_cmd: str,
4970
+ pool: bool, # pylint: disable=redefined-outer-name
4586
4971
  ) -> sky.Task:
4587
4972
  """Generate a task with service section from a service YAML file."""
4588
4973
  is_yaml, _ = _check_yaml(''.join(service_yaml_args))
4974
+ yaml_name = 'SERVICE_YAML' if not pool else 'POOL_YAML'
4589
4975
  if not is_yaml:
4590
- raise click.UsageError('SERVICE_YAML must be a valid YAML file.')
4976
+ raise click.UsageError(f'{yaml_name} must be a valid YAML file.')
4591
4977
  env = _merge_env_vars(env_file, env)
4592
4978
  # We keep nargs=-1 in service_yaml argument to reuse this function.
4593
4979
  task = _make_task_or_dag_from_entrypoint_with_overrides(
@@ -4617,9 +5003,17 @@ def _generate_task_with_service(
4617
5003
  _DAG_NOT_SUPPORTED_MESSAGE.format(command=not_supported_cmd))
4618
5004
 
4619
5005
  if task.service is None:
5006
+ field_name = 'service' if not pool else 'pool'
4620
5007
  with ux_utils.print_exception_no_traceback():
4621
- raise ValueError('Service section not found in the YAML file. '
4622
- 'To fix, add a valid `service` field.')
5008
+ raise ValueError(f'{field_name.capitalize()} section not found '
5009
+ 'in the YAML file. To fix, add a valid '
5010
+ f'`{field_name}` field.')
5011
+
5012
+ if task.service.pool:
5013
+ if task.service.ports is not None or ports:
5014
+ with ux_utils.print_exception_no_traceback():
5015
+ raise ValueError('Cannot specify ports in a cluster pool.')
5016
+ return task
4623
5017
 
4624
5018
  # NOTE(yi): we only allow one service port now.
4625
5019
  service_port: Optional[int] = int(
@@ -4779,10 +5173,14 @@ def serve_up(
4779
5173
  network_tier=network_tier,
4780
5174
  ports=ports,
4781
5175
  not_supported_cmd='sky serve up',
5176
+ pool=False,
4782
5177
  )
5178
+ assert task.service is not None
5179
+ if task.service.pool:
5180
+ raise click.UsageError('The YAML file needs a `service` section.')
4783
5181
  click.secho('Service spec:', fg='cyan')
4784
5182
  click.echo(task.service)
4785
- serve_lib.validate_service_task(task)
5183
+ serve_lib.validate_service_task(task, pool=False)
4786
5184
 
4787
5185
  click.secho('Each replica will use the following resources (estimated):',
4788
5186
  fg='cyan')
@@ -4881,10 +5279,11 @@ def serve_update(
4881
5279
  network_tier=network_tier,
4882
5280
  ports=ports,
4883
5281
  not_supported_cmd='sky serve update',
5282
+ pool=False,
4884
5283
  )
4885
5284
  click.secho('Service spec:', fg='cyan')
4886
5285
  click.echo(task.service)
4887
- serve_lib.validate_service_task(task)
5286
+ serve_lib.validate_service_task(task, pool=False)
4888
5287
 
4889
5288
  click.secho('New replica will use the following resources (estimated):',
4890
5289
  fg='cyan')