skypilot-nightly 1.0.0.dev20250522__py3-none-any.whl → 1.0.0.dev20250524__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +46 -16
  3. sky/backends/backend_utils.py +62 -45
  4. sky/backends/cloud_vm_ray_backend.py +19 -5
  5. sky/check.py +398 -171
  6. sky/cli.py +302 -98
  7. sky/client/cli.py +302 -98
  8. sky/client/sdk.py +104 -12
  9. sky/clouds/__init__.py +3 -0
  10. sky/clouds/aws.py +4 -2
  11. sky/clouds/azure.py +4 -2
  12. sky/clouds/cloud.py +24 -6
  13. sky/clouds/cudo.py +2 -1
  14. sky/clouds/do.py +2 -1
  15. sky/clouds/fluidstack.py +2 -1
  16. sky/clouds/gcp.py +23 -5
  17. sky/clouds/ibm.py +4 -2
  18. sky/clouds/kubernetes.py +66 -22
  19. sky/clouds/lambda_cloud.py +2 -1
  20. sky/clouds/nebius.py +18 -2
  21. sky/clouds/oci.py +4 -2
  22. sky/clouds/paperspace.py +2 -1
  23. sky/clouds/runpod.py +2 -1
  24. sky/clouds/scp.py +2 -1
  25. sky/clouds/service_catalog/constants.py +1 -1
  26. sky/clouds/service_catalog/ssh_catalog.py +167 -0
  27. sky/clouds/ssh.py +203 -0
  28. sky/clouds/vast.py +2 -1
  29. sky/clouds/vsphere.py +2 -1
  30. sky/core.py +58 -11
  31. sky/dashboard/out/404.html +1 -1
  32. sky/dashboard/out/_next/static/aHej19bZyl4hoHgrzPCn7/_buildManifest.js +1 -0
  33. sky/dashboard/out/_next/static/chunks/480-ee58038f1a4afd5c.js +1 -0
  34. sky/dashboard/out/_next/static/chunks/488-50d843fdb5396d32.js +15 -0
  35. sky/dashboard/out/_next/static/chunks/498-d7722313e5e5b4e6.js +21 -0
  36. sky/dashboard/out/_next/static/chunks/573-f17bd89d9f9118b3.js +66 -0
  37. sky/dashboard/out/_next/static/chunks/578-7a4795009a56430c.js +6 -0
  38. sky/dashboard/out/_next/static/chunks/734-5f5ce8f347b7f417.js +1 -0
  39. sky/dashboard/out/_next/static/chunks/937.f97f83652028e944.js +1 -0
  40. sky/dashboard/out/_next/static/chunks/938-f347f6144075b0c8.js +1 -0
  41. sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +1 -0
  42. sky/dashboard/out/_next/static/chunks/pages/_app-dec800f9ef1b10f4.js +1 -0
  43. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-37c042a356f8e608.js +1 -0
  44. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js +16 -0
  45. sky/dashboard/out/_next/static/chunks/pages/clusters-9e6d1ec6e1ac5b29.js +1 -0
  46. sky/dashboard/out/_next/static/chunks/pages/infra-e690d864aa00e2ea.js +1 -0
  47. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-db6558a5ec687011.js +1 -0
  48. sky/dashboard/out/_next/static/chunks/pages/jobs-73d5e0c369d00346.js +16 -0
  49. sky/dashboard/out/_next/static/chunks/pages/users-2d319455c3f1c3e2.js +1 -0
  50. sky/dashboard/out/_next/static/chunks/pages/workspaces-02a7b60f2ead275f.js +1 -0
  51. sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +1 -0
  52. sky/dashboard/out/_next/static/css/d2cdba64c9202dd7.css +3 -0
  53. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  54. sky/dashboard/out/clusters/[cluster].html +1 -1
  55. sky/dashboard/out/clusters.html +1 -1
  56. sky/dashboard/out/index.html +1 -1
  57. sky/dashboard/out/infra.html +1 -1
  58. sky/dashboard/out/jobs/[job].html +1 -1
  59. sky/dashboard/out/jobs.html +1 -1
  60. sky/dashboard/out/users.html +1 -0
  61. sky/dashboard/out/workspaces.html +1 -0
  62. sky/data/storage.py +1 -1
  63. sky/global_user_state.py +42 -19
  64. sky/jobs/constants.py +1 -1
  65. sky/jobs/server/core.py +72 -56
  66. sky/jobs/state.py +26 -5
  67. sky/jobs/utils.py +65 -13
  68. sky/optimizer.py +29 -7
  69. sky/provision/__init__.py +1 -0
  70. sky/provision/aws/instance.py +17 -1
  71. sky/provision/fluidstack/instance.py +1 -0
  72. sky/provision/kubernetes/instance.py +16 -5
  73. sky/provision/kubernetes/utils.py +37 -19
  74. sky/provision/nebius/instance.py +3 -1
  75. sky/provision/nebius/utils.py +14 -2
  76. sky/provision/ssh/__init__.py +18 -0
  77. sky/resources.py +4 -1
  78. sky/serve/server/core.py +9 -6
  79. sky/server/html/token_page.html +6 -1
  80. sky/server/requests/executor.py +1 -0
  81. sky/server/requests/payloads.py +18 -0
  82. sky/server/server.py +108 -5
  83. sky/setup_files/dependencies.py +1 -0
  84. sky/skylet/constants.py +4 -1
  85. sky/skypilot_config.py +83 -9
  86. sky/templates/nebius-ray.yml.j2 +12 -0
  87. sky/utils/cli_utils/status_utils.py +18 -8
  88. sky/utils/infra_utils.py +21 -1
  89. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  90. sky/utils/kubernetes/create_cluster.sh +1 -0
  91. sky/utils/kubernetes/deploy_remote_cluster.py +1440 -0
  92. sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
  93. sky/utils/kubernetes/ssh-tunnel.sh +387 -0
  94. sky/utils/log_utils.py +218 -1
  95. sky/utils/schemas.py +75 -0
  96. sky/utils/ux_utils.py +2 -1
  97. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/METADATA +6 -1
  98. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/RECORD +103 -91
  99. sky/dashboard/out/_next/static/CzOVV6JpRQBRt5GhZuhyK/_buildManifest.js +0 -1
  100. sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +0 -6
  101. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  102. sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +0 -6
  103. sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +0 -1
  104. sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +0 -1
  105. sky/dashboard/out/_next/static/chunks/582-683f4f27b81996dc.js +0 -59
  106. sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +0 -1
  107. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +0 -1
  108. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +0 -1
  109. sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +0 -1
  110. sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
  111. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +0 -1
  112. sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +0 -1
  113. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  114. sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +0 -3
  115. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  116. /sky/dashboard/out/_next/static/{CzOVV6JpRQBRt5GhZuhyK → aHej19bZyl4hoHgrzPCn7}/_ssgManifest.js +0 -0
  117. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/WHEEL +0 -0
  118. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/entry_points.txt +0 -0
  119. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/licenses/LICENSE +0 -0
  120. {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/top_level.txt +0 -0
sky/cli.py CHANGED
@@ -212,6 +212,7 @@ def _get_glob_storages(storages: List[str]) -> List[str]:
212
212
  """Returns a list of storages that match the glob pattern."""
213
213
  glob_storages = []
214
214
  for storage_object in storages:
215
+ # TODO(zhwu): client side should not rely on global_user_state.
215
216
  glob_storage = global_user_state.get_glob_storage_name(storage_object)
216
217
  if not glob_storage:
217
218
  click.echo(f'Storage {storage_object} not found.')
@@ -1780,6 +1781,31 @@ def _show_endpoint(query_clusters: Optional[List[str]],
1780
1781
  return
1781
1782
 
1782
1783
 
1784
+ def _show_enabled_infra(active_workspace: str, show_workspace: bool):
1785
+ """Show the enabled infrastructure."""
1786
+ workspace_str = ''
1787
+ if show_workspace:
1788
+ workspace_str = f' (workspace: {active_workspace!r})'
1789
+ title = (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Enabled Infra'
1790
+ f'{workspace_str}:'
1791
+ f'{colorama.Style.RESET_ALL} ')
1792
+ enabled_clouds = sdk.get(sdk.enabled_clouds())
1793
+ enabled_ssh_infras = []
1794
+ enabled_k8s_infras = []
1795
+ enabled_cloud_infras = []
1796
+ for cloud in enabled_clouds:
1797
+ cloud_infra = cloud.get_infras()
1798
+ if isinstance(cloud, clouds.SSH):
1799
+ enabled_ssh_infras.extend(cloud_infra)
1800
+ elif isinstance(cloud, clouds.Kubernetes):
1801
+ enabled_k8s_infras.extend(cloud_infra)
1802
+ else:
1803
+ enabled_cloud_infras.extend(cloud_infra)
1804
+ all_infras = sorted(enabled_ssh_infras) + sorted(
1805
+ enabled_k8s_infras) + sorted(enabled_cloud_infras)
1806
+ click.echo(f'{title}{", ".join(all_infras)}\n')
1807
+
1808
+
1783
1809
  @cli.command()
1784
1810
  @config_option(expose_value=False)
1785
1811
  @click.option('--verbose',
@@ -1932,6 +1958,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1932
1958
  # status query.
1933
1959
  service_status_request_id = serve_lib.status(service_names=None)
1934
1960
 
1961
+ workspace_request_id = None
1935
1962
  if ip or show_endpoints:
1936
1963
  if refresh:
1937
1964
  raise click.UsageError(
@@ -1966,8 +1993,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1966
1993
  ('endpoint port'
1967
1994
  if show_single_endpoint else 'endpoints')))
1968
1995
  else:
1969
- click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters'
1970
- f'{colorama.Style.RESET_ALL}')
1996
+ workspace_request_id = sdk.workspaces()
1997
+
1971
1998
  query_clusters: Optional[List[str]] = None if not clusters else clusters
1972
1999
  refresh_mode = common.StatusRefreshMode.NONE
1973
2000
  if refresh:
@@ -1990,9 +2017,20 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1990
2017
  else:
1991
2018
  normal_clusters.append(cluster_record)
1992
2019
 
2020
+ if workspace_request_id is not None:
2021
+ all_workspaces = sdk.get(workspace_request_id)
2022
+ else:
2023
+ all_workspaces = [constants.SKYPILOT_DEFAULT_WORKSPACE]
2024
+ active_workspace = skypilot_config.get_active_workspace()
2025
+ show_workspace = len(all_workspaces) > 1
2026
+ _show_enabled_infra(active_workspace, show_workspace)
2027
+ click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters'
2028
+ f'{colorama.Style.RESET_ALL}')
2029
+
1993
2030
  num_pending_autostop = 0
1994
2031
  num_pending_autostop += status_utils.show_status_table(
1995
- normal_clusters + controllers, verbose, all_users, query_clusters)
2032
+ normal_clusters + controllers, verbose, all_users, query_clusters,
2033
+ show_workspace)
1996
2034
 
1997
2035
  managed_jobs_query_interrupted = False
1998
2036
  if show_managed_jobs:
@@ -3322,9 +3360,16 @@ def _down_or_stop_clusters(
3322
3360
  is_flag=True,
3323
3361
  default=False,
3324
3362
  help='Show the activated account for each cloud.')
3363
+ @click.option(
3364
+ '--workspace',
3365
+ '-w',
3366
+ type=str,
3367
+ help='The workspace to check. If None, all workspaces will be checked.')
3325
3368
  @usage_lib.entrypoint
3326
3369
  # pylint: disable=redefined-outer-name
3327
- def check(infra_list: Tuple[str], verbose: bool):
3370
+ def check(infra_list: Tuple[str],
3371
+ verbose: bool,
3372
+ workspace: Optional[str] = None):
3328
3373
  """Check which clouds are available to use.
3329
3374
 
3330
3375
  This checks access credentials for all clouds supported by SkyPilot. If a
@@ -3347,7 +3392,9 @@ def check(infra_list: Tuple[str], verbose: bool):
3347
3392
  sky check aws gcp
3348
3393
  """
3349
3394
  infra_arg = infra_list if len(infra_list) > 0 else None
3350
- request_id = sdk.check(infra_list=infra_arg, verbose=verbose)
3395
+ request_id = sdk.check(infra_list=infra_arg,
3396
+ verbose=verbose,
3397
+ workspace=workspace)
3351
3398
  sdk.stream_and_get(request_id)
3352
3399
  api_server_url = server_common.get_server_url()
3353
3400
  click.echo()
@@ -3462,13 +3509,22 @@ def show_gpus(
3462
3509
 
3463
3510
  # Kubernetes specific bools
3464
3511
  enabled_clouds = sdk.get(sdk.enabled_clouds())
3465
- cloud_is_kubernetes = isinstance(cloud_obj, clouds.Kubernetes)
3512
+ cloud_is_kubernetes = isinstance(
3513
+ cloud_obj, clouds.Kubernetes) and not isinstance(cloud_obj, clouds.SSH)
3514
+ cloud_is_ssh = isinstance(cloud_obj, clouds.SSH)
3466
3515
  # TODO(romilb): We should move this to the backend.
3467
3516
  kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None
3468
- kubernetes_is_enabled = clouds.cloud_in_iterable(
3469
- clouds.Kubernetes(),
3470
- enabled_clouds,
3471
- )
3517
+ kubernetes_is_enabled = False
3518
+ ssh_is_enabled = False
3519
+ for cloud in enabled_clouds:
3520
+ if isinstance(cloud, clouds.SSH):
3521
+ ssh_is_enabled = True
3522
+ elif isinstance(cloud, clouds.Kubernetes):
3523
+ kubernetes_is_enabled = True
3524
+ query_k8s_realtime_gpu = (kubernetes_is_enabled and
3525
+ (cloud_name is None or cloud_is_kubernetes))
3526
+ query_ssh_realtime_gpu = (ssh_is_enabled and
3527
+ (cloud_name is None or cloud_is_ssh))
3472
3528
 
3473
3529
  def _list_to_str(lst):
3474
3530
  return ', '.join([str(e) for e in lst])
@@ -3478,7 +3534,8 @@ def show_gpus(
3478
3534
  def _get_kubernetes_realtime_gpu_tables(
3479
3535
  context: Optional[str] = None,
3480
3536
  name_filter: Optional[str] = None,
3481
- quantity_filter: Optional[int] = None
3537
+ quantity_filter: Optional[int] = None,
3538
+ is_ssh: bool = False,
3482
3539
  ) -> Tuple[List[Tuple[str, 'prettytable.PrettyTable']],
3483
3540
  Optional['prettytable.PrettyTable'], List[Tuple[
3484
3541
  str, 'models.KubernetesNodesInfo']]]:
@@ -3491,19 +3548,26 @@ def show_gpus(
3491
3548
  sdk.realtime_kubernetes_gpu_availability(
3492
3549
  context=context,
3493
3550
  name_filter=name_filter,
3494
- quantity_filter=quantity_filter))
3551
+ quantity_filter=quantity_filter,
3552
+ is_ssh=is_ssh))
3495
3553
  if not realtime_gpu_availability_lists:
3496
- err_msg = 'No GPUs found in any allowed Kubernetes cluster. '
3497
- debug_msg = 'To further debug, run: sky check '
3554
+ # Customize message based on context
3555
+ identity = ('SSH Node Pool'
3556
+ if is_ssh else 'any allowed Kubernetes cluster')
3557
+ cloud_name = 'ssh' if is_ssh else 'kubernetes'
3558
+ err_msg = f'No GPUs found in {identity}. '
3559
+ debug_msg = (f'To further debug, run: sky check {cloud_name}')
3498
3560
  if name_filter is not None:
3499
3561
  gpu_info_msg = f' {name_filter!r}'
3500
3562
  if quantity_filter is not None:
3501
3563
  gpu_info_msg += (' with requested quantity'
3502
3564
  f' {quantity_filter}')
3503
3565
  err_msg = (f'Resources{gpu_info_msg} not found '
3504
- 'in any allowed Kubernetes cluster. ')
3505
- debug_msg = ('To show available accelerators on kubernetes,'
3506
- ' run: sky show-gpus --cloud kubernetes ')
3566
+ f'in {identity}. ')
3567
+ identity_short = 'SSH Node Pool' if is_ssh else 'Kubernetes'
3568
+ debug_msg = (
3569
+ f'To show available accelerators in {identity_short}, '
3570
+ f'run: sky show-gpus --cloud {cloud_name}')
3507
3571
  full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
3508
3572
  debug_msg)
3509
3573
  raise ValueError(full_err_msg)
@@ -3513,6 +3577,14 @@ def show_gpus(
3513
3577
  lambda: [0, 0])
3514
3578
  all_nodes_info = []
3515
3579
 
3580
+ # display an aggregated table for all contexts
3581
+ # if there are more than one contexts with GPUs.
3582
+ def _filter_ctx(ctx: str) -> bool:
3583
+ ctx_is_ssh = ctx and ctx.startswith('ssh-')
3584
+ return ctx_is_ssh is is_ssh
3585
+
3586
+ num_filtered_contexts = 0
3587
+
3516
3588
  if realtime_gpu_availability_lists:
3517
3589
  if len(realtime_gpu_availability_lists[0]) != 2:
3518
3590
  # TODO(kyuds): for backwards compatibility, as we add new
@@ -3522,6 +3594,13 @@ def show_gpus(
3522
3594
  (context, realtime_gpu_availability_lists)
3523
3595
  ]
3524
3596
  for (ctx, availability_list) in realtime_gpu_availability_lists:
3597
+ if not _filter_ctx(ctx):
3598
+ continue
3599
+ if is_ssh:
3600
+ display_ctx = ctx.lstrip('ssh-')
3601
+ else:
3602
+ display_ctx = ctx
3603
+ num_filtered_contexts += 1
3525
3604
  realtime_gpu_table = log_utils.create_table(
3526
3605
  ['GPU', qty_header, 'UTILIZATION'])
3527
3606
  for realtime_gpu_availability in sorted(availability_list):
@@ -3542,15 +3621,12 @@ def show_gpus(
3542
3621
  if capacity > 0:
3543
3622
  total_gpu_info[gpu][0] += capacity
3544
3623
  total_gpu_info[gpu][1] += available
3545
- realtime_gpu_infos.append((ctx, realtime_gpu_table))
3624
+ realtime_gpu_infos.append((display_ctx, realtime_gpu_table))
3546
3625
  # Collect node info for this context
3547
3626
  nodes_info = sdk.stream_and_get(
3548
3627
  sdk.kubernetes_node_info(context=ctx))
3549
- all_nodes_info.append((ctx, nodes_info))
3550
-
3551
- # display an aggregated table for all contexts
3552
- # if there are more than one contexts with GPUs
3553
- if len(realtime_gpu_infos) > 1:
3628
+ all_nodes_info.append((display_ctx, nodes_info))
3629
+ if num_filtered_contexts > 1:
3554
3630
  total_realtime_gpu_table = log_utils.create_table(
3555
3631
  ['GPU', 'UTILIZATION'])
3556
3632
  for gpu, stats in total_gpu_info.items():
@@ -3562,10 +3638,11 @@ def show_gpus(
3562
3638
  return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
3563
3639
 
3564
3640
  def _format_kubernetes_node_info_combined(
3565
- contexts_info: List[Tuple[str,
3566
- 'models.KubernetesNodesInfo']]) -> str:
3641
+ contexts_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
3642
+ cloud_str: str = 'Kubernetes',
3643
+ context_title_str: str = 'CONTEXT') -> str:
3567
3644
  node_table = log_utils.create_table(
3568
- ['CONTEXT', 'NODE', 'GPU', 'UTILIZATION'])
3645
+ [context_title_str, 'NODE', 'GPU', 'UTILIZATION'])
3569
3646
 
3570
3647
  no_permissions_str = '<no permissions>'
3571
3648
  hints = []
@@ -3588,7 +3665,7 @@ def show_gpus(
3588
3665
  'free'
3589
3666
  ])
3590
3667
 
3591
- k8s_per_node_acc_message = ('Kubernetes per-node GPU availability')
3668
+ k8s_per_node_acc_message = (f'{cloud_str} per-node GPU availability')
3592
3669
  if hints:
3593
3670
  k8s_per_node_acc_message += ' (' + '; '.join(hints) + ')'
3594
3671
 
@@ -3598,26 +3675,30 @@ def show_gpus(
3598
3675
  f'{node_table.get_string()}')
3599
3676
 
3600
3677
  def _format_kubernetes_realtime_gpu(
3601
- total_table: 'prettytable.PrettyTable',
3678
+ total_table: Optional['prettytable.PrettyTable'],
3602
3679
  k8s_realtime_infos: List[Tuple[str, 'prettytable.PrettyTable']],
3603
3680
  all_nodes_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
3604
- show_node_info: bool) -> Generator[str, None, None]:
3681
+ show_node_info: bool, is_ssh: bool) -> Generator[str, None, None]:
3682
+ identity = 'SSH Node Pool' if is_ssh else 'Kubernetes'
3605
3683
  yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
3606
- 'Kubernetes GPUs'
3684
+ f'{identity} GPUs'
3607
3685
  f'{colorama.Style.RESET_ALL}')
3608
3686
  # print total table
3609
3687
  if total_table is not None:
3610
3688
  yield '\n'
3611
3689
  yield from total_table.get_string()
3612
3690
 
3691
+ ctx_name = 'SSH Node Pool' if is_ssh else 'Context'
3692
+ ctx_column_title = 'NODE_POOL' if is_ssh else 'CONTEXT'
3693
+
3613
3694
  # print individual infos.
3614
3695
  for (ctx, k8s_realtime_table) in k8s_realtime_infos:
3615
3696
  yield '\n'
3616
3697
  # Print context header separately
3617
3698
  if ctx:
3618
- context_str = f'Context: {ctx}'
3699
+ context_str = f'{ctx_name}: {ctx}'
3619
3700
  else:
3620
- context_str = 'Default Context'
3701
+ context_str = f'Default {ctx_name}'
3621
3702
  yield (
3622
3703
  f'{colorama.Fore.CYAN}{context_str}{colorama.Style.RESET_ALL}\n'
3623
3704
  )
@@ -3625,7 +3706,102 @@ def show_gpus(
3625
3706
 
3626
3707
  if show_node_info:
3627
3708
  yield '\n'
3628
- yield _format_kubernetes_node_info_combined(all_nodes_info)
3709
+ yield _format_kubernetes_node_info_combined(all_nodes_info,
3710
+ identity,
3711
+ ctx_column_title)
3712
+
3713
+ def _possibly_show_k8s_like_realtime(
3714
+ is_ssh: bool = False
3715
+ ) -> Generator[str, None, Tuple[bool, bool, str]]:
3716
+ # If cloud is kubernetes, we want to show real-time capacity
3717
+ k8s_messages = ''
3718
+ print_section_titles = False
3719
+ if (is_ssh and query_ssh_realtime_gpu or query_k8s_realtime_gpu):
3720
+ context = region
3721
+
3722
+ try:
3723
+ # If --cloud kubernetes is not specified, we want to catch
3724
+ # the case where no GPUs are available on the cluster and
3725
+ # print the warning at the end.
3726
+ k8s_realtime_infos, total_table, all_nodes_info = (
3727
+ _get_kubernetes_realtime_gpu_tables(context, is_ssh=is_ssh))
3728
+ except ValueError as e:
3729
+ if not (cloud_is_kubernetes or cloud_is_ssh):
3730
+ # Make it a note if cloud is not kubernetes
3731
+ k8s_messages += 'Note: '
3732
+ k8s_messages += str(e)
3733
+ else:
3734
+ print_section_titles = True
3735
+
3736
+ yield from _format_kubernetes_realtime_gpu(total_table,
3737
+ k8s_realtime_infos,
3738
+ all_nodes_info,
3739
+ show_node_info=True,
3740
+ is_ssh=is_ssh)
3741
+
3742
+ if kubernetes_autoscaling:
3743
+ k8s_messages += ('\n' +
3744
+ kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
3745
+ if is_ssh:
3746
+ if cloud_is_ssh:
3747
+ if not ssh_is_enabled:
3748
+ yield ('SSH Node Pools are not enabled. To fix, run: '
3749
+ 'sky check ssh ')
3750
+ yield k8s_messages
3751
+ return True, print_section_titles, ''
3752
+ else:
3753
+ if cloud_is_kubernetes:
3754
+ if not kubernetes_is_enabled:
3755
+ yield ('Kubernetes is not enabled. To fix, run: '
3756
+ 'sky check kubernetes ')
3757
+ yield k8s_messages
3758
+ return True, print_section_titles, ''
3759
+ return False, print_section_titles, k8s_messages
3760
+
3761
+ def _possibly_show_k8s_like_realtime_for_acc(
3762
+ name: Optional[str],
3763
+ quantity: Optional[int],
3764
+ is_ssh: bool = False) -> Generator[str, None, Tuple[bool, bool]]:
3765
+ k8s_messages = ''
3766
+ print_section_titles = False
3767
+ if (is_ssh and query_ssh_realtime_gpu or
3768
+ query_k8s_realtime_gpu) and not show_all:
3769
+ print_section_titles = True
3770
+ # TODO(romilb): Show filtered per node GPU availability here as well
3771
+ try:
3772
+ (k8s_realtime_infos, total_table,
3773
+ all_nodes_info) = _get_kubernetes_realtime_gpu_tables(
3774
+ context=region,
3775
+ name_filter=name,
3776
+ quantity_filter=quantity,
3777
+ is_ssh=is_ssh)
3778
+
3779
+ yield from _format_kubernetes_realtime_gpu(total_table,
3780
+ k8s_realtime_infos,
3781
+ all_nodes_info,
3782
+ show_node_info=False,
3783
+ is_ssh=is_ssh)
3784
+ except ValueError as e:
3785
+ # In the case of a specific accelerator, show the error message
3786
+ # immediately (e.g., "Resources H100 not found ...")
3787
+ yield common_utils.format_exception(e, use_bracket=True)
3788
+ if kubernetes_autoscaling:
3789
+ k8s_messages += ('\n' +
3790
+ kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
3791
+ yield k8s_messages
3792
+ if is_ssh:
3793
+ if cloud_is_ssh:
3794
+ if not ssh_is_enabled:
3795
+ yield ('SSH Node Pools are not enabled. To fix, run: '
3796
+ 'sky check ssh ')
3797
+ return True, print_section_titles
3798
+ else:
3799
+ if cloud_is_kubernetes:
3800
+ if not kubernetes_is_enabled:
3801
+ yield ('Kubernetes is not enabled. To fix, run: '
3802
+ 'sky check kubernetes ')
3803
+ return True, print_section_titles
3804
+ return False, print_section_titles
3629
3805
 
3630
3806
  def _output() -> Generator[str, None, None]:
3631
3807
  gpu_table = log_utils.create_table(
@@ -3643,46 +3819,28 @@ def show_gpus(
3643
3819
  clouds_to_list: Union[Optional[str], List[str]] = cloud_name
3644
3820
  if cloud_name is None:
3645
3821
  clouds_to_list = [
3646
- c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes'
3822
+ c for c in service_catalog.ALL_CLOUDS
3823
+ if c != 'kubernetes' and c != 'ssh'
3647
3824
  ]
3648
3825
 
3649
3826
  k8s_messages = ''
3650
3827
  if accelerator_str is None:
3651
3828
  # Collect k8s related messages in k8s_messages and print them at end
3652
3829
  print_section_titles = False
3653
- # If cloud is kubernetes, we want to show real-time capacity
3654
- if kubernetes_is_enabled and (cloud_name is None or
3655
- cloud_is_kubernetes):
3656
- context = region
3657
-
3658
- try:
3659
- # If --cloud kubernetes is not specified, we want to catch
3660
- # the case where no GPUs are available on the cluster and
3661
- # print the warning at the end.
3662
- k8s_realtime_infos, total_table, all_nodes_info = _get_kubernetes_realtime_gpu_tables(context) # pylint: disable=line-too-long
3663
- except ValueError as e:
3664
- if not cloud_is_kubernetes:
3665
- # Make it a note if cloud is not kubernetes
3666
- k8s_messages += 'Note: '
3667
- k8s_messages += str(e)
3668
- else:
3669
- print_section_titles = True
3670
-
3671
- yield from _format_kubernetes_realtime_gpu(
3672
- total_table,
3673
- k8s_realtime_infos,
3674
- all_nodes_info,
3675
- show_node_info=True)
3676
-
3677
- if kubernetes_autoscaling:
3678
- k8s_messages += (
3679
- '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
3680
- if cloud_is_kubernetes:
3681
- # Do not show clouds if --cloud kubernetes is specified
3682
- if not kubernetes_is_enabled:
3683
- yield ('Kubernetes is not enabled. To fix, run: '
3684
- 'sky check kubernetes ')
3685
- yield k8s_messages
3830
+ stop_iter = False
3831
+ k8s_messages = ''
3832
+ prev_print_section_titles = False
3833
+ for is_ssh in [False, True]:
3834
+ if prev_print_section_titles:
3835
+ yield '\n\n'
3836
+ stop_iter_one, print_section_titles_one, k8s_messages_one = (
3837
+ yield from _possibly_show_k8s_like_realtime(is_ssh))
3838
+ stop_iter = stop_iter or stop_iter_one
3839
+ print_section_titles = (print_section_titles or
3840
+ print_section_titles_one)
3841
+ k8s_messages += k8s_messages_one
3842
+ prev_print_section_titles = print_section_titles_one
3843
+ if stop_iter:
3686
3844
  return
3687
3845
 
3688
3846
  # For show_all, show the k8s message at the start since output is
@@ -3757,34 +3915,19 @@ def show_gpus(
3757
3915
  name, quantity = accelerator_str, None
3758
3916
 
3759
3917
  print_section_titles = False
3760
- if (kubernetes_is_enabled and
3761
- (cloud_name is None or cloud_is_kubernetes) and not show_all):
3762
- # Print section title if not showing all and instead a specific
3763
- # accelerator is requested
3764
- print_section_titles = True
3765
- # TODO(romilb): Show filtered per node GPU availability here as well
3766
- try:
3767
- (k8s_realtime_infos, total_table,
3768
- all_nodes_info) = _get_kubernetes_realtime_gpu_tables(
3769
- context=region, name_filter=name, quantity_filter=quantity)
3770
-
3771
- yield from _format_kubernetes_realtime_gpu(total_table,
3772
- k8s_realtime_infos,
3773
- all_nodes_info,
3774
- show_node_info=False)
3775
- except ValueError as e:
3776
- # In the case of a specific accelerator, show the error message
3777
- # immediately (e.g., "Resources H100 not found ...")
3778
- yield common_utils.format_exception(e, use_bracket=True)
3779
- if kubernetes_autoscaling:
3780
- k8s_messages += ('\n' +
3781
- kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
3782
- yield k8s_messages
3783
- if cloud_is_kubernetes:
3784
- # Do not show clouds if --cloud kubernetes is specified
3785
- if not kubernetes_is_enabled:
3786
- yield ('Kubernetes is not enabled. To fix, run: '
3787
- 'sky check kubernetes ')
3918
+ stop_iter = False
3919
+ prev_print_section_titles = False
3920
+ for is_ssh in [False, True]:
3921
+ if prev_print_section_titles:
3922
+ yield '\n\n'
3923
+ stop_iter_one, print_section_titles_one = (
3924
+ yield from _possibly_show_k8s_like_realtime_for_acc(
3925
+ name, quantity, is_ssh))
3926
+ stop_iter = stop_iter or stop_iter_one
3927
+ print_section_titles = (print_section_titles or
3928
+ print_section_titles_one)
3929
+ prev_print_section_titles = print_section_titles_one
3930
+ if stop_iter:
3788
3931
  return
3789
3932
 
3790
3933
  # For clouds other than Kubernetes, get the accelerator details
@@ -4328,7 +4471,8 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
4328
4471
  f'Provided {" ".join(arguments)!r}.')
4329
4472
 
4330
4473
  if not yes:
4331
- job_identity_str = (f'managed jobs with IDs {job_id_str}'
4474
+ plural = 's' if len(job_ids) > 1 else ''
4475
+ job_identity_str = (f'managed job{plural} with ID{plural} {job_id_str}'
4332
4476
  if job_ids else repr(name))
4333
4477
  if all_users:
4334
4478
  job_identity_str = 'all managed jobs FOR ALL USERS'
@@ -6050,10 +6194,14 @@ def api_status(request_ids: Optional[List[str]], all_status: bool,
6050
6194
  '-e',
6051
6195
  required=False,
6052
6196
  help='The SkyPilot API server endpoint.')
6197
+ @click.option('--get-token',
6198
+ is_flag=True,
6199
+ default=False,
6200
+ help='Force token-based login.')
6053
6201
  @usage_lib.entrypoint
6054
- def api_login(endpoint: Optional[str]):
6202
+ def api_login(endpoint: Optional[str], get_token: bool):
6055
6203
  """Logs into a SkyPilot API server."""
6056
- sdk.api_login(endpoint)
6204
+ sdk.api_login(endpoint, get_token)
6057
6205
 
6058
6206
 
6059
6207
  @api.command('info', cls=_DocumentedCodeCommand)
@@ -6065,6 +6213,10 @@ def api_info():
6065
6213
  api_server_info = sdk.api_info()
6066
6214
  user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
6067
6215
  user_hash = common_utils.get_user_hash()
6216
+ api_server_user = api_server_info.get('user')
6217
+ if api_server_user is not None:
6218
+ user_name = api_server_user['name']
6219
+ user_hash = api_server_user['id']
6068
6220
  dashboard_url = server_common.get_dashboard_url(url)
6069
6221
  click.echo(f'Using SkyPilot API server: {url}\n'
6070
6222
  f'{ux_utils.INDENT_SYMBOL}Status: {api_server_info["status"]}, '
@@ -6074,6 +6226,58 @@ def api_info():
6074
6226
  f'{ux_utils.INDENT_LAST_SYMBOL}Dashboard: {dashboard_url}')
6075
6227
 
6076
6228
 
6229
+ @cli.group(cls=_NaturalOrderGroup)
6230
+ def ssh():
6231
+ """Commands for managing SSH Node Pools."""
6232
+ pass
6233
+
6234
+
6235
+ @ssh.command('up', cls=_DocumentedCodeCommand)
6236
+ @click.option(
6237
+ '--infra',
6238
+ help='Name of the cluster to set up in ~/.sky/ssh_node_pools.yaml. '
6239
+ 'If not specified, all clusters in the file will be set up.')
6240
+ @click.option('--async',
6241
+ 'async_call',
6242
+ is_flag=True,
6243
+ hidden=True,
6244
+ help='Run the command asynchronously.')
6245
+ def ssh_up(infra: Optional[str], async_call: bool):
6246
+ """Set up a cluster using SSH targets from ~/.sky/ssh_node_pools.yaml.
6247
+
6248
+ This command sets up a Kubernetes cluster on the machines specified in
6249
+ ~/.sky/ssh_node_pools.yaml and configures SkyPilot to use it.
6250
+ """
6251
+ request_id = sdk.ssh_up(infra=infra)
6252
+ if async_call:
6253
+ print(f'Request submitted with ID: {request_id}')
6254
+ else:
6255
+ sdk.stream_and_get(request_id)
6256
+
6257
+
6258
+ @ssh.command('down', cls=_DocumentedCodeCommand)
6259
+ @click.option(
6260
+ '--infra',
6261
+ help='Name of the cluster to clean up in ~/.sky/ssh_node_pools.yaml. '
6262
+ 'If not specified, all clusters in the file will be cleaned up.')
6263
+ @click.option('--async',
6264
+ 'async_call',
6265
+ is_flag=True,
6266
+ hidden=True,
6267
+ help='Run the command asynchronously.')
6268
+ def ssh_down(infra, async_call):
6269
+ """Clean up a cluster set up with 'sky ssh up'.
6270
+
6271
+ This command removes the Kubernetes installation from the machines specified
6272
+ in ~/.sky/ssh_node_pools.yaml.
6273
+ """
6274
+ request_id = sdk.ssh_down(infra=infra)
6275
+ if async_call:
6276
+ print(f'Request submitted with ID: {request_id}')
6277
+ else:
6278
+ sdk.stream_and_get(request_id)
6279
+
6280
+
6077
6281
  def main():
6078
6282
  return cli()
6079
6283