skypilot-nightly 1.0.0.dev20250521__py3-none-any.whl → 1.0.0.dev20250523__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +46 -16
  3. sky/backends/cloud_vm_ray_backend.py +16 -4
  4. sky/check.py +109 -44
  5. sky/cli.py +261 -90
  6. sky/client/cli.py +261 -90
  7. sky/client/sdk.py +122 -3
  8. sky/clouds/__init__.py +5 -0
  9. sky/clouds/aws.py +4 -2
  10. sky/clouds/azure.py +4 -2
  11. sky/clouds/cloud.py +30 -6
  12. sky/clouds/cudo.py +2 -1
  13. sky/clouds/do.py +2 -1
  14. sky/clouds/fluidstack.py +2 -1
  15. sky/clouds/gcp.py +160 -23
  16. sky/clouds/ibm.py +4 -2
  17. sky/clouds/kubernetes.py +66 -22
  18. sky/clouds/lambda_cloud.py +2 -1
  19. sky/clouds/nebius.py +18 -2
  20. sky/clouds/oci.py +4 -2
  21. sky/clouds/paperspace.py +2 -1
  22. sky/clouds/runpod.py +2 -1
  23. sky/clouds/scp.py +2 -1
  24. sky/clouds/service_catalog/__init__.py +3 -0
  25. sky/clouds/service_catalog/common.py +9 -2
  26. sky/clouds/service_catalog/constants.py +2 -1
  27. sky/clouds/service_catalog/ssh_catalog.py +167 -0
  28. sky/clouds/ssh.py +203 -0
  29. sky/clouds/vast.py +2 -1
  30. sky/clouds/vsphere.py +2 -1
  31. sky/core.py +59 -17
  32. sky/dashboard/out/404.html +1 -1
  33. sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_buildManifest.js +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +1 -0
  35. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  36. sky/dashboard/out/clusters/[cluster].html +1 -1
  37. sky/dashboard/out/clusters.html +1 -1
  38. sky/dashboard/out/index.html +1 -1
  39. sky/dashboard/out/infra.html +1 -1
  40. sky/dashboard/out/jobs/[job].html +1 -1
  41. sky/dashboard/out/jobs.html +1 -1
  42. sky/data/storage.py +1 -0
  43. sky/execution.py +56 -7
  44. sky/jobs/server/core.py +4 -2
  45. sky/optimizer.py +29 -15
  46. sky/provision/__init__.py +1 -0
  47. sky/provision/aws/instance.py +17 -1
  48. sky/provision/gcp/constants.py +147 -4
  49. sky/provision/gcp/instance_utils.py +10 -0
  50. sky/provision/gcp/volume_utils.py +247 -0
  51. sky/provision/kubernetes/instance.py +16 -5
  52. sky/provision/kubernetes/utils.py +37 -19
  53. sky/provision/nebius/instance.py +3 -1
  54. sky/provision/nebius/utils.py +14 -2
  55. sky/provision/ssh/__init__.py +18 -0
  56. sky/resources.py +177 -4
  57. sky/serve/server/core.py +2 -4
  58. sky/server/common.py +46 -9
  59. sky/server/constants.py +2 -0
  60. sky/server/html/token_page.html +154 -0
  61. sky/server/requests/executor.py +3 -6
  62. sky/server/requests/payloads.py +7 -0
  63. sky/server/server.py +80 -8
  64. sky/setup_files/dependencies.py +1 -0
  65. sky/skypilot_config.py +117 -31
  66. sky/task.py +24 -1
  67. sky/templates/gcp-ray.yml.j2 +44 -1
  68. sky/templates/nebius-ray.yml.j2 +12 -2
  69. sky/utils/admin_policy_utils.py +26 -22
  70. sky/utils/context.py +36 -6
  71. sky/utils/context_utils.py +15 -0
  72. sky/utils/infra_utils.py +21 -1
  73. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  74. sky/utils/kubernetes/create_cluster.sh +1 -0
  75. sky/utils/kubernetes/deploy_remote_cluster.py +1437 -0
  76. sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
  77. sky/utils/kubernetes/ssh-tunnel.sh +387 -0
  78. sky/utils/log_utils.py +214 -1
  79. sky/utils/resources_utils.py +14 -0
  80. sky/utils/schemas.py +67 -0
  81. sky/utils/ux_utils.py +2 -1
  82. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/METADATA +6 -1
  83. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/RECORD +88 -81
  84. sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
  85. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  86. /sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_ssgManifest.js +0 -0
  87. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/WHEEL +0 -0
  88. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/entry_points.txt +0 -0
  89. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/licenses/LICENSE +0 -0
  90. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/top_level.txt +0 -0
sky/client/cli.py CHANGED
@@ -212,6 +212,7 @@ def _get_glob_storages(storages: List[str]) -> List[str]:
212
212
  """Returns a list of storages that match the glob pattern."""
213
213
  glob_storages = []
214
214
  for storage_object in storages:
215
+ # TODO(zhwu): client side should not rely on global_user_state.
215
216
  glob_storage = global_user_state.get_glob_storage_name(storage_object)
216
217
  if not glob_storage:
217
218
  click.echo(f'Storage {storage_object} not found.')
@@ -1780,6 +1781,27 @@ def _show_endpoint(query_clusters: Optional[List[str]],
1780
1781
  return
1781
1782
 
1782
1783
 
1784
+ def _show_enabled_infra():
1785
+ """Show the enabled infrastructure."""
1786
+ title = (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Enabled Infra:'
1787
+ f'{colorama.Style.RESET_ALL} ')
1788
+ enabled_clouds = sdk.get(sdk.enabled_clouds())
1789
+ enabled_ssh_infras = []
1790
+ enabled_k8s_infras = []
1791
+ enabled_cloud_infras = []
1792
+ for cloud in enabled_clouds:
1793
+ cloud_infra = cloud.get_infras()
1794
+ if isinstance(cloud, clouds.SSH):
1795
+ enabled_ssh_infras.extend(cloud_infra)
1796
+ elif isinstance(cloud, clouds.Kubernetes):
1797
+ enabled_k8s_infras.extend(cloud_infra)
1798
+ else:
1799
+ enabled_cloud_infras.extend(cloud_infra)
1800
+ all_infras = sorted(enabled_ssh_infras) + sorted(
1801
+ enabled_k8s_infras) + sorted(enabled_cloud_infras)
1802
+ click.echo(f'{title}{", ".join(all_infras)}\n')
1803
+
1804
+
1783
1805
  @cli.command()
1784
1806
  @config_option(expose_value=False)
1785
1807
  @click.option('--verbose',
@@ -1966,6 +1988,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
1966
1988
  ('endpoint port'
1967
1989
  if show_single_endpoint else 'endpoints')))
1968
1990
  else:
1991
+ _show_enabled_infra()
1969
1992
  click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters'
1970
1993
  f'{colorama.Style.RESET_ALL}')
1971
1994
  query_clusters: Optional[List[str]] = None if not clusters else clusters
@@ -3462,13 +3485,22 @@ def show_gpus(
3462
3485
 
3463
3486
  # Kubernetes specific bools
3464
3487
  enabled_clouds = sdk.get(sdk.enabled_clouds())
3465
- cloud_is_kubernetes = isinstance(cloud_obj, clouds.Kubernetes)
3488
+ cloud_is_kubernetes = isinstance(
3489
+ cloud_obj, clouds.Kubernetes) and not isinstance(cloud_obj, clouds.SSH)
3490
+ cloud_is_ssh = isinstance(cloud_obj, clouds.SSH)
3466
3491
  # TODO(romilb): We should move this to the backend.
3467
3492
  kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None
3468
- kubernetes_is_enabled = clouds.cloud_in_iterable(
3469
- clouds.Kubernetes(),
3470
- enabled_clouds,
3471
- )
3493
+ kubernetes_is_enabled = False
3494
+ ssh_is_enabled = False
3495
+ for cloud in enabled_clouds:
3496
+ if isinstance(cloud, clouds.SSH):
3497
+ ssh_is_enabled = True
3498
+ elif isinstance(cloud, clouds.Kubernetes):
3499
+ kubernetes_is_enabled = True
3500
+ query_k8s_realtime_gpu = (kubernetes_is_enabled and
3501
+ (cloud_name is None or cloud_is_kubernetes))
3502
+ query_ssh_realtime_gpu = (ssh_is_enabled and
3503
+ (cloud_name is None or cloud_is_ssh))
3472
3504
 
3473
3505
  def _list_to_str(lst):
3474
3506
  return ', '.join([str(e) for e in lst])
@@ -3478,7 +3510,8 @@ def show_gpus(
3478
3510
  def _get_kubernetes_realtime_gpu_tables(
3479
3511
  context: Optional[str] = None,
3480
3512
  name_filter: Optional[str] = None,
3481
- quantity_filter: Optional[int] = None
3513
+ quantity_filter: Optional[int] = None,
3514
+ is_ssh: bool = False,
3482
3515
  ) -> Tuple[List[Tuple[str, 'prettytable.PrettyTable']],
3483
3516
  Optional['prettytable.PrettyTable'], List[Tuple[
3484
3517
  str, 'models.KubernetesNodesInfo']]]:
@@ -3491,19 +3524,26 @@ def show_gpus(
3491
3524
  sdk.realtime_kubernetes_gpu_availability(
3492
3525
  context=context,
3493
3526
  name_filter=name_filter,
3494
- quantity_filter=quantity_filter))
3527
+ quantity_filter=quantity_filter,
3528
+ is_ssh=is_ssh))
3495
3529
  if not realtime_gpu_availability_lists:
3496
- err_msg = 'No GPUs found in any allowed Kubernetes cluster. '
3497
- debug_msg = 'To further debug, run: sky check '
3530
+ # Customize message based on context
3531
+ identity = ('SSH Node Pool'
3532
+ if is_ssh else 'any allowed Kubernetes cluster')
3533
+ cloud_name = 'ssh' if is_ssh else 'kubernetes'
3534
+ err_msg = f'No GPUs found in {identity}. '
3535
+ debug_msg = (f'To further debug, run: sky check {cloud_name}')
3498
3536
  if name_filter is not None:
3499
3537
  gpu_info_msg = f' {name_filter!r}'
3500
3538
  if quantity_filter is not None:
3501
3539
  gpu_info_msg += (' with requested quantity'
3502
3540
  f' {quantity_filter}')
3503
3541
  err_msg = (f'Resources{gpu_info_msg} not found '
3504
- 'in any allowed Kubernetes cluster. ')
3505
- debug_msg = ('To show available accelerators on kubernetes,'
3506
- ' run: sky show-gpus --cloud kubernetes ')
3542
+ f'in {identity}. ')
3543
+ identity_short = 'SSH Node Pool' if is_ssh else 'Kubernetes'
3544
+ debug_msg = (
3545
+ f'To show available accelerators in {identity_short}, '
3546
+ f'run: sky show-gpus --cloud {cloud_name}')
3507
3547
  full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
3508
3548
  debug_msg)
3509
3549
  raise ValueError(full_err_msg)
@@ -3513,6 +3553,14 @@ def show_gpus(
3513
3553
  lambda: [0, 0])
3514
3554
  all_nodes_info = []
3515
3555
 
3556
+ # display an aggregated table for all contexts
3557
+ # if there are more than one contexts with GPUs.
3558
+ def _filter_ctx(ctx: str) -> bool:
3559
+ ctx_is_ssh = ctx and ctx.startswith('ssh-')
3560
+ return ctx_is_ssh is is_ssh
3561
+
3562
+ num_filtered_contexts = 0
3563
+
3516
3564
  if realtime_gpu_availability_lists:
3517
3565
  if len(realtime_gpu_availability_lists[0]) != 2:
3518
3566
  # TODO(kyuds): for backwards compatibility, as we add new
@@ -3522,6 +3570,13 @@ def show_gpus(
3522
3570
  (context, realtime_gpu_availability_lists)
3523
3571
  ]
3524
3572
  for (ctx, availability_list) in realtime_gpu_availability_lists:
3573
+ if not _filter_ctx(ctx):
3574
+ continue
3575
+ if is_ssh:
3576
+ display_ctx = ctx.lstrip('ssh-')
3577
+ else:
3578
+ display_ctx = ctx
3579
+ num_filtered_contexts += 1
3525
3580
  realtime_gpu_table = log_utils.create_table(
3526
3581
  ['GPU', qty_header, 'UTILIZATION'])
3527
3582
  for realtime_gpu_availability in sorted(availability_list):
@@ -3542,15 +3597,12 @@ def show_gpus(
3542
3597
  if capacity > 0:
3543
3598
  total_gpu_info[gpu][0] += capacity
3544
3599
  total_gpu_info[gpu][1] += available
3545
- realtime_gpu_infos.append((ctx, realtime_gpu_table))
3600
+ realtime_gpu_infos.append((display_ctx, realtime_gpu_table))
3546
3601
  # Collect node info for this context
3547
3602
  nodes_info = sdk.stream_and_get(
3548
3603
  sdk.kubernetes_node_info(context=ctx))
3549
- all_nodes_info.append((ctx, nodes_info))
3550
-
3551
- # display an aggregated table for all contexts
3552
- # if there are more than one contexts with GPUs
3553
- if len(realtime_gpu_infos) > 1:
3604
+ all_nodes_info.append((display_ctx, nodes_info))
3605
+ if num_filtered_contexts > 1:
3554
3606
  total_realtime_gpu_table = log_utils.create_table(
3555
3607
  ['GPU', 'UTILIZATION'])
3556
3608
  for gpu, stats in total_gpu_info.items():
@@ -3562,10 +3614,11 @@ def show_gpus(
3562
3614
  return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
3563
3615
 
3564
3616
  def _format_kubernetes_node_info_combined(
3565
- contexts_info: List[Tuple[str,
3566
- 'models.KubernetesNodesInfo']]) -> str:
3617
+ contexts_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
3618
+ cloud_str: str = 'Kubernetes',
3619
+ context_title_str: str = 'CONTEXT') -> str:
3567
3620
  node_table = log_utils.create_table(
3568
- ['CONTEXT', 'NODE', 'GPU', 'UTILIZATION'])
3621
+ [context_title_str, 'NODE', 'GPU', 'UTILIZATION'])
3569
3622
 
3570
3623
  no_permissions_str = '<no permissions>'
3571
3624
  hints = []
@@ -3588,7 +3641,7 @@ def show_gpus(
3588
3641
  'free'
3589
3642
  ])
3590
3643
 
3591
- k8s_per_node_acc_message = ('Kubernetes per-node GPU availability')
3644
+ k8s_per_node_acc_message = (f'{cloud_str} per-node GPU availability')
3592
3645
  if hints:
3593
3646
  k8s_per_node_acc_message += ' (' + '; '.join(hints) + ')'
3594
3647
 
@@ -3598,26 +3651,30 @@ def show_gpus(
3598
3651
  f'{node_table.get_string()}')
3599
3652
 
3600
3653
  def _format_kubernetes_realtime_gpu(
3601
- total_table: 'prettytable.PrettyTable',
3654
+ total_table: Optional['prettytable.PrettyTable'],
3602
3655
  k8s_realtime_infos: List[Tuple[str, 'prettytable.PrettyTable']],
3603
3656
  all_nodes_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
3604
- show_node_info: bool) -> Generator[str, None, None]:
3657
+ show_node_info: bool, is_ssh: bool) -> Generator[str, None, None]:
3658
+ identity = 'SSH Node Pool' if is_ssh else 'Kubernetes'
3605
3659
  yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
3606
- 'Kubernetes GPUs'
3660
+ f'{identity} GPUs'
3607
3661
  f'{colorama.Style.RESET_ALL}')
3608
3662
  # print total table
3609
3663
  if total_table is not None:
3610
3664
  yield '\n'
3611
3665
  yield from total_table.get_string()
3612
3666
 
3667
+ ctx_name = 'SSH Node Pool' if is_ssh else 'Context'
3668
+ ctx_column_title = 'NODE_POOL' if is_ssh else 'CONTEXT'
3669
+
3613
3670
  # print individual infos.
3614
3671
  for (ctx, k8s_realtime_table) in k8s_realtime_infos:
3615
3672
  yield '\n'
3616
3673
  # Print context header separately
3617
3674
  if ctx:
3618
- context_str = f'Context: {ctx}'
3675
+ context_str = f'{ctx_name}: {ctx}'
3619
3676
  else:
3620
- context_str = 'Default Context'
3677
+ context_str = f'Default {ctx_name}'
3621
3678
  yield (
3622
3679
  f'{colorama.Fore.CYAN}{context_str}{colorama.Style.RESET_ALL}\n'
3623
3680
  )
@@ -3625,7 +3682,102 @@ def show_gpus(
3625
3682
 
3626
3683
  if show_node_info:
3627
3684
  yield '\n'
3628
- yield _format_kubernetes_node_info_combined(all_nodes_info)
3685
+ yield _format_kubernetes_node_info_combined(all_nodes_info,
3686
+ identity,
3687
+ ctx_column_title)
3688
+
3689
+ def _possibly_show_k8s_like_realtime(
3690
+ is_ssh: bool = False
3691
+ ) -> Generator[str, None, Tuple[bool, bool, str]]:
3692
+ # If cloud is kubernetes, we want to show real-time capacity
3693
+ k8s_messages = ''
3694
+ print_section_titles = False
3695
+ if (is_ssh and query_ssh_realtime_gpu or query_k8s_realtime_gpu):
3696
+ context = region
3697
+
3698
+ try:
3699
+ # If --cloud kubernetes is not specified, we want to catch
3700
+ # the case where no GPUs are available on the cluster and
3701
+ # print the warning at the end.
3702
+ k8s_realtime_infos, total_table, all_nodes_info = (
3703
+ _get_kubernetes_realtime_gpu_tables(context, is_ssh=is_ssh))
3704
+ except ValueError as e:
3705
+ if not (cloud_is_kubernetes or cloud_is_ssh):
3706
+ # Make it a note if cloud is not kubernetes
3707
+ k8s_messages += 'Note: '
3708
+ k8s_messages += str(e)
3709
+ else:
3710
+ print_section_titles = True
3711
+
3712
+ yield from _format_kubernetes_realtime_gpu(total_table,
3713
+ k8s_realtime_infos,
3714
+ all_nodes_info,
3715
+ show_node_info=True,
3716
+ is_ssh=is_ssh)
3717
+
3718
+ if kubernetes_autoscaling:
3719
+ k8s_messages += ('\n' +
3720
+ kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
3721
+ if is_ssh:
3722
+ if cloud_is_ssh:
3723
+ if not ssh_is_enabled:
3724
+ yield ('SSH Node Pools are not enabled. To fix, run: '
3725
+ 'sky check ssh ')
3726
+ yield k8s_messages
3727
+ return True, print_section_titles, ''
3728
+ else:
3729
+ if cloud_is_kubernetes:
3730
+ if not kubernetes_is_enabled:
3731
+ yield ('Kubernetes is not enabled. To fix, run: '
3732
+ 'sky check kubernetes ')
3733
+ yield k8s_messages
3734
+ return True, print_section_titles, ''
3735
+ return False, print_section_titles, k8s_messages
3736
+
3737
+ def _possibly_show_k8s_like_realtime_for_acc(
3738
+ name: Optional[str],
3739
+ quantity: Optional[int],
3740
+ is_ssh: bool = False) -> Generator[str, None, Tuple[bool, bool]]:
3741
+ k8s_messages = ''
3742
+ print_section_titles = False
3743
+ if (is_ssh and query_ssh_realtime_gpu or
3744
+ query_k8s_realtime_gpu) and not show_all:
3745
+ print_section_titles = True
3746
+ # TODO(romilb): Show filtered per node GPU availability here as well
3747
+ try:
3748
+ (k8s_realtime_infos, total_table,
3749
+ all_nodes_info) = _get_kubernetes_realtime_gpu_tables(
3750
+ context=region,
3751
+ name_filter=name,
3752
+ quantity_filter=quantity,
3753
+ is_ssh=is_ssh)
3754
+
3755
+ yield from _format_kubernetes_realtime_gpu(total_table,
3756
+ k8s_realtime_infos,
3757
+ all_nodes_info,
3758
+ show_node_info=False,
3759
+ is_ssh=is_ssh)
3760
+ except ValueError as e:
3761
+ # In the case of a specific accelerator, show the error message
3762
+ # immediately (e.g., "Resources H100 not found ...")
3763
+ yield common_utils.format_exception(e, use_bracket=True)
3764
+ if kubernetes_autoscaling:
3765
+ k8s_messages += ('\n' +
3766
+ kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
3767
+ yield k8s_messages
3768
+ if is_ssh:
3769
+ if cloud_is_ssh:
3770
+ if not ssh_is_enabled:
3771
+ yield ('SSH Node Pools are not enabled. To fix, run: '
3772
+ 'sky check ssh ')
3773
+ return True, print_section_titles
3774
+ else:
3775
+ if cloud_is_kubernetes:
3776
+ if not kubernetes_is_enabled:
3777
+ yield ('Kubernetes is not enabled. To fix, run: '
3778
+ 'sky check kubernetes ')
3779
+ return True, print_section_titles
3780
+ return False, print_section_titles
3629
3781
 
3630
3782
  def _output() -> Generator[str, None, None]:
3631
3783
  gpu_table = log_utils.create_table(
@@ -3643,46 +3795,28 @@ def show_gpus(
3643
3795
  clouds_to_list: Union[Optional[str], List[str]] = cloud_name
3644
3796
  if cloud_name is None:
3645
3797
  clouds_to_list = [
3646
- c for c in service_catalog.ALL_CLOUDS if c != 'kubernetes'
3798
+ c for c in service_catalog.ALL_CLOUDS
3799
+ if c != 'kubernetes' and c != 'ssh'
3647
3800
  ]
3648
3801
 
3649
3802
  k8s_messages = ''
3650
3803
  if accelerator_str is None:
3651
3804
  # Collect k8s related messages in k8s_messages and print them at end
3652
3805
  print_section_titles = False
3653
- # If cloud is kubernetes, we want to show real-time capacity
3654
- if kubernetes_is_enabled and (cloud_name is None or
3655
- cloud_is_kubernetes):
3656
- context = region
3657
-
3658
- try:
3659
- # If --cloud kubernetes is not specified, we want to catch
3660
- # the case where no GPUs are available on the cluster and
3661
- # print the warning at the end.
3662
- k8s_realtime_infos, total_table, all_nodes_info = _get_kubernetes_realtime_gpu_tables(context) # pylint: disable=line-too-long
3663
- except ValueError as e:
3664
- if not cloud_is_kubernetes:
3665
- # Make it a note if cloud is not kubernetes
3666
- k8s_messages += 'Note: '
3667
- k8s_messages += str(e)
3668
- else:
3669
- print_section_titles = True
3670
-
3671
- yield from _format_kubernetes_realtime_gpu(
3672
- total_table,
3673
- k8s_realtime_infos,
3674
- all_nodes_info,
3675
- show_node_info=True)
3676
-
3677
- if kubernetes_autoscaling:
3678
- k8s_messages += (
3679
- '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
3680
- if cloud_is_kubernetes:
3681
- # Do not show clouds if --cloud kubernetes is specified
3682
- if not kubernetes_is_enabled:
3683
- yield ('Kubernetes is not enabled. To fix, run: '
3684
- 'sky check kubernetes ')
3685
- yield k8s_messages
3806
+ stop_iter = False
3807
+ k8s_messages = ''
3808
+ prev_print_section_titles = False
3809
+ for is_ssh in [False, True]:
3810
+ if prev_print_section_titles:
3811
+ yield '\n\n'
3812
+ stop_iter_one, print_section_titles_one, k8s_messages_one = (
3813
+ yield from _possibly_show_k8s_like_realtime(is_ssh))
3814
+ stop_iter = stop_iter or stop_iter_one
3815
+ print_section_titles = (print_section_titles or
3816
+ print_section_titles_one)
3817
+ k8s_messages += k8s_messages_one
3818
+ prev_print_section_titles = print_section_titles_one
3819
+ if stop_iter:
3686
3820
  return
3687
3821
 
3688
3822
  # For show_all, show the k8s message at the start since output is
@@ -3757,34 +3891,19 @@ def show_gpus(
3757
3891
  name, quantity = accelerator_str, None
3758
3892
 
3759
3893
  print_section_titles = False
3760
- if (kubernetes_is_enabled and
3761
- (cloud_name is None or cloud_is_kubernetes) and not show_all):
3762
- # Print section title if not showing all and instead a specific
3763
- # accelerator is requested
3764
- print_section_titles = True
3765
- # TODO(romilb): Show filtered per node GPU availability here as well
3766
- try:
3767
- (k8s_realtime_infos, total_table,
3768
- all_nodes_info) = _get_kubernetes_realtime_gpu_tables(
3769
- context=region, name_filter=name, quantity_filter=quantity)
3770
-
3771
- yield from _format_kubernetes_realtime_gpu(total_table,
3772
- k8s_realtime_infos,
3773
- all_nodes_info,
3774
- show_node_info=False)
3775
- except ValueError as e:
3776
- # In the case of a specific accelerator, show the error message
3777
- # immediately (e.g., "Resources H100 not found ...")
3778
- yield common_utils.format_exception(e, use_bracket=True)
3779
- if kubernetes_autoscaling:
3780
- k8s_messages += ('\n' +
3781
- kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
3782
- yield k8s_messages
3783
- if cloud_is_kubernetes:
3784
- # Do not show clouds if --cloud kubernetes is specified
3785
- if not kubernetes_is_enabled:
3786
- yield ('Kubernetes is not enabled. To fix, run: '
3787
- 'sky check kubernetes ')
3894
+ stop_iter = False
3895
+ prev_print_section_titles = False
3896
+ for is_ssh in [False, True]:
3897
+ if prev_print_section_titles:
3898
+ yield '\n\n'
3899
+ stop_iter_one, print_section_titles_one = (
3900
+ yield from _possibly_show_k8s_like_realtime_for_acc(
3901
+ name, quantity, is_ssh))
3902
+ stop_iter = stop_iter or stop_iter_one
3903
+ print_section_titles = (print_section_titles or
3904
+ print_section_titles_one)
3905
+ prev_print_section_titles = print_section_titles_one
3906
+ if stop_iter:
3788
3907
  return
3789
3908
 
3790
3909
  # For clouds other than Kubernetes, get the accelerator details
@@ -6074,6 +6193,58 @@ def api_info():
6074
6193
  f'{ux_utils.INDENT_LAST_SYMBOL}Dashboard: {dashboard_url}')
6075
6194
 
6076
6195
 
6196
+ @cli.group(cls=_NaturalOrderGroup)
6197
+ def ssh():
6198
+ """Commands for managing SSH Node Pools."""
6199
+ pass
6200
+
6201
+
6202
+ @ssh.command('up', cls=_DocumentedCodeCommand)
6203
+ @click.option(
6204
+ '--infra',
6205
+ help='Name of the cluster to set up in ~/.sky/ssh_node_pools.yaml. '
6206
+ 'If not specified, all clusters in the file will be set up.')
6207
+ @click.option('--async',
6208
+ 'async_call',
6209
+ is_flag=True,
6210
+ hidden=True,
6211
+ help='Run the command asynchronously.')
6212
+ def ssh_up(infra: Optional[str], async_call: bool):
6213
+ """Set up a cluster using SSH targets from ~/.sky/ssh_node_pools.yaml.
6214
+
6215
+ This command sets up a Kubernetes cluster on the machines specified in
6216
+ ~/.sky/ssh_node_pools.yaml and configures SkyPilot to use it.
6217
+ """
6218
+ request_id = sdk.ssh_up(infra=infra)
6219
+ if async_call:
6220
+ print(f'Request submitted with ID: {request_id}')
6221
+ else:
6222
+ sdk.stream_and_get(request_id)
6223
+
6224
+
6225
+ @ssh.command('down', cls=_DocumentedCodeCommand)
6226
+ @click.option(
6227
+ '--infra',
6228
+ help='Name of the cluster to clean up in ~/.sky/ssh_node_pools.yaml. '
6229
+ 'If not specified, all clusters in the file will be cleaned up.')
6230
+ @click.option('--async',
6231
+ 'async_call',
6232
+ is_flag=True,
6233
+ hidden=True,
6234
+ help='Run the command asynchronously.')
6235
+ def ssh_down(infra, async_call):
6236
+ """Clean up a cluster set up with 'sky ssh up'.
6237
+
6238
+ This command removes the Kubernetes installation from the machines specified
6239
+ in ~/.sky/ssh_node_pools.yaml.
6240
+ """
6241
+ request_id = sdk.ssh_down(infra=infra)
6242
+ if async_call:
6243
+ print(f'Request submitted with ID: {request_id}')
6244
+ else:
6245
+ sdk.stream_and_get(request_id)
6246
+
6247
+
6077
6248
  def main():
6078
6249
  return cli()
6079
6250
 
sky/client/sdk.py CHANGED
@@ -10,14 +10,19 @@ Usage example:
10
10
  statuses = sky.get(request_id)
11
11
 
12
12
  """
13
+ import base64
14
+ import binascii
13
15
  import getpass
16
+ from http import cookiejar
14
17
  import json
15
18
  import logging
16
19
  import os
17
20
  import pathlib
18
21
  import subprocess
22
+ import time
19
23
  import typing
20
24
  from typing import Any, Dict, List, Optional, Tuple, Union
25
+ from urllib import parse as urlparse
21
26
  import webbrowser
22
27
 
23
28
  import click
@@ -220,7 +225,7 @@ def list_accelerator_counts(
220
225
  accelerator names mapped to a list of available counts. See usage
221
226
  in cli.py.
222
227
  """
223
- body = payloads.ListAcceleratorsBody(
228
+ body = payloads.ListAcceleratorCountsBody(
224
229
  gpus_only=gpus_only,
225
230
  name_filter=name_filter,
226
231
  region_filter=region_filter,
@@ -1391,13 +1396,60 @@ def local_down() -> server_common.RequestId:
1391
1396
  return server_common.get_request_id(response)
1392
1397
 
1393
1398
 
1399
+ @usage_lib.entrypoint
1400
+ @server_common.check_server_healthy_or_start
1401
+ @annotations.client_api
1402
+ def ssh_up(infra: Optional[str] = None) -> server_common.RequestId:
1403
+ """Deploys the SSH Node Pools defined in ~/.sky/ssh_targets.yaml.
1404
+
1405
+ Args:
1406
+ infra: Name of the cluster configuration in ssh_targets.yaml.
1407
+ If None, the first cluster in the file is used.
1408
+
1409
+ Returns:
1410
+ request_id: The request ID of the SSH cluster deployment request.
1411
+ """
1412
+ body = payloads.SSHUpBody(
1413
+ infra=infra,
1414
+ cleanup=False,
1415
+ )
1416
+ response = requests.post(f'{server_common.get_server_url()}/ssh_up',
1417
+ json=json.loads(body.model_dump_json()),
1418
+ cookies=server_common.get_api_cookie_jar())
1419
+ return server_common.get_request_id(response)
1420
+
1421
+
1422
+ @usage_lib.entrypoint
1423
+ @server_common.check_server_healthy_or_start
1424
+ @annotations.client_api
1425
+ def ssh_down(infra: Optional[str] = None) -> server_common.RequestId:
1426
+ """Tears down a Kubernetes cluster on SSH targets.
1427
+
1428
+ Args:
1429
+ infra: Name of the cluster configuration in ssh_targets.yaml.
1430
+ If None, the first cluster in the file is used.
1431
+
1432
+ Returns:
1433
+ request_id: The request ID of the SSH cluster teardown request.
1434
+ """
1435
+ body = payloads.SSHUpBody(
1436
+ infra=infra,
1437
+ cleanup=True,
1438
+ )
1439
+ response = requests.post(f'{server_common.get_server_url()}/ssh_down',
1440
+ json=json.loads(body.model_dump_json()),
1441
+ cookies=server_common.get_api_cookie_jar())
1442
+ return server_common.get_request_id(response)
1443
+
1444
+
1394
1445
  @usage_lib.entrypoint
1395
1446
  @server_common.check_server_healthy_or_start
1396
1447
  @annotations.client_api
1397
1448
  def realtime_kubernetes_gpu_availability(
1398
1449
  context: Optional[str] = None,
1399
1450
  name_filter: Optional[str] = None,
1400
- quantity_filter: Optional[int] = None) -> server_common.RequestId:
1451
+ quantity_filter: Optional[int] = None,
1452
+ is_ssh: Optional[bool] = None) -> server_common.RequestId:
1401
1453
  """Gets the real-time Kubernetes GPU availability.
1402
1454
 
1403
1455
  Returns:
@@ -1407,6 +1459,7 @@ def realtime_kubernetes_gpu_availability(
1407
1459
  context=context,
1408
1460
  name_filter=name_filter,
1409
1461
  quantity_filter=quantity_filter,
1462
+ is_ssh=is_ssh,
1410
1463
  )
1411
1464
  response = requests.post(
1412
1465
  f'{server_common.get_server_url()}/'
@@ -1841,7 +1894,73 @@ def api_login(endpoint: Optional[str] = None) -> None:
1841
1894
  not endpoint.startswith('https://')):
1842
1895
  raise click.BadParameter('Endpoint must be a valid URL.')
1843
1896
 
1844
- server_common.check_server_healthy(endpoint)
1897
+ server_status = server_common.check_server_healthy(endpoint)
1898
+ if server_status == server_common.ApiServerStatus.NEEDS_AUTH:
1899
+ # We detected an auth proxy, so go through the auth proxy cookie flow.
1900
+ parsed_url = urlparse.urlparse(endpoint)
1901
+ token_url = f'{endpoint}/token'
1902
+ click.echo('Authentication is needed. Please visit this URL setup up '
1903
+ f'the token:{colorama.Style.BRIGHT}\n\n{token_url}'
1904
+ f'\n{colorama.Style.RESET_ALL}')
1905
+ if webbrowser.open(token_url):
1906
+ click.echo('Opening browser...')
1907
+ token: str = click.prompt('Paste the token')
1908
+
1909
+ # Parse the token.
1910
+ # b64decode will ignore invalid characters, but does some length and
1911
+ # padding checks.
1912
+ try:
1913
+ data = base64.b64decode(token)
1914
+ except binascii.Error as e:
1915
+ raise ValueError(f'Malformed token: {token}') from e
1916
+ logger.debug(f'Token data: {data!r}')
1917
+ try:
1918
+ cookie_dict = json.loads(data)
1919
+ except (json.JSONDecodeError, UnicodeDecodeError) as e:
1920
+ raise ValueError(f'Malformed token data: {data!r}') from e
1921
+ if not isinstance(cookie_dict, dict):
1922
+ raise ValueError(f'Malformed token JSON: {cookie_dict}')
1923
+
1924
+ cookie_jar = cookiejar.MozillaCookieJar()
1925
+ for (name, value) in cookie_dict.items():
1926
+ # dict keys in JSON must be strings
1927
+ assert isinstance(name, str)
1928
+ if not isinstance(value, str):
1929
+ raise ValueError('Malformed token - bad key/value: '
1930
+ f'{name}: {value}')
1931
+
1932
+ # See CookieJar._cookie_from_cookie_tuple
1933
+ # oauth2proxy default is Max-Age 604800
1934
+ expires = int(time.time()) + 604800
1935
+ domain = str(parsed_url.hostname)
1936
+ domain_initial_dot = domain.startswith('.')
1937
+ if not domain_initial_dot:
1938
+ domain = '.' + domain
1939
+
1940
+ cookie_jar.set_cookie(
1941
+ cookiejar.Cookie(
1942
+ version=0,
1943
+ name=name,
1944
+ value=value,
1945
+ port=None,
1946
+ port_specified=False,
1947
+ domain=domain,
1948
+ domain_specified=True,
1949
+ domain_initial_dot=domain_initial_dot,
1950
+ path='',
1951
+ path_specified=False,
1952
+ secure=False,
1953
+ expires=expires,
1954
+ discard=False,
1955
+ comment=None,
1956
+ comment_url=None,
1957
+ rest=dict(),
1958
+ ))
1959
+
1960
+ # Now that the cookies are parsed, save them to the cookie jar.
1961
+ cookie_jar_path = os.path.expanduser(
1962
+ server_common.get_api_cookie_jar_path())
1963
+ cookie_jar.save(cookie_jar_path)
1845
1964
 
1846
1965
  # Set the endpoint in the config file
1847
1966
  config_path = pathlib.Path(