skypilot-nightly 1.0.0.dev20250522__py3-none-any.whl → 1.0.0.dev20250523__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +46 -16
- sky/backends/cloud_vm_ray_backend.py +16 -4
- sky/check.py +109 -44
- sky/cli.py +261 -90
- sky/client/cli.py +261 -90
- sky/client/sdk.py +50 -2
- sky/clouds/__init__.py +3 -0
- sky/clouds/aws.py +4 -2
- sky/clouds/azure.py +4 -2
- sky/clouds/cloud.py +24 -6
- sky/clouds/cudo.py +2 -1
- sky/clouds/do.py +2 -1
- sky/clouds/fluidstack.py +2 -1
- sky/clouds/gcp.py +4 -2
- sky/clouds/ibm.py +4 -2
- sky/clouds/kubernetes.py +66 -22
- sky/clouds/lambda_cloud.py +2 -1
- sky/clouds/nebius.py +18 -2
- sky/clouds/oci.py +4 -2
- sky/clouds/paperspace.py +2 -1
- sky/clouds/runpod.py +2 -1
- sky/clouds/scp.py +2 -1
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/ssh_catalog.py +167 -0
- sky/clouds/ssh.py +203 -0
- sky/clouds/vast.py +2 -1
- sky/clouds/vsphere.py +2 -1
- sky/core.py +53 -9
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{CzOVV6JpRQBRt5GhZuhyK → ECKwDNS9v9y3_IKFZ2lpp}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/optimizer.py +23 -4
- sky/provision/__init__.py +1 -0
- sky/provision/aws/instance.py +17 -1
- sky/provision/kubernetes/instance.py +16 -5
- sky/provision/kubernetes/utils.py +37 -19
- sky/provision/nebius/instance.py +3 -1
- sky/provision/nebius/utils.py +14 -2
- sky/provision/ssh/__init__.py +18 -0
- sky/resources.py +4 -1
- sky/server/requests/payloads.py +7 -0
- sky/server/server.py +40 -0
- sky/setup_files/dependencies.py +1 -0
- sky/templates/nebius-ray.yml.j2 +12 -0
- sky/utils/infra_utils.py +21 -1
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/create_cluster.sh +1 -0
- sky/utils/kubernetes/deploy_remote_cluster.py +1437 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
- sky/utils/kubernetes/ssh-tunnel.sh +387 -0
- sky/utils/log_utils.py +214 -1
- sky/utils/schemas.py +21 -0
- sky/utils/ux_utils.py +2 -1
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/METADATA +6 -1
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/RECORD +68 -63
- sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- /sky/dashboard/out/_next/static/{CzOVV6JpRQBRt5GhZuhyK → ECKwDNS9v9y3_IKFZ2lpp}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/top_level.txt +0 -0
sky/client/cli.py
CHANGED
@@ -212,6 +212,7 @@ def _get_glob_storages(storages: List[str]) -> List[str]:
|
|
212
212
|
"""Returns a list of storages that match the glob pattern."""
|
213
213
|
glob_storages = []
|
214
214
|
for storage_object in storages:
|
215
|
+
# TODO(zhwu): client side should not rely on global_user_state.
|
215
216
|
glob_storage = global_user_state.get_glob_storage_name(storage_object)
|
216
217
|
if not glob_storage:
|
217
218
|
click.echo(f'Storage {storage_object} not found.')
|
@@ -1780,6 +1781,27 @@ def _show_endpoint(query_clusters: Optional[List[str]],
|
|
1780
1781
|
return
|
1781
1782
|
|
1782
1783
|
|
1784
|
+
def _show_enabled_infra():
|
1785
|
+
"""Show the enabled infrastructure."""
|
1786
|
+
title = (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Enabled Infra:'
|
1787
|
+
f'{colorama.Style.RESET_ALL} ')
|
1788
|
+
enabled_clouds = sdk.get(sdk.enabled_clouds())
|
1789
|
+
enabled_ssh_infras = []
|
1790
|
+
enabled_k8s_infras = []
|
1791
|
+
enabled_cloud_infras = []
|
1792
|
+
for cloud in enabled_clouds:
|
1793
|
+
cloud_infra = cloud.get_infras()
|
1794
|
+
if isinstance(cloud, clouds.SSH):
|
1795
|
+
enabled_ssh_infras.extend(cloud_infra)
|
1796
|
+
elif isinstance(cloud, clouds.Kubernetes):
|
1797
|
+
enabled_k8s_infras.extend(cloud_infra)
|
1798
|
+
else:
|
1799
|
+
enabled_cloud_infras.extend(cloud_infra)
|
1800
|
+
all_infras = sorted(enabled_ssh_infras) + sorted(
|
1801
|
+
enabled_k8s_infras) + sorted(enabled_cloud_infras)
|
1802
|
+
click.echo(f'{title}{", ".join(all_infras)}\n')
|
1803
|
+
|
1804
|
+
|
1783
1805
|
@cli.command()
|
1784
1806
|
@config_option(expose_value=False)
|
1785
1807
|
@click.option('--verbose',
|
@@ -1966,6 +1988,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1966
1988
|
('endpoint port'
|
1967
1989
|
if show_single_endpoint else 'endpoints')))
|
1968
1990
|
else:
|
1991
|
+
_show_enabled_infra()
|
1969
1992
|
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters'
|
1970
1993
|
f'{colorama.Style.RESET_ALL}')
|
1971
1994
|
query_clusters: Optional[List[str]] = None if not clusters else clusters
|
@@ -3462,13 +3485,22 @@ def show_gpus(
|
|
3462
3485
|
|
3463
3486
|
# Kubernetes specific bools
|
3464
3487
|
enabled_clouds = sdk.get(sdk.enabled_clouds())
|
3465
|
-
cloud_is_kubernetes = isinstance(
|
3488
|
+
cloud_is_kubernetes = isinstance(
|
3489
|
+
cloud_obj, clouds.Kubernetes) and not isinstance(cloud_obj, clouds.SSH)
|
3490
|
+
cloud_is_ssh = isinstance(cloud_obj, clouds.SSH)
|
3466
3491
|
# TODO(romilb): We should move this to the backend.
|
3467
3492
|
kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None
|
3468
|
-
kubernetes_is_enabled =
|
3469
|
-
|
3470
|
-
|
3471
|
-
|
3493
|
+
kubernetes_is_enabled = False
|
3494
|
+
ssh_is_enabled = False
|
3495
|
+
for cloud in enabled_clouds:
|
3496
|
+
if isinstance(cloud, clouds.SSH):
|
3497
|
+
ssh_is_enabled = True
|
3498
|
+
elif isinstance(cloud, clouds.Kubernetes):
|
3499
|
+
kubernetes_is_enabled = True
|
3500
|
+
query_k8s_realtime_gpu = (kubernetes_is_enabled and
|
3501
|
+
(cloud_name is None or cloud_is_kubernetes))
|
3502
|
+
query_ssh_realtime_gpu = (ssh_is_enabled and
|
3503
|
+
(cloud_name is None or cloud_is_ssh))
|
3472
3504
|
|
3473
3505
|
def _list_to_str(lst):
|
3474
3506
|
return ', '.join([str(e) for e in lst])
|
@@ -3478,7 +3510,8 @@ def show_gpus(
|
|
3478
3510
|
def _get_kubernetes_realtime_gpu_tables(
|
3479
3511
|
context: Optional[str] = None,
|
3480
3512
|
name_filter: Optional[str] = None,
|
3481
|
-
quantity_filter: Optional[int] = None
|
3513
|
+
quantity_filter: Optional[int] = None,
|
3514
|
+
is_ssh: bool = False,
|
3482
3515
|
) -> Tuple[List[Tuple[str, 'prettytable.PrettyTable']],
|
3483
3516
|
Optional['prettytable.PrettyTable'], List[Tuple[
|
3484
3517
|
str, 'models.KubernetesNodesInfo']]]:
|
@@ -3491,19 +3524,26 @@ def show_gpus(
|
|
3491
3524
|
sdk.realtime_kubernetes_gpu_availability(
|
3492
3525
|
context=context,
|
3493
3526
|
name_filter=name_filter,
|
3494
|
-
quantity_filter=quantity_filter
|
3527
|
+
quantity_filter=quantity_filter,
|
3528
|
+
is_ssh=is_ssh))
|
3495
3529
|
if not realtime_gpu_availability_lists:
|
3496
|
-
|
3497
|
-
|
3530
|
+
# Customize message based on context
|
3531
|
+
identity = ('SSH Node Pool'
|
3532
|
+
if is_ssh else 'any allowed Kubernetes cluster')
|
3533
|
+
cloud_name = 'ssh' if is_ssh else 'kubernetes'
|
3534
|
+
err_msg = f'No GPUs found in {identity}. '
|
3535
|
+
debug_msg = (f'To further debug, run: sky check {cloud_name}')
|
3498
3536
|
if name_filter is not None:
|
3499
3537
|
gpu_info_msg = f' {name_filter!r}'
|
3500
3538
|
if quantity_filter is not None:
|
3501
3539
|
gpu_info_msg += (' with requested quantity'
|
3502
3540
|
f' {quantity_filter}')
|
3503
3541
|
err_msg = (f'Resources{gpu_info_msg} not found '
|
3504
|
-
'in
|
3505
|
-
|
3506
|
-
|
3542
|
+
f'in {identity}. ')
|
3543
|
+
identity_short = 'SSH Node Pool' if is_ssh else 'Kubernetes'
|
3544
|
+
debug_msg = (
|
3545
|
+
f'To show available accelerators in {identity_short}, '
|
3546
|
+
f'run: sky show-gpus --cloud {cloud_name}')
|
3507
3547
|
full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
|
3508
3548
|
debug_msg)
|
3509
3549
|
raise ValueError(full_err_msg)
|
@@ -3513,6 +3553,14 @@ def show_gpus(
|
|
3513
3553
|
lambda: [0, 0])
|
3514
3554
|
all_nodes_info = []
|
3515
3555
|
|
3556
|
+
# display an aggregated table for all contexts
|
3557
|
+
# if there are more than one contexts with GPUs.
|
3558
|
+
def _filter_ctx(ctx: str) -> bool:
|
3559
|
+
ctx_is_ssh = ctx and ctx.startswith('ssh-')
|
3560
|
+
return ctx_is_ssh is is_ssh
|
3561
|
+
|
3562
|
+
num_filtered_contexts = 0
|
3563
|
+
|
3516
3564
|
if realtime_gpu_availability_lists:
|
3517
3565
|
if len(realtime_gpu_availability_lists[0]) != 2:
|
3518
3566
|
# TODO(kyuds): for backwards compatibility, as we add new
|
@@ -3522,6 +3570,13 @@ def show_gpus(
|
|
3522
3570
|
(context, realtime_gpu_availability_lists)
|
3523
3571
|
]
|
3524
3572
|
for (ctx, availability_list) in realtime_gpu_availability_lists:
|
3573
|
+
if not _filter_ctx(ctx):
|
3574
|
+
continue
|
3575
|
+
if is_ssh:
|
3576
|
+
display_ctx = ctx.lstrip('ssh-')
|
3577
|
+
else:
|
3578
|
+
display_ctx = ctx
|
3579
|
+
num_filtered_contexts += 1
|
3525
3580
|
realtime_gpu_table = log_utils.create_table(
|
3526
3581
|
['GPU', qty_header, 'UTILIZATION'])
|
3527
3582
|
for realtime_gpu_availability in sorted(availability_list):
|
@@ -3542,15 +3597,12 @@ def show_gpus(
|
|
3542
3597
|
if capacity > 0:
|
3543
3598
|
total_gpu_info[gpu][0] += capacity
|
3544
3599
|
total_gpu_info[gpu][1] += available
|
3545
|
-
realtime_gpu_infos.append((
|
3600
|
+
realtime_gpu_infos.append((display_ctx, realtime_gpu_table))
|
3546
3601
|
# Collect node info for this context
|
3547
3602
|
nodes_info = sdk.stream_and_get(
|
3548
3603
|
sdk.kubernetes_node_info(context=ctx))
|
3549
|
-
all_nodes_info.append((
|
3550
|
-
|
3551
|
-
# display an aggregated table for all contexts
|
3552
|
-
# if there are more than one contexts with GPUs
|
3553
|
-
if len(realtime_gpu_infos) > 1:
|
3604
|
+
all_nodes_info.append((display_ctx, nodes_info))
|
3605
|
+
if num_filtered_contexts > 1:
|
3554
3606
|
total_realtime_gpu_table = log_utils.create_table(
|
3555
3607
|
['GPU', 'UTILIZATION'])
|
3556
3608
|
for gpu, stats in total_gpu_info.items():
|
@@ -3562,10 +3614,11 @@ def show_gpus(
|
|
3562
3614
|
return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
|
3563
3615
|
|
3564
3616
|
def _format_kubernetes_node_info_combined(
|
3565
|
-
contexts_info: List[Tuple[str,
|
3566
|
-
|
3617
|
+
contexts_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
|
3618
|
+
cloud_str: str = 'Kubernetes',
|
3619
|
+
context_title_str: str = 'CONTEXT') -> str:
|
3567
3620
|
node_table = log_utils.create_table(
|
3568
|
-
[
|
3621
|
+
[context_title_str, 'NODE', 'GPU', 'UTILIZATION'])
|
3569
3622
|
|
3570
3623
|
no_permissions_str = '<no permissions>'
|
3571
3624
|
hints = []
|
@@ -3588,7 +3641,7 @@ def show_gpus(
|
|
3588
3641
|
'free'
|
3589
3642
|
])
|
3590
3643
|
|
3591
|
-
k8s_per_node_acc_message = ('
|
3644
|
+
k8s_per_node_acc_message = (f'{cloud_str} per-node GPU availability')
|
3592
3645
|
if hints:
|
3593
3646
|
k8s_per_node_acc_message += ' (' + '; '.join(hints) + ')'
|
3594
3647
|
|
@@ -3598,26 +3651,30 @@ def show_gpus(
|
|
3598
3651
|
f'{node_table.get_string()}')
|
3599
3652
|
|
3600
3653
|
def _format_kubernetes_realtime_gpu(
|
3601
|
-
total_table: 'prettytable.PrettyTable',
|
3654
|
+
total_table: Optional['prettytable.PrettyTable'],
|
3602
3655
|
k8s_realtime_infos: List[Tuple[str, 'prettytable.PrettyTable']],
|
3603
3656
|
all_nodes_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
|
3604
|
-
show_node_info: bool) -> Generator[str, None, None]:
|
3657
|
+
show_node_info: bool, is_ssh: bool) -> Generator[str, None, None]:
|
3658
|
+
identity = 'SSH Node Pool' if is_ssh else 'Kubernetes'
|
3605
3659
|
yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
|
3606
|
-
'
|
3660
|
+
f'{identity} GPUs'
|
3607
3661
|
f'{colorama.Style.RESET_ALL}')
|
3608
3662
|
# print total table
|
3609
3663
|
if total_table is not None:
|
3610
3664
|
yield '\n'
|
3611
3665
|
yield from total_table.get_string()
|
3612
3666
|
|
3667
|
+
ctx_name = 'SSH Node Pool' if is_ssh else 'Context'
|
3668
|
+
ctx_column_title = 'NODE_POOL' if is_ssh else 'CONTEXT'
|
3669
|
+
|
3613
3670
|
# print individual infos.
|
3614
3671
|
for (ctx, k8s_realtime_table) in k8s_realtime_infos:
|
3615
3672
|
yield '\n'
|
3616
3673
|
# Print context header separately
|
3617
3674
|
if ctx:
|
3618
|
-
context_str = f'
|
3675
|
+
context_str = f'{ctx_name}: {ctx}'
|
3619
3676
|
else:
|
3620
|
-
context_str = 'Default
|
3677
|
+
context_str = f'Default {ctx_name}'
|
3621
3678
|
yield (
|
3622
3679
|
f'{colorama.Fore.CYAN}{context_str}{colorama.Style.RESET_ALL}\n'
|
3623
3680
|
)
|
@@ -3625,7 +3682,102 @@ def show_gpus(
|
|
3625
3682
|
|
3626
3683
|
if show_node_info:
|
3627
3684
|
yield '\n'
|
3628
|
-
yield _format_kubernetes_node_info_combined(all_nodes_info
|
3685
|
+
yield _format_kubernetes_node_info_combined(all_nodes_info,
|
3686
|
+
identity,
|
3687
|
+
ctx_column_title)
|
3688
|
+
|
3689
|
+
def _possibly_show_k8s_like_realtime(
|
3690
|
+
is_ssh: bool = False
|
3691
|
+
) -> Generator[str, None, Tuple[bool, bool, str]]:
|
3692
|
+
# If cloud is kubernetes, we want to show real-time capacity
|
3693
|
+
k8s_messages = ''
|
3694
|
+
print_section_titles = False
|
3695
|
+
if (is_ssh and query_ssh_realtime_gpu or query_k8s_realtime_gpu):
|
3696
|
+
context = region
|
3697
|
+
|
3698
|
+
try:
|
3699
|
+
# If --cloud kubernetes is not specified, we want to catch
|
3700
|
+
# the case where no GPUs are available on the cluster and
|
3701
|
+
# print the warning at the end.
|
3702
|
+
k8s_realtime_infos, total_table, all_nodes_info = (
|
3703
|
+
_get_kubernetes_realtime_gpu_tables(context, is_ssh=is_ssh))
|
3704
|
+
except ValueError as e:
|
3705
|
+
if not (cloud_is_kubernetes or cloud_is_ssh):
|
3706
|
+
# Make it a note if cloud is not kubernetes
|
3707
|
+
k8s_messages += 'Note: '
|
3708
|
+
k8s_messages += str(e)
|
3709
|
+
else:
|
3710
|
+
print_section_titles = True
|
3711
|
+
|
3712
|
+
yield from _format_kubernetes_realtime_gpu(total_table,
|
3713
|
+
k8s_realtime_infos,
|
3714
|
+
all_nodes_info,
|
3715
|
+
show_node_info=True,
|
3716
|
+
is_ssh=is_ssh)
|
3717
|
+
|
3718
|
+
if kubernetes_autoscaling:
|
3719
|
+
k8s_messages += ('\n' +
|
3720
|
+
kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
|
3721
|
+
if is_ssh:
|
3722
|
+
if cloud_is_ssh:
|
3723
|
+
if not ssh_is_enabled:
|
3724
|
+
yield ('SSH Node Pools are not enabled. To fix, run: '
|
3725
|
+
'sky check ssh ')
|
3726
|
+
yield k8s_messages
|
3727
|
+
return True, print_section_titles, ''
|
3728
|
+
else:
|
3729
|
+
if cloud_is_kubernetes:
|
3730
|
+
if not kubernetes_is_enabled:
|
3731
|
+
yield ('Kubernetes is not enabled. To fix, run: '
|
3732
|
+
'sky check kubernetes ')
|
3733
|
+
yield k8s_messages
|
3734
|
+
return True, print_section_titles, ''
|
3735
|
+
return False, print_section_titles, k8s_messages
|
3736
|
+
|
3737
|
+
def _possibly_show_k8s_like_realtime_for_acc(
|
3738
|
+
name: Optional[str],
|
3739
|
+
quantity: Optional[int],
|
3740
|
+
is_ssh: bool = False) -> Generator[str, None, Tuple[bool, bool]]:
|
3741
|
+
k8s_messages = ''
|
3742
|
+
print_section_titles = False
|
3743
|
+
if (is_ssh and query_ssh_realtime_gpu or
|
3744
|
+
query_k8s_realtime_gpu) and not show_all:
|
3745
|
+
print_section_titles = True
|
3746
|
+
# TODO(romilb): Show filtered per node GPU availability here as well
|
3747
|
+
try:
|
3748
|
+
(k8s_realtime_infos, total_table,
|
3749
|
+
all_nodes_info) = _get_kubernetes_realtime_gpu_tables(
|
3750
|
+
context=region,
|
3751
|
+
name_filter=name,
|
3752
|
+
quantity_filter=quantity,
|
3753
|
+
is_ssh=is_ssh)
|
3754
|
+
|
3755
|
+
yield from _format_kubernetes_realtime_gpu(total_table,
|
3756
|
+
k8s_realtime_infos,
|
3757
|
+
all_nodes_info,
|
3758
|
+
show_node_info=False,
|
3759
|
+
is_ssh=is_ssh)
|
3760
|
+
except ValueError as e:
|
3761
|
+
# In the case of a specific accelerator, show the error message
|
3762
|
+
# immediately (e.g., "Resources H100 not found ...")
|
3763
|
+
yield common_utils.format_exception(e, use_bracket=True)
|
3764
|
+
if kubernetes_autoscaling:
|
3765
|
+
k8s_messages += ('\n' +
|
3766
|
+
kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
|
3767
|
+
yield k8s_messages
|
3768
|
+
if is_ssh:
|
3769
|
+
if cloud_is_ssh:
|
3770
|
+
if not ssh_is_enabled:
|
3771
|
+
yield ('SSH Node Pools are not enabled. To fix, run: '
|
3772
|
+
'sky check ssh ')
|
3773
|
+
return True, print_section_titles
|
3774
|
+
else:
|
3775
|
+
if cloud_is_kubernetes:
|
3776
|
+
if not kubernetes_is_enabled:
|
3777
|
+
yield ('Kubernetes is not enabled. To fix, run: '
|
3778
|
+
'sky check kubernetes ')
|
3779
|
+
return True, print_section_titles
|
3780
|
+
return False, print_section_titles
|
3629
3781
|
|
3630
3782
|
def _output() -> Generator[str, None, None]:
|
3631
3783
|
gpu_table = log_utils.create_table(
|
@@ -3643,46 +3795,28 @@ def show_gpus(
|
|
3643
3795
|
clouds_to_list: Union[Optional[str], List[str]] = cloud_name
|
3644
3796
|
if cloud_name is None:
|
3645
3797
|
clouds_to_list = [
|
3646
|
-
c for c in service_catalog.ALL_CLOUDS
|
3798
|
+
c for c in service_catalog.ALL_CLOUDS
|
3799
|
+
if c != 'kubernetes' and c != 'ssh'
|
3647
3800
|
]
|
3648
3801
|
|
3649
3802
|
k8s_messages = ''
|
3650
3803
|
if accelerator_str is None:
|
3651
3804
|
# Collect k8s related messages in k8s_messages and print them at end
|
3652
3805
|
print_section_titles = False
|
3653
|
-
|
3654
|
-
|
3655
|
-
|
3656
|
-
|
3657
|
-
|
3658
|
-
|
3659
|
-
|
3660
|
-
|
3661
|
-
|
3662
|
-
|
3663
|
-
|
3664
|
-
|
3665
|
-
|
3666
|
-
|
3667
|
-
k8s_messages += str(e)
|
3668
|
-
else:
|
3669
|
-
print_section_titles = True
|
3670
|
-
|
3671
|
-
yield from _format_kubernetes_realtime_gpu(
|
3672
|
-
total_table,
|
3673
|
-
k8s_realtime_infos,
|
3674
|
-
all_nodes_info,
|
3675
|
-
show_node_info=True)
|
3676
|
-
|
3677
|
-
if kubernetes_autoscaling:
|
3678
|
-
k8s_messages += (
|
3679
|
-
'\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
|
3680
|
-
if cloud_is_kubernetes:
|
3681
|
-
# Do not show clouds if --cloud kubernetes is specified
|
3682
|
-
if not kubernetes_is_enabled:
|
3683
|
-
yield ('Kubernetes is not enabled. To fix, run: '
|
3684
|
-
'sky check kubernetes ')
|
3685
|
-
yield k8s_messages
|
3806
|
+
stop_iter = False
|
3807
|
+
k8s_messages = ''
|
3808
|
+
prev_print_section_titles = False
|
3809
|
+
for is_ssh in [False, True]:
|
3810
|
+
if prev_print_section_titles:
|
3811
|
+
yield '\n\n'
|
3812
|
+
stop_iter_one, print_section_titles_one, k8s_messages_one = (
|
3813
|
+
yield from _possibly_show_k8s_like_realtime(is_ssh))
|
3814
|
+
stop_iter = stop_iter or stop_iter_one
|
3815
|
+
print_section_titles = (print_section_titles or
|
3816
|
+
print_section_titles_one)
|
3817
|
+
k8s_messages += k8s_messages_one
|
3818
|
+
prev_print_section_titles = print_section_titles_one
|
3819
|
+
if stop_iter:
|
3686
3820
|
return
|
3687
3821
|
|
3688
3822
|
# For show_all, show the k8s message at the start since output is
|
@@ -3757,34 +3891,19 @@ def show_gpus(
|
|
3757
3891
|
name, quantity = accelerator_str, None
|
3758
3892
|
|
3759
3893
|
print_section_titles = False
|
3760
|
-
|
3761
|
-
|
3762
|
-
|
3763
|
-
|
3764
|
-
|
3765
|
-
|
3766
|
-
|
3767
|
-
|
3768
|
-
|
3769
|
-
|
3770
|
-
|
3771
|
-
|
3772
|
-
|
3773
|
-
all_nodes_info,
|
3774
|
-
show_node_info=False)
|
3775
|
-
except ValueError as e:
|
3776
|
-
# In the case of a specific accelerator, show the error message
|
3777
|
-
# immediately (e.g., "Resources H100 not found ...")
|
3778
|
-
yield common_utils.format_exception(e, use_bracket=True)
|
3779
|
-
if kubernetes_autoscaling:
|
3780
|
-
k8s_messages += ('\n' +
|
3781
|
-
kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
|
3782
|
-
yield k8s_messages
|
3783
|
-
if cloud_is_kubernetes:
|
3784
|
-
# Do not show clouds if --cloud kubernetes is specified
|
3785
|
-
if not kubernetes_is_enabled:
|
3786
|
-
yield ('Kubernetes is not enabled. To fix, run: '
|
3787
|
-
'sky check kubernetes ')
|
3894
|
+
stop_iter = False
|
3895
|
+
prev_print_section_titles = False
|
3896
|
+
for is_ssh in [False, True]:
|
3897
|
+
if prev_print_section_titles:
|
3898
|
+
yield '\n\n'
|
3899
|
+
stop_iter_one, print_section_titles_one = (
|
3900
|
+
yield from _possibly_show_k8s_like_realtime_for_acc(
|
3901
|
+
name, quantity, is_ssh))
|
3902
|
+
stop_iter = stop_iter or stop_iter_one
|
3903
|
+
print_section_titles = (print_section_titles or
|
3904
|
+
print_section_titles_one)
|
3905
|
+
prev_print_section_titles = print_section_titles_one
|
3906
|
+
if stop_iter:
|
3788
3907
|
return
|
3789
3908
|
|
3790
3909
|
# For clouds other than Kubernetes, get the accelerator details
|
@@ -6074,6 +6193,58 @@ def api_info():
|
|
6074
6193
|
f'{ux_utils.INDENT_LAST_SYMBOL}Dashboard: {dashboard_url}')
|
6075
6194
|
|
6076
6195
|
|
6196
|
+
@cli.group(cls=_NaturalOrderGroup)
|
6197
|
+
def ssh():
|
6198
|
+
"""Commands for managing SSH Node Pools."""
|
6199
|
+
pass
|
6200
|
+
|
6201
|
+
|
6202
|
+
@ssh.command('up', cls=_DocumentedCodeCommand)
|
6203
|
+
@click.option(
|
6204
|
+
'--infra',
|
6205
|
+
help='Name of the cluster to set up in ~/.sky/ssh_node_pools.yaml. '
|
6206
|
+
'If not specified, all clusters in the file will be set up.')
|
6207
|
+
@click.option('--async',
|
6208
|
+
'async_call',
|
6209
|
+
is_flag=True,
|
6210
|
+
hidden=True,
|
6211
|
+
help='Run the command asynchronously.')
|
6212
|
+
def ssh_up(infra: Optional[str], async_call: bool):
|
6213
|
+
"""Set up a cluster using SSH targets from ~/.sky/ssh_node_pools.yaml.
|
6214
|
+
|
6215
|
+
This command sets up a Kubernetes cluster on the machines specified in
|
6216
|
+
~/.sky/ssh_node_pools.yaml and configures SkyPilot to use it.
|
6217
|
+
"""
|
6218
|
+
request_id = sdk.ssh_up(infra=infra)
|
6219
|
+
if async_call:
|
6220
|
+
print(f'Request submitted with ID: {request_id}')
|
6221
|
+
else:
|
6222
|
+
sdk.stream_and_get(request_id)
|
6223
|
+
|
6224
|
+
|
6225
|
+
@ssh.command('down', cls=_DocumentedCodeCommand)
|
6226
|
+
@click.option(
|
6227
|
+
'--infra',
|
6228
|
+
help='Name of the cluster to clean up in ~/.sky/ssh_node_pools.yaml. '
|
6229
|
+
'If not specified, all clusters in the file will be cleaned up.')
|
6230
|
+
@click.option('--async',
|
6231
|
+
'async_call',
|
6232
|
+
is_flag=True,
|
6233
|
+
hidden=True,
|
6234
|
+
help='Run the command asynchronously.')
|
6235
|
+
def ssh_down(infra, async_call):
|
6236
|
+
"""Clean up a cluster set up with 'sky ssh up'.
|
6237
|
+
|
6238
|
+
This command removes the Kubernetes installation from the machines specified
|
6239
|
+
in ~/.sky/ssh_node_pools.yaml.
|
6240
|
+
"""
|
6241
|
+
request_id = sdk.ssh_down(infra=infra)
|
6242
|
+
if async_call:
|
6243
|
+
print(f'Request submitted with ID: {request_id}')
|
6244
|
+
else:
|
6245
|
+
sdk.stream_and_get(request_id)
|
6246
|
+
|
6247
|
+
|
6077
6248
|
def main():
|
6078
6249
|
return cli()
|
6079
6250
|
|
sky/client/sdk.py
CHANGED
@@ -225,7 +225,7 @@ def list_accelerator_counts(
|
|
225
225
|
accelerator names mapped to a list of available counts. See usage
|
226
226
|
in cli.py.
|
227
227
|
"""
|
228
|
-
body = payloads.
|
228
|
+
body = payloads.ListAcceleratorCountsBody(
|
229
229
|
gpus_only=gpus_only,
|
230
230
|
name_filter=name_filter,
|
231
231
|
region_filter=region_filter,
|
@@ -1396,13 +1396,60 @@ def local_down() -> server_common.RequestId:
|
|
1396
1396
|
return server_common.get_request_id(response)
|
1397
1397
|
|
1398
1398
|
|
1399
|
+
@usage_lib.entrypoint
|
1400
|
+
@server_common.check_server_healthy_or_start
|
1401
|
+
@annotations.client_api
|
1402
|
+
def ssh_up(infra: Optional[str] = None) -> server_common.RequestId:
|
1403
|
+
"""Deploys the SSH Node Pools defined in ~/.sky/ssh_targets.yaml.
|
1404
|
+
|
1405
|
+
Args:
|
1406
|
+
infra: Name of the cluster configuration in ssh_targets.yaml.
|
1407
|
+
If None, the first cluster in the file is used.
|
1408
|
+
|
1409
|
+
Returns:
|
1410
|
+
request_id: The request ID of the SSH cluster deployment request.
|
1411
|
+
"""
|
1412
|
+
body = payloads.SSHUpBody(
|
1413
|
+
infra=infra,
|
1414
|
+
cleanup=False,
|
1415
|
+
)
|
1416
|
+
response = requests.post(f'{server_common.get_server_url()}/ssh_up',
|
1417
|
+
json=json.loads(body.model_dump_json()),
|
1418
|
+
cookies=server_common.get_api_cookie_jar())
|
1419
|
+
return server_common.get_request_id(response)
|
1420
|
+
|
1421
|
+
|
1422
|
+
@usage_lib.entrypoint
|
1423
|
+
@server_common.check_server_healthy_or_start
|
1424
|
+
@annotations.client_api
|
1425
|
+
def ssh_down(infra: Optional[str] = None) -> server_common.RequestId:
|
1426
|
+
"""Tears down a Kubernetes cluster on SSH targets.
|
1427
|
+
|
1428
|
+
Args:
|
1429
|
+
infra: Name of the cluster configuration in ssh_targets.yaml.
|
1430
|
+
If None, the first cluster in the file is used.
|
1431
|
+
|
1432
|
+
Returns:
|
1433
|
+
request_id: The request ID of the SSH cluster teardown request.
|
1434
|
+
"""
|
1435
|
+
body = payloads.SSHUpBody(
|
1436
|
+
infra=infra,
|
1437
|
+
cleanup=True,
|
1438
|
+
)
|
1439
|
+
response = requests.post(f'{server_common.get_server_url()}/ssh_down',
|
1440
|
+
json=json.loads(body.model_dump_json()),
|
1441
|
+
cookies=server_common.get_api_cookie_jar())
|
1442
|
+
return server_common.get_request_id(response)
|
1443
|
+
|
1444
|
+
|
1399
1445
|
@usage_lib.entrypoint
|
1400
1446
|
@server_common.check_server_healthy_or_start
|
1401
1447
|
@annotations.client_api
|
1402
1448
|
def realtime_kubernetes_gpu_availability(
|
1403
1449
|
context: Optional[str] = None,
|
1404
1450
|
name_filter: Optional[str] = None,
|
1405
|
-
quantity_filter: Optional[int] = None
|
1451
|
+
quantity_filter: Optional[int] = None,
|
1452
|
+
is_ssh: Optional[bool] = None) -> server_common.RequestId:
|
1406
1453
|
"""Gets the real-time Kubernetes GPU availability.
|
1407
1454
|
|
1408
1455
|
Returns:
|
@@ -1412,6 +1459,7 @@ def realtime_kubernetes_gpu_availability(
|
|
1412
1459
|
context=context,
|
1413
1460
|
name_filter=name_filter,
|
1414
1461
|
quantity_filter=quantity_filter,
|
1462
|
+
is_ssh=is_ssh,
|
1415
1463
|
)
|
1416
1464
|
response = requests.post(
|
1417
1465
|
f'{server_common.get_server_url()}/'
|
sky/clouds/__init__.py
CHANGED
@@ -2,6 +2,7 @@
|
|
2
2
|
|
3
3
|
from sky.clouds.cloud import Cloud
|
4
4
|
from sky.clouds.cloud import cloud_in_iterable
|
5
|
+
from sky.clouds.cloud import CloudCapability
|
5
6
|
from sky.clouds.cloud import CloudImplementationFeatures
|
6
7
|
from sky.clouds.cloud import DummyCloud
|
7
8
|
from sky.clouds.cloud import OpenPortsVersion
|
@@ -26,6 +27,7 @@ from sky.clouds.oci import OCI
|
|
26
27
|
from sky.clouds.paperspace import Paperspace
|
27
28
|
from sky.clouds.runpod import RunPod
|
28
29
|
from sky.clouds.scp import SCP
|
30
|
+
from sky.clouds.ssh import SSH
|
29
31
|
from sky.clouds.vast import Vast
|
30
32
|
from sky.clouds.vsphere import Vsphere
|
31
33
|
|
@@ -46,6 +48,7 @@ __all__ = [
|
|
46
48
|
'OCI',
|
47
49
|
'Vsphere',
|
48
50
|
'Kubernetes',
|
51
|
+
'SSH',
|
49
52
|
'CloudImplementationFeatures',
|
50
53
|
'Region',
|
51
54
|
'Zone',
|
sky/clouds/aws.py
CHANGED
@@ -565,12 +565,14 @@ class AWS(clouds.Cloud):
|
|
565
565
|
fuzzy_candidate_list, None)
|
566
566
|
|
567
567
|
@classmethod
|
568
|
-
def _check_compute_credentials(
|
568
|
+
def _check_compute_credentials(
|
569
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
569
570
|
"""Checks if the user has access credentials to this AWS's compute service."""
|
570
571
|
return cls._check_credentials()
|
571
572
|
|
572
573
|
@classmethod
|
573
|
-
def _check_storage_credentials(
|
574
|
+
def _check_storage_credentials(
|
575
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
574
576
|
"""Checks if the user has access credentials to this AWS's storage service."""
|
575
577
|
# TODO(seungjin): Implement separate check for
|
576
578
|
# if the user has access to S3.
|
sky/clouds/azure.py
CHANGED
@@ -518,12 +518,14 @@ class Azure(clouds.Cloud):
|
|
518
518
|
fuzzy_candidate_list, None)
|
519
519
|
|
520
520
|
@classmethod
|
521
|
-
def _check_compute_credentials(
|
521
|
+
def _check_compute_credentials(
|
522
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
522
523
|
"""Checks if the user has access credentials to this cloud's compute service."""
|
523
524
|
return cls._check_credentials()
|
524
525
|
|
525
526
|
@classmethod
|
526
|
-
def _check_storage_credentials(
|
527
|
+
def _check_storage_credentials(
|
528
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
527
529
|
"""Checks if the user has access credentials to this cloud's storage service."""
|
528
530
|
# TODO(seungjin): Implement separate check for
|
529
531
|
# if the user has access to Azure Blob Storage.
|