skypilot-nightly 1.0.0.dev20250522__py3-none-any.whl → 1.0.0.dev20250524__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +46 -16
- sky/backends/backend_utils.py +62 -45
- sky/backends/cloud_vm_ray_backend.py +19 -5
- sky/check.py +398 -171
- sky/cli.py +302 -98
- sky/client/cli.py +302 -98
- sky/client/sdk.py +104 -12
- sky/clouds/__init__.py +3 -0
- sky/clouds/aws.py +4 -2
- sky/clouds/azure.py +4 -2
- sky/clouds/cloud.py +24 -6
- sky/clouds/cudo.py +2 -1
- sky/clouds/do.py +2 -1
- sky/clouds/fluidstack.py +2 -1
- sky/clouds/gcp.py +23 -5
- sky/clouds/ibm.py +4 -2
- sky/clouds/kubernetes.py +66 -22
- sky/clouds/lambda_cloud.py +2 -1
- sky/clouds/nebius.py +18 -2
- sky/clouds/oci.py +4 -2
- sky/clouds/paperspace.py +2 -1
- sky/clouds/runpod.py +2 -1
- sky/clouds/scp.py +2 -1
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/ssh_catalog.py +167 -0
- sky/clouds/ssh.py +203 -0
- sky/clouds/vast.py +2 -1
- sky/clouds/vsphere.py +2 -1
- sky/core.py +58 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/aHej19bZyl4hoHgrzPCn7/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/480-ee58038f1a4afd5c.js +1 -0
- sky/dashboard/out/_next/static/chunks/488-50d843fdb5396d32.js +15 -0
- sky/dashboard/out/_next/static/chunks/498-d7722313e5e5b4e6.js +21 -0
- sky/dashboard/out/_next/static/chunks/573-f17bd89d9f9118b3.js +66 -0
- sky/dashboard/out/_next/static/chunks/578-7a4795009a56430c.js +6 -0
- sky/dashboard/out/_next/static/chunks/734-5f5ce8f347b7f417.js +1 -0
- sky/dashboard/out/_next/static/chunks/937.f97f83652028e944.js +1 -0
- sky/dashboard/out/_next/static/chunks/938-f347f6144075b0c8.js +1 -0
- sky/dashboard/out/_next/static/chunks/9f96d65d-5a3e4af68c26849e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-dec800f9ef1b10f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-37c042a356f8e608.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-9529d9e882a0e75c.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e6d1ec6e1ac5b29.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-e690d864aa00e2ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-db6558a5ec687011.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-73d5e0c369d00346.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2d319455c3f1c3e2.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-02a7b60f2ead275f.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-deda68c926e8d0bc.js +1 -0
- sky/dashboard/out/_next/static/css/d2cdba64c9202dd7.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/storage.py +1 -1
- sky/global_user_state.py +42 -19
- sky/jobs/constants.py +1 -1
- sky/jobs/server/core.py +72 -56
- sky/jobs/state.py +26 -5
- sky/jobs/utils.py +65 -13
- sky/optimizer.py +29 -7
- sky/provision/__init__.py +1 -0
- sky/provision/aws/instance.py +17 -1
- sky/provision/fluidstack/instance.py +1 -0
- sky/provision/kubernetes/instance.py +16 -5
- sky/provision/kubernetes/utils.py +37 -19
- sky/provision/nebius/instance.py +3 -1
- sky/provision/nebius/utils.py +14 -2
- sky/provision/ssh/__init__.py +18 -0
- sky/resources.py +4 -1
- sky/serve/server/core.py +9 -6
- sky/server/html/token_page.html +6 -1
- sky/server/requests/executor.py +1 -0
- sky/server/requests/payloads.py +18 -0
- sky/server/server.py +108 -5
- sky/setup_files/dependencies.py +1 -0
- sky/skylet/constants.py +4 -1
- sky/skypilot_config.py +83 -9
- sky/templates/nebius-ray.yml.j2 +12 -0
- sky/utils/cli_utils/status_utils.py +18 -8
- sky/utils/infra_utils.py +21 -1
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/create_cluster.sh +1 -0
- sky/utils/kubernetes/deploy_remote_cluster.py +1440 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
- sky/utils/kubernetes/ssh-tunnel.sh +387 -0
- sky/utils/log_utils.py +218 -1
- sky/utils/schemas.py +75 -0
- sky/utils/ux_utils.py +2 -1
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/METADATA +6 -1
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/RECORD +103 -91
- sky/dashboard/out/_next/static/CzOVV6JpRQBRt5GhZuhyK/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-1a3a9440417720eb.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-d584022b0da4ac3b.js +0 -6
- sky/dashboard/out/_next/static/chunks/393-e1eaa440481337ec.js +0 -1
- sky/dashboard/out/_next/static/chunks/480-f28cd152a98997de.js +0 -1
- sky/dashboard/out/_next/static/chunks/582-683f4f27b81996dc.js +0 -59
- sky/dashboard/out/_next/static/chunks/pages/_app-8cfab319f9fb3ae8.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33bc2bec322249b1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-e2fc2dd1955e6c36.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-3a748bd76e5c2984.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-70756c2dad850a7e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-ecd804b9272f4a7c.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/7e7ce4ff31d3977b.css +0 -3
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- /sky/dashboard/out/_next/static/{CzOVV6JpRQBRt5GhZuhyK → aHej19bZyl4hoHgrzPCn7}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250522.dist-info → skypilot_nightly-1.0.0.dev20250524.dist-info}/top_level.txt +0 -0
sky/cli.py
CHANGED
@@ -212,6 +212,7 @@ def _get_glob_storages(storages: List[str]) -> List[str]:
|
|
212
212
|
"""Returns a list of storages that match the glob pattern."""
|
213
213
|
glob_storages = []
|
214
214
|
for storage_object in storages:
|
215
|
+
# TODO(zhwu): client side should not rely on global_user_state.
|
215
216
|
glob_storage = global_user_state.get_glob_storage_name(storage_object)
|
216
217
|
if not glob_storage:
|
217
218
|
click.echo(f'Storage {storage_object} not found.')
|
@@ -1780,6 +1781,31 @@ def _show_endpoint(query_clusters: Optional[List[str]],
|
|
1780
1781
|
return
|
1781
1782
|
|
1782
1783
|
|
1784
|
+
def _show_enabled_infra(active_workspace: str, show_workspace: bool):
|
1785
|
+
"""Show the enabled infrastructure."""
|
1786
|
+
workspace_str = ''
|
1787
|
+
if show_workspace:
|
1788
|
+
workspace_str = f' (workspace: {active_workspace!r})'
|
1789
|
+
title = (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Enabled Infra'
|
1790
|
+
f'{workspace_str}:'
|
1791
|
+
f'{colorama.Style.RESET_ALL} ')
|
1792
|
+
enabled_clouds = sdk.get(sdk.enabled_clouds())
|
1793
|
+
enabled_ssh_infras = []
|
1794
|
+
enabled_k8s_infras = []
|
1795
|
+
enabled_cloud_infras = []
|
1796
|
+
for cloud in enabled_clouds:
|
1797
|
+
cloud_infra = cloud.get_infras()
|
1798
|
+
if isinstance(cloud, clouds.SSH):
|
1799
|
+
enabled_ssh_infras.extend(cloud_infra)
|
1800
|
+
elif isinstance(cloud, clouds.Kubernetes):
|
1801
|
+
enabled_k8s_infras.extend(cloud_infra)
|
1802
|
+
else:
|
1803
|
+
enabled_cloud_infras.extend(cloud_infra)
|
1804
|
+
all_infras = sorted(enabled_ssh_infras) + sorted(
|
1805
|
+
enabled_k8s_infras) + sorted(enabled_cloud_infras)
|
1806
|
+
click.echo(f'{title}{", ".join(all_infras)}\n')
|
1807
|
+
|
1808
|
+
|
1783
1809
|
@cli.command()
|
1784
1810
|
@config_option(expose_value=False)
|
1785
1811
|
@click.option('--verbose',
|
@@ -1932,6 +1958,7 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1932
1958
|
# status query.
|
1933
1959
|
service_status_request_id = serve_lib.status(service_names=None)
|
1934
1960
|
|
1961
|
+
workspace_request_id = None
|
1935
1962
|
if ip or show_endpoints:
|
1936
1963
|
if refresh:
|
1937
1964
|
raise click.UsageError(
|
@@ -1966,8 +1993,8 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1966
1993
|
('endpoint port'
|
1967
1994
|
if show_single_endpoint else 'endpoints')))
|
1968
1995
|
else:
|
1969
|
-
|
1970
|
-
|
1996
|
+
workspace_request_id = sdk.workspaces()
|
1997
|
+
|
1971
1998
|
query_clusters: Optional[List[str]] = None if not clusters else clusters
|
1972
1999
|
refresh_mode = common.StatusRefreshMode.NONE
|
1973
2000
|
if refresh:
|
@@ -1990,9 +2017,20 @@ def status(verbose: bool, refresh: bool, ip: bool, endpoints: bool,
|
|
1990
2017
|
else:
|
1991
2018
|
normal_clusters.append(cluster_record)
|
1992
2019
|
|
2020
|
+
if workspace_request_id is not None:
|
2021
|
+
all_workspaces = sdk.get(workspace_request_id)
|
2022
|
+
else:
|
2023
|
+
all_workspaces = [constants.SKYPILOT_DEFAULT_WORKSPACE]
|
2024
|
+
active_workspace = skypilot_config.get_active_workspace()
|
2025
|
+
show_workspace = len(all_workspaces) > 1
|
2026
|
+
_show_enabled_infra(active_workspace, show_workspace)
|
2027
|
+
click.echo(f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}Clusters'
|
2028
|
+
f'{colorama.Style.RESET_ALL}')
|
2029
|
+
|
1993
2030
|
num_pending_autostop = 0
|
1994
2031
|
num_pending_autostop += status_utils.show_status_table(
|
1995
|
-
normal_clusters + controllers, verbose, all_users, query_clusters
|
2032
|
+
normal_clusters + controllers, verbose, all_users, query_clusters,
|
2033
|
+
show_workspace)
|
1996
2034
|
|
1997
2035
|
managed_jobs_query_interrupted = False
|
1998
2036
|
if show_managed_jobs:
|
@@ -3322,9 +3360,16 @@ def _down_or_stop_clusters(
|
|
3322
3360
|
is_flag=True,
|
3323
3361
|
default=False,
|
3324
3362
|
help='Show the activated account for each cloud.')
|
3363
|
+
@click.option(
|
3364
|
+
'--workspace',
|
3365
|
+
'-w',
|
3366
|
+
type=str,
|
3367
|
+
help='The workspace to check. If None, all workspaces will be checked.')
|
3325
3368
|
@usage_lib.entrypoint
|
3326
3369
|
# pylint: disable=redefined-outer-name
|
3327
|
-
def check(infra_list: Tuple[str],
|
3370
|
+
def check(infra_list: Tuple[str],
|
3371
|
+
verbose: bool,
|
3372
|
+
workspace: Optional[str] = None):
|
3328
3373
|
"""Check which clouds are available to use.
|
3329
3374
|
|
3330
3375
|
This checks access credentials for all clouds supported by SkyPilot. If a
|
@@ -3347,7 +3392,9 @@ def check(infra_list: Tuple[str], verbose: bool):
|
|
3347
3392
|
sky check aws gcp
|
3348
3393
|
"""
|
3349
3394
|
infra_arg = infra_list if len(infra_list) > 0 else None
|
3350
|
-
request_id = sdk.check(infra_list=infra_arg,
|
3395
|
+
request_id = sdk.check(infra_list=infra_arg,
|
3396
|
+
verbose=verbose,
|
3397
|
+
workspace=workspace)
|
3351
3398
|
sdk.stream_and_get(request_id)
|
3352
3399
|
api_server_url = server_common.get_server_url()
|
3353
3400
|
click.echo()
|
@@ -3462,13 +3509,22 @@ def show_gpus(
|
|
3462
3509
|
|
3463
3510
|
# Kubernetes specific bools
|
3464
3511
|
enabled_clouds = sdk.get(sdk.enabled_clouds())
|
3465
|
-
cloud_is_kubernetes = isinstance(
|
3512
|
+
cloud_is_kubernetes = isinstance(
|
3513
|
+
cloud_obj, clouds.Kubernetes) and not isinstance(cloud_obj, clouds.SSH)
|
3514
|
+
cloud_is_ssh = isinstance(cloud_obj, clouds.SSH)
|
3466
3515
|
# TODO(romilb): We should move this to the backend.
|
3467
3516
|
kubernetes_autoscaling = kubernetes_utils.get_autoscaler_type() is not None
|
3468
|
-
kubernetes_is_enabled =
|
3469
|
-
|
3470
|
-
|
3471
|
-
|
3517
|
+
kubernetes_is_enabled = False
|
3518
|
+
ssh_is_enabled = False
|
3519
|
+
for cloud in enabled_clouds:
|
3520
|
+
if isinstance(cloud, clouds.SSH):
|
3521
|
+
ssh_is_enabled = True
|
3522
|
+
elif isinstance(cloud, clouds.Kubernetes):
|
3523
|
+
kubernetes_is_enabled = True
|
3524
|
+
query_k8s_realtime_gpu = (kubernetes_is_enabled and
|
3525
|
+
(cloud_name is None or cloud_is_kubernetes))
|
3526
|
+
query_ssh_realtime_gpu = (ssh_is_enabled and
|
3527
|
+
(cloud_name is None or cloud_is_ssh))
|
3472
3528
|
|
3473
3529
|
def _list_to_str(lst):
|
3474
3530
|
return ', '.join([str(e) for e in lst])
|
@@ -3478,7 +3534,8 @@ def show_gpus(
|
|
3478
3534
|
def _get_kubernetes_realtime_gpu_tables(
|
3479
3535
|
context: Optional[str] = None,
|
3480
3536
|
name_filter: Optional[str] = None,
|
3481
|
-
quantity_filter: Optional[int] = None
|
3537
|
+
quantity_filter: Optional[int] = None,
|
3538
|
+
is_ssh: bool = False,
|
3482
3539
|
) -> Tuple[List[Tuple[str, 'prettytable.PrettyTable']],
|
3483
3540
|
Optional['prettytable.PrettyTable'], List[Tuple[
|
3484
3541
|
str, 'models.KubernetesNodesInfo']]]:
|
@@ -3491,19 +3548,26 @@ def show_gpus(
|
|
3491
3548
|
sdk.realtime_kubernetes_gpu_availability(
|
3492
3549
|
context=context,
|
3493
3550
|
name_filter=name_filter,
|
3494
|
-
quantity_filter=quantity_filter
|
3551
|
+
quantity_filter=quantity_filter,
|
3552
|
+
is_ssh=is_ssh))
|
3495
3553
|
if not realtime_gpu_availability_lists:
|
3496
|
-
|
3497
|
-
|
3554
|
+
# Customize message based on context
|
3555
|
+
identity = ('SSH Node Pool'
|
3556
|
+
if is_ssh else 'any allowed Kubernetes cluster')
|
3557
|
+
cloud_name = 'ssh' if is_ssh else 'kubernetes'
|
3558
|
+
err_msg = f'No GPUs found in {identity}. '
|
3559
|
+
debug_msg = (f'To further debug, run: sky check {cloud_name}')
|
3498
3560
|
if name_filter is not None:
|
3499
3561
|
gpu_info_msg = f' {name_filter!r}'
|
3500
3562
|
if quantity_filter is not None:
|
3501
3563
|
gpu_info_msg += (' with requested quantity'
|
3502
3564
|
f' {quantity_filter}')
|
3503
3565
|
err_msg = (f'Resources{gpu_info_msg} not found '
|
3504
|
-
'in
|
3505
|
-
|
3506
|
-
|
3566
|
+
f'in {identity}. ')
|
3567
|
+
identity_short = 'SSH Node Pool' if is_ssh else 'Kubernetes'
|
3568
|
+
debug_msg = (
|
3569
|
+
f'To show available accelerators in {identity_short}, '
|
3570
|
+
f'run: sky show-gpus --cloud {cloud_name}')
|
3507
3571
|
full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
|
3508
3572
|
debug_msg)
|
3509
3573
|
raise ValueError(full_err_msg)
|
@@ -3513,6 +3577,14 @@ def show_gpus(
|
|
3513
3577
|
lambda: [0, 0])
|
3514
3578
|
all_nodes_info = []
|
3515
3579
|
|
3580
|
+
# display an aggregated table for all contexts
|
3581
|
+
# if there are more than one contexts with GPUs.
|
3582
|
+
def _filter_ctx(ctx: str) -> bool:
|
3583
|
+
ctx_is_ssh = ctx and ctx.startswith('ssh-')
|
3584
|
+
return ctx_is_ssh is is_ssh
|
3585
|
+
|
3586
|
+
num_filtered_contexts = 0
|
3587
|
+
|
3516
3588
|
if realtime_gpu_availability_lists:
|
3517
3589
|
if len(realtime_gpu_availability_lists[0]) != 2:
|
3518
3590
|
# TODO(kyuds): for backwards compatibility, as we add new
|
@@ -3522,6 +3594,13 @@ def show_gpus(
|
|
3522
3594
|
(context, realtime_gpu_availability_lists)
|
3523
3595
|
]
|
3524
3596
|
for (ctx, availability_list) in realtime_gpu_availability_lists:
|
3597
|
+
if not _filter_ctx(ctx):
|
3598
|
+
continue
|
3599
|
+
if is_ssh:
|
3600
|
+
display_ctx = ctx.lstrip('ssh-')
|
3601
|
+
else:
|
3602
|
+
display_ctx = ctx
|
3603
|
+
num_filtered_contexts += 1
|
3525
3604
|
realtime_gpu_table = log_utils.create_table(
|
3526
3605
|
['GPU', qty_header, 'UTILIZATION'])
|
3527
3606
|
for realtime_gpu_availability in sorted(availability_list):
|
@@ -3542,15 +3621,12 @@ def show_gpus(
|
|
3542
3621
|
if capacity > 0:
|
3543
3622
|
total_gpu_info[gpu][0] += capacity
|
3544
3623
|
total_gpu_info[gpu][1] += available
|
3545
|
-
realtime_gpu_infos.append((
|
3624
|
+
realtime_gpu_infos.append((display_ctx, realtime_gpu_table))
|
3546
3625
|
# Collect node info for this context
|
3547
3626
|
nodes_info = sdk.stream_and_get(
|
3548
3627
|
sdk.kubernetes_node_info(context=ctx))
|
3549
|
-
all_nodes_info.append((
|
3550
|
-
|
3551
|
-
# display an aggregated table for all contexts
|
3552
|
-
# if there are more than one contexts with GPUs
|
3553
|
-
if len(realtime_gpu_infos) > 1:
|
3628
|
+
all_nodes_info.append((display_ctx, nodes_info))
|
3629
|
+
if num_filtered_contexts > 1:
|
3554
3630
|
total_realtime_gpu_table = log_utils.create_table(
|
3555
3631
|
['GPU', 'UTILIZATION'])
|
3556
3632
|
for gpu, stats in total_gpu_info.items():
|
@@ -3562,10 +3638,11 @@ def show_gpus(
|
|
3562
3638
|
return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
|
3563
3639
|
|
3564
3640
|
def _format_kubernetes_node_info_combined(
|
3565
|
-
contexts_info: List[Tuple[str,
|
3566
|
-
|
3641
|
+
contexts_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
|
3642
|
+
cloud_str: str = 'Kubernetes',
|
3643
|
+
context_title_str: str = 'CONTEXT') -> str:
|
3567
3644
|
node_table = log_utils.create_table(
|
3568
|
-
[
|
3645
|
+
[context_title_str, 'NODE', 'GPU', 'UTILIZATION'])
|
3569
3646
|
|
3570
3647
|
no_permissions_str = '<no permissions>'
|
3571
3648
|
hints = []
|
@@ -3588,7 +3665,7 @@ def show_gpus(
|
|
3588
3665
|
'free'
|
3589
3666
|
])
|
3590
3667
|
|
3591
|
-
k8s_per_node_acc_message = ('
|
3668
|
+
k8s_per_node_acc_message = (f'{cloud_str} per-node GPU availability')
|
3592
3669
|
if hints:
|
3593
3670
|
k8s_per_node_acc_message += ' (' + '; '.join(hints) + ')'
|
3594
3671
|
|
@@ -3598,26 +3675,30 @@ def show_gpus(
|
|
3598
3675
|
f'{node_table.get_string()}')
|
3599
3676
|
|
3600
3677
|
def _format_kubernetes_realtime_gpu(
|
3601
|
-
total_table: 'prettytable.PrettyTable',
|
3678
|
+
total_table: Optional['prettytable.PrettyTable'],
|
3602
3679
|
k8s_realtime_infos: List[Tuple[str, 'prettytable.PrettyTable']],
|
3603
3680
|
all_nodes_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
|
3604
|
-
show_node_info: bool) -> Generator[str, None, None]:
|
3681
|
+
show_node_info: bool, is_ssh: bool) -> Generator[str, None, None]:
|
3682
|
+
identity = 'SSH Node Pool' if is_ssh else 'Kubernetes'
|
3605
3683
|
yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
|
3606
|
-
'
|
3684
|
+
f'{identity} GPUs'
|
3607
3685
|
f'{colorama.Style.RESET_ALL}')
|
3608
3686
|
# print total table
|
3609
3687
|
if total_table is not None:
|
3610
3688
|
yield '\n'
|
3611
3689
|
yield from total_table.get_string()
|
3612
3690
|
|
3691
|
+
ctx_name = 'SSH Node Pool' if is_ssh else 'Context'
|
3692
|
+
ctx_column_title = 'NODE_POOL' if is_ssh else 'CONTEXT'
|
3693
|
+
|
3613
3694
|
# print individual infos.
|
3614
3695
|
for (ctx, k8s_realtime_table) in k8s_realtime_infos:
|
3615
3696
|
yield '\n'
|
3616
3697
|
# Print context header separately
|
3617
3698
|
if ctx:
|
3618
|
-
context_str = f'
|
3699
|
+
context_str = f'{ctx_name}: {ctx}'
|
3619
3700
|
else:
|
3620
|
-
context_str = 'Default
|
3701
|
+
context_str = f'Default {ctx_name}'
|
3621
3702
|
yield (
|
3622
3703
|
f'{colorama.Fore.CYAN}{context_str}{colorama.Style.RESET_ALL}\n'
|
3623
3704
|
)
|
@@ -3625,7 +3706,102 @@ def show_gpus(
|
|
3625
3706
|
|
3626
3707
|
if show_node_info:
|
3627
3708
|
yield '\n'
|
3628
|
-
yield _format_kubernetes_node_info_combined(all_nodes_info
|
3709
|
+
yield _format_kubernetes_node_info_combined(all_nodes_info,
|
3710
|
+
identity,
|
3711
|
+
ctx_column_title)
|
3712
|
+
|
3713
|
+
def _possibly_show_k8s_like_realtime(
|
3714
|
+
is_ssh: bool = False
|
3715
|
+
) -> Generator[str, None, Tuple[bool, bool, str]]:
|
3716
|
+
# If cloud is kubernetes, we want to show real-time capacity
|
3717
|
+
k8s_messages = ''
|
3718
|
+
print_section_titles = False
|
3719
|
+
if (is_ssh and query_ssh_realtime_gpu or query_k8s_realtime_gpu):
|
3720
|
+
context = region
|
3721
|
+
|
3722
|
+
try:
|
3723
|
+
# If --cloud kubernetes is not specified, we want to catch
|
3724
|
+
# the case where no GPUs are available on the cluster and
|
3725
|
+
# print the warning at the end.
|
3726
|
+
k8s_realtime_infos, total_table, all_nodes_info = (
|
3727
|
+
_get_kubernetes_realtime_gpu_tables(context, is_ssh=is_ssh))
|
3728
|
+
except ValueError as e:
|
3729
|
+
if not (cloud_is_kubernetes or cloud_is_ssh):
|
3730
|
+
# Make it a note if cloud is not kubernetes
|
3731
|
+
k8s_messages += 'Note: '
|
3732
|
+
k8s_messages += str(e)
|
3733
|
+
else:
|
3734
|
+
print_section_titles = True
|
3735
|
+
|
3736
|
+
yield from _format_kubernetes_realtime_gpu(total_table,
|
3737
|
+
k8s_realtime_infos,
|
3738
|
+
all_nodes_info,
|
3739
|
+
show_node_info=True,
|
3740
|
+
is_ssh=is_ssh)
|
3741
|
+
|
3742
|
+
if kubernetes_autoscaling:
|
3743
|
+
k8s_messages += ('\n' +
|
3744
|
+
kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
|
3745
|
+
if is_ssh:
|
3746
|
+
if cloud_is_ssh:
|
3747
|
+
if not ssh_is_enabled:
|
3748
|
+
yield ('SSH Node Pools are not enabled. To fix, run: '
|
3749
|
+
'sky check ssh ')
|
3750
|
+
yield k8s_messages
|
3751
|
+
return True, print_section_titles, ''
|
3752
|
+
else:
|
3753
|
+
if cloud_is_kubernetes:
|
3754
|
+
if not kubernetes_is_enabled:
|
3755
|
+
yield ('Kubernetes is not enabled. To fix, run: '
|
3756
|
+
'sky check kubernetes ')
|
3757
|
+
yield k8s_messages
|
3758
|
+
return True, print_section_titles, ''
|
3759
|
+
return False, print_section_titles, k8s_messages
|
3760
|
+
|
3761
|
+
def _possibly_show_k8s_like_realtime_for_acc(
|
3762
|
+
name: Optional[str],
|
3763
|
+
quantity: Optional[int],
|
3764
|
+
is_ssh: bool = False) -> Generator[str, None, Tuple[bool, bool]]:
|
3765
|
+
k8s_messages = ''
|
3766
|
+
print_section_titles = False
|
3767
|
+
if (is_ssh and query_ssh_realtime_gpu or
|
3768
|
+
query_k8s_realtime_gpu) and not show_all:
|
3769
|
+
print_section_titles = True
|
3770
|
+
# TODO(romilb): Show filtered per node GPU availability here as well
|
3771
|
+
try:
|
3772
|
+
(k8s_realtime_infos, total_table,
|
3773
|
+
all_nodes_info) = _get_kubernetes_realtime_gpu_tables(
|
3774
|
+
context=region,
|
3775
|
+
name_filter=name,
|
3776
|
+
quantity_filter=quantity,
|
3777
|
+
is_ssh=is_ssh)
|
3778
|
+
|
3779
|
+
yield from _format_kubernetes_realtime_gpu(total_table,
|
3780
|
+
k8s_realtime_infos,
|
3781
|
+
all_nodes_info,
|
3782
|
+
show_node_info=False,
|
3783
|
+
is_ssh=is_ssh)
|
3784
|
+
except ValueError as e:
|
3785
|
+
# In the case of a specific accelerator, show the error message
|
3786
|
+
# immediately (e.g., "Resources H100 not found ...")
|
3787
|
+
yield common_utils.format_exception(e, use_bracket=True)
|
3788
|
+
if kubernetes_autoscaling:
|
3789
|
+
k8s_messages += ('\n' +
|
3790
|
+
kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
|
3791
|
+
yield k8s_messages
|
3792
|
+
if is_ssh:
|
3793
|
+
if cloud_is_ssh:
|
3794
|
+
if not ssh_is_enabled:
|
3795
|
+
yield ('SSH Node Pools are not enabled. To fix, run: '
|
3796
|
+
'sky check ssh ')
|
3797
|
+
return True, print_section_titles
|
3798
|
+
else:
|
3799
|
+
if cloud_is_kubernetes:
|
3800
|
+
if not kubernetes_is_enabled:
|
3801
|
+
yield ('Kubernetes is not enabled. To fix, run: '
|
3802
|
+
'sky check kubernetes ')
|
3803
|
+
return True, print_section_titles
|
3804
|
+
return False, print_section_titles
|
3629
3805
|
|
3630
3806
|
def _output() -> Generator[str, None, None]:
|
3631
3807
|
gpu_table = log_utils.create_table(
|
@@ -3643,46 +3819,28 @@ def show_gpus(
|
|
3643
3819
|
clouds_to_list: Union[Optional[str], List[str]] = cloud_name
|
3644
3820
|
if cloud_name is None:
|
3645
3821
|
clouds_to_list = [
|
3646
|
-
c for c in service_catalog.ALL_CLOUDS
|
3822
|
+
c for c in service_catalog.ALL_CLOUDS
|
3823
|
+
if c != 'kubernetes' and c != 'ssh'
|
3647
3824
|
]
|
3648
3825
|
|
3649
3826
|
k8s_messages = ''
|
3650
3827
|
if accelerator_str is None:
|
3651
3828
|
# Collect k8s related messages in k8s_messages and print them at end
|
3652
3829
|
print_section_titles = False
|
3653
|
-
|
3654
|
-
|
3655
|
-
|
3656
|
-
|
3657
|
-
|
3658
|
-
|
3659
|
-
|
3660
|
-
|
3661
|
-
|
3662
|
-
|
3663
|
-
|
3664
|
-
|
3665
|
-
|
3666
|
-
|
3667
|
-
k8s_messages += str(e)
|
3668
|
-
else:
|
3669
|
-
print_section_titles = True
|
3670
|
-
|
3671
|
-
yield from _format_kubernetes_realtime_gpu(
|
3672
|
-
total_table,
|
3673
|
-
k8s_realtime_infos,
|
3674
|
-
all_nodes_info,
|
3675
|
-
show_node_info=True)
|
3676
|
-
|
3677
|
-
if kubernetes_autoscaling:
|
3678
|
-
k8s_messages += (
|
3679
|
-
'\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
|
3680
|
-
if cloud_is_kubernetes:
|
3681
|
-
# Do not show clouds if --cloud kubernetes is specified
|
3682
|
-
if not kubernetes_is_enabled:
|
3683
|
-
yield ('Kubernetes is not enabled. To fix, run: '
|
3684
|
-
'sky check kubernetes ')
|
3685
|
-
yield k8s_messages
|
3830
|
+
stop_iter = False
|
3831
|
+
k8s_messages = ''
|
3832
|
+
prev_print_section_titles = False
|
3833
|
+
for is_ssh in [False, True]:
|
3834
|
+
if prev_print_section_titles:
|
3835
|
+
yield '\n\n'
|
3836
|
+
stop_iter_one, print_section_titles_one, k8s_messages_one = (
|
3837
|
+
yield from _possibly_show_k8s_like_realtime(is_ssh))
|
3838
|
+
stop_iter = stop_iter or stop_iter_one
|
3839
|
+
print_section_titles = (print_section_titles or
|
3840
|
+
print_section_titles_one)
|
3841
|
+
k8s_messages += k8s_messages_one
|
3842
|
+
prev_print_section_titles = print_section_titles_one
|
3843
|
+
if stop_iter:
|
3686
3844
|
return
|
3687
3845
|
|
3688
3846
|
# For show_all, show the k8s message at the start since output is
|
@@ -3757,34 +3915,19 @@ def show_gpus(
|
|
3757
3915
|
name, quantity = accelerator_str, None
|
3758
3916
|
|
3759
3917
|
print_section_titles = False
|
3760
|
-
|
3761
|
-
|
3762
|
-
|
3763
|
-
|
3764
|
-
|
3765
|
-
|
3766
|
-
|
3767
|
-
|
3768
|
-
|
3769
|
-
|
3770
|
-
|
3771
|
-
|
3772
|
-
|
3773
|
-
all_nodes_info,
|
3774
|
-
show_node_info=False)
|
3775
|
-
except ValueError as e:
|
3776
|
-
# In the case of a specific accelerator, show the error message
|
3777
|
-
# immediately (e.g., "Resources H100 not found ...")
|
3778
|
-
yield common_utils.format_exception(e, use_bracket=True)
|
3779
|
-
if kubernetes_autoscaling:
|
3780
|
-
k8s_messages += ('\n' +
|
3781
|
-
kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
|
3782
|
-
yield k8s_messages
|
3783
|
-
if cloud_is_kubernetes:
|
3784
|
-
# Do not show clouds if --cloud kubernetes is specified
|
3785
|
-
if not kubernetes_is_enabled:
|
3786
|
-
yield ('Kubernetes is not enabled. To fix, run: '
|
3787
|
-
'sky check kubernetes ')
|
3918
|
+
stop_iter = False
|
3919
|
+
prev_print_section_titles = False
|
3920
|
+
for is_ssh in [False, True]:
|
3921
|
+
if prev_print_section_titles:
|
3922
|
+
yield '\n\n'
|
3923
|
+
stop_iter_one, print_section_titles_one = (
|
3924
|
+
yield from _possibly_show_k8s_like_realtime_for_acc(
|
3925
|
+
name, quantity, is_ssh))
|
3926
|
+
stop_iter = stop_iter or stop_iter_one
|
3927
|
+
print_section_titles = (print_section_titles or
|
3928
|
+
print_section_titles_one)
|
3929
|
+
prev_print_section_titles = print_section_titles_one
|
3930
|
+
if stop_iter:
|
3788
3931
|
return
|
3789
3932
|
|
3790
3933
|
# For clouds other than Kubernetes, get the accelerator details
|
@@ -4328,7 +4471,8 @@ def jobs_cancel(name: Optional[str], job_ids: Tuple[int], all: bool, yes: bool,
|
|
4328
4471
|
f'Provided {" ".join(arguments)!r}.')
|
4329
4472
|
|
4330
4473
|
if not yes:
|
4331
|
-
|
4474
|
+
plural = 's' if len(job_ids) > 1 else ''
|
4475
|
+
job_identity_str = (f'managed job{plural} with ID{plural} {job_id_str}'
|
4332
4476
|
if job_ids else repr(name))
|
4333
4477
|
if all_users:
|
4334
4478
|
job_identity_str = 'all managed jobs FOR ALL USERS'
|
@@ -6050,10 +6194,14 @@ def api_status(request_ids: Optional[List[str]], all_status: bool,
|
|
6050
6194
|
'-e',
|
6051
6195
|
required=False,
|
6052
6196
|
help='The SkyPilot API server endpoint.')
|
6197
|
+
@click.option('--get-token',
|
6198
|
+
is_flag=True,
|
6199
|
+
default=False,
|
6200
|
+
help='Force token-based login.')
|
6053
6201
|
@usage_lib.entrypoint
|
6054
|
-
def api_login(endpoint: Optional[str]):
|
6202
|
+
def api_login(endpoint: Optional[str], get_token: bool):
|
6055
6203
|
"""Logs into a SkyPilot API server."""
|
6056
|
-
sdk.api_login(endpoint)
|
6204
|
+
sdk.api_login(endpoint, get_token)
|
6057
6205
|
|
6058
6206
|
|
6059
6207
|
@api.command('info', cls=_DocumentedCodeCommand)
|
@@ -6065,6 +6213,10 @@ def api_info():
|
|
6065
6213
|
api_server_info = sdk.api_info()
|
6066
6214
|
user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
|
6067
6215
|
user_hash = common_utils.get_user_hash()
|
6216
|
+
api_server_user = api_server_info.get('user')
|
6217
|
+
if api_server_user is not None:
|
6218
|
+
user_name = api_server_user['name']
|
6219
|
+
user_hash = api_server_user['id']
|
6068
6220
|
dashboard_url = server_common.get_dashboard_url(url)
|
6069
6221
|
click.echo(f'Using SkyPilot API server: {url}\n'
|
6070
6222
|
f'{ux_utils.INDENT_SYMBOL}Status: {api_server_info["status"]}, '
|
@@ -6074,6 +6226,58 @@ def api_info():
|
|
6074
6226
|
f'{ux_utils.INDENT_LAST_SYMBOL}Dashboard: {dashboard_url}')
|
6075
6227
|
|
6076
6228
|
|
6229
|
+
@cli.group(cls=_NaturalOrderGroup)
|
6230
|
+
def ssh():
|
6231
|
+
"""Commands for managing SSH Node Pools."""
|
6232
|
+
pass
|
6233
|
+
|
6234
|
+
|
6235
|
+
@ssh.command('up', cls=_DocumentedCodeCommand)
|
6236
|
+
@click.option(
|
6237
|
+
'--infra',
|
6238
|
+
help='Name of the cluster to set up in ~/.sky/ssh_node_pools.yaml. '
|
6239
|
+
'If not specified, all clusters in the file will be set up.')
|
6240
|
+
@click.option('--async',
|
6241
|
+
'async_call',
|
6242
|
+
is_flag=True,
|
6243
|
+
hidden=True,
|
6244
|
+
help='Run the command asynchronously.')
|
6245
|
+
def ssh_up(infra: Optional[str], async_call: bool):
|
6246
|
+
"""Set up a cluster using SSH targets from ~/.sky/ssh_node_pools.yaml.
|
6247
|
+
|
6248
|
+
This command sets up a Kubernetes cluster on the machines specified in
|
6249
|
+
~/.sky/ssh_node_pools.yaml and configures SkyPilot to use it.
|
6250
|
+
"""
|
6251
|
+
request_id = sdk.ssh_up(infra=infra)
|
6252
|
+
if async_call:
|
6253
|
+
print(f'Request submitted with ID: {request_id}')
|
6254
|
+
else:
|
6255
|
+
sdk.stream_and_get(request_id)
|
6256
|
+
|
6257
|
+
|
6258
|
+
@ssh.command('down', cls=_DocumentedCodeCommand)
|
6259
|
+
@click.option(
|
6260
|
+
'--infra',
|
6261
|
+
help='Name of the cluster to clean up in ~/.sky/ssh_node_pools.yaml. '
|
6262
|
+
'If not specified, all clusters in the file will be cleaned up.')
|
6263
|
+
@click.option('--async',
|
6264
|
+
'async_call',
|
6265
|
+
is_flag=True,
|
6266
|
+
hidden=True,
|
6267
|
+
help='Run the command asynchronously.')
|
6268
|
+
def ssh_down(infra, async_call):
|
6269
|
+
"""Clean up a cluster set up with 'sky ssh up'.
|
6270
|
+
|
6271
|
+
This command removes the Kubernetes installation from the machines specified
|
6272
|
+
in ~/.sky/ssh_node_pools.yaml.
|
6273
|
+
"""
|
6274
|
+
request_id = sdk.ssh_down(infra=infra)
|
6275
|
+
if async_call:
|
6276
|
+
print(f'Request submitted with ID: {request_id}')
|
6277
|
+
else:
|
6278
|
+
sdk.stream_and_get(request_id)
|
6279
|
+
|
6280
|
+
|
6077
6281
|
def main():
|
6078
6282
|
return cli()
|
6079
6283
|
|