skypilot-nightly 1.0.0.dev20250428__py3-none-any.whl → 1.0.0.dev20250430__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +28 -40
- sky/backends/backend_utils.py +2 -0
- sky/cli.py +90 -37
- sky/client/cli.py +90 -37
- sky/client/sdk.py +3 -2
- sky/clouds/cloud.py +5 -2
- sky/clouds/kubernetes.py +4 -4
- sky/clouds/nebius.py +16 -10
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/kubernetes_catalog.py +7 -7
- sky/core.py +58 -29
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/favicon.ico +0 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/optimizer.py +35 -11
- sky/provision/docker_utils.py +22 -16
- sky/provision/kubernetes/utils.py +26 -24
- sky/resources.py +1 -1
- sky/server/common.py +6 -3
- sky/server/config.py +184 -0
- sky/server/requests/executor.py +17 -156
- sky/server/server.py +4 -4
- sky/setup_files/dependencies.py +0 -1
- sky/setup_files/setup.py +1 -1
- sky/skylet/constants.py +18 -0
- sky/skypilot_config.py +32 -11
- sky/templates/aws-ray.yml.j2 +2 -1
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +3 -2
- sky/templates/fluidstack-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +1 -1
- sky/templates/ibm-ray.yml.j2 +3 -3
- sky/templates/kubernetes-ray.yml.j2 +26 -14
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +64 -0
- sky/templates/oci-ray.yml.j2 +1 -1
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/vast-ray.yml.j2 +1 -1
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/utils/aws/__init__.py +0 -0
- sky/utils/aws/get_default_security_group.py +11 -0
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/METADATA +3 -3
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/RECORD +58 -55
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/WHEEL +1 -1
- /sky/dashboard/out/_next/static/{2f-jlOWR_G5mOwCF4RcZz → Ggv82ZIZy1hoW81egpwD1}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{2f-jlOWR_G5mOwCF4RcZz → Ggv82ZIZy1hoW81egpwD1}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'fe1583a36034080dbe7791da63ea270db30d0bcc'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250430'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/adaptors/nebius.py
CHANGED
@@ -29,11 +29,6 @@ MAX_RETRIES_TO_INSTANCE_WAIT = 120 # Maximum number of retries
|
|
29
29
|
|
30
30
|
POLL_INTERVAL = 5
|
31
31
|
|
32
|
-
_iam_token = None
|
33
|
-
_sdk = None
|
34
|
-
_tenant_id = None
|
35
|
-
_project_id = None
|
36
|
-
|
37
32
|
_IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Nebius AI Cloud.'
|
38
33
|
'Try pip install "skypilot[nebius]"')
|
39
34
|
|
@@ -81,56 +76,49 @@ def vpc():
|
|
81
76
|
return vpc_v1
|
82
77
|
|
83
78
|
|
79
|
+
@annotations.lru_cache(scope='request')
|
84
80
|
def get_iam_token():
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
except FileNotFoundError:
|
92
|
-
return None
|
93
|
-
return _iam_token
|
81
|
+
try:
|
82
|
+
with open(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH),
|
83
|
+
encoding='utf-8') as file:
|
84
|
+
return file.read().strip()
|
85
|
+
except FileNotFoundError:
|
86
|
+
return None
|
94
87
|
|
95
88
|
|
89
|
+
@annotations.lru_cache(scope='request')
|
96
90
|
def is_token_or_cred_file_exist():
|
97
91
|
return (os.path.exists(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH)) or
|
98
92
|
os.path.exists(os.path.expanduser(NEBIUS_CREDENTIALS_PATH)))
|
99
93
|
|
100
94
|
|
95
|
+
@annotations.lru_cache(scope='request')
|
101
96
|
def get_project_id():
|
102
|
-
|
103
|
-
|
104
|
-
|
105
|
-
|
106
|
-
|
107
|
-
|
108
|
-
except FileNotFoundError:
|
109
|
-
return None
|
110
|
-
return _project_id
|
97
|
+
try:
|
98
|
+
with open(os.path.expanduser(NEBIUS_PROJECT_ID_PATH),
|
99
|
+
encoding='utf-8') as file:
|
100
|
+
return file.read().strip()
|
101
|
+
except FileNotFoundError:
|
102
|
+
return None
|
111
103
|
|
112
104
|
|
105
|
+
@annotations.lru_cache(scope='request')
|
113
106
|
def get_tenant_id():
|
114
|
-
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
except FileNotFoundError:
|
121
|
-
return None
|
122
|
-
return _tenant_id
|
107
|
+
try:
|
108
|
+
with open(os.path.expanduser(NEBIUS_TENANT_ID_PATH),
|
109
|
+
encoding='utf-8') as file:
|
110
|
+
return file.read().strip()
|
111
|
+
except FileNotFoundError:
|
112
|
+
return None
|
123
113
|
|
124
114
|
|
115
|
+
@annotations.lru_cache(scope='request')
|
125
116
|
def sdk():
|
126
|
-
|
127
|
-
if
|
128
|
-
|
129
|
-
|
130
|
-
|
131
|
-
_sdk = nebius.sdk.SDK(
|
132
|
-
credentials_file_name=os.path.expanduser(NEBIUS_CREDENTIALS_PATH))
|
133
|
-
return _sdk
|
117
|
+
token = get_iam_token()
|
118
|
+
if token is not None:
|
119
|
+
return nebius.sdk.SDK(credentials=token)
|
120
|
+
return nebius.sdk.SDK(
|
121
|
+
credentials_file_name=os.path.expanduser(NEBIUS_CREDENTIALS_PATH))
|
134
122
|
|
135
123
|
|
136
124
|
def get_nebius_credentials(boto3_session):
|
sky/backends/backend_utils.py
CHANGED
@@ -798,6 +798,8 @@ def write_cluster_config(
|
|
798
798
|
'sky_ray_yaml_local_path': tmp_yaml_path,
|
799
799
|
'sky_version': str(version.parse(sky.__version__)),
|
800
800
|
'sky_wheel_hash': wheel_hash,
|
801
|
+
'ssh_max_sessions_config':
|
802
|
+
constants.SET_SSH_MAX_SESSIONS_CONFIG_CMD,
|
801
803
|
# Authentication (optional).
|
802
804
|
**auth_config,
|
803
805
|
|
sky/cli.py
CHANGED
@@ -23,6 +23,7 @@ NOTE: the order of command definitions in this file corresponds to how they are
|
|
23
23
|
listed in "sky --help". Take care to put logically connected commands close to
|
24
24
|
each other.
|
25
25
|
"""
|
26
|
+
import collections
|
26
27
|
import copy
|
27
28
|
import datetime
|
28
29
|
import functools
|
@@ -3413,7 +3414,7 @@ def show_gpus(
|
|
3413
3414
|
|
3414
3415
|
# TODO(zhwu,romilb): We should move most of these kubernetes related
|
3415
3416
|
# queries into the backend, especially behind the server.
|
3416
|
-
def
|
3417
|
+
def _get_kubernetes_realtime_gpu_tables(
|
3417
3418
|
context: Optional[str] = None,
|
3418
3419
|
name_filter: Optional[str] = None,
|
3419
3420
|
quantity_filter: Optional[int] = None):
|
@@ -3423,15 +3424,14 @@ def show_gpus(
|
|
3423
3424
|
else:
|
3424
3425
|
qty_header = 'REQUESTABLE_QTY_PER_NODE'
|
3425
3426
|
free_header = 'TOTAL_FREE_GPUS'
|
3426
|
-
|
3427
|
-
|
3428
|
-
realtime_gpu_availability_list = sdk.stream_and_get(
|
3427
|
+
|
3428
|
+
realtime_gpu_availability_lists = sdk.stream_and_get(
|
3429
3429
|
sdk.realtime_kubernetes_gpu_availability(
|
3430
3430
|
context=context,
|
3431
3431
|
name_filter=name_filter,
|
3432
3432
|
quantity_filter=quantity_filter))
|
3433
|
-
if not
|
3434
|
-
err_msg = 'No GPUs found in Kubernetes cluster. '
|
3433
|
+
if not realtime_gpu_availability_lists:
|
3434
|
+
err_msg = 'No GPUs found in any allowed Kubernetes cluster. '
|
3435
3435
|
debug_msg = 'To further debug, run: sky check '
|
3436
3436
|
if name_filter is not None:
|
3437
3437
|
gpu_info_msg = f' {name_filter!r}'
|
@@ -3439,26 +3439,52 @@ def show_gpus(
|
|
3439
3439
|
gpu_info_msg += (' with requested quantity'
|
3440
3440
|
f' {quantity_filter}')
|
3441
3441
|
err_msg = (f'Resources{gpu_info_msg} not found '
|
3442
|
-
'in Kubernetes cluster. ')
|
3442
|
+
'in any allowed Kubernetes cluster. ')
|
3443
3443
|
debug_msg = ('To show available accelerators on kubernetes,'
|
3444
3444
|
' run: sky show-gpus --cloud kubernetes ')
|
3445
3445
|
full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
|
3446
3446
|
debug_msg)
|
3447
3447
|
raise ValueError(full_err_msg)
|
3448
3448
|
no_permissions_str = '<no permissions>'
|
3449
|
-
|
3450
|
-
|
3451
|
-
|
3452
|
-
|
3453
|
-
|
3454
|
-
|
3455
|
-
|
3456
|
-
|
3457
|
-
|
3458
|
-
|
3459
|
-
available_qty
|
3460
|
-
|
3461
|
-
|
3449
|
+
realtime_gpu_infos = []
|
3450
|
+
total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
|
3451
|
+
lambda: [0, 0])
|
3452
|
+
|
3453
|
+
for (ctx, availability_list) in realtime_gpu_availability_lists:
|
3454
|
+
realtime_gpu_table = log_utils.create_table(
|
3455
|
+
['GPU', qty_header, 'TOTAL_GPUS', free_header])
|
3456
|
+
for realtime_gpu_availability in sorted(availability_list):
|
3457
|
+
gpu_availability = models.RealtimeGpuAvailability(
|
3458
|
+
*realtime_gpu_availability)
|
3459
|
+
available_qty = (gpu_availability.available
|
3460
|
+
if gpu_availability.available != -1 else
|
3461
|
+
no_permissions_str)
|
3462
|
+
realtime_gpu_table.add_row([
|
3463
|
+
gpu_availability.gpu,
|
3464
|
+
_list_to_str(gpu_availability.counts),
|
3465
|
+
gpu_availability.capacity,
|
3466
|
+
available_qty,
|
3467
|
+
])
|
3468
|
+
gpu = gpu_availability.gpu
|
3469
|
+
capacity = gpu_availability.capacity
|
3470
|
+
# we want total, so skip permission denied.
|
3471
|
+
available = max(gpu_availability.available, 0)
|
3472
|
+
if capacity > 0:
|
3473
|
+
total_gpu_info[gpu][0] += capacity
|
3474
|
+
total_gpu_info[gpu][1] += available
|
3475
|
+
realtime_gpu_infos.append((ctx, realtime_gpu_table))
|
3476
|
+
|
3477
|
+
# display an aggregated table for all contexts
|
3478
|
+
# if there are more than one contexts with GPUs
|
3479
|
+
if len(realtime_gpu_infos) > 1:
|
3480
|
+
total_realtime_gpu_table = log_utils.create_table(
|
3481
|
+
['GPU', 'TOTAL_GPUS', free_header])
|
3482
|
+
for gpu, stats in total_gpu_info.items():
|
3483
|
+
total_realtime_gpu_table.add_row([gpu, stats[0], stats[1]])
|
3484
|
+
else:
|
3485
|
+
total_realtime_gpu_table = None
|
3486
|
+
|
3487
|
+
return realtime_gpu_infos, total_realtime_gpu_table
|
3462
3488
|
|
3463
3489
|
def _format_kubernetes_node_info(context: Optional[str]):
|
3464
3490
|
node_table = log_utils.create_table(
|
@@ -3479,7 +3505,7 @@ def show_gpus(
|
|
3479
3505
|
'Kubernetes per node accelerator availability ')
|
3480
3506
|
if nodes_info.hint:
|
3481
3507
|
k8s_per_node_acc_message += nodes_info.hint
|
3482
|
-
return (f'{colorama.Fore.
|
3508
|
+
return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
|
3483
3509
|
f'{k8s_per_node_acc_message}'
|
3484
3510
|
f'{colorama.Style.RESET_ALL}\n'
|
3485
3511
|
f'{node_table.get_string()}')
|
@@ -3516,8 +3542,7 @@ def show_gpus(
|
|
3516
3542
|
# If --cloud kubernetes is not specified, we want to catch
|
3517
3543
|
# the case where no GPUs are available on the cluster and
|
3518
3544
|
# print the warning at the end.
|
3519
|
-
|
3520
|
-
context)
|
3545
|
+
k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables(context) # pylint: disable=line-too-long
|
3521
3546
|
except ValueError as e:
|
3522
3547
|
if not cloud_is_kubernetes:
|
3523
3548
|
# Make it a note if cloud is not kubernetes
|
@@ -3525,13 +3550,24 @@ def show_gpus(
|
|
3525
3550
|
k8s_messages += str(e)
|
3526
3551
|
else:
|
3527
3552
|
print_section_titles = True
|
3528
|
-
|
3529
|
-
|
3530
|
-
|
3531
|
-
|
3532
|
-
|
3533
|
-
|
3534
|
-
|
3553
|
+
|
3554
|
+
# print total table
|
3555
|
+
if total_table is not None:
|
3556
|
+
yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
|
3557
|
+
'Total Kubernetes GPUs'
|
3558
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3559
|
+
yield from total_table.get_string()
|
3560
|
+
yield '\n-----\n\n'
|
3561
|
+
|
3562
|
+
# print individual infos.
|
3563
|
+
for (ctx, k8s_realtime_table) in k8s_realtime_infos:
|
3564
|
+
context_str = f'(Context: {ctx})' if ctx else ''
|
3565
|
+
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3566
|
+
f'Kubernetes GPUs {context_str}'
|
3567
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3568
|
+
yield from k8s_realtime_table.get_string()
|
3569
|
+
yield '\n\n'
|
3570
|
+
yield _format_kubernetes_node_info(ctx) + '\n-----\n\n'
|
3535
3571
|
if kubernetes_autoscaling:
|
3536
3572
|
k8s_messages += (
|
3537
3573
|
'\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
|
@@ -3620,13 +3656,29 @@ def show_gpus(
|
|
3620
3656
|
# Print section title if not showing all and instead a specific
|
3621
3657
|
# accelerator is requested
|
3622
3658
|
print_section_titles = True
|
3623
|
-
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3624
|
-
f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
|
3625
3659
|
# TODO(romilb): Show filtered per node GPU availability here as well
|
3626
3660
|
try:
|
3627
|
-
|
3628
|
-
|
3629
|
-
|
3661
|
+
k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables( # pylint: disable=line-too-long
|
3662
|
+
context=region,
|
3663
|
+
name_filter=name,
|
3664
|
+
quantity_filter=quantity)
|
3665
|
+
|
3666
|
+
# print total table
|
3667
|
+
if total_table is not None:
|
3668
|
+
yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
|
3669
|
+
'Total Kubernetes GPUs'
|
3670
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3671
|
+
yield from total_table.get_string()
|
3672
|
+
yield '\n-----\n\n'
|
3673
|
+
|
3674
|
+
# print individual tables
|
3675
|
+
for (ctx, k8s_realtime_table) in k8s_realtime_infos:
|
3676
|
+
context_str = f'(Context: {ctx})' if ctx else ''
|
3677
|
+
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3678
|
+
f'Kubernetes GPUs {context_str}'
|
3679
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3680
|
+
yield from k8s_realtime_table.get_string()
|
3681
|
+
yield '\n\n'
|
3630
3682
|
except ValueError as e:
|
3631
3683
|
# In the case of a specific accelerator, show the error message
|
3632
3684
|
# immediately (e.g., "Resources H100 not found ...")
|
@@ -5911,11 +5963,12 @@ def api_info():
|
|
5911
5963
|
user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
|
5912
5964
|
user_hash = common_utils.get_user_hash()
|
5913
5965
|
dashboard_url = server_common.get_dashboard_url(url)
|
5914
|
-
click.echo(f'Using SkyPilot API server: {url}
|
5966
|
+
click.echo(f'Using SkyPilot API server: {url}\n'
|
5915
5967
|
f'{ux_utils.INDENT_SYMBOL}Status: {api_server_info["status"]}, '
|
5916
5968
|
f'commit: {api_server_info["commit"]}, '
|
5917
5969
|
f'version: {api_server_info["version"]}\n'
|
5918
|
-
f'{ux_utils.
|
5970
|
+
f'{ux_utils.INDENT_SYMBOL}User: {user_name} ({user_hash})\n'
|
5971
|
+
f'{ux_utils.INDENT_LAST_SYMBOL}Dashboard: {dashboard_url}')
|
5919
5972
|
|
5920
5973
|
|
5921
5974
|
def main():
|
sky/client/cli.py
CHANGED
@@ -23,6 +23,7 @@ NOTE: the order of command definitions in this file corresponds to how they are
|
|
23
23
|
listed in "sky --help". Take care to put logically connected commands close to
|
24
24
|
each other.
|
25
25
|
"""
|
26
|
+
import collections
|
26
27
|
import copy
|
27
28
|
import datetime
|
28
29
|
import functools
|
@@ -3413,7 +3414,7 @@ def show_gpus(
|
|
3413
3414
|
|
3414
3415
|
# TODO(zhwu,romilb): We should move most of these kubernetes related
|
3415
3416
|
# queries into the backend, especially behind the server.
|
3416
|
-
def
|
3417
|
+
def _get_kubernetes_realtime_gpu_tables(
|
3417
3418
|
context: Optional[str] = None,
|
3418
3419
|
name_filter: Optional[str] = None,
|
3419
3420
|
quantity_filter: Optional[int] = None):
|
@@ -3423,15 +3424,14 @@ def show_gpus(
|
|
3423
3424
|
else:
|
3424
3425
|
qty_header = 'REQUESTABLE_QTY_PER_NODE'
|
3425
3426
|
free_header = 'TOTAL_FREE_GPUS'
|
3426
|
-
|
3427
|
-
|
3428
|
-
realtime_gpu_availability_list = sdk.stream_and_get(
|
3427
|
+
|
3428
|
+
realtime_gpu_availability_lists = sdk.stream_and_get(
|
3429
3429
|
sdk.realtime_kubernetes_gpu_availability(
|
3430
3430
|
context=context,
|
3431
3431
|
name_filter=name_filter,
|
3432
3432
|
quantity_filter=quantity_filter))
|
3433
|
-
if not
|
3434
|
-
err_msg = 'No GPUs found in Kubernetes cluster. '
|
3433
|
+
if not realtime_gpu_availability_lists:
|
3434
|
+
err_msg = 'No GPUs found in any allowed Kubernetes cluster. '
|
3435
3435
|
debug_msg = 'To further debug, run: sky check '
|
3436
3436
|
if name_filter is not None:
|
3437
3437
|
gpu_info_msg = f' {name_filter!r}'
|
@@ -3439,26 +3439,52 @@ def show_gpus(
|
|
3439
3439
|
gpu_info_msg += (' with requested quantity'
|
3440
3440
|
f' {quantity_filter}')
|
3441
3441
|
err_msg = (f'Resources{gpu_info_msg} not found '
|
3442
|
-
'in Kubernetes cluster. ')
|
3442
|
+
'in any allowed Kubernetes cluster. ')
|
3443
3443
|
debug_msg = ('To show available accelerators on kubernetes,'
|
3444
3444
|
' run: sky show-gpus --cloud kubernetes ')
|
3445
3445
|
full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
|
3446
3446
|
debug_msg)
|
3447
3447
|
raise ValueError(full_err_msg)
|
3448
3448
|
no_permissions_str = '<no permissions>'
|
3449
|
-
|
3450
|
-
|
3451
|
-
|
3452
|
-
|
3453
|
-
|
3454
|
-
|
3455
|
-
|
3456
|
-
|
3457
|
-
|
3458
|
-
|
3459
|
-
available_qty
|
3460
|
-
|
3461
|
-
|
3449
|
+
realtime_gpu_infos = []
|
3450
|
+
total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
|
3451
|
+
lambda: [0, 0])
|
3452
|
+
|
3453
|
+
for (ctx, availability_list) in realtime_gpu_availability_lists:
|
3454
|
+
realtime_gpu_table = log_utils.create_table(
|
3455
|
+
['GPU', qty_header, 'TOTAL_GPUS', free_header])
|
3456
|
+
for realtime_gpu_availability in sorted(availability_list):
|
3457
|
+
gpu_availability = models.RealtimeGpuAvailability(
|
3458
|
+
*realtime_gpu_availability)
|
3459
|
+
available_qty = (gpu_availability.available
|
3460
|
+
if gpu_availability.available != -1 else
|
3461
|
+
no_permissions_str)
|
3462
|
+
realtime_gpu_table.add_row([
|
3463
|
+
gpu_availability.gpu,
|
3464
|
+
_list_to_str(gpu_availability.counts),
|
3465
|
+
gpu_availability.capacity,
|
3466
|
+
available_qty,
|
3467
|
+
])
|
3468
|
+
gpu = gpu_availability.gpu
|
3469
|
+
capacity = gpu_availability.capacity
|
3470
|
+
# we want total, so skip permission denied.
|
3471
|
+
available = max(gpu_availability.available, 0)
|
3472
|
+
if capacity > 0:
|
3473
|
+
total_gpu_info[gpu][0] += capacity
|
3474
|
+
total_gpu_info[gpu][1] += available
|
3475
|
+
realtime_gpu_infos.append((ctx, realtime_gpu_table))
|
3476
|
+
|
3477
|
+
# display an aggregated table for all contexts
|
3478
|
+
# if there are more than one contexts with GPUs
|
3479
|
+
if len(realtime_gpu_infos) > 1:
|
3480
|
+
total_realtime_gpu_table = log_utils.create_table(
|
3481
|
+
['GPU', 'TOTAL_GPUS', free_header])
|
3482
|
+
for gpu, stats in total_gpu_info.items():
|
3483
|
+
total_realtime_gpu_table.add_row([gpu, stats[0], stats[1]])
|
3484
|
+
else:
|
3485
|
+
total_realtime_gpu_table = None
|
3486
|
+
|
3487
|
+
return realtime_gpu_infos, total_realtime_gpu_table
|
3462
3488
|
|
3463
3489
|
def _format_kubernetes_node_info(context: Optional[str]):
|
3464
3490
|
node_table = log_utils.create_table(
|
@@ -3479,7 +3505,7 @@ def show_gpus(
|
|
3479
3505
|
'Kubernetes per node accelerator availability ')
|
3480
3506
|
if nodes_info.hint:
|
3481
3507
|
k8s_per_node_acc_message += nodes_info.hint
|
3482
|
-
return (f'{colorama.Fore.
|
3508
|
+
return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
|
3483
3509
|
f'{k8s_per_node_acc_message}'
|
3484
3510
|
f'{colorama.Style.RESET_ALL}\n'
|
3485
3511
|
f'{node_table.get_string()}')
|
@@ -3516,8 +3542,7 @@ def show_gpus(
|
|
3516
3542
|
# If --cloud kubernetes is not specified, we want to catch
|
3517
3543
|
# the case where no GPUs are available on the cluster and
|
3518
3544
|
# print the warning at the end.
|
3519
|
-
|
3520
|
-
context)
|
3545
|
+
k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables(context) # pylint: disable=line-too-long
|
3521
3546
|
except ValueError as e:
|
3522
3547
|
if not cloud_is_kubernetes:
|
3523
3548
|
# Make it a note if cloud is not kubernetes
|
@@ -3525,13 +3550,24 @@ def show_gpus(
|
|
3525
3550
|
k8s_messages += str(e)
|
3526
3551
|
else:
|
3527
3552
|
print_section_titles = True
|
3528
|
-
|
3529
|
-
|
3530
|
-
|
3531
|
-
|
3532
|
-
|
3533
|
-
|
3534
|
-
|
3553
|
+
|
3554
|
+
# print total table
|
3555
|
+
if total_table is not None:
|
3556
|
+
yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
|
3557
|
+
'Total Kubernetes GPUs'
|
3558
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3559
|
+
yield from total_table.get_string()
|
3560
|
+
yield '\n-----\n\n'
|
3561
|
+
|
3562
|
+
# print individual infos.
|
3563
|
+
for (ctx, k8s_realtime_table) in k8s_realtime_infos:
|
3564
|
+
context_str = f'(Context: {ctx})' if ctx else ''
|
3565
|
+
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3566
|
+
f'Kubernetes GPUs {context_str}'
|
3567
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3568
|
+
yield from k8s_realtime_table.get_string()
|
3569
|
+
yield '\n\n'
|
3570
|
+
yield _format_kubernetes_node_info(ctx) + '\n-----\n\n'
|
3535
3571
|
if kubernetes_autoscaling:
|
3536
3572
|
k8s_messages += (
|
3537
3573
|
'\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
|
@@ -3620,13 +3656,29 @@ def show_gpus(
|
|
3620
3656
|
# Print section title if not showing all and instead a specific
|
3621
3657
|
# accelerator is requested
|
3622
3658
|
print_section_titles = True
|
3623
|
-
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3624
|
-
f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
|
3625
3659
|
# TODO(romilb): Show filtered per node GPU availability here as well
|
3626
3660
|
try:
|
3627
|
-
|
3628
|
-
|
3629
|
-
|
3661
|
+
k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables( # pylint: disable=line-too-long
|
3662
|
+
context=region,
|
3663
|
+
name_filter=name,
|
3664
|
+
quantity_filter=quantity)
|
3665
|
+
|
3666
|
+
# print total table
|
3667
|
+
if total_table is not None:
|
3668
|
+
yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
|
3669
|
+
'Total Kubernetes GPUs'
|
3670
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3671
|
+
yield from total_table.get_string()
|
3672
|
+
yield '\n-----\n\n'
|
3673
|
+
|
3674
|
+
# print individual tables
|
3675
|
+
for (ctx, k8s_realtime_table) in k8s_realtime_infos:
|
3676
|
+
context_str = f'(Context: {ctx})' if ctx else ''
|
3677
|
+
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3678
|
+
f'Kubernetes GPUs {context_str}'
|
3679
|
+
f'{colorama.Style.RESET_ALL}\n')
|
3680
|
+
yield from k8s_realtime_table.get_string()
|
3681
|
+
yield '\n\n'
|
3630
3682
|
except ValueError as e:
|
3631
3683
|
# In the case of a specific accelerator, show the error message
|
3632
3684
|
# immediately (e.g., "Resources H100 not found ...")
|
@@ -5911,11 +5963,12 @@ def api_info():
|
|
5911
5963
|
user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
|
5912
5964
|
user_hash = common_utils.get_user_hash()
|
5913
5965
|
dashboard_url = server_common.get_dashboard_url(url)
|
5914
|
-
click.echo(f'Using SkyPilot API server: {url}
|
5966
|
+
click.echo(f'Using SkyPilot API server: {url}\n'
|
5915
5967
|
f'{ux_utils.INDENT_SYMBOL}Status: {api_server_info["status"]}, '
|
5916
5968
|
f'commit: {api_server_info["commit"]}, '
|
5917
5969
|
f'version: {api_server_info["version"]}\n'
|
5918
|
-
f'{ux_utils.
|
5970
|
+
f'{ux_utils.INDENT_SYMBOL}User: {user_name} ({user_hash})\n'
|
5971
|
+
f'{ux_utils.INDENT_LAST_SYMBOL}Dashboard: {dashboard_url}')
|
5919
5972
|
|
5920
5973
|
|
5921
5974
|
def main():
|
sky/client/sdk.py
CHANGED
@@ -1840,6 +1840,7 @@ def api_login(endpoint: Optional[str] = None) -> None:
|
|
1840
1840
|
dashboard_url = server_common.get_dashboard_url(endpoint)
|
1841
1841
|
dashboard_msg = f'Dashboard: {dashboard_url}'
|
1842
1842
|
click.secho(
|
1843
|
-
f'Logged
|
1844
|
-
f'
|
1843
|
+
f'Logged into SkyPilot API server at: {endpoint}'
|
1844
|
+
f'\n{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
|
1845
|
+
f'{dashboard_msg}',
|
1845
1846
|
fg='green')
|
sky/clouds/cloud.py
CHANGED
@@ -418,13 +418,16 @@ class Cloud:
|
|
418
418
|
try:
|
419
419
|
self.check_features_are_supported(resources,
|
420
420
|
resources_required_features)
|
421
|
-
except exceptions.NotSupportedError:
|
421
|
+
except exceptions.NotSupportedError as e:
|
422
422
|
# TODO(zhwu): The resources are now silently filtered out. We
|
423
423
|
# should have some logging telling the user why the resources
|
424
424
|
# are not considered.
|
425
|
+
# UPDATE(kyuds): passing in NotSupportedError reason string
|
426
|
+
# to hint for issue #5344. Did not remove above comment as
|
427
|
+
# reason is not displayed when other resources are valid.
|
425
428
|
return resources_utils.FeasibleResources(resources_list=[],
|
426
429
|
fuzzy_candidate_list=[],
|
427
|
-
hint=
|
430
|
+
hint=str(e))
|
428
431
|
return self._get_feasible_launchable_resources(resources)
|
429
432
|
|
430
433
|
def _get_feasible_launchable_resources(
|
sky/clouds/kubernetes.py
CHANGED
@@ -454,7 +454,7 @@ class Kubernetes(clouds.Cloud):
|
|
454
454
|
self.IMAGE_CPU, clouds='kubernetes')
|
455
455
|
|
456
456
|
k8s_acc_label_key = None
|
457
|
-
|
457
|
+
k8s_acc_label_values = None
|
458
458
|
k8s_topology_label_key = None
|
459
459
|
k8s_topology_label_value = None
|
460
460
|
k8s_resource_key = None
|
@@ -462,9 +462,9 @@ class Kubernetes(clouds.Cloud):
|
|
462
462
|
|
463
463
|
# If GPU/TPUs are requested, set node label to match the GPU/TPU type.
|
464
464
|
if acc_count > 0 and acc_type is not None:
|
465
|
-
(k8s_acc_label_key,
|
465
|
+
(k8s_acc_label_key, k8s_acc_label_values, k8s_topology_label_key,
|
466
466
|
k8s_topology_label_value) = (
|
467
|
-
kubernetes_utils.
|
467
|
+
kubernetes_utils.get_accelerator_label_key_values(
|
468
468
|
context, acc_type, acc_count))
|
469
469
|
if (k8s_acc_label_key ==
|
470
470
|
kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY):
|
@@ -562,7 +562,7 @@ class Kubernetes(clouds.Cloud):
|
|
562
562
|
'k8s_networking_mode': network_utils.get_networking_mode().value,
|
563
563
|
'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME,
|
564
564
|
'k8s_acc_label_key': k8s_acc_label_key,
|
565
|
-
'
|
565
|
+
'k8s_acc_label_values': k8s_acc_label_values,
|
566
566
|
'k8s_ssh_jump_name': self.SKY_SSH_JUMP_NAME,
|
567
567
|
'k8s_ssh_jump_image': ssh_jump_image,
|
568
568
|
'k8s_service_account_name': k8s_service_account_name,
|