skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20250513__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +3 -0
- sky/backends/cloud_vm_ray_backend.py +7 -0
- sky/cli.py +109 -109
- sky/client/cli.py +109 -109
- sky/clouds/gcp.py +35 -8
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → 2dkponv64SfFShA8Rnw0D}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/845-0ca6f2c1ba667c3b.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/global_user_state.py +2 -0
- sky/provision/docker_utils.py +4 -1
- sky/provision/gcp/config.py +197 -15
- sky/provision/gcp/constants.py +64 -0
- sky/provision/gcp/instance.py +5 -3
- sky/provision/gcp/instance_utils.py +8 -4
- sky/provision/nebius/instance.py +3 -1
- sky/provision/nebius/utils.py +4 -2
- sky/server/requests/executor.py +114 -22
- sky/server/requests/requests.py +15 -0
- sky/server/server.py +12 -7
- sky/server/uvicorn.py +12 -2
- sky/sky_logging.py +40 -2
- sky/skylet/constants.py +3 -0
- sky/skylet/log_lib.py +51 -11
- sky/templates/gcp-ray.yml.j2 +11 -0
- sky/templates/nebius-ray.yml.j2 +4 -0
- sky/templates/websocket_proxy.py +29 -9
- sky/utils/command_runner.py +3 -0
- sky/utils/context.py +264 -0
- sky/utils/context_utils.py +172 -0
- sky/utils/rich_utils.py +81 -37
- sky/utils/schemas.py +9 -1
- sky/utils/subprocess_utils.py +8 -2
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/METADATA +1 -5
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/RECORD +46 -44
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/WHEEL +1 -1
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → 2dkponv64SfFShA8Rnw0D}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20250513.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'c23907b7f1baf65740791dc1e17ff1411e7d9a97'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250513'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/backends/backend_utils.py
CHANGED
@@ -40,6 +40,7 @@ from sky.utils import cluster_utils
|
|
40
40
|
from sky.utils import command_runner
|
41
41
|
from sky.utils import common
|
42
42
|
from sky.utils import common_utils
|
43
|
+
from sky.utils import context_utils
|
43
44
|
from sky.utils import controller_utils
|
44
45
|
from sky.utils import env_options
|
45
46
|
from sky.utils import registry
|
@@ -2204,6 +2205,7 @@ def refresh_cluster_record(
|
|
2204
2205
|
|
2205
2206
|
|
2206
2207
|
@timeline.event
|
2208
|
+
@context_utils.cancellation_guard
|
2207
2209
|
def refresh_cluster_status_handle(
|
2208
2210
|
cluster_name: str,
|
2209
2211
|
*,
|
@@ -2253,6 +2255,7 @@ def check_cluster_available(
|
|
2253
2255
|
...
|
2254
2256
|
|
2255
2257
|
|
2258
|
+
@context_utils.cancellation_guard
|
2256
2259
|
def check_cluster_available(
|
2257
2260
|
cluster_name: str,
|
2258
2261
|
*,
|
@@ -61,6 +61,7 @@ from sky.utils import cluster_utils
|
|
61
61
|
from sky.utils import command_runner
|
62
62
|
from sky.utils import common
|
63
63
|
from sky.utils import common_utils
|
64
|
+
from sky.utils import context_utils
|
64
65
|
from sky.utils import controller_utils
|
65
66
|
from sky.utils import env_options
|
66
67
|
from sky.utils import log_utils
|
@@ -274,6 +275,7 @@ class RayCodeGen:
|
|
274
275
|
ray_address = 'auto'
|
275
276
|
self._code = [
|
276
277
|
textwrap.dedent(f"""\
|
278
|
+
import functools
|
277
279
|
import getpass
|
278
280
|
import hashlib
|
279
281
|
import io
|
@@ -301,6 +303,8 @@ class RayCodeGen:
|
|
301
303
|
from sky.skylet import autostop_lib
|
302
304
|
from sky.skylet import constants
|
303
305
|
from sky.skylet import job_lib
|
306
|
+
from sky.utils import context
|
307
|
+
from sky.utils import context_utils
|
304
308
|
from sky.utils import log_utils
|
305
309
|
from sky.utils import subprocess_utils
|
306
310
|
|
@@ -2415,6 +2419,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2415
2419
|
internal_external_ips[1:], key=lambda x: x[1])
|
2416
2420
|
self.stable_internal_external_ips = stable_internal_external_ips
|
2417
2421
|
|
2422
|
+
@context_utils.cancellation_guard
|
2418
2423
|
@annotations.lru_cache(scope='global')
|
2419
2424
|
@timeline.event
|
2420
2425
|
def get_command_runners(self,
|
@@ -3842,6 +3847,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3842
3847
|
subprocess_utils.run_in_parallel(_rsync_down, parallel_args)
|
3843
3848
|
return dict(zip(job_ids, local_log_dirs))
|
3844
3849
|
|
3850
|
+
@context_utils.cancellation_guard
|
3845
3851
|
def tail_logs(self,
|
3846
3852
|
handle: CloudVmRayResourceHandle,
|
3847
3853
|
job_id: Optional[int],
|
@@ -4559,6 +4565,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
4559
4565
|
# TODO(zhwu): Refactor this to a CommandRunner class, so different backends
|
4560
4566
|
# can support its own command runner.
|
4561
4567
|
@timeline.event
|
4568
|
+
@context_utils.cancellation_guard
|
4562
4569
|
def run_on_head(
|
4563
4570
|
self,
|
4564
4571
|
handle: CloudVmRayResourceHandle,
|
sky/cli.py
CHANGED
@@ -91,6 +91,8 @@ from sky.utils.cli_utils import status_utils
|
|
91
91
|
if typing.TYPE_CHECKING:
|
92
92
|
import types
|
93
93
|
|
94
|
+
import prettytable
|
95
|
+
|
94
96
|
pd = adaptors_common.LazyImport('pandas')
|
95
97
|
logger = sky_logging.init_logger(__name__)
|
96
98
|
|
@@ -3371,12 +3373,8 @@ def show_gpus(
|
|
3371
3373
|
* ``QTY_PER_NODE`` (Kubernetes only): GPU quantities that can be requested
|
3372
3374
|
on a single node.
|
3373
3375
|
|
3374
|
-
* ``
|
3375
|
-
Kubernetes cluster.
|
3376
|
-
|
3377
|
-
* ``TOTAL_FREE_GPUS`` (Kubernetes only): Number of currently free GPUs
|
3378
|
-
in the Kubernetes cluster. This is fetched in real-time and may change
|
3379
|
-
when other users are using the cluster.
|
3376
|
+
* ``UTILIZATION`` (Kubernetes only): Total number of GPUs free / available
|
3377
|
+
in the Kubernetes cluster.
|
3380
3378
|
"""
|
3381
3379
|
# validation for the --region flag
|
3382
3380
|
if region is not None and cloud is None:
|
@@ -3415,15 +3413,16 @@ def show_gpus(
|
|
3415
3413
|
# TODO(zhwu,romilb): We should move most of these kubernetes related
|
3416
3414
|
# queries into the backend, especially behind the server.
|
3417
3415
|
def _get_kubernetes_realtime_gpu_tables(
|
3418
|
-
|
3419
|
-
|
3420
|
-
|
3416
|
+
context: Optional[str] = None,
|
3417
|
+
name_filter: Optional[str] = None,
|
3418
|
+
quantity_filter: Optional[int] = None
|
3419
|
+
) -> Tuple[List[Tuple[str, 'prettytable.PrettyTable']],
|
3420
|
+
Optional['prettytable.PrettyTable'], List[Tuple[
|
3421
|
+
str, 'models.KubernetesNodesInfo']]]:
|
3421
3422
|
if quantity_filter:
|
3422
3423
|
qty_header = 'QTY_FILTER'
|
3423
|
-
free_header = 'FILTERED_FREE_GPUS'
|
3424
3424
|
else:
|
3425
3425
|
qty_header = 'REQUESTABLE_QTY_PER_NODE'
|
3426
|
-
free_header = 'TOTAL_FREE_GPUS'
|
3427
3426
|
|
3428
3427
|
realtime_gpu_availability_lists = sdk.stream_and_get(
|
3429
3428
|
sdk.realtime_kubernetes_gpu_availability(
|
@@ -3449,41 +3448,19 @@ def show_gpus(
|
|
3449
3448
|
realtime_gpu_infos = []
|
3450
3449
|
total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
|
3451
3450
|
lambda: [0, 0])
|
3451
|
+
all_nodes_info = []
|
3452
3452
|
|
3453
|
-
# TODO(kyuds): remove backwards compatibility code (else branch)
|
3454
|
-
# when API version is bumped
|
3455
3453
|
if realtime_gpu_availability_lists:
|
3456
|
-
|
3457
|
-
|
3458
|
-
|
3459
|
-
|
3460
|
-
|
3461
|
-
|
3462
|
-
|
3463
|
-
|
3464
|
-
available_qty = (gpu_availability.available
|
3465
|
-
if gpu_availability.available != -1
|
3466
|
-
else no_permissions_str)
|
3467
|
-
realtime_gpu_table.add_row([
|
3468
|
-
gpu_availability.gpu,
|
3469
|
-
_list_to_str(gpu_availability.counts),
|
3470
|
-
gpu_availability.capacity,
|
3471
|
-
available_qty,
|
3472
|
-
])
|
3473
|
-
gpu = gpu_availability.gpu
|
3474
|
-
capacity = gpu_availability.capacity
|
3475
|
-
# we want total, so skip permission denied.
|
3476
|
-
available = max(gpu_availability.available, 0)
|
3477
|
-
if capacity > 0:
|
3478
|
-
total_gpu_info[gpu][0] += capacity
|
3479
|
-
total_gpu_info[gpu][1] += available
|
3480
|
-
realtime_gpu_infos.append((ctx, realtime_gpu_table))
|
3481
|
-
else:
|
3482
|
-
# can remove this with api server version bump.
|
3483
|
-
# 2025.05.03
|
3484
|
-
availability_list = realtime_gpu_availability_lists
|
3454
|
+
if len(realtime_gpu_availability_lists[0]) != 2:
|
3455
|
+
# TODO(kyuds): for backwards compatibility, as we add new
|
3456
|
+
# context to the API server response in #5362. Remove this after
|
3457
|
+
# 0.10.0.
|
3458
|
+
realtime_gpu_availability_lists = [
|
3459
|
+
(context, realtime_gpu_availability_lists)
|
3460
|
+
]
|
3461
|
+
for (ctx, availability_list) in realtime_gpu_availability_lists:
|
3485
3462
|
realtime_gpu_table = log_utils.create_table(
|
3486
|
-
['GPU', qty_header, '
|
3463
|
+
['GPU', qty_header, 'UTILIZATION'])
|
3487
3464
|
for realtime_gpu_availability in sorted(availability_list):
|
3488
3465
|
gpu_availability = models.RealtimeGpuAvailability(
|
3489
3466
|
*realtime_gpu_availability)
|
@@ -3493,49 +3470,100 @@ def show_gpus(
|
|
3493
3470
|
realtime_gpu_table.add_row([
|
3494
3471
|
gpu_availability.gpu,
|
3495
3472
|
_list_to_str(gpu_availability.counts),
|
3496
|
-
gpu_availability.capacity,
|
3497
|
-
available_qty,
|
3473
|
+
f'{available_qty} of {gpu_availability.capacity} free',
|
3498
3474
|
])
|
3499
|
-
|
3475
|
+
gpu = gpu_availability.gpu
|
3476
|
+
capacity = gpu_availability.capacity
|
3477
|
+
# we want total, so skip permission denied.
|
3478
|
+
available = max(gpu_availability.available, 0)
|
3479
|
+
if capacity > 0:
|
3480
|
+
total_gpu_info[gpu][0] += capacity
|
3481
|
+
total_gpu_info[gpu][1] += available
|
3482
|
+
realtime_gpu_infos.append((ctx, realtime_gpu_table))
|
3483
|
+
# Collect node info for this context
|
3484
|
+
nodes_info = sdk.stream_and_get(
|
3485
|
+
sdk.kubernetes_node_info(context=ctx))
|
3486
|
+
all_nodes_info.append((ctx, nodes_info))
|
3500
3487
|
|
3501
3488
|
# display an aggregated table for all contexts
|
3502
3489
|
# if there are more than one contexts with GPUs
|
3503
3490
|
if len(realtime_gpu_infos) > 1:
|
3504
3491
|
total_realtime_gpu_table = log_utils.create_table(
|
3505
|
-
['GPU', '
|
3492
|
+
['GPU', 'UTILIZATION'])
|
3506
3493
|
for gpu, stats in total_gpu_info.items():
|
3507
|
-
total_realtime_gpu_table.add_row(
|
3494
|
+
total_realtime_gpu_table.add_row(
|
3495
|
+
[gpu, f'{stats[1]} of {stats[0]} free'])
|
3508
3496
|
else:
|
3509
3497
|
total_realtime_gpu_table = None
|
3510
3498
|
|
3511
|
-
return realtime_gpu_infos, total_realtime_gpu_table
|
3499
|
+
return realtime_gpu_infos, total_realtime_gpu_table, all_nodes_info
|
3512
3500
|
|
3513
|
-
def
|
3501
|
+
def _format_kubernetes_node_info_combined(
|
3502
|
+
contexts_info: List[Tuple[str,
|
3503
|
+
'models.KubernetesNodesInfo']]) -> str:
|
3514
3504
|
node_table = log_utils.create_table(
|
3515
|
-
['
|
3505
|
+
['CONTEXT', 'NODE', 'GPU', 'UTILIZATION'])
|
3516
3506
|
|
3517
|
-
nodes_info = sdk.stream_and_get(
|
3518
|
-
sdk.kubernetes_node_info(context=context))
|
3519
3507
|
no_permissions_str = '<no permissions>'
|
3520
|
-
|
3521
|
-
|
3522
|
-
|
3523
|
-
|
3524
|
-
|
3525
|
-
|
3508
|
+
hints = []
|
3509
|
+
|
3510
|
+
for context, nodes_info in contexts_info:
|
3511
|
+
context_name = context if context else 'default'
|
3512
|
+
if nodes_info.hint:
|
3513
|
+
hints.append(f'{context_name}: {nodes_info.hint}')
|
3514
|
+
|
3515
|
+
for node_name, node_info in nodes_info.node_info_dict.items():
|
3516
|
+
available = node_info.free[
|
3517
|
+
'accelerators_available'] if node_info.free[
|
3518
|
+
'accelerators_available'] != -1 else no_permissions_str
|
3519
|
+
acc_type = node_info.accelerator_type
|
3520
|
+
if acc_type is None:
|
3521
|
+
acc_type = '-'
|
3526
3522
|
node_table.add_row([
|
3527
|
-
node_name,
|
3528
|
-
node_info.total[
|
3523
|
+
context_name, node_name, acc_type,
|
3524
|
+
f'{available} of {node_info.total["accelerator_count"]} '
|
3525
|
+
'free'
|
3529
3526
|
])
|
3530
|
-
|
3531
|
-
|
3532
|
-
if
|
3533
|
-
k8s_per_node_acc_message +=
|
3527
|
+
|
3528
|
+
k8s_per_node_acc_message = ('Kubernetes per-node GPU availability')
|
3529
|
+
if hints:
|
3530
|
+
k8s_per_node_acc_message += ' (' + '; '.join(hints) + ')'
|
3531
|
+
|
3534
3532
|
return (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3535
3533
|
f'{k8s_per_node_acc_message}'
|
3536
3534
|
f'{colorama.Style.RESET_ALL}\n'
|
3537
3535
|
f'{node_table.get_string()}')
|
3538
3536
|
|
3537
|
+
def _format_kubernetes_realtime_gpu(
|
3538
|
+
total_table: 'prettytable.PrettyTable',
|
3539
|
+
k8s_realtime_infos: List[Tuple[str, 'prettytable.PrettyTable']],
|
3540
|
+
all_nodes_info: List[Tuple[str, 'models.KubernetesNodesInfo']],
|
3541
|
+
show_node_info: bool) -> Generator[str, None, None]:
|
3542
|
+
yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
|
3543
|
+
'Kubernetes GPUs'
|
3544
|
+
f'{colorama.Style.RESET_ALL}')
|
3545
|
+
# print total table
|
3546
|
+
if total_table is not None:
|
3547
|
+
yield '\n'
|
3548
|
+
yield from total_table.get_string()
|
3549
|
+
|
3550
|
+
# print individual infos.
|
3551
|
+
for (ctx, k8s_realtime_table) in k8s_realtime_infos:
|
3552
|
+
yield '\n'
|
3553
|
+
# Print context header separately
|
3554
|
+
if ctx:
|
3555
|
+
context_str = f'Context: {ctx}'
|
3556
|
+
else:
|
3557
|
+
context_str = 'Default Context'
|
3558
|
+
yield (
|
3559
|
+
f'{colorama.Fore.CYAN}{context_str}{colorama.Style.RESET_ALL}\n'
|
3560
|
+
)
|
3561
|
+
yield from k8s_realtime_table.get_string()
|
3562
|
+
|
3563
|
+
if show_node_info:
|
3564
|
+
yield '\n'
|
3565
|
+
yield _format_kubernetes_node_info_combined(all_nodes_info)
|
3566
|
+
|
3539
3567
|
def _output() -> Generator[str, None, None]:
|
3540
3568
|
gpu_table = log_utils.create_table(
|
3541
3569
|
['COMMON_GPU', 'AVAILABLE_QUANTITIES'])
|
@@ -3568,7 +3596,7 @@ def show_gpus(
|
|
3568
3596
|
# If --cloud kubernetes is not specified, we want to catch
|
3569
3597
|
# the case where no GPUs are available on the cluster and
|
3570
3598
|
# print the warning at the end.
|
3571
|
-
k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables(context) # pylint: disable=line-too-long
|
3599
|
+
k8s_realtime_infos, total_table, all_nodes_info = _get_kubernetes_realtime_gpu_tables(context) # pylint: disable=line-too-long
|
3572
3600
|
except ValueError as e:
|
3573
3601
|
if not cloud_is_kubernetes:
|
3574
3602
|
# Make it a note if cloud is not kubernetes
|
@@ -3577,27 +3605,12 @@ def show_gpus(
|
|
3577
3605
|
else:
|
3578
3606
|
print_section_titles = True
|
3579
3607
|
|
3580
|
-
|
3581
|
-
|
3582
|
-
|
3583
|
-
|
3584
|
-
|
3585
|
-
|
3586
|
-
yield '\n\n'
|
3587
|
-
|
3588
|
-
# print individual infos.
|
3589
|
-
for (idx,
|
3590
|
-
(ctx,
|
3591
|
-
k8s_realtime_table)) in enumerate(k8s_realtime_infos):
|
3592
|
-
context_str = f'(Context: {ctx})' if ctx else ''
|
3593
|
-
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3594
|
-
f'Kubernetes GPUs {context_str}'
|
3595
|
-
f'{colorama.Style.RESET_ALL}\n')
|
3596
|
-
yield from k8s_realtime_table.get_string()
|
3597
|
-
yield '\n\n'
|
3598
|
-
yield _format_kubernetes_node_info(ctx)
|
3599
|
-
if idx != len(k8s_realtime_infos) - 1:
|
3600
|
-
yield '\n\n'
|
3608
|
+
yield from _format_kubernetes_realtime_gpu(
|
3609
|
+
total_table,
|
3610
|
+
k8s_realtime_infos,
|
3611
|
+
all_nodes_info,
|
3612
|
+
show_node_info=True)
|
3613
|
+
|
3601
3614
|
if kubernetes_autoscaling:
|
3602
3615
|
k8s_messages += (
|
3603
3616
|
'\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
|
@@ -3688,31 +3701,18 @@ def show_gpus(
|
|
3688
3701
|
print_section_titles = True
|
3689
3702
|
# TODO(romilb): Show filtered per node GPU availability here as well
|
3690
3703
|
try:
|
3691
|
-
k8s_realtime_infos, total_table
|
3692
|
-
|
3693
|
-
|
3694
|
-
|
3695
|
-
|
3696
|
-
|
3697
|
-
|
3698
|
-
|
3699
|
-
'Total Kubernetes GPUs'
|
3700
|
-
f'{colorama.Style.RESET_ALL}\n')
|
3701
|
-
yield from total_table.get_string()
|
3702
|
-
yield '\n\n'
|
3703
|
-
|
3704
|
-
# print individual tables
|
3705
|
-
for (ctx, k8s_realtime_table) in k8s_realtime_infos:
|
3706
|
-
context_str = f'(Context: {ctx})' if ctx else ''
|
3707
|
-
yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
|
3708
|
-
f'Kubernetes GPUs {context_str}'
|
3709
|
-
f'{colorama.Style.RESET_ALL}\n')
|
3710
|
-
yield from k8s_realtime_table.get_string()
|
3711
|
-
yield '\n\n'
|
3704
|
+
(k8s_realtime_infos, total_table,
|
3705
|
+
all_nodes_info) = _get_kubernetes_realtime_gpu_tables(
|
3706
|
+
context=region, name_filter=name, quantity_filter=quantity)
|
3707
|
+
|
3708
|
+
yield from _format_kubernetes_realtime_gpu(total_table,
|
3709
|
+
k8s_realtime_infos,
|
3710
|
+
all_nodes_info,
|
3711
|
+
show_node_info=False)
|
3712
3712
|
except ValueError as e:
|
3713
3713
|
# In the case of a specific accelerator, show the error message
|
3714
3714
|
# immediately (e.g., "Resources H100 not found ...")
|
3715
|
-
yield
|
3715
|
+
yield common_utils.format_exception(e, use_bracket=True)
|
3716
3716
|
if kubernetes_autoscaling:
|
3717
3717
|
k8s_messages += ('\n' +
|
3718
3718
|
kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
|