skypilot-nightly 1.0.0.dev20250428__py3-none-any.whl → 1.0.0.dev20250430__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +28 -40
  3. sky/backends/backend_utils.py +2 -0
  4. sky/cli.py +90 -37
  5. sky/client/cli.py +90 -37
  6. sky/client/sdk.py +3 -2
  7. sky/clouds/cloud.py +5 -2
  8. sky/clouds/kubernetes.py +4 -4
  9. sky/clouds/nebius.py +16 -10
  10. sky/clouds/service_catalog/constants.py +1 -1
  11. sky/clouds/service_catalog/kubernetes_catalog.py +7 -7
  12. sky/core.py +58 -29
  13. sky/dashboard/out/404.html +1 -1
  14. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  15. sky/dashboard/out/clusters/[cluster].html +1 -1
  16. sky/dashboard/out/clusters.html +1 -1
  17. sky/dashboard/out/favicon.ico +0 -0
  18. sky/dashboard/out/index.html +1 -1
  19. sky/dashboard/out/jobs/[job].html +1 -1
  20. sky/dashboard/out/jobs.html +1 -1
  21. sky/optimizer.py +35 -11
  22. sky/provision/docker_utils.py +22 -16
  23. sky/provision/kubernetes/utils.py +26 -24
  24. sky/resources.py +1 -1
  25. sky/server/common.py +6 -3
  26. sky/server/config.py +184 -0
  27. sky/server/requests/executor.py +17 -156
  28. sky/server/server.py +4 -4
  29. sky/setup_files/dependencies.py +0 -1
  30. sky/setup_files/setup.py +1 -1
  31. sky/skylet/constants.py +18 -0
  32. sky/skypilot_config.py +32 -11
  33. sky/templates/aws-ray.yml.j2 +2 -1
  34. sky/templates/azure-ray.yml.j2 +2 -1
  35. sky/templates/cudo-ray.yml.j2 +1 -0
  36. sky/templates/do-ray.yml.j2 +3 -2
  37. sky/templates/fluidstack-ray.yml.j2 +1 -1
  38. sky/templates/gcp-ray.yml.j2 +1 -1
  39. sky/templates/ibm-ray.yml.j2 +3 -3
  40. sky/templates/kubernetes-ray.yml.j2 +26 -14
  41. sky/templates/lambda-ray.yml.j2 +1 -0
  42. sky/templates/nebius-ray.yml.j2 +64 -0
  43. sky/templates/oci-ray.yml.j2 +1 -1
  44. sky/templates/paperspace-ray.yml.j2 +1 -0
  45. sky/templates/runpod-ray.yml.j2 +1 -0
  46. sky/templates/scp-ray.yml.j2 +1 -0
  47. sky/templates/vast-ray.yml.j2 +1 -1
  48. sky/templates/vsphere-ray.yml.j2 +1 -0
  49. sky/utils/aws/__init__.py +0 -0
  50. sky/utils/aws/get_default_security_group.py +11 -0
  51. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/METADATA +3 -3
  52. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/RECORD +58 -55
  53. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/WHEEL +1 -1
  54. /sky/dashboard/out/_next/static/{2f-jlOWR_G5mOwCF4RcZz → Ggv82ZIZy1hoW81egpwD1}/_buildManifest.js +0 -0
  55. /sky/dashboard/out/_next/static/{2f-jlOWR_G5mOwCF4RcZz → Ggv82ZIZy1hoW81egpwD1}/_ssgManifest.js +0 -0
  56. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/entry_points.txt +0 -0
  57. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/licenses/LICENSE +0 -0
  58. {skypilot_nightly-1.0.0.dev20250428.dist-info → skypilot_nightly-1.0.0.dev20250430.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '607eee0a24e50718d783e92081f141f45cac6cda'
8
+ _SKYPILOT_COMMIT_SHA = 'fe1583a36034080dbe7791da63ea270db30d0bcc'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250428'
38
+ __version__ = '1.0.0.dev20250430'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/adaptors/nebius.py CHANGED
@@ -29,11 +29,6 @@ MAX_RETRIES_TO_INSTANCE_WAIT = 120 # Maximum number of retries
29
29
 
30
30
  POLL_INTERVAL = 5
31
31
 
32
- _iam_token = None
33
- _sdk = None
34
- _tenant_id = None
35
- _project_id = None
36
-
37
32
  _IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Nebius AI Cloud.'
38
33
  'Try pip install "skypilot[nebius]"')
39
34
 
@@ -81,56 +76,49 @@ def vpc():
81
76
  return vpc_v1
82
77
 
83
78
 
79
+ @annotations.lru_cache(scope='request')
84
80
  def get_iam_token():
85
- global _iam_token
86
- if _iam_token is None:
87
- try:
88
- with open(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH),
89
- encoding='utf-8') as file:
90
- _iam_token = file.read().strip()
91
- except FileNotFoundError:
92
- return None
93
- return _iam_token
81
+ try:
82
+ with open(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH),
83
+ encoding='utf-8') as file:
84
+ return file.read().strip()
85
+ except FileNotFoundError:
86
+ return None
94
87
 
95
88
 
89
+ @annotations.lru_cache(scope='request')
96
90
  def is_token_or_cred_file_exist():
97
91
  return (os.path.exists(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH)) or
98
92
  os.path.exists(os.path.expanduser(NEBIUS_CREDENTIALS_PATH)))
99
93
 
100
94
 
95
+ @annotations.lru_cache(scope='request')
101
96
  def get_project_id():
102
- global _project_id
103
- if _project_id is None:
104
- try:
105
- with open(os.path.expanduser(NEBIUS_PROJECT_ID_PATH),
106
- encoding='utf-8') as file:
107
- _project_id = file.read().strip()
108
- except FileNotFoundError:
109
- return None
110
- return _project_id
97
+ try:
98
+ with open(os.path.expanduser(NEBIUS_PROJECT_ID_PATH),
99
+ encoding='utf-8') as file:
100
+ return file.read().strip()
101
+ except FileNotFoundError:
102
+ return None
111
103
 
112
104
 
105
+ @annotations.lru_cache(scope='request')
113
106
  def get_tenant_id():
114
- global _tenant_id
115
- if _tenant_id is None:
116
- try:
117
- with open(os.path.expanduser(NEBIUS_TENANT_ID_PATH),
118
- encoding='utf-8') as file:
119
- _tenant_id = file.read().strip()
120
- except FileNotFoundError:
121
- return None
122
- return _tenant_id
107
+ try:
108
+ with open(os.path.expanduser(NEBIUS_TENANT_ID_PATH),
109
+ encoding='utf-8') as file:
110
+ return file.read().strip()
111
+ except FileNotFoundError:
112
+ return None
123
113
 
124
114
 
115
+ @annotations.lru_cache(scope='request')
125
116
  def sdk():
126
- global _sdk
127
- if _sdk is None:
128
- if get_iam_token() is not None:
129
- _sdk = nebius.sdk.SDK(credentials=get_iam_token())
130
- return _sdk
131
- _sdk = nebius.sdk.SDK(
132
- credentials_file_name=os.path.expanduser(NEBIUS_CREDENTIALS_PATH))
133
- return _sdk
117
+ token = get_iam_token()
118
+ if token is not None:
119
+ return nebius.sdk.SDK(credentials=token)
120
+ return nebius.sdk.SDK(
121
+ credentials_file_name=os.path.expanduser(NEBIUS_CREDENTIALS_PATH))
134
122
 
135
123
 
136
124
  def get_nebius_credentials(boto3_session):
@@ -798,6 +798,8 @@ def write_cluster_config(
798
798
  'sky_ray_yaml_local_path': tmp_yaml_path,
799
799
  'sky_version': str(version.parse(sky.__version__)),
800
800
  'sky_wheel_hash': wheel_hash,
801
+ 'ssh_max_sessions_config':
802
+ constants.SET_SSH_MAX_SESSIONS_CONFIG_CMD,
801
803
  # Authentication (optional).
802
804
  **auth_config,
803
805
 
sky/cli.py CHANGED
@@ -23,6 +23,7 @@ NOTE: the order of command definitions in this file corresponds to how they are
23
23
  listed in "sky --help". Take care to put logically connected commands close to
24
24
  each other.
25
25
  """
26
+ import collections
26
27
  import copy
27
28
  import datetime
28
29
  import functools
@@ -3413,7 +3414,7 @@ def show_gpus(
3413
3414
 
3414
3415
  # TODO(zhwu,romilb): We should move most of these kubernetes related
3415
3416
  # queries into the backend, especially behind the server.
3416
- def _get_kubernetes_realtime_gpu_table(
3417
+ def _get_kubernetes_realtime_gpu_tables(
3417
3418
  context: Optional[str] = None,
3418
3419
  name_filter: Optional[str] = None,
3419
3420
  quantity_filter: Optional[int] = None):
@@ -3423,15 +3424,14 @@ def show_gpus(
3423
3424
  else:
3424
3425
  qty_header = 'REQUESTABLE_QTY_PER_NODE'
3425
3426
  free_header = 'TOTAL_FREE_GPUS'
3426
- realtime_gpu_table = log_utils.create_table(
3427
- ['GPU', qty_header, 'TOTAL_GPUS', free_header])
3428
- realtime_gpu_availability_list = sdk.stream_and_get(
3427
+
3428
+ realtime_gpu_availability_lists = sdk.stream_and_get(
3429
3429
  sdk.realtime_kubernetes_gpu_availability(
3430
3430
  context=context,
3431
3431
  name_filter=name_filter,
3432
3432
  quantity_filter=quantity_filter))
3433
- if not realtime_gpu_availability_list:
3434
- err_msg = 'No GPUs found in Kubernetes cluster. '
3433
+ if not realtime_gpu_availability_lists:
3434
+ err_msg = 'No GPUs found in any allowed Kubernetes cluster. '
3435
3435
  debug_msg = 'To further debug, run: sky check '
3436
3436
  if name_filter is not None:
3437
3437
  gpu_info_msg = f' {name_filter!r}'
@@ -3439,26 +3439,52 @@ def show_gpus(
3439
3439
  gpu_info_msg += (' with requested quantity'
3440
3440
  f' {quantity_filter}')
3441
3441
  err_msg = (f'Resources{gpu_info_msg} not found '
3442
- 'in Kubernetes cluster. ')
3442
+ 'in any allowed Kubernetes cluster. ')
3443
3443
  debug_msg = ('To show available accelerators on kubernetes,'
3444
3444
  ' run: sky show-gpus --cloud kubernetes ')
3445
3445
  full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
3446
3446
  debug_msg)
3447
3447
  raise ValueError(full_err_msg)
3448
3448
  no_permissions_str = '<no permissions>'
3449
- for realtime_gpu_availability in sorted(realtime_gpu_availability_list):
3450
- gpu_availability = models.RealtimeGpuAvailability(
3451
- *realtime_gpu_availability)
3452
- available_qty = (gpu_availability.available
3453
- if gpu_availability.available != -1 else
3454
- no_permissions_str)
3455
- realtime_gpu_table.add_row([
3456
- gpu_availability.gpu,
3457
- _list_to_str(gpu_availability.counts),
3458
- gpu_availability.capacity,
3459
- available_qty,
3460
- ])
3461
- return realtime_gpu_table
3449
+ realtime_gpu_infos = []
3450
+ total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
3451
+ lambda: [0, 0])
3452
+
3453
+ for (ctx, availability_list) in realtime_gpu_availability_lists:
3454
+ realtime_gpu_table = log_utils.create_table(
3455
+ ['GPU', qty_header, 'TOTAL_GPUS', free_header])
3456
+ for realtime_gpu_availability in sorted(availability_list):
3457
+ gpu_availability = models.RealtimeGpuAvailability(
3458
+ *realtime_gpu_availability)
3459
+ available_qty = (gpu_availability.available
3460
+ if gpu_availability.available != -1 else
3461
+ no_permissions_str)
3462
+ realtime_gpu_table.add_row([
3463
+ gpu_availability.gpu,
3464
+ _list_to_str(gpu_availability.counts),
3465
+ gpu_availability.capacity,
3466
+ available_qty,
3467
+ ])
3468
+ gpu = gpu_availability.gpu
3469
+ capacity = gpu_availability.capacity
3470
+ # we want total, so skip permission denied.
3471
+ available = max(gpu_availability.available, 0)
3472
+ if capacity > 0:
3473
+ total_gpu_info[gpu][0] += capacity
3474
+ total_gpu_info[gpu][1] += available
3475
+ realtime_gpu_infos.append((ctx, realtime_gpu_table))
3476
+
3477
+ # display an aggregated table for all contexts
3478
+ # if there are more than one contexts with GPUs
3479
+ if len(realtime_gpu_infos) > 1:
3480
+ total_realtime_gpu_table = log_utils.create_table(
3481
+ ['GPU', 'TOTAL_GPUS', free_header])
3482
+ for gpu, stats in total_gpu_info.items():
3483
+ total_realtime_gpu_table.add_row([gpu, stats[0], stats[1]])
3484
+ else:
3485
+ total_realtime_gpu_table = None
3486
+
3487
+ return realtime_gpu_infos, total_realtime_gpu_table
3462
3488
 
3463
3489
  def _format_kubernetes_node_info(context: Optional[str]):
3464
3490
  node_table = log_utils.create_table(
@@ -3479,7 +3505,7 @@ def show_gpus(
3479
3505
  'Kubernetes per node accelerator availability ')
3480
3506
  if nodes_info.hint:
3481
3507
  k8s_per_node_acc_message += nodes_info.hint
3482
- return (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3508
+ return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
3483
3509
  f'{k8s_per_node_acc_message}'
3484
3510
  f'{colorama.Style.RESET_ALL}\n'
3485
3511
  f'{node_table.get_string()}')
@@ -3516,8 +3542,7 @@ def show_gpus(
3516
3542
  # If --cloud kubernetes is not specified, we want to catch
3517
3543
  # the case where no GPUs are available on the cluster and
3518
3544
  # print the warning at the end.
3519
- k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
3520
- context)
3545
+ k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables(context) # pylint: disable=line-too-long
3521
3546
  except ValueError as e:
3522
3547
  if not cloud_is_kubernetes:
3523
3548
  # Make it a note if cloud is not kubernetes
@@ -3525,13 +3550,24 @@ def show_gpus(
3525
3550
  k8s_messages += str(e)
3526
3551
  else:
3527
3552
  print_section_titles = True
3528
- context_str = f'(Context: {context})' if context else ''
3529
- yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3530
- f'Kubernetes GPUs {context_str}'
3531
- f'{colorama.Style.RESET_ALL}\n')
3532
- yield from k8s_realtime_table.get_string()
3533
- yield '\n\n'
3534
- yield _format_kubernetes_node_info(context)
3553
+
3554
+ # print total table
3555
+ if total_table is not None:
3556
+ yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
3557
+ 'Total Kubernetes GPUs'
3558
+ f'{colorama.Style.RESET_ALL}\n')
3559
+ yield from total_table.get_string()
3560
+ yield '\n-----\n\n'
3561
+
3562
+ # print individual infos.
3563
+ for (ctx, k8s_realtime_table) in k8s_realtime_infos:
3564
+ context_str = f'(Context: {ctx})' if ctx else ''
3565
+ yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3566
+ f'Kubernetes GPUs {context_str}'
3567
+ f'{colorama.Style.RESET_ALL}\n')
3568
+ yield from k8s_realtime_table.get_string()
3569
+ yield '\n\n'
3570
+ yield _format_kubernetes_node_info(ctx) + '\n-----\n\n'
3535
3571
  if kubernetes_autoscaling:
3536
3572
  k8s_messages += (
3537
3573
  '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
@@ -3620,13 +3656,29 @@ def show_gpus(
3620
3656
  # Print section title if not showing all and instead a specific
3621
3657
  # accelerator is requested
3622
3658
  print_section_titles = True
3623
- yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3624
- f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
3625
3659
  # TODO(romilb): Show filtered per node GPU availability here as well
3626
3660
  try:
3627
- k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
3628
- name_filter=name, quantity_filter=quantity)
3629
- yield from k8s_realtime_table.get_string()
3661
+ k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables( # pylint: disable=line-too-long
3662
+ context=region,
3663
+ name_filter=name,
3664
+ quantity_filter=quantity)
3665
+
3666
+ # print total table
3667
+ if total_table is not None:
3668
+ yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
3669
+ 'Total Kubernetes GPUs'
3670
+ f'{colorama.Style.RESET_ALL}\n')
3671
+ yield from total_table.get_string()
3672
+ yield '\n-----\n\n'
3673
+
3674
+ # print individual tables
3675
+ for (ctx, k8s_realtime_table) in k8s_realtime_infos:
3676
+ context_str = f'(Context: {ctx})' if ctx else ''
3677
+ yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3678
+ f'Kubernetes GPUs {context_str}'
3679
+ f'{colorama.Style.RESET_ALL}\n')
3680
+ yield from k8s_realtime_table.get_string()
3681
+ yield '\n\n'
3630
3682
  except ValueError as e:
3631
3683
  # In the case of a specific accelerator, show the error message
3632
3684
  # immediately (e.g., "Resources H100 not found ...")
@@ -5911,11 +5963,12 @@ def api_info():
5911
5963
  user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
5912
5964
  user_hash = common_utils.get_user_hash()
5913
5965
  dashboard_url = server_common.get_dashboard_url(url)
5914
- click.echo(f'Using SkyPilot API server: {url} Dashboard: {dashboard_url}\n'
5966
+ click.echo(f'Using SkyPilot API server: {url}\n'
5915
5967
  f'{ux_utils.INDENT_SYMBOL}Status: {api_server_info["status"]}, '
5916
5968
  f'commit: {api_server_info["commit"]}, '
5917
5969
  f'version: {api_server_info["version"]}\n'
5918
- f'{ux_utils.INDENT_LAST_SYMBOL}User: {user_name} ({user_hash})')
5970
+ f'{ux_utils.INDENT_SYMBOL}User: {user_name} ({user_hash})\n'
5971
+ f'{ux_utils.INDENT_LAST_SYMBOL}Dashboard: {dashboard_url}')
5919
5972
 
5920
5973
 
5921
5974
  def main():
sky/client/cli.py CHANGED
@@ -23,6 +23,7 @@ NOTE: the order of command definitions in this file corresponds to how they are
23
23
  listed in "sky --help". Take care to put logically connected commands close to
24
24
  each other.
25
25
  """
26
+ import collections
26
27
  import copy
27
28
  import datetime
28
29
  import functools
@@ -3413,7 +3414,7 @@ def show_gpus(
3413
3414
 
3414
3415
  # TODO(zhwu,romilb): We should move most of these kubernetes related
3415
3416
  # queries into the backend, especially behind the server.
3416
- def _get_kubernetes_realtime_gpu_table(
3417
+ def _get_kubernetes_realtime_gpu_tables(
3417
3418
  context: Optional[str] = None,
3418
3419
  name_filter: Optional[str] = None,
3419
3420
  quantity_filter: Optional[int] = None):
@@ -3423,15 +3424,14 @@ def show_gpus(
3423
3424
  else:
3424
3425
  qty_header = 'REQUESTABLE_QTY_PER_NODE'
3425
3426
  free_header = 'TOTAL_FREE_GPUS'
3426
- realtime_gpu_table = log_utils.create_table(
3427
- ['GPU', qty_header, 'TOTAL_GPUS', free_header])
3428
- realtime_gpu_availability_list = sdk.stream_and_get(
3427
+
3428
+ realtime_gpu_availability_lists = sdk.stream_and_get(
3429
3429
  sdk.realtime_kubernetes_gpu_availability(
3430
3430
  context=context,
3431
3431
  name_filter=name_filter,
3432
3432
  quantity_filter=quantity_filter))
3433
- if not realtime_gpu_availability_list:
3434
- err_msg = 'No GPUs found in Kubernetes cluster. '
3433
+ if not realtime_gpu_availability_lists:
3434
+ err_msg = 'No GPUs found in any allowed Kubernetes cluster. '
3435
3435
  debug_msg = 'To further debug, run: sky check '
3436
3436
  if name_filter is not None:
3437
3437
  gpu_info_msg = f' {name_filter!r}'
@@ -3439,26 +3439,52 @@ def show_gpus(
3439
3439
  gpu_info_msg += (' with requested quantity'
3440
3440
  f' {quantity_filter}')
3441
3441
  err_msg = (f'Resources{gpu_info_msg} not found '
3442
- 'in Kubernetes cluster. ')
3442
+ 'in any allowed Kubernetes cluster. ')
3443
3443
  debug_msg = ('To show available accelerators on kubernetes,'
3444
3444
  ' run: sky show-gpus --cloud kubernetes ')
3445
3445
  full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
3446
3446
  debug_msg)
3447
3447
  raise ValueError(full_err_msg)
3448
3448
  no_permissions_str = '<no permissions>'
3449
- for realtime_gpu_availability in sorted(realtime_gpu_availability_list):
3450
- gpu_availability = models.RealtimeGpuAvailability(
3451
- *realtime_gpu_availability)
3452
- available_qty = (gpu_availability.available
3453
- if gpu_availability.available != -1 else
3454
- no_permissions_str)
3455
- realtime_gpu_table.add_row([
3456
- gpu_availability.gpu,
3457
- _list_to_str(gpu_availability.counts),
3458
- gpu_availability.capacity,
3459
- available_qty,
3460
- ])
3461
- return realtime_gpu_table
3449
+ realtime_gpu_infos = []
3450
+ total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
3451
+ lambda: [0, 0])
3452
+
3453
+ for (ctx, availability_list) in realtime_gpu_availability_lists:
3454
+ realtime_gpu_table = log_utils.create_table(
3455
+ ['GPU', qty_header, 'TOTAL_GPUS', free_header])
3456
+ for realtime_gpu_availability in sorted(availability_list):
3457
+ gpu_availability = models.RealtimeGpuAvailability(
3458
+ *realtime_gpu_availability)
3459
+ available_qty = (gpu_availability.available
3460
+ if gpu_availability.available != -1 else
3461
+ no_permissions_str)
3462
+ realtime_gpu_table.add_row([
3463
+ gpu_availability.gpu,
3464
+ _list_to_str(gpu_availability.counts),
3465
+ gpu_availability.capacity,
3466
+ available_qty,
3467
+ ])
3468
+ gpu = gpu_availability.gpu
3469
+ capacity = gpu_availability.capacity
3470
+ # we want total, so skip permission denied.
3471
+ available = max(gpu_availability.available, 0)
3472
+ if capacity > 0:
3473
+ total_gpu_info[gpu][0] += capacity
3474
+ total_gpu_info[gpu][1] += available
3475
+ realtime_gpu_infos.append((ctx, realtime_gpu_table))
3476
+
3477
+ # display an aggregated table for all contexts
3478
+ # if there are more than one contexts with GPUs
3479
+ if len(realtime_gpu_infos) > 1:
3480
+ total_realtime_gpu_table = log_utils.create_table(
3481
+ ['GPU', 'TOTAL_GPUS', free_header])
3482
+ for gpu, stats in total_gpu_info.items():
3483
+ total_realtime_gpu_table.add_row([gpu, stats[0], stats[1]])
3484
+ else:
3485
+ total_realtime_gpu_table = None
3486
+
3487
+ return realtime_gpu_infos, total_realtime_gpu_table
3462
3488
 
3463
3489
  def _format_kubernetes_node_info(context: Optional[str]):
3464
3490
  node_table = log_utils.create_table(
@@ -3479,7 +3505,7 @@ def show_gpus(
3479
3505
  'Kubernetes per node accelerator availability ')
3480
3506
  if nodes_info.hint:
3481
3507
  k8s_per_node_acc_message += nodes_info.hint
3482
- return (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3508
+ return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
3483
3509
  f'{k8s_per_node_acc_message}'
3484
3510
  f'{colorama.Style.RESET_ALL}\n'
3485
3511
  f'{node_table.get_string()}')
@@ -3516,8 +3542,7 @@ def show_gpus(
3516
3542
  # If --cloud kubernetes is not specified, we want to catch
3517
3543
  # the case where no GPUs are available on the cluster and
3518
3544
  # print the warning at the end.
3519
- k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
3520
- context)
3545
+ k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables(context) # pylint: disable=line-too-long
3521
3546
  except ValueError as e:
3522
3547
  if not cloud_is_kubernetes:
3523
3548
  # Make it a note if cloud is not kubernetes
@@ -3525,13 +3550,24 @@ def show_gpus(
3525
3550
  k8s_messages += str(e)
3526
3551
  else:
3527
3552
  print_section_titles = True
3528
- context_str = f'(Context: {context})' if context else ''
3529
- yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3530
- f'Kubernetes GPUs {context_str}'
3531
- f'{colorama.Style.RESET_ALL}\n')
3532
- yield from k8s_realtime_table.get_string()
3533
- yield '\n\n'
3534
- yield _format_kubernetes_node_info(context)
3553
+
3554
+ # print total table
3555
+ if total_table is not None:
3556
+ yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
3557
+ 'Total Kubernetes GPUs'
3558
+ f'{colorama.Style.RESET_ALL}\n')
3559
+ yield from total_table.get_string()
3560
+ yield '\n-----\n\n'
3561
+
3562
+ # print individual infos.
3563
+ for (ctx, k8s_realtime_table) in k8s_realtime_infos:
3564
+ context_str = f'(Context: {ctx})' if ctx else ''
3565
+ yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3566
+ f'Kubernetes GPUs {context_str}'
3567
+ f'{colorama.Style.RESET_ALL}\n')
3568
+ yield from k8s_realtime_table.get_string()
3569
+ yield '\n\n'
3570
+ yield _format_kubernetes_node_info(ctx) + '\n-----\n\n'
3535
3571
  if kubernetes_autoscaling:
3536
3572
  k8s_messages += (
3537
3573
  '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
@@ -3620,13 +3656,29 @@ def show_gpus(
3620
3656
  # Print section title if not showing all and instead a specific
3621
3657
  # accelerator is requested
3622
3658
  print_section_titles = True
3623
- yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3624
- f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
3625
3659
  # TODO(romilb): Show filtered per node GPU availability here as well
3626
3660
  try:
3627
- k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
3628
- name_filter=name, quantity_filter=quantity)
3629
- yield from k8s_realtime_table.get_string()
3661
+ k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables( # pylint: disable=line-too-long
3662
+ context=region,
3663
+ name_filter=name,
3664
+ quantity_filter=quantity)
3665
+
3666
+ # print total table
3667
+ if total_table is not None:
3668
+ yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
3669
+ 'Total Kubernetes GPUs'
3670
+ f'{colorama.Style.RESET_ALL}\n')
3671
+ yield from total_table.get_string()
3672
+ yield '\n-----\n\n'
3673
+
3674
+ # print individual tables
3675
+ for (ctx, k8s_realtime_table) in k8s_realtime_infos:
3676
+ context_str = f'(Context: {ctx})' if ctx else ''
3677
+ yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3678
+ f'Kubernetes GPUs {context_str}'
3679
+ f'{colorama.Style.RESET_ALL}\n')
3680
+ yield from k8s_realtime_table.get_string()
3681
+ yield '\n\n'
3630
3682
  except ValueError as e:
3631
3683
  # In the case of a specific accelerator, show the error message
3632
3684
  # immediately (e.g., "Resources H100 not found ...")
@@ -5911,11 +5963,12 @@ def api_info():
5911
5963
  user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
5912
5964
  user_hash = common_utils.get_user_hash()
5913
5965
  dashboard_url = server_common.get_dashboard_url(url)
5914
- click.echo(f'Using SkyPilot API server: {url} Dashboard: {dashboard_url}\n'
5966
+ click.echo(f'Using SkyPilot API server: {url}\n'
5915
5967
  f'{ux_utils.INDENT_SYMBOL}Status: {api_server_info["status"]}, '
5916
5968
  f'commit: {api_server_info["commit"]}, '
5917
5969
  f'version: {api_server_info["version"]}\n'
5918
- f'{ux_utils.INDENT_LAST_SYMBOL}User: {user_name} ({user_hash})')
5970
+ f'{ux_utils.INDENT_SYMBOL}User: {user_name} ({user_hash})\n'
5971
+ f'{ux_utils.INDENT_LAST_SYMBOL}Dashboard: {dashboard_url}')
5919
5972
 
5920
5973
 
5921
5974
  def main():
sky/client/sdk.py CHANGED
@@ -1840,6 +1840,7 @@ def api_login(endpoint: Optional[str] = None) -> None:
1840
1840
  dashboard_url = server_common.get_dashboard_url(endpoint)
1841
1841
  dashboard_msg = f'Dashboard: {dashboard_url}'
1842
1842
  click.secho(
1843
- f'Logged in to SkyPilot API server at {endpoint}.'
1844
- f' {dashboard_msg}',
1843
+ f'Logged into SkyPilot API server at: {endpoint}'
1844
+ f'\n{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
1845
+ f'{dashboard_msg}',
1845
1846
  fg='green')
sky/clouds/cloud.py CHANGED
@@ -418,13 +418,16 @@ class Cloud:
418
418
  try:
419
419
  self.check_features_are_supported(resources,
420
420
  resources_required_features)
421
- except exceptions.NotSupportedError:
421
+ except exceptions.NotSupportedError as e:
422
422
  # TODO(zhwu): The resources are now silently filtered out. We
423
423
  # should have some logging telling the user why the resources
424
424
  # are not considered.
425
+ # UPDATE(kyuds): passing in NotSupportedError reason string
426
+ # to hint for issue #5344. Did not remove above comment as
427
+ # reason is not displayed when other resources are valid.
425
428
  return resources_utils.FeasibleResources(resources_list=[],
426
429
  fuzzy_candidate_list=[],
427
- hint=None)
430
+ hint=str(e))
428
431
  return self._get_feasible_launchable_resources(resources)
429
432
 
430
433
  def _get_feasible_launchable_resources(
sky/clouds/kubernetes.py CHANGED
@@ -454,7 +454,7 @@ class Kubernetes(clouds.Cloud):
454
454
  self.IMAGE_CPU, clouds='kubernetes')
455
455
 
456
456
  k8s_acc_label_key = None
457
- k8s_acc_label_value = None
457
+ k8s_acc_label_values = None
458
458
  k8s_topology_label_key = None
459
459
  k8s_topology_label_value = None
460
460
  k8s_resource_key = None
@@ -462,9 +462,9 @@ class Kubernetes(clouds.Cloud):
462
462
 
463
463
  # If GPU/TPUs are requested, set node label to match the GPU/TPU type.
464
464
  if acc_count > 0 and acc_type is not None:
465
- (k8s_acc_label_key, k8s_acc_label_value, k8s_topology_label_key,
465
+ (k8s_acc_label_key, k8s_acc_label_values, k8s_topology_label_key,
466
466
  k8s_topology_label_value) = (
467
- kubernetes_utils.get_accelerator_label_key_value(
467
+ kubernetes_utils.get_accelerator_label_key_values(
468
468
  context, acc_type, acc_count))
469
469
  if (k8s_acc_label_key ==
470
470
  kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY):
@@ -562,7 +562,7 @@ class Kubernetes(clouds.Cloud):
562
562
  'k8s_networking_mode': network_utils.get_networking_mode().value,
563
563
  'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME,
564
564
  'k8s_acc_label_key': k8s_acc_label_key,
565
- 'k8s_acc_label_value': k8s_acc_label_value,
565
+ 'k8s_acc_label_values': k8s_acc_label_values,
566
566
  'k8s_ssh_jump_name': self.SKY_SSH_JUMP_NAME,
567
567
  'k8s_ssh_jump_image': ssh_jump_image,
568
568
  'k8s_service_account_name': k8s_service_account_name,