skypilot-nightly 1.0.0.dev20250427__py3-none-any.whl → 1.0.0.dev20250429__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +28 -40
  3. sky/backends/backend_utils.py +19 -2
  4. sky/backends/cloud_vm_ray_backend.py +33 -8
  5. sky/backends/local_docker_backend.py +1 -2
  6. sky/cli.py +91 -38
  7. sky/client/cli.py +91 -38
  8. sky/client/sdk.py +3 -2
  9. sky/clouds/aws.py +12 -6
  10. sky/clouds/azure.py +3 -0
  11. sky/clouds/cloud.py +8 -2
  12. sky/clouds/cudo.py +2 -0
  13. sky/clouds/do.py +3 -0
  14. sky/clouds/fluidstack.py +3 -0
  15. sky/clouds/gcp.py +7 -0
  16. sky/clouds/ibm.py +2 -0
  17. sky/clouds/kubernetes.py +42 -19
  18. sky/clouds/lambda_cloud.py +1 -0
  19. sky/clouds/nebius.py +18 -10
  20. sky/clouds/oci.py +6 -3
  21. sky/clouds/paperspace.py +2 -0
  22. sky/clouds/runpod.py +2 -0
  23. sky/clouds/scp.py +2 -0
  24. sky/clouds/service_catalog/constants.py +1 -1
  25. sky/clouds/service_catalog/kubernetes_catalog.py +7 -7
  26. sky/clouds/vast.py +2 -0
  27. sky/clouds/vsphere.py +2 -0
  28. sky/core.py +58 -29
  29. sky/dashboard/out/404.html +1 -1
  30. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  31. sky/dashboard/out/clusters/[cluster].html +1 -1
  32. sky/dashboard/out/clusters.html +1 -1
  33. sky/dashboard/out/favicon.ico +0 -0
  34. sky/dashboard/out/index.html +1 -1
  35. sky/dashboard/out/jobs/[job].html +1 -1
  36. sky/dashboard/out/jobs.html +1 -1
  37. sky/exceptions.py +6 -0
  38. sky/execution.py +19 -4
  39. sky/global_user_state.py +1 -0
  40. sky/optimizer.py +35 -11
  41. sky/provision/common.py +2 -5
  42. sky/provision/docker_utils.py +22 -16
  43. sky/provision/instance_setup.py +1 -1
  44. sky/provision/kubernetes/instance.py +276 -93
  45. sky/provision/kubernetes/network.py +1 -1
  46. sky/provision/kubernetes/utils.py +36 -24
  47. sky/provision/provisioner.py +6 -0
  48. sky/serve/replica_managers.py +51 -5
  49. sky/serve/serve_state.py +41 -0
  50. sky/serve/service.py +108 -63
  51. sky/server/common.py +6 -3
  52. sky/server/config.py +184 -0
  53. sky/server/requests/executor.py +17 -156
  54. sky/server/server.py +4 -4
  55. sky/setup_files/dependencies.py +0 -1
  56. sky/skylet/constants.py +7 -0
  57. sky/skypilot_config.py +27 -6
  58. sky/task.py +1 -1
  59. sky/templates/kubernetes-ray.yml.j2 +145 -15
  60. sky/templates/nebius-ray.yml.j2 +63 -0
  61. sky/utils/command_runner.py +17 -3
  62. sky/utils/command_runner.pyi +2 -0
  63. sky/utils/controller_utils.py +24 -0
  64. sky/utils/kubernetes/rsync_helper.sh +20 -4
  65. sky/utils/schemas.py +13 -0
  66. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/METADATA +2 -2
  67. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/RECORD +73 -72
  68. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/WHEEL +1 -1
  69. /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_buildManifest.js +0 -0
  70. /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_ssgManifest.js +0 -0
  71. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/entry_points.txt +0 -0
  72. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/licenses/LICENSE +0 -0
  73. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '966b4b5f431d8912d7fecd151b5e069bca0b9f13'
8
+ _SKYPILOT_COMMIT_SHA = '7b804dafe2f6b775f8a357ac6e147b83e792af93'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250427'
38
+ __version__ = '1.0.0.dev20250429'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/adaptors/nebius.py CHANGED
@@ -29,11 +29,6 @@ MAX_RETRIES_TO_INSTANCE_WAIT = 120 # Maximum number of retries
29
29
 
30
30
  POLL_INTERVAL = 5
31
31
 
32
- _iam_token = None
33
- _sdk = None
34
- _tenant_id = None
35
- _project_id = None
36
-
37
32
  _IMPORT_ERROR_MESSAGE = ('Failed to import dependencies for Nebius AI Cloud.'
38
33
  'Try pip install "skypilot[nebius]"')
39
34
 
@@ -81,56 +76,49 @@ def vpc():
81
76
  return vpc_v1
82
77
 
83
78
 
79
+ @annotations.lru_cache(scope='request')
84
80
  def get_iam_token():
85
- global _iam_token
86
- if _iam_token is None:
87
- try:
88
- with open(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH),
89
- encoding='utf-8') as file:
90
- _iam_token = file.read().strip()
91
- except FileNotFoundError:
92
- return None
93
- return _iam_token
81
+ try:
82
+ with open(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH),
83
+ encoding='utf-8') as file:
84
+ return file.read().strip()
85
+ except FileNotFoundError:
86
+ return None
94
87
 
95
88
 
89
+ @annotations.lru_cache(scope='request')
96
90
  def is_token_or_cred_file_exist():
97
91
  return (os.path.exists(os.path.expanduser(NEBIUS_IAM_TOKEN_PATH)) or
98
92
  os.path.exists(os.path.expanduser(NEBIUS_CREDENTIALS_PATH)))
99
93
 
100
94
 
95
+ @annotations.lru_cache(scope='request')
101
96
  def get_project_id():
102
- global _project_id
103
- if _project_id is None:
104
- try:
105
- with open(os.path.expanduser(NEBIUS_PROJECT_ID_PATH),
106
- encoding='utf-8') as file:
107
- _project_id = file.read().strip()
108
- except FileNotFoundError:
109
- return None
110
- return _project_id
97
+ try:
98
+ with open(os.path.expanduser(NEBIUS_PROJECT_ID_PATH),
99
+ encoding='utf-8') as file:
100
+ return file.read().strip()
101
+ except FileNotFoundError:
102
+ return None
111
103
 
112
104
 
105
+ @annotations.lru_cache(scope='request')
113
106
  def get_tenant_id():
114
- global _tenant_id
115
- if _tenant_id is None:
116
- try:
117
- with open(os.path.expanduser(NEBIUS_TENANT_ID_PATH),
118
- encoding='utf-8') as file:
119
- _tenant_id = file.read().strip()
120
- except FileNotFoundError:
121
- return None
122
- return _tenant_id
107
+ try:
108
+ with open(os.path.expanduser(NEBIUS_TENANT_ID_PATH),
109
+ encoding='utf-8') as file:
110
+ return file.read().strip()
111
+ except FileNotFoundError:
112
+ return None
123
113
 
124
114
 
115
+ @annotations.lru_cache(scope='request')
125
116
  def sdk():
126
- global _sdk
127
- if _sdk is None:
128
- if get_iam_token() is not None:
129
- _sdk = nebius.sdk.SDK(credentials=get_iam_token())
130
- return _sdk
131
- _sdk = nebius.sdk.SDK(
132
- credentials_file_name=os.path.expanduser(NEBIUS_CREDENTIALS_PATH))
133
- return _sdk
117
+ token = get_iam_token()
118
+ if token is not None:
119
+ return nebius.sdk.SDK(credentials=token)
120
+ return nebius.sdk.SDK(
121
+ credentials_file_name=os.path.expanduser(NEBIUS_CREDENTIALS_PATH))
134
122
 
135
123
 
136
124
  def get_nebius_credentials(boto3_session):
@@ -179,6 +179,9 @@ _RAY_YAML_KEYS_TO_RESTORE_EXCEPTIONS = [
179
179
  ('available_node_types', 'ray.head.default', 'node_config', 'UserData'),
180
180
  ('available_node_types', 'ray.head.default', 'node_config',
181
181
  'azure_arm_parameters', 'cloudInitSetupCommands'),
182
+ ('available_node_types', 'ray_head_default', 'node_config', 'pvc_spec'),
183
+ ('available_node_types', 'ray_head_default', 'node_config',
184
+ 'deployment_spec'),
182
185
  ]
183
186
  # These keys are expected to change when provisioning on an existing cluster,
184
187
  # but they don't actually represent a change that requires re-provisioning the
@@ -705,6 +708,13 @@ def write_cluster_config(
705
708
  is_custom_docker = ('true' if to_provision.extract_docker_image()
706
709
  is not None else 'false')
707
710
 
711
+ # Here, if users specify the controller to be high availability, we will
712
+ # provision a high availability controller. Whether the cloud supports
713
+ # this feature has been checked by
714
+ # CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS
715
+ high_availability_specified = controller_utils.high_availability_specified(
716
+ cluster_name_on_cloud)
717
+
708
718
  # Use a tmp file path to avoid incomplete YAML file being re-used in the
709
719
  # future.
710
720
  tmp_yaml_path = yaml_path + '.tmp'
@@ -790,6 +800,9 @@ def write_cluster_config(
790
800
  'sky_wheel_hash': wheel_hash,
791
801
  # Authentication (optional).
792
802
  **auth_config,
803
+
804
+ # High availability
805
+ 'high_availability': high_availability_specified,
793
806
  }),
794
807
  output_path=tmp_yaml_path)
795
808
  config_dict['cluster_name'] = cluster_name
@@ -802,8 +815,12 @@ def write_cluster_config(
802
815
  cluster_config_overrides=to_provision.cluster_config_overrides)
803
816
  kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
804
817
  yaml_obj = common_utils.read_yaml(tmp_yaml_path)
805
- pod_config = yaml_obj['available_node_types']['ray_head_default'][
806
- 'node_config']
818
+ pod_config: Dict[str, Any] = yaml_obj['available_node_types'][
819
+ 'ray_head_default']['node_config']
820
+
821
+ # Check pod spec only. For high availability controllers, we deploy pvc & deployment for the controller. Read kubernetes-ray.yml.j2 for more details.
822
+ pod_config.pop('deployment_spec', None)
823
+ pod_config.pop('pvc_spec', None)
807
824
  valid, message = kubernetes_utils.check_pod_config(pod_config)
808
825
  if not valid:
809
826
  raise exceptions.InvalidCloudConfigs(
@@ -1583,6 +1583,10 @@ class RetryingVmProvisioner(object):
1583
1583
  except provision_common.StopFailoverError:
1584
1584
  with ux_utils.print_exception_no_traceback():
1585
1585
  raise
1586
+ except exceptions.InconsistentHighAvailabilityError:
1587
+ # No teardown happens for this error.
1588
+ with ux_utils.print_exception_no_traceback():
1589
+ raise
1586
1590
  except Exception as e: # pylint: disable=broad-except
1587
1591
  # NOTE: We try to cleanup the cluster even if the previous
1588
1592
  # cluster does not exist. Also we are fast at
@@ -2032,6 +2036,7 @@ class RetryingVmProvisioner(object):
2032
2036
  # Recheck cluster name as the 'except:' block below may
2033
2037
  # change the cloud assignment.
2034
2038
  common_utils.check_cluster_name_is_valid(cluster_name)
2039
+
2035
2040
  if dryrun:
2036
2041
  cloud_user = None
2037
2042
  else:
@@ -2459,6 +2464,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2459
2464
  'Tried to use cached cluster info, but it\'s missing for '
2460
2465
  f'cluster "{self.cluster_name}"')
2461
2466
  self._update_cluster_info()
2467
+
2462
2468
  assert self.cached_cluster_info is not None, self
2463
2469
  runners = provision_lib.get_command_runners(
2464
2470
  self.cached_cluster_info.provider_name, self.cached_cluster_info,
@@ -2689,6 +2695,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2689
2695
  self._optimize_target) or common.OptimizeTarget.COST
2690
2696
  self._requested_features = kwargs.pop('requested_features',
2691
2697
  self._requested_features)
2698
+ self._dump_final_script = kwargs.pop('dump_final_script', False)
2692
2699
  assert not kwargs, f'Unexpected kwargs: {kwargs}'
2693
2700
 
2694
2701
  def check_resources_fit_cluster(
@@ -3272,18 +3279,30 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3272
3279
  env_vars=setup_envs)
3273
3280
  encoded_script = shlex.quote(setup_script)
3274
3281
 
3275
- def _dump_setup_script(setup_script: str) -> None:
3282
+ def _dump_final_script(
3283
+ setup_script: str,
3284
+ target_dir: str = remote_setup_file_name) -> None:
3276
3285
  with tempfile.NamedTemporaryFile('w', prefix='sky_setup_') as f:
3277
3286
  f.write(setup_script)
3278
3287
  f.flush()
3279
3288
  setup_sh_path = f.name
3280
3289
  runner.rsync(source=setup_sh_path,
3281
- target=remote_setup_file_name,
3290
+ target=target_dir,
3282
3291
  up=True,
3283
3292
  stream_logs=False)
3284
3293
 
3294
+ # Always dump the full setup script to the persistent path first
3295
+ # In high availability mode, we need to dump the full setup script
3296
+ # to a persistent path BEFORE any other operations. This ensures
3297
+ # that if the pod restarts, it can find and execute the complete
3298
+ # setup script, rather than a reference to a temporary file that
3299
+ # would no longer exist after restart.
3300
+ if self._dump_final_script:
3301
+ _dump_final_script(setup_script,
3302
+ constants.PERSISTENT_SETUP_SCRIPT_PATH)
3303
+
3285
3304
  if detach_setup or _is_command_length_over_limit(encoded_script):
3286
- _dump_setup_script(setup_script)
3305
+ _dump_final_script(setup_script)
3287
3306
  create_script_code = 'true'
3288
3307
  else:
3289
3308
  create_script_code = (f'{{ echo {encoded_script} > '
@@ -3335,7 +3354,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3335
3354
  'Failed to run setup command inline due to '
3336
3355
  'command length limit. Dumping setup script to '
3337
3356
  'file and running it with SSH.')
3338
- _dump_setup_script(setup_script)
3357
+ _dump_final_script(setup_script)
3339
3358
  returncode = _run_setup(setup_cmd)
3340
3359
 
3341
3360
  def error_message() -> str:
@@ -3426,14 +3445,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3426
3445
  code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
3427
3446
  job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
3428
3447
 
3429
- def _dump_code_to_file(codegen: str) -> None:
3448
+ def _dump_code_to_file(codegen: str,
3449
+ target_dir: str = SKY_REMOTE_APP_DIR) -> None:
3430
3450
  runners = handle.get_command_runners()
3431
3451
  head_runner = runners[0]
3432
3452
  with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
3433
3453
  fp.write(codegen)
3434
3454
  fp.flush()
3435
- script_path = os.path.join(SKY_REMOTE_APP_DIR,
3436
- f'sky_job_{job_id}')
3455
+ script_path = os.path.join(target_dir, f'sky_job_{job_id}')
3437
3456
  # We choose to sync code + exec, because the alternative of 'ray
3438
3457
  # submit' may not work as it may use system python (python2) to
3439
3458
  # execute the script. Happens for AWS.
@@ -3442,6 +3461,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3442
3461
  up=True,
3443
3462
  stream_logs=False)
3444
3463
 
3464
+ # Should also be ealier than _is_command_length_over_limit
3465
+ # Same reason as in _setup
3466
+ if self._dump_final_script:
3467
+ _dump_code_to_file(job_submit_cmd,
3468
+ constants.PERSISTENT_RUN_SCRIPT_DIR)
3469
+
3445
3470
  if _is_command_length_over_limit(job_submit_cmd):
3446
3471
  _dump_code_to_file(codegen)
3447
3472
  job_submit_cmd = f'{mkdir_code} && {code}'
@@ -3457,7 +3482,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3457
3482
  # We cannot set the managed job to PENDING state in the job template
3458
3483
  # (jobs-controller.yaml.j2), as it may need to wait for the run
3459
3484
  # commands to be scheduled on the job controller in high-load cases.
3460
- job_submit_cmd = job_submit_cmd + ' && ' + managed_job_code
3485
+ job_submit_cmd += ' && ' + managed_job_code
3461
3486
 
3462
3487
  returncode, stdout, stderr = self.run_on_head(handle,
3463
3488
  job_submit_cmd,
@@ -276,7 +276,6 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
276
276
  detach_run: bool,
277
277
  dryrun: bool = False) -> None:
278
278
  """ Launches the container."""
279
-
280
279
  if detach_run:
281
280
  raise NotImplementedError('detach_run=True is not supported in '
282
281
  'LocalDockerBackend.')
@@ -364,7 +363,7 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
364
363
  if k.startswith(_DOCKER_LABEL_PREFIX):
365
364
  # Remove 'skymeta_' from key
366
365
  metadata[k[len(_DOCKER_LABEL_PREFIX):]] = v
367
- self.images[c.name] = [c.image, metadata]
366
+ self.images[c.name] = (c.image, metadata)
368
367
  self.containers[c.name] = c
369
368
 
370
369
  def _execute_task_one_node(self, handle: LocalDockerResourceHandle,
sky/cli.py CHANGED
@@ -23,6 +23,7 @@ NOTE: the order of command definitions in this file corresponds to how they are
23
23
  listed in "sky --help". Take care to put logically connected commands close to
24
24
  each other.
25
25
  """
26
+ import collections
26
27
  import copy
27
28
  import datetime
28
29
  import functools
@@ -162,7 +163,7 @@ def _get_cluster_records_and_set_ssh_config(
162
163
  '-o StrictHostKeyChecking=no '
163
164
  '-o UserKnownHostsFile=/dev/null '
164
165
  '-o IdentitiesOnly=yes '
165
- '-W %h:%p '
166
+ '-W \'[%h]:%p\' '
166
167
  f'{handle.ssh_user}@127.0.0.1 '
167
168
  '-o ProxyCommand='
168
169
  # TODO(zhwu): write the template to a temp file, don't use
@@ -3413,7 +3414,7 @@ def show_gpus(
3413
3414
 
3414
3415
  # TODO(zhwu,romilb): We should move most of these kubernetes related
3415
3416
  # queries into the backend, especially behind the server.
3416
- def _get_kubernetes_realtime_gpu_table(
3417
+ def _get_kubernetes_realtime_gpu_tables(
3417
3418
  context: Optional[str] = None,
3418
3419
  name_filter: Optional[str] = None,
3419
3420
  quantity_filter: Optional[int] = None):
@@ -3423,15 +3424,14 @@ def show_gpus(
3423
3424
  else:
3424
3425
  qty_header = 'REQUESTABLE_QTY_PER_NODE'
3425
3426
  free_header = 'TOTAL_FREE_GPUS'
3426
- realtime_gpu_table = log_utils.create_table(
3427
- ['GPU', qty_header, 'TOTAL_GPUS', free_header])
3428
- realtime_gpu_availability_list = sdk.stream_and_get(
3427
+
3428
+ realtime_gpu_availability_lists = sdk.stream_and_get(
3429
3429
  sdk.realtime_kubernetes_gpu_availability(
3430
3430
  context=context,
3431
3431
  name_filter=name_filter,
3432
3432
  quantity_filter=quantity_filter))
3433
- if not realtime_gpu_availability_list:
3434
- err_msg = 'No GPUs found in Kubernetes cluster. '
3433
+ if not realtime_gpu_availability_lists:
3434
+ err_msg = 'No GPUs found in any allowed Kubernetes cluster. '
3435
3435
  debug_msg = 'To further debug, run: sky check '
3436
3436
  if name_filter is not None:
3437
3437
  gpu_info_msg = f' {name_filter!r}'
@@ -3439,26 +3439,52 @@ def show_gpus(
3439
3439
  gpu_info_msg += (' with requested quantity'
3440
3440
  f' {quantity_filter}')
3441
3441
  err_msg = (f'Resources{gpu_info_msg} not found '
3442
- 'in Kubernetes cluster. ')
3442
+ 'in any allowed Kubernetes cluster. ')
3443
3443
  debug_msg = ('To show available accelerators on kubernetes,'
3444
3444
  ' run: sky show-gpus --cloud kubernetes ')
3445
3445
  full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
3446
3446
  debug_msg)
3447
3447
  raise ValueError(full_err_msg)
3448
3448
  no_permissions_str = '<no permissions>'
3449
- for realtime_gpu_availability in sorted(realtime_gpu_availability_list):
3450
- gpu_availability = models.RealtimeGpuAvailability(
3451
- *realtime_gpu_availability)
3452
- available_qty = (gpu_availability.available
3453
- if gpu_availability.available != -1 else
3454
- no_permissions_str)
3455
- realtime_gpu_table.add_row([
3456
- gpu_availability.gpu,
3457
- _list_to_str(gpu_availability.counts),
3458
- gpu_availability.capacity,
3459
- available_qty,
3460
- ])
3461
- return realtime_gpu_table
3449
+ realtime_gpu_infos = []
3450
+ total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
3451
+ lambda: [0, 0])
3452
+
3453
+ for (ctx, availability_list) in realtime_gpu_availability_lists:
3454
+ realtime_gpu_table = log_utils.create_table(
3455
+ ['GPU', qty_header, 'TOTAL_GPUS', free_header])
3456
+ for realtime_gpu_availability in sorted(availability_list):
3457
+ gpu_availability = models.RealtimeGpuAvailability(
3458
+ *realtime_gpu_availability)
3459
+ available_qty = (gpu_availability.available
3460
+ if gpu_availability.available != -1 else
3461
+ no_permissions_str)
3462
+ realtime_gpu_table.add_row([
3463
+ gpu_availability.gpu,
3464
+ _list_to_str(gpu_availability.counts),
3465
+ gpu_availability.capacity,
3466
+ available_qty,
3467
+ ])
3468
+ gpu = gpu_availability.gpu
3469
+ capacity = gpu_availability.capacity
3470
+ # we want total, so skip permission denied.
3471
+ available = max(gpu_availability.available, 0)
3472
+ if capacity > 0:
3473
+ total_gpu_info[gpu][0] += capacity
3474
+ total_gpu_info[gpu][1] += available
3475
+ realtime_gpu_infos.append((ctx, realtime_gpu_table))
3476
+
3477
+ # display an aggregated table for all contexts
3478
+ # if there are more than one contexts with GPUs
3479
+ if len(realtime_gpu_infos) > 1:
3480
+ total_realtime_gpu_table = log_utils.create_table(
3481
+ ['GPU', 'TOTAL_GPUS', free_header])
3482
+ for gpu, stats in total_gpu_info.items():
3483
+ total_realtime_gpu_table.add_row([gpu, stats[0], stats[1]])
3484
+ else:
3485
+ total_realtime_gpu_table = None
3486
+
3487
+ return realtime_gpu_infos, total_realtime_gpu_table
3462
3488
 
3463
3489
  def _format_kubernetes_node_info(context: Optional[str]):
3464
3490
  node_table = log_utils.create_table(
@@ -3479,7 +3505,7 @@ def show_gpus(
3479
3505
  'Kubernetes per node accelerator availability ')
3480
3506
  if nodes_info.hint:
3481
3507
  k8s_per_node_acc_message += nodes_info.hint
3482
- return (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3508
+ return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
3483
3509
  f'{k8s_per_node_acc_message}'
3484
3510
  f'{colorama.Style.RESET_ALL}\n'
3485
3511
  f'{node_table.get_string()}')
@@ -3516,8 +3542,7 @@ def show_gpus(
3516
3542
  # If --cloud kubernetes is not specified, we want to catch
3517
3543
  # the case where no GPUs are available on the cluster and
3518
3544
  # print the warning at the end.
3519
- k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
3520
- context)
3545
+ k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables(context) # pylint: disable=line-too-long
3521
3546
  except ValueError as e:
3522
3547
  if not cloud_is_kubernetes:
3523
3548
  # Make it a note if cloud is not kubernetes
@@ -3525,13 +3550,24 @@ def show_gpus(
3525
3550
  k8s_messages += str(e)
3526
3551
  else:
3527
3552
  print_section_titles = True
3528
- context_str = f'(Context: {context})' if context else ''
3529
- yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3530
- f'Kubernetes GPUs {context_str}'
3531
- f'{colorama.Style.RESET_ALL}\n')
3532
- yield from k8s_realtime_table.get_string()
3533
- yield '\n\n'
3534
- yield _format_kubernetes_node_info(context)
3553
+
3554
+ # print total table
3555
+ if total_table is not None:
3556
+ yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
3557
+ 'Total Kubernetes GPUs'
3558
+ f'{colorama.Style.RESET_ALL}\n')
3559
+ yield from total_table.get_string()
3560
+ yield '\n-----\n\n'
3561
+
3562
+ # print individual infos.
3563
+ for (ctx, k8s_realtime_table) in k8s_realtime_infos:
3564
+ context_str = f'(Context: {ctx})' if ctx else ''
3565
+ yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3566
+ f'Kubernetes GPUs {context_str}'
3567
+ f'{colorama.Style.RESET_ALL}\n')
3568
+ yield from k8s_realtime_table.get_string()
3569
+ yield '\n\n'
3570
+ yield _format_kubernetes_node_info(ctx) + '\n-----\n\n'
3535
3571
  if kubernetes_autoscaling:
3536
3572
  k8s_messages += (
3537
3573
  '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
@@ -3620,13 +3656,29 @@ def show_gpus(
3620
3656
  # Print section title if not showing all and instead a specific
3621
3657
  # accelerator is requested
3622
3658
  print_section_titles = True
3623
- yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3624
- f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
3625
3659
  # TODO(romilb): Show filtered per node GPU availability here as well
3626
3660
  try:
3627
- k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
3628
- name_filter=name, quantity_filter=quantity)
3629
- yield from k8s_realtime_table.get_string()
3661
+ k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables( # pylint: disable=line-too-long
3662
+ context=region,
3663
+ name_filter=name,
3664
+ quantity_filter=quantity)
3665
+
3666
+ # print total table
3667
+ if total_table is not None:
3668
+ yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
3669
+ 'Total Kubernetes GPUs'
3670
+ f'{colorama.Style.RESET_ALL}\n')
3671
+ yield from total_table.get_string()
3672
+ yield '\n-----\n\n'
3673
+
3674
+ # print individual tables
3675
+ for (ctx, k8s_realtime_table) in k8s_realtime_infos:
3676
+ context_str = f'(Context: {ctx})' if ctx else ''
3677
+ yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3678
+ f'Kubernetes GPUs {context_str}'
3679
+ f'{colorama.Style.RESET_ALL}\n')
3680
+ yield from k8s_realtime_table.get_string()
3681
+ yield '\n\n'
3630
3682
  except ValueError as e:
3631
3683
  # In the case of a specific accelerator, show the error message
3632
3684
  # immediately (e.g., "Resources H100 not found ...")
@@ -5911,11 +5963,12 @@ def api_info():
5911
5963
  user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
5912
5964
  user_hash = common_utils.get_user_hash()
5913
5965
  dashboard_url = server_common.get_dashboard_url(url)
5914
- click.echo(f'Using SkyPilot API server: {url} Dashboard: {dashboard_url}\n'
5966
+ click.echo(f'Using SkyPilot API server: {url}\n'
5915
5967
  f'{ux_utils.INDENT_SYMBOL}Status: {api_server_info["status"]}, '
5916
5968
  f'commit: {api_server_info["commit"]}, '
5917
5969
  f'version: {api_server_info["version"]}\n'
5918
- f'{ux_utils.INDENT_LAST_SYMBOL}User: {user_name} ({user_hash})')
5970
+ f'{ux_utils.INDENT_SYMBOL}User: {user_name} ({user_hash})\n'
5971
+ f'{ux_utils.INDENT_LAST_SYMBOL}Dashboard: {dashboard_url}')
5919
5972
 
5920
5973
 
5921
5974
  def main():