skypilot-nightly 1.0.0.dev20250427__py3-none-any.whl → 1.0.0.dev20250429__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +28 -40
  3. sky/backends/backend_utils.py +19 -2
  4. sky/backends/cloud_vm_ray_backend.py +33 -8
  5. sky/backends/local_docker_backend.py +1 -2
  6. sky/cli.py +91 -38
  7. sky/client/cli.py +91 -38
  8. sky/client/sdk.py +3 -2
  9. sky/clouds/aws.py +12 -6
  10. sky/clouds/azure.py +3 -0
  11. sky/clouds/cloud.py +8 -2
  12. sky/clouds/cudo.py +2 -0
  13. sky/clouds/do.py +3 -0
  14. sky/clouds/fluidstack.py +3 -0
  15. sky/clouds/gcp.py +7 -0
  16. sky/clouds/ibm.py +2 -0
  17. sky/clouds/kubernetes.py +42 -19
  18. sky/clouds/lambda_cloud.py +1 -0
  19. sky/clouds/nebius.py +18 -10
  20. sky/clouds/oci.py +6 -3
  21. sky/clouds/paperspace.py +2 -0
  22. sky/clouds/runpod.py +2 -0
  23. sky/clouds/scp.py +2 -0
  24. sky/clouds/service_catalog/constants.py +1 -1
  25. sky/clouds/service_catalog/kubernetes_catalog.py +7 -7
  26. sky/clouds/vast.py +2 -0
  27. sky/clouds/vsphere.py +2 -0
  28. sky/core.py +58 -29
  29. sky/dashboard/out/404.html +1 -1
  30. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  31. sky/dashboard/out/clusters/[cluster].html +1 -1
  32. sky/dashboard/out/clusters.html +1 -1
  33. sky/dashboard/out/favicon.ico +0 -0
  34. sky/dashboard/out/index.html +1 -1
  35. sky/dashboard/out/jobs/[job].html +1 -1
  36. sky/dashboard/out/jobs.html +1 -1
  37. sky/exceptions.py +6 -0
  38. sky/execution.py +19 -4
  39. sky/global_user_state.py +1 -0
  40. sky/optimizer.py +35 -11
  41. sky/provision/common.py +2 -5
  42. sky/provision/docker_utils.py +22 -16
  43. sky/provision/instance_setup.py +1 -1
  44. sky/provision/kubernetes/instance.py +276 -93
  45. sky/provision/kubernetes/network.py +1 -1
  46. sky/provision/kubernetes/utils.py +36 -24
  47. sky/provision/provisioner.py +6 -0
  48. sky/serve/replica_managers.py +51 -5
  49. sky/serve/serve_state.py +41 -0
  50. sky/serve/service.py +108 -63
  51. sky/server/common.py +6 -3
  52. sky/server/config.py +184 -0
  53. sky/server/requests/executor.py +17 -156
  54. sky/server/server.py +4 -4
  55. sky/setup_files/dependencies.py +0 -1
  56. sky/skylet/constants.py +7 -0
  57. sky/skypilot_config.py +27 -6
  58. sky/task.py +1 -1
  59. sky/templates/kubernetes-ray.yml.j2 +145 -15
  60. sky/templates/nebius-ray.yml.j2 +63 -0
  61. sky/utils/command_runner.py +17 -3
  62. sky/utils/command_runner.pyi +2 -0
  63. sky/utils/controller_utils.py +24 -0
  64. sky/utils/kubernetes/rsync_helper.sh +20 -4
  65. sky/utils/schemas.py +13 -0
  66. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/METADATA +2 -2
  67. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/RECORD +73 -72
  68. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/WHEEL +1 -1
  69. /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_buildManifest.js +0 -0
  70. /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_ssgManifest.js +0 -0
  71. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/entry_points.txt +0 -0
  72. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/licenses/LICENSE +0 -0
  73. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/top_level.txt +0 -0
sky/client/cli.py CHANGED
@@ -23,6 +23,7 @@ NOTE: the order of command definitions in this file corresponds to how they are
23
23
  listed in "sky --help". Take care to put logically connected commands close to
24
24
  each other.
25
25
  """
26
+ import collections
26
27
  import copy
27
28
  import datetime
28
29
  import functools
@@ -162,7 +163,7 @@ def _get_cluster_records_and_set_ssh_config(
162
163
  '-o StrictHostKeyChecking=no '
163
164
  '-o UserKnownHostsFile=/dev/null '
164
165
  '-o IdentitiesOnly=yes '
165
- '-W %h:%p '
166
+ '-W \'[%h]:%p\' '
166
167
  f'{handle.ssh_user}@127.0.0.1 '
167
168
  '-o ProxyCommand='
168
169
  # TODO(zhwu): write the template to a temp file, don't use
@@ -3413,7 +3414,7 @@ def show_gpus(
3413
3414
 
3414
3415
  # TODO(zhwu,romilb): We should move most of these kubernetes related
3415
3416
  # queries into the backend, especially behind the server.
3416
- def _get_kubernetes_realtime_gpu_table(
3417
+ def _get_kubernetes_realtime_gpu_tables(
3417
3418
  context: Optional[str] = None,
3418
3419
  name_filter: Optional[str] = None,
3419
3420
  quantity_filter: Optional[int] = None):
@@ -3423,15 +3424,14 @@ def show_gpus(
3423
3424
  else:
3424
3425
  qty_header = 'REQUESTABLE_QTY_PER_NODE'
3425
3426
  free_header = 'TOTAL_FREE_GPUS'
3426
- realtime_gpu_table = log_utils.create_table(
3427
- ['GPU', qty_header, 'TOTAL_GPUS', free_header])
3428
- realtime_gpu_availability_list = sdk.stream_and_get(
3427
+
3428
+ realtime_gpu_availability_lists = sdk.stream_and_get(
3429
3429
  sdk.realtime_kubernetes_gpu_availability(
3430
3430
  context=context,
3431
3431
  name_filter=name_filter,
3432
3432
  quantity_filter=quantity_filter))
3433
- if not realtime_gpu_availability_list:
3434
- err_msg = 'No GPUs found in Kubernetes cluster. '
3433
+ if not realtime_gpu_availability_lists:
3434
+ err_msg = 'No GPUs found in any allowed Kubernetes cluster. '
3435
3435
  debug_msg = 'To further debug, run: sky check '
3436
3436
  if name_filter is not None:
3437
3437
  gpu_info_msg = f' {name_filter!r}'
@@ -3439,26 +3439,52 @@ def show_gpus(
3439
3439
  gpu_info_msg += (' with requested quantity'
3440
3440
  f' {quantity_filter}')
3441
3441
  err_msg = (f'Resources{gpu_info_msg} not found '
3442
- 'in Kubernetes cluster. ')
3442
+ 'in any allowed Kubernetes cluster. ')
3443
3443
  debug_msg = ('To show available accelerators on kubernetes,'
3444
3444
  ' run: sky show-gpus --cloud kubernetes ')
3445
3445
  full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
3446
3446
  debug_msg)
3447
3447
  raise ValueError(full_err_msg)
3448
3448
  no_permissions_str = '<no permissions>'
3449
- for realtime_gpu_availability in sorted(realtime_gpu_availability_list):
3450
- gpu_availability = models.RealtimeGpuAvailability(
3451
- *realtime_gpu_availability)
3452
- available_qty = (gpu_availability.available
3453
- if gpu_availability.available != -1 else
3454
- no_permissions_str)
3455
- realtime_gpu_table.add_row([
3456
- gpu_availability.gpu,
3457
- _list_to_str(gpu_availability.counts),
3458
- gpu_availability.capacity,
3459
- available_qty,
3460
- ])
3461
- return realtime_gpu_table
3449
+ realtime_gpu_infos = []
3450
+ total_gpu_info: Dict[str, List[int]] = collections.defaultdict(
3451
+ lambda: [0, 0])
3452
+
3453
+ for (ctx, availability_list) in realtime_gpu_availability_lists:
3454
+ realtime_gpu_table = log_utils.create_table(
3455
+ ['GPU', qty_header, 'TOTAL_GPUS', free_header])
3456
+ for realtime_gpu_availability in sorted(availability_list):
3457
+ gpu_availability = models.RealtimeGpuAvailability(
3458
+ *realtime_gpu_availability)
3459
+ available_qty = (gpu_availability.available
3460
+ if gpu_availability.available != -1 else
3461
+ no_permissions_str)
3462
+ realtime_gpu_table.add_row([
3463
+ gpu_availability.gpu,
3464
+ _list_to_str(gpu_availability.counts),
3465
+ gpu_availability.capacity,
3466
+ available_qty,
3467
+ ])
3468
+ gpu = gpu_availability.gpu
3469
+ capacity = gpu_availability.capacity
3470
+ # we want total, so skip permission denied.
3471
+ available = max(gpu_availability.available, 0)
3472
+ if capacity > 0:
3473
+ total_gpu_info[gpu][0] += capacity
3474
+ total_gpu_info[gpu][1] += available
3475
+ realtime_gpu_infos.append((ctx, realtime_gpu_table))
3476
+
3477
+ # display an aggregated table for all contexts
3478
+ # if there are more than one contexts with GPUs
3479
+ if len(realtime_gpu_infos) > 1:
3480
+ total_realtime_gpu_table = log_utils.create_table(
3481
+ ['GPU', 'TOTAL_GPUS', free_header])
3482
+ for gpu, stats in total_gpu_info.items():
3483
+ total_realtime_gpu_table.add_row([gpu, stats[0], stats[1]])
3484
+ else:
3485
+ total_realtime_gpu_table = None
3486
+
3487
+ return realtime_gpu_infos, total_realtime_gpu_table
3462
3488
 
3463
3489
  def _format_kubernetes_node_info(context: Optional[str]):
3464
3490
  node_table = log_utils.create_table(
@@ -3479,7 +3505,7 @@ def show_gpus(
3479
3505
  'Kubernetes per node accelerator availability ')
3480
3506
  if nodes_info.hint:
3481
3507
  k8s_per_node_acc_message += nodes_info.hint
3482
- return (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3508
+ return (f'{colorama.Fore.LIGHTMAGENTA_EX}{colorama.Style.NORMAL}'
3483
3509
  f'{k8s_per_node_acc_message}'
3484
3510
  f'{colorama.Style.RESET_ALL}\n'
3485
3511
  f'{node_table.get_string()}')
@@ -3516,8 +3542,7 @@ def show_gpus(
3516
3542
  # If --cloud kubernetes is not specified, we want to catch
3517
3543
  # the case where no GPUs are available on the cluster and
3518
3544
  # print the warning at the end.
3519
- k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
3520
- context)
3545
+ k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables(context) # pylint: disable=line-too-long
3521
3546
  except ValueError as e:
3522
3547
  if not cloud_is_kubernetes:
3523
3548
  # Make it a note if cloud is not kubernetes
@@ -3525,13 +3550,24 @@ def show_gpus(
3525
3550
  k8s_messages += str(e)
3526
3551
  else:
3527
3552
  print_section_titles = True
3528
- context_str = f'(Context: {context})' if context else ''
3529
- yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3530
- f'Kubernetes GPUs {context_str}'
3531
- f'{colorama.Style.RESET_ALL}\n')
3532
- yield from k8s_realtime_table.get_string()
3533
- yield '\n\n'
3534
- yield _format_kubernetes_node_info(context)
3553
+
3554
+ # print total table
3555
+ if total_table is not None:
3556
+ yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
3557
+ 'Total Kubernetes GPUs'
3558
+ f'{colorama.Style.RESET_ALL}\n')
3559
+ yield from total_table.get_string()
3560
+ yield '\n-----\n\n'
3561
+
3562
+ # print individual infos.
3563
+ for (ctx, k8s_realtime_table) in k8s_realtime_infos:
3564
+ context_str = f'(Context: {ctx})' if ctx else ''
3565
+ yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3566
+ f'Kubernetes GPUs {context_str}'
3567
+ f'{colorama.Style.RESET_ALL}\n')
3568
+ yield from k8s_realtime_table.get_string()
3569
+ yield '\n\n'
3570
+ yield _format_kubernetes_node_info(ctx) + '\n-----\n\n'
3535
3571
  if kubernetes_autoscaling:
3536
3572
  k8s_messages += (
3537
3573
  '\n' + kubernetes_utils.KUBERNETES_AUTOSCALER_NOTE)
@@ -3620,13 +3656,29 @@ def show_gpus(
3620
3656
  # Print section title if not showing all and instead a specific
3621
3657
  # accelerator is requested
3622
3658
  print_section_titles = True
3623
- yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3624
- f'Kubernetes GPUs{colorama.Style.RESET_ALL}\n')
3625
3659
  # TODO(romilb): Show filtered per node GPU availability here as well
3626
3660
  try:
3627
- k8s_realtime_table = _get_kubernetes_realtime_gpu_table(
3628
- name_filter=name, quantity_filter=quantity)
3629
- yield from k8s_realtime_table.get_string()
3661
+ k8s_realtime_infos, total_table = _get_kubernetes_realtime_gpu_tables( # pylint: disable=line-too-long
3662
+ context=region,
3663
+ name_filter=name,
3664
+ quantity_filter=quantity)
3665
+
3666
+ # print total table
3667
+ if total_table is not None:
3668
+ yield (f'{colorama.Fore.GREEN}{colorama.Style.BRIGHT}'
3669
+ 'Total Kubernetes GPUs'
3670
+ f'{colorama.Style.RESET_ALL}\n')
3671
+ yield from total_table.get_string()
3672
+ yield '\n-----\n\n'
3673
+
3674
+ # print individual tables
3675
+ for (ctx, k8s_realtime_table) in k8s_realtime_infos:
3676
+ context_str = f'(Context: {ctx})' if ctx else ''
3677
+ yield (f'{colorama.Fore.CYAN}{colorama.Style.BRIGHT}'
3678
+ f'Kubernetes GPUs {context_str}'
3679
+ f'{colorama.Style.RESET_ALL}\n')
3680
+ yield from k8s_realtime_table.get_string()
3681
+ yield '\n\n'
3630
3682
  except ValueError as e:
3631
3683
  # In the case of a specific accelerator, show the error message
3632
3684
  # immediately (e.g., "Resources H100 not found ...")
@@ -5911,11 +5963,12 @@ def api_info():
5911
5963
  user_name = os.getenv(constants.USER_ENV_VAR, getpass.getuser())
5912
5964
  user_hash = common_utils.get_user_hash()
5913
5965
  dashboard_url = server_common.get_dashboard_url(url)
5914
- click.echo(f'Using SkyPilot API server: {url} Dashboard: {dashboard_url}\n'
5966
+ click.echo(f'Using SkyPilot API server: {url}\n'
5915
5967
  f'{ux_utils.INDENT_SYMBOL}Status: {api_server_info["status"]}, '
5916
5968
  f'commit: {api_server_info["commit"]}, '
5917
5969
  f'version: {api_server_info["version"]}\n'
5918
- f'{ux_utils.INDENT_LAST_SYMBOL}User: {user_name} ({user_hash})')
5970
+ f'{ux_utils.INDENT_SYMBOL}User: {user_name} ({user_hash})\n'
5971
+ f'{ux_utils.INDENT_LAST_SYMBOL}Dashboard: {dashboard_url}')
5919
5972
 
5920
5973
 
5921
5974
  def main():
sky/client/sdk.py CHANGED
@@ -1840,6 +1840,7 @@ def api_login(endpoint: Optional[str] = None) -> None:
1840
1840
  dashboard_url = server_common.get_dashboard_url(endpoint)
1841
1841
  dashboard_msg = f'Dashboard: {dashboard_url}'
1842
1842
  click.secho(
1843
- f'Logged in to SkyPilot API server at {endpoint}.'
1844
- f' {dashboard_msg}',
1843
+ f'Logged into SkyPilot API server at: {endpoint}'
1844
+ f'\n{ux_utils.INDENT_LAST_SYMBOL}{colorama.Fore.GREEN}'
1845
+ f'{dashboard_msg}',
1845
1846
  fg='green')
sky/clouds/aws.py CHANGED
@@ -161,13 +161,19 @@ class AWS(clouds.Cloud):
161
161
  def _unsupported_features_for_resources(
162
162
  cls, resources: 'resources_lib.Resources'
163
163
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
164
+ unsupported_features = {}
164
165
  if resources.use_spot:
165
- return {
166
- clouds.CloudImplementationFeatures.STOP:
167
- ('Stopping spot instances is currently not supported on'
168
- f' {cls._REPR}.'),
169
- }
170
- return {}
166
+ unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
167
+ f'Stopping spot instances is currently not supported on {cls._REPR}.'
168
+ )
169
+
170
+ unsupported_features[
171
+ clouds.CloudImplementationFeatures.
172
+ HIGH_AVAILABILITY_CONTROLLERS] = (
173
+ f'High availability controllers are not supported on {cls._REPR}.'
174
+ )
175
+
176
+ return unsupported_features
171
177
 
172
178
  @classmethod
173
179
  def max_cluster_name_length(cls) -> Optional[int]:
sky/clouds/azure.py CHANGED
@@ -90,6 +90,9 @@ class Azure(clouds.Cloud):
90
90
  features = {
91
91
  clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
92
92
  (f'Migrating disk is currently not supported on {cls._REPR}.'),
93
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS: (
94
+ f'High availability controllers are not supported on {cls._REPR}.'
95
+ ),
93
96
  }
94
97
  if resources.use_spot:
95
98
  features[clouds.CloudImplementationFeatures.STOP] = (
sky/clouds/cloud.py CHANGED
@@ -47,6 +47,9 @@ class CloudImplementationFeatures(enum.Enum):
47
47
  OPEN_PORTS = 'open_ports'
48
48
  STORAGE_MOUNTING = 'storage_mounting'
49
49
  HOST_CONTROLLERS = 'host_controllers' # Can run jobs/serve controllers
50
+ HIGH_AVAILABILITY_CONTROLLERS = ('high_availability_controllers'
51
+ ) # Controller can auto-restart
52
+ AUTO_TERMINATE = 'auto_terminate' # Pod/VM can stop or down itself
50
53
  AUTOSTOP = 'autostop' # Pod/VM can stop itself
51
54
  AUTODOWN = 'autodown' # Pod/VM can down itself
52
55
 
@@ -415,13 +418,16 @@ class Cloud:
415
418
  try:
416
419
  self.check_features_are_supported(resources,
417
420
  resources_required_features)
418
- except exceptions.NotSupportedError:
421
+ except exceptions.NotSupportedError as e:
419
422
  # TODO(zhwu): The resources are now silently filtered out. We
420
423
  # should have some logging telling the user why the resources
421
424
  # are not considered.
425
+ # UPDATE(kyuds): passing in NotSupportedError reason string
426
+ # to hint for issue #5344. Did not remove above comment as
427
+ # reason is not displayed when other resources are valid.
422
428
  return resources_utils.FeasibleResources(resources_list=[],
423
429
  fuzzy_candidate_list=[],
424
- hint=None)
430
+ hint=str(e))
425
431
  return self._get_feasible_launchable_resources(resources)
426
432
 
427
433
  def _get_feasible_launchable_resources(
sky/clouds/cudo.py CHANGED
@@ -68,6 +68,8 @@ class Cudo(clouds.Cloud):
68
68
  'Cudo Compute cannot host a controller as it does not '
69
69
  'autostopping, which will leave the controller to run indefinitely.'
70
70
  ),
71
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
72
+ ('High availability controllers are not supported on Cudo.'),
71
73
  }
72
74
  _MAX_CLUSTER_NAME_LEN_LIMIT = 60
73
75
 
sky/clouds/do.py CHANGED
@@ -33,6 +33,9 @@ class DO(clouds.Cloud):
33
33
  clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
34
34
  'Custom disk tiers'
35
35
  f' is not supported in {_REPR}.',
36
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
37
+ ('High availability controllers are not supported in '
38
+ f'{_REPR}.'),
36
39
  }
37
40
  # DO maximum node name length defined as <= 255
38
41
  # https://docs.digitalocean.com/reference/api/api-reference/#operation/droplets_create
sky/clouds/fluidstack.py CHANGED
@@ -56,6 +56,9 @@ class Fluidstack(clouds.Cloud):
56
56
  clouds.CloudImplementationFeatures.HOST_CONTROLLERS:
57
57
  'Host controllers'
58
58
  f' are not supported in {_REPR}.',
59
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
60
+ ('High availability controllers are not supported in '
61
+ f'{_REPR}.'),
59
62
  }
60
63
  # Using the latest SkyPilot provisioner API to provision and check status.
61
64
  PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
sky/clouds/gcp.py CHANGED
@@ -232,6 +232,13 @@ class GCP(clouds.Cloud):
232
232
  unsupported[clouds.CloudImplementationFeatures.SPOT_INSTANCE] = (
233
233
  'Managed Instance Group with DWS does not support '
234
234
  'spot instances.')
235
+
236
+ unsupported[
237
+ clouds.CloudImplementationFeatures.
238
+ HIGH_AVAILABILITY_CONTROLLERS] = (
239
+ f'High availability controllers are not supported on {cls._REPR}.'
240
+ )
241
+
235
242
  return unsupported
236
243
 
237
244
  @classmethod
sky/clouds/ibm.py CHANGED
@@ -50,6 +50,8 @@ class IBM(clouds.Cloud):
50
50
  ),
51
51
  clouds.CloudImplementationFeatures.OPEN_PORTS:
52
52
  (f'Opening ports is currently not supported on {cls._REPR}.'),
53
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
54
+ ('High availability controllers are not supported on IBM.'),
53
55
  }
54
56
  if resources.use_spot:
55
57
  features[clouds.CloudImplementationFeatures.STOP] = (
sky/clouds/kubernetes.py CHANGED
@@ -429,28 +429,32 @@ class Kubernetes(clouds.Cloud):
429
429
  acc_count = k.accelerator_count if k.accelerator_count else 0
430
430
  acc_type = k.accelerator_type if k.accelerator_type else None
431
431
 
432
- image_id_dict = resources.image_id
433
- if image_id_dict is not None:
434
- # Use custom image specified in resources
435
- if None in image_id_dict:
436
- image_id = image_id_dict[None]
432
+ def _get_image_id(resources: 'resources_lib.Resources') -> str:
433
+ image_id_dict = resources.image_id
434
+ if image_id_dict is not None:
435
+ # Use custom image specified in resources
436
+ if None in image_id_dict:
437
+ image_id = image_id_dict[None]
438
+ else:
439
+ assert resources.region in image_id_dict, image_id_dict
440
+ image_id = image_id_dict[resources.region]
441
+ if image_id.startswith('docker:'):
442
+ image_id = image_id[len('docker:'):]
437
443
  else:
438
- assert resources.region in image_id_dict, image_id_dict
439
- image_id = image_id_dict[resources.region]
440
- if image_id.startswith('docker:'):
441
- image_id = image_id[len('docker:'):]
442
- else:
443
- # Select image based on whether we are using GPUs or not.
444
- image_id = self.IMAGE_GPU if acc_count > 0 else self.IMAGE_CPU
445
- # Get the container image ID from the service catalog.
446
- image_id = service_catalog.get_image_id_from_tag(
447
- image_id, clouds='kubernetes')
444
+ # Select image based on whether we are using GPUs or not.
445
+ image_id = self.IMAGE_GPU if acc_count > 0 else self.IMAGE_CPU
446
+ # Get the container image ID from the service catalog.
447
+ image_id = service_catalog.get_image_id_from_tag(
448
+ image_id, clouds='kubernetes')
449
+ return image_id
450
+
451
+ image_id = _get_image_id(resources)
448
452
  # TODO(romilb): Create a lightweight image for SSH jump host
449
453
  ssh_jump_image = service_catalog.get_image_id_from_tag(
450
454
  self.IMAGE_CPU, clouds='kubernetes')
451
455
 
452
456
  k8s_acc_label_key = None
453
- k8s_acc_label_value = None
457
+ k8s_acc_label_values = None
454
458
  k8s_topology_label_key = None
455
459
  k8s_topology_label_value = None
456
460
  k8s_resource_key = None
@@ -458,9 +462,9 @@ class Kubernetes(clouds.Cloud):
458
462
 
459
463
  # If GPU/TPUs are requested, set node label to match the GPU/TPU type.
460
464
  if acc_count > 0 and acc_type is not None:
461
- (k8s_acc_label_key, k8s_acc_label_value, k8s_topology_label_key,
465
+ (k8s_acc_label_key, k8s_acc_label_values, k8s_topology_label_key,
462
466
  k8s_topology_label_value) = (
463
- kubernetes_utils.get_accelerator_label_key_value(
467
+ kubernetes_utils.get_accelerator_label_key_values(
464
468
  context, acc_type, acc_count))
465
469
  if (k8s_acc_label_key ==
466
470
  kubernetes_utils.GKELabelFormatter.TPU_LABEL_KEY):
@@ -540,6 +544,13 @@ class Kubernetes(clouds.Cloud):
540
544
  # cpus is <1.
541
545
  'num-cpus': str(max(int(cpus), 1)),
542
546
  }
547
+
548
+ # Get the storage class name for high availability controller's PVC
549
+ k8s_ha_storage_class_name = skypilot_config.get_nested(
550
+ ('kubernetes', 'high_availability', 'storage_class_name'),
551
+ None,
552
+ override_configs=resources.cluster_config_overrides)
553
+
543
554
  deploy_vars = {
544
555
  'instance_type': resources.instance_type,
545
556
  'custom_resources': custom_resources,
@@ -551,7 +562,7 @@ class Kubernetes(clouds.Cloud):
551
562
  'k8s_networking_mode': network_utils.get_networking_mode().value,
552
563
  'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME,
553
564
  'k8s_acc_label_key': k8s_acc_label_key,
554
- 'k8s_acc_label_value': k8s_acc_label_value,
565
+ 'k8s_acc_label_values': k8s_acc_label_values,
555
566
  'k8s_ssh_jump_name': self.SKY_SSH_JUMP_NAME,
556
567
  'k8s_ssh_jump_image': ssh_jump_image,
557
568
  'k8s_service_account_name': k8s_service_account_name,
@@ -574,6 +585,18 @@ class Kubernetes(clouds.Cloud):
574
585
  'skypilot_ray_port': constants.SKY_REMOTE_RAY_PORT,
575
586
  'ray_worker_start_command': instance_setup.ray_worker_start_command(
576
587
  custom_resources, custom_ray_options, no_restart=False),
588
+ 'k8s_high_availability_deployment_volume_mount_name':
589
+ (kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME
590
+ ),
591
+ 'k8s_high_availability_deployment_volume_mount_path':
592
+ (kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH
593
+ ),
594
+ 'k8s_high_availability_deployment_setup_script_path':
595
+ (constants.PERSISTENT_SETUP_SCRIPT_PATH),
596
+ 'k8s_high_availability_deployment_run_script_dir':
597
+ (constants.PERSISTENT_RUN_SCRIPT_DIR),
598
+ 'k8s_high_availability_storage_class_name':
599
+ (k8s_ha_storage_class_name),
577
600
  }
578
601
 
579
602
  # Add kubecontext if it is set. It may be None if SkyPilot is running
@@ -44,6 +44,7 @@ class Lambda(clouds.Cloud):
44
44
  clouds.CloudImplementationFeatures.IMAGE_ID: f'Specifying image ID is not supported in {_REPR}.',
45
45
  clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: f'Custom disk tiers are not supported in {_REPR}.',
46
46
  clouds.CloudImplementationFeatures.HOST_CONTROLLERS: f'Host controllers are not supported in {_REPR}.',
47
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS: f'High availability controllers are not supported on {_REPR}.',
47
48
  }
48
49
 
49
50
  PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
sky/clouds/nebius.py CHANGED
@@ -1,5 +1,4 @@
1
1
  """ Nebius Cloud. """
2
- import logging
3
2
  import os
4
3
  import typing
5
4
  from typing import Dict, Iterator, List, Optional, Tuple, Union
@@ -7,6 +6,7 @@ from typing import Dict, Iterator, List, Optional, Tuple, Union
7
6
  from sky import clouds
8
7
  from sky.adaptors import nebius
9
8
  from sky.clouds import service_catalog
9
+ from sky.utils import annotations
10
10
  from sky.utils import registry
11
11
  from sky.utils import resources_utils
12
12
 
@@ -59,12 +59,10 @@ class Nebius(clouds.Cloud):
59
59
  ('Spot is not supported, as Nebius API does not implement spot.'),
60
60
  clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
61
61
  (f'Migrating disk is currently not supported on {_REPR}.'),
62
- clouds.CloudImplementationFeatures.DOCKER_IMAGE:
63
- (f'Docker image is currently not supported on {_REPR}. '
64
- 'You can try running docker command inside the '
65
- '`run` section in task.yaml.'),
66
62
  clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
67
63
  (f'Custom disk tier is currently not supported on {_REPR}.'),
64
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
65
+ ('High availability controllers are not supported on Nebius.'),
68
66
  }
69
67
  # Nebius maximum instance name length defined as <= 63 as a hostname length
70
68
  # 63 - 8 - 5 = 50 characters since
@@ -211,7 +209,8 @@ class Nebius(clouds.Cloud):
211
209
  else:
212
210
  raise RuntimeError('Unsupported instance type for Nebius cloud:'
213
211
  f' {resources.instance_type}')
214
- return {
212
+
213
+ resources_vars = {
215
214
  'instance_type': resources.instance_type,
216
215
  'custom_resources': custom_resources,
217
216
  'region': region.name,
@@ -220,6 +219,14 @@ class Nebius(clouds.Cloud):
220
219
  'zones': None,
221
220
  }
222
221
 
222
+ if acc_dict is not None:
223
+ # Nebius cloud's docker runtime information does not contain
224
+ # 'nvidia-container-runtime', causing no GPU option to be added to
225
+ # the docker run command. We patch this by adding it here.
226
+ resources_vars['docker_run_options'] = ['--gpus all']
227
+
228
+ return resources_vars
229
+
223
230
  def _get_feasible_launchable_resources(
224
231
  self, resources: 'resources_lib.Resources'
225
232
  ) -> 'resources_utils.FeasibleResources':
@@ -275,16 +282,16 @@ class Nebius(clouds.Cloud):
275
282
  fuzzy_candidate_list, None)
276
283
 
277
284
  @classmethod
285
+ @annotations.lru_cache(scope='request')
278
286
  def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
279
287
  """Checks if the user has access credentials to
280
288
  Nebius's compute service."""
281
- logging.debug('Nebius cloud check credentials')
282
289
  token_cred_msg = (
283
290
  f'{_INDENT_PREFIX}Credentials can be set up by running: \n'
284
291
  f'{_INDENT_PREFIX} $ nebius iam get-access-token > {nebius.NEBIUS_IAM_TOKEN_PATH} \n' # pylint: disable=line-too-long
285
- f'{_INDENT_PREFIX} or generate ~/.nebius/credentials.json')
292
+ f'{_INDENT_PREFIX} or generate ~/.nebius/credentials.json \n')
286
293
 
287
- tenant_msg = (f'{_INDENT_PREFIX}Copy your tenat ID from the web console and save it to file \n' # pylint: disable=line-too-long
294
+ tenant_msg = (f'{_INDENT_PREFIX} Copy your tenat ID from the web console and save it to file \n' # pylint: disable=line-too-long
288
295
  f'{_INDENT_PREFIX} $ echo $NEBIUS_TENANT_ID_PATH > {nebius.NEBIUS_TENANT_ID_PATH} \n' # pylint: disable=line-too-long
289
296
  f'{_INDENT_PREFIX} Or if you have 1 tenant you can run:\n' # pylint: disable=line-too-long
290
297
  f'{_INDENT_PREFIX} $ nebius --format json iam whoami|jq -r \'.user_profile.tenants[0].tenant_id\' > {nebius.NEBIUS_TENANT_ID_PATH} \n') # pylint: disable=line-too-long
@@ -301,11 +308,12 @@ class Nebius(clouds.Cloud):
301
308
  except nebius.request_error() as e:
302
309
  return False, (
303
310
  f'{e.status} \n' # First line is indented by 4 spaces
304
- f'{token_cred_msg}'
311
+ f'{token_cred_msg} \n'
305
312
  f'{tenant_msg}')
306
313
  return True, None
307
314
 
308
315
  @classmethod
316
+ @annotations.lru_cache(scope='request')
309
317
  def _check_storage_credentials(cls) -> Tuple[bool, Optional[str]]:
310
318
  """Checks if the user has access credentials to Nebius Object Storage.
311
319
 
sky/clouds/oci.py CHANGED
@@ -69,19 +69,22 @@ class OCI(clouds.Cloud):
69
69
  def _unsupported_features_for_resources(
70
70
  cls, resources: 'resources_lib.Resources'
71
71
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
72
- features = {
72
+ unsupported_features = {
73
73
  clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
74
74
  (f'Migrating disk is currently not supported on {cls._REPR}.'),
75
75
  clouds.CloudImplementationFeatures.DOCKER_IMAGE:
76
76
  (f'Docker image is currently not supported on {cls._REPR}. '
77
77
  'You can try running docker command inside the '
78
78
  '`run` section in task.yaml.'),
79
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
80
+ ('High availability controllers are not supported on '
81
+ f'{cls._REPR}.'),
79
82
  }
80
83
  if resources.use_spot:
81
- features[clouds.CloudImplementationFeatures.STOP] = (
84
+ unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
82
85
  f'Stopping spot instances is currently not supported on '
83
86
  f'{cls._REPR}.')
84
- return features
87
+ return unsupported_features
85
88
 
86
89
  @classmethod
87
90
  def max_cluster_name_length(cls) -> Optional[int]:
sky/clouds/paperspace.py CHANGED
@@ -41,6 +41,8 @@ class Paperspace(clouds.Cloud):
41
41
  clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
42
42
  'Custom disk tiers'
43
43
  f' is not supported in {_REPR}.',
44
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
45
+ (f'High availability controllers are not supported in {_REPR}.'),
44
46
  }
45
47
  _MAX_CLUSTER_NAME_LEN_LIMIT = 120
46
48
  _regions: List[clouds.Region] = []
sky/clouds/runpod.py CHANGED
@@ -34,6 +34,8 @@ class RunPod(clouds.Cloud):
34
34
  ('Mounting object stores is not supported on RunPod. To read data '
35
35
  'from object stores on RunPod, use `mode: COPY` to copy the data '
36
36
  'to local disk.'),
37
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
38
+ ('High availability controllers are not supported on RunPod.'),
37
39
  }
38
40
  _MAX_CLUSTER_NAME_LEN_LIMIT = 120
39
41
  _regions: List[clouds.Region] = []
sky/clouds/scp.py CHANGED
@@ -58,6 +58,8 @@ class SCP(clouds.Cloud):
58
58
  (f'Custom disk tiers are not supported in {_REPR}.'),
59
59
  clouds.CloudImplementationFeatures.OPEN_PORTS:
60
60
  (f'Opening ports is currently not supported on {_REPR}.'),
61
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
62
+ (f'High availability controllers are not supported on {_REPR}.'),
61
63
  }
62
64
 
63
65
  _INDENT_PREFIX = ' '
@@ -1,6 +1,6 @@
1
1
  """Constants used for service catalog."""
2
2
  HOSTED_CATALOG_DIR_URL = 'https://raw.githubusercontent.com/skypilot-org/skypilot-catalog/master/catalogs' # pylint: disable=line-too-long
3
- CATALOG_SCHEMA_VERSION = 'v6'
3
+ CATALOG_SCHEMA_VERSION = 'v7'
4
4
  CATALOG_DIR = '~/.sky/catalogs'
5
5
  ALL_CLOUDS = ('aws', 'azure', 'gcp', 'ibm', 'lambda', 'scp', 'oci',
6
6
  'kubernetes', 'runpod', 'vast', 'vsphere', 'cudo', 'fluidstack',
@@ -261,16 +261,16 @@ def _list_accelerators(
261
261
 
262
262
  accelerators_available = accelerator_count - allocated_qty
263
263
 
264
- # Initialize the entry if it doesn't exist yet
265
- if accelerator_name not in total_accelerators_available:
266
- total_accelerators_available[accelerator_name] = 0
267
-
268
264
  if accelerators_available >= min_quantity_filter:
269
265
  quantized_availability = min_quantity_filter * (
270
266
  accelerators_available // min_quantity_filter)
271
- total_accelerators_available[accelerator_name] = (
272
- total_accelerators_available.get(accelerator_name, 0) +
273
- quantized_availability)
267
+ if quantized_availability > 0:
268
+ # only increment when quantized availability is positive
269
+ # to avoid assertion errors checking keyset sizes in
270
+ # core.py _realtime_kubernetes_gpu_availability_single
271
+ total_accelerators_available[accelerator_name] = (
272
+ total_accelerators_available.get(
273
+ accelerator_name, 0) + quantized_availability)
274
274
 
275
275
  result = []
276
276