skypilot-nightly 1.0.0.dev20250426__py3-none-any.whl → 1.0.0.dev20250428__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +19 -2
  3. sky/backends/cloud_vm_ray_backend.py +33 -8
  4. sky/backends/local_docker_backend.py +1 -2
  5. sky/cli.py +1 -1
  6. sky/client/cli.py +1 -1
  7. sky/clouds/aws.py +12 -6
  8. sky/clouds/azure.py +3 -0
  9. sky/clouds/cloud.py +3 -0
  10. sky/clouds/cudo.py +2 -0
  11. sky/clouds/do.py +3 -0
  12. sky/clouds/fluidstack.py +3 -0
  13. sky/clouds/gcp.py +7 -0
  14. sky/clouds/ibm.py +2 -0
  15. sky/clouds/kubernetes.py +38 -15
  16. sky/clouds/lambda_cloud.py +1 -0
  17. sky/clouds/nebius.py +2 -0
  18. sky/clouds/oci.py +6 -3
  19. sky/clouds/paperspace.py +2 -0
  20. sky/clouds/runpod.py +2 -0
  21. sky/clouds/scp.py +2 -0
  22. sky/clouds/vast.py +2 -0
  23. sky/clouds/vsphere.py +2 -0
  24. sky/dashboard/out/404.html +1 -1
  25. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  26. sky/dashboard/out/clusters/[cluster].html +1 -1
  27. sky/dashboard/out/clusters.html +1 -1
  28. sky/dashboard/out/index.html +1 -1
  29. sky/dashboard/out/jobs/[job].html +1 -1
  30. sky/dashboard/out/jobs.html +1 -1
  31. sky/exceptions.py +6 -0
  32. sky/execution.py +19 -4
  33. sky/global_user_state.py +1 -0
  34. sky/provision/common.py +2 -5
  35. sky/provision/instance_setup.py +1 -1
  36. sky/provision/kubernetes/instance.py +280 -94
  37. sky/provision/kubernetes/network.py +1 -1
  38. sky/provision/kubernetes/utils.py +10 -0
  39. sky/provision/provisioner.py +6 -0
  40. sky/serve/replica_managers.py +51 -5
  41. sky/serve/serve_state.py +41 -0
  42. sky/serve/service.py +108 -63
  43. sky/server/requests/executor.py +4 -4
  44. sky/skylet/constants.py +7 -0
  45. sky/task.py +1 -1
  46. sky/templates/kubernetes-ray.yml.j2 +122 -2
  47. sky/utils/command_runner.py +17 -3
  48. sky/utils/command_runner.pyi +2 -0
  49. sky/utils/controller_utils.py +24 -0
  50. sky/utils/kubernetes/rsync_helper.sh +20 -4
  51. sky/utils/schemas.py +13 -0
  52. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/METADATA +1 -1
  53. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/RECORD +59 -59
  54. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/WHEEL +1 -1
  55. /sky/dashboard/out/_next/static/{WO8lTFPfj-lO3_gDGEiN8 → 2f-jlOWR_G5mOwCF4RcZz}/_buildManifest.js +0 -0
  56. /sky/dashboard/out/_next/static/{WO8lTFPfj-lO3_gDGEiN8 → 2f-jlOWR_G5mOwCF4RcZz}/_ssgManifest.js +0 -0
  57. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/entry_points.txt +0 -0
  58. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/licenses/LICENSE +0 -0
  59. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = '979c123c54d0ee615cd854a586f88afa9cb3d2ce'
8
+ _SKYPILOT_COMMIT_SHA = '607eee0a24e50718d783e92081f141f45cac6cda'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250426'
38
+ __version__ = '1.0.0.dev20250428'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
@@ -179,6 +179,9 @@ _RAY_YAML_KEYS_TO_RESTORE_EXCEPTIONS = [
179
179
  ('available_node_types', 'ray.head.default', 'node_config', 'UserData'),
180
180
  ('available_node_types', 'ray.head.default', 'node_config',
181
181
  'azure_arm_parameters', 'cloudInitSetupCommands'),
182
+ ('available_node_types', 'ray_head_default', 'node_config', 'pvc_spec'),
183
+ ('available_node_types', 'ray_head_default', 'node_config',
184
+ 'deployment_spec'),
182
185
  ]
183
186
  # These keys are expected to change when provisioning on an existing cluster,
184
187
  # but they don't actually represent a change that requires re-provisioning the
@@ -705,6 +708,13 @@ def write_cluster_config(
705
708
  is_custom_docker = ('true' if to_provision.extract_docker_image()
706
709
  is not None else 'false')
707
710
 
711
+ # Here, if users specify the controller to be high availability, we will
712
+ # provision a high availability controller. Whether the cloud supports
713
+ # this feature has been checked by
714
+ # CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS
715
+ high_availability_specified = controller_utils.high_availability_specified(
716
+ cluster_name_on_cloud)
717
+
708
718
  # Use a tmp file path to avoid incomplete YAML file being re-used in the
709
719
  # future.
710
720
  tmp_yaml_path = yaml_path + '.tmp'
@@ -790,6 +800,9 @@ def write_cluster_config(
790
800
  'sky_wheel_hash': wheel_hash,
791
801
  # Authentication (optional).
792
802
  **auth_config,
803
+
804
+ # High availability
805
+ 'high_availability': high_availability_specified,
793
806
  }),
794
807
  output_path=tmp_yaml_path)
795
808
  config_dict['cluster_name'] = cluster_name
@@ -802,8 +815,12 @@ def write_cluster_config(
802
815
  cluster_config_overrides=to_provision.cluster_config_overrides)
803
816
  kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
804
817
  yaml_obj = common_utils.read_yaml(tmp_yaml_path)
805
- pod_config = yaml_obj['available_node_types']['ray_head_default'][
806
- 'node_config']
818
+ pod_config: Dict[str, Any] = yaml_obj['available_node_types'][
819
+ 'ray_head_default']['node_config']
820
+
821
+ # Check pod spec only. For high availability controllers, we deploy pvc & deployment for the controller. Read kubernetes-ray.yml.j2 for more details.
822
+ pod_config.pop('deployment_spec', None)
823
+ pod_config.pop('pvc_spec', None)
807
824
  valid, message = kubernetes_utils.check_pod_config(pod_config)
808
825
  if not valid:
809
826
  raise exceptions.InvalidCloudConfigs(
@@ -1583,6 +1583,10 @@ class RetryingVmProvisioner(object):
1583
1583
  except provision_common.StopFailoverError:
1584
1584
  with ux_utils.print_exception_no_traceback():
1585
1585
  raise
1586
+ except exceptions.InconsistentHighAvailabilityError:
1587
+ # No teardown happens for this error.
1588
+ with ux_utils.print_exception_no_traceback():
1589
+ raise
1586
1590
  except Exception as e: # pylint: disable=broad-except
1587
1591
  # NOTE: We try to cleanup the cluster even if the previous
1588
1592
  # cluster does not exist. Also we are fast at
@@ -2032,6 +2036,7 @@ class RetryingVmProvisioner(object):
2032
2036
  # Recheck cluster name as the 'except:' block below may
2033
2037
  # change the cloud assignment.
2034
2038
  common_utils.check_cluster_name_is_valid(cluster_name)
2039
+
2035
2040
  if dryrun:
2036
2041
  cloud_user = None
2037
2042
  else:
@@ -2459,6 +2464,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
2459
2464
  'Tried to use cached cluster info, but it\'s missing for '
2460
2465
  f'cluster "{self.cluster_name}"')
2461
2466
  self._update_cluster_info()
2467
+
2462
2468
  assert self.cached_cluster_info is not None, self
2463
2469
  runners = provision_lib.get_command_runners(
2464
2470
  self.cached_cluster_info.provider_name, self.cached_cluster_info,
@@ -2689,6 +2695,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
2689
2695
  self._optimize_target) or common.OptimizeTarget.COST
2690
2696
  self._requested_features = kwargs.pop('requested_features',
2691
2697
  self._requested_features)
2698
+ self._dump_final_script = kwargs.pop('dump_final_script', False)
2692
2699
  assert not kwargs, f'Unexpected kwargs: {kwargs}'
2693
2700
 
2694
2701
  def check_resources_fit_cluster(
@@ -3272,18 +3279,30 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3272
3279
  env_vars=setup_envs)
3273
3280
  encoded_script = shlex.quote(setup_script)
3274
3281
 
3275
- def _dump_setup_script(setup_script: str) -> None:
3282
+ def _dump_final_script(
3283
+ setup_script: str,
3284
+ target_dir: str = remote_setup_file_name) -> None:
3276
3285
  with tempfile.NamedTemporaryFile('w', prefix='sky_setup_') as f:
3277
3286
  f.write(setup_script)
3278
3287
  f.flush()
3279
3288
  setup_sh_path = f.name
3280
3289
  runner.rsync(source=setup_sh_path,
3281
- target=remote_setup_file_name,
3290
+ target=target_dir,
3282
3291
  up=True,
3283
3292
  stream_logs=False)
3284
3293
 
3294
+ # Always dump the full setup script to the persistent path first
3295
+ # In high availability mode, we need to dump the full setup script
3296
+ # to a persistent path BEFORE any other operations. This ensures
3297
+ # that if the pod restarts, it can find and execute the complete
3298
+ # setup script, rather than a reference to a temporary file that
3299
+ # would no longer exist after restart.
3300
+ if self._dump_final_script:
3301
+ _dump_final_script(setup_script,
3302
+ constants.PERSISTENT_SETUP_SCRIPT_PATH)
3303
+
3285
3304
  if detach_setup or _is_command_length_over_limit(encoded_script):
3286
- _dump_setup_script(setup_script)
3305
+ _dump_final_script(setup_script)
3287
3306
  create_script_code = 'true'
3288
3307
  else:
3289
3308
  create_script_code = (f'{{ echo {encoded_script} > '
@@ -3335,7 +3354,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3335
3354
  'Failed to run setup command inline due to '
3336
3355
  'command length limit. Dumping setup script to '
3337
3356
  'file and running it with SSH.')
3338
- _dump_setup_script(setup_script)
3357
+ _dump_final_script(setup_script)
3339
3358
  returncode = _run_setup(setup_cmd)
3340
3359
 
3341
3360
  def error_message() -> str:
@@ -3426,14 +3445,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3426
3445
  code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
3427
3446
  job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
3428
3447
 
3429
- def _dump_code_to_file(codegen: str) -> None:
3448
+ def _dump_code_to_file(codegen: str,
3449
+ target_dir: str = SKY_REMOTE_APP_DIR) -> None:
3430
3450
  runners = handle.get_command_runners()
3431
3451
  head_runner = runners[0]
3432
3452
  with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
3433
3453
  fp.write(codegen)
3434
3454
  fp.flush()
3435
- script_path = os.path.join(SKY_REMOTE_APP_DIR,
3436
- f'sky_job_{job_id}')
3455
+ script_path = os.path.join(target_dir, f'sky_job_{job_id}')
3437
3456
  # We choose to sync code + exec, because the alternative of 'ray
3438
3457
  # submit' may not work as it may use system python (python2) to
3439
3458
  # execute the script. Happens for AWS.
@@ -3442,6 +3461,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3442
3461
  up=True,
3443
3462
  stream_logs=False)
3444
3463
 
3464
+ # Should also be ealier than _is_command_length_over_limit
3465
+ # Same reason as in _setup
3466
+ if self._dump_final_script:
3467
+ _dump_code_to_file(job_submit_cmd,
3468
+ constants.PERSISTENT_RUN_SCRIPT_DIR)
3469
+
3445
3470
  if _is_command_length_over_limit(job_submit_cmd):
3446
3471
  _dump_code_to_file(codegen)
3447
3472
  job_submit_cmd = f'{mkdir_code} && {code}'
@@ -3457,7 +3482,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
3457
3482
  # We cannot set the managed job to PENDING state in the job template
3458
3483
  # (jobs-controller.yaml.j2), as it may need to wait for the run
3459
3484
  # commands to be scheduled on the job controller in high-load cases.
3460
- job_submit_cmd = job_submit_cmd + ' && ' + managed_job_code
3485
+ job_submit_cmd += ' && ' + managed_job_code
3461
3486
 
3462
3487
  returncode, stdout, stderr = self.run_on_head(handle,
3463
3488
  job_submit_cmd,
@@ -276,7 +276,6 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
276
276
  detach_run: bool,
277
277
  dryrun: bool = False) -> None:
278
278
  """ Launches the container."""
279
-
280
279
  if detach_run:
281
280
  raise NotImplementedError('detach_run=True is not supported in '
282
281
  'LocalDockerBackend.')
@@ -364,7 +363,7 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
364
363
  if k.startswith(_DOCKER_LABEL_PREFIX):
365
364
  # Remove 'skymeta_' from key
366
365
  metadata[k[len(_DOCKER_LABEL_PREFIX):]] = v
367
- self.images[c.name] = [c.image, metadata]
366
+ self.images[c.name] = (c.image, metadata)
368
367
  self.containers[c.name] = c
369
368
 
370
369
  def _execute_task_one_node(self, handle: LocalDockerResourceHandle,
sky/cli.py CHANGED
@@ -162,7 +162,7 @@ def _get_cluster_records_and_set_ssh_config(
162
162
  '-o StrictHostKeyChecking=no '
163
163
  '-o UserKnownHostsFile=/dev/null '
164
164
  '-o IdentitiesOnly=yes '
165
- '-W %h:%p '
165
+ '-W \'[%h]:%p\' '
166
166
  f'{handle.ssh_user}@127.0.0.1 '
167
167
  '-o ProxyCommand='
168
168
  # TODO(zhwu): write the template to a temp file, don't use
sky/client/cli.py CHANGED
@@ -162,7 +162,7 @@ def _get_cluster_records_and_set_ssh_config(
162
162
  '-o StrictHostKeyChecking=no '
163
163
  '-o UserKnownHostsFile=/dev/null '
164
164
  '-o IdentitiesOnly=yes '
165
- '-W %h:%p '
165
+ '-W \'[%h]:%p\' '
166
166
  f'{handle.ssh_user}@127.0.0.1 '
167
167
  '-o ProxyCommand='
168
168
  # TODO(zhwu): write the template to a temp file, don't use
sky/clouds/aws.py CHANGED
@@ -161,13 +161,19 @@ class AWS(clouds.Cloud):
161
161
  def _unsupported_features_for_resources(
162
162
  cls, resources: 'resources_lib.Resources'
163
163
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
164
+ unsupported_features = {}
164
165
  if resources.use_spot:
165
- return {
166
- clouds.CloudImplementationFeatures.STOP:
167
- ('Stopping spot instances is currently not supported on'
168
- f' {cls._REPR}.'),
169
- }
170
- return {}
166
+ unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
167
+ f'Stopping spot instances is currently not supported on {cls._REPR}.'
168
+ )
169
+
170
+ unsupported_features[
171
+ clouds.CloudImplementationFeatures.
172
+ HIGH_AVAILABILITY_CONTROLLERS] = (
173
+ f'High availability controllers are not supported on {cls._REPR}.'
174
+ )
175
+
176
+ return unsupported_features
171
177
 
172
178
  @classmethod
173
179
  def max_cluster_name_length(cls) -> Optional[int]:
sky/clouds/azure.py CHANGED
@@ -90,6 +90,9 @@ class Azure(clouds.Cloud):
90
90
  features = {
91
91
  clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
92
92
  (f'Migrating disk is currently not supported on {cls._REPR}.'),
93
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS: (
94
+ f'High availability controllers are not supported on {cls._REPR}.'
95
+ ),
93
96
  }
94
97
  if resources.use_spot:
95
98
  features[clouds.CloudImplementationFeatures.STOP] = (
sky/clouds/cloud.py CHANGED
@@ -47,6 +47,9 @@ class CloudImplementationFeatures(enum.Enum):
47
47
  OPEN_PORTS = 'open_ports'
48
48
  STORAGE_MOUNTING = 'storage_mounting'
49
49
  HOST_CONTROLLERS = 'host_controllers' # Can run jobs/serve controllers
50
+ HIGH_AVAILABILITY_CONTROLLERS = ('high_availability_controllers'
51
+ ) # Controller can auto-restart
52
+ AUTO_TERMINATE = 'auto_terminate' # Pod/VM can stop or down itself
50
53
  AUTOSTOP = 'autostop' # Pod/VM can stop itself
51
54
  AUTODOWN = 'autodown' # Pod/VM can down itself
52
55
 
sky/clouds/cudo.py CHANGED
@@ -68,6 +68,8 @@ class Cudo(clouds.Cloud):
68
68
  'Cudo Compute cannot host a controller as it does not '
69
69
  'autostopping, which will leave the controller to run indefinitely.'
70
70
  ),
71
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
72
+ ('High availability controllers are not supported on Cudo.'),
71
73
  }
72
74
  _MAX_CLUSTER_NAME_LEN_LIMIT = 60
73
75
 
sky/clouds/do.py CHANGED
@@ -33,6 +33,9 @@ class DO(clouds.Cloud):
33
33
  clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
34
34
  'Custom disk tiers'
35
35
  f' is not supported in {_REPR}.',
36
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
37
+ ('High availability controllers are not supported in '
38
+ f'{_REPR}.'),
36
39
  }
37
40
  # DO maximum node name length defined as <= 255
38
41
  # https://docs.digitalocean.com/reference/api/api-reference/#operation/droplets_create
sky/clouds/fluidstack.py CHANGED
@@ -56,6 +56,9 @@ class Fluidstack(clouds.Cloud):
56
56
  clouds.CloudImplementationFeatures.HOST_CONTROLLERS:
57
57
  'Host controllers'
58
58
  f' are not supported in {_REPR}.',
59
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
60
+ ('High availability controllers are not supported in '
61
+ f'{_REPR}.'),
59
62
  }
60
63
  # Using the latest SkyPilot provisioner API to provision and check status.
61
64
  PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
sky/clouds/gcp.py CHANGED
@@ -232,6 +232,13 @@ class GCP(clouds.Cloud):
232
232
  unsupported[clouds.CloudImplementationFeatures.SPOT_INSTANCE] = (
233
233
  'Managed Instance Group with DWS does not support '
234
234
  'spot instances.')
235
+
236
+ unsupported[
237
+ clouds.CloudImplementationFeatures.
238
+ HIGH_AVAILABILITY_CONTROLLERS] = (
239
+ f'High availability controllers are not supported on {cls._REPR}.'
240
+ )
241
+
235
242
  return unsupported
236
243
 
237
244
  @classmethod
sky/clouds/ibm.py CHANGED
@@ -50,6 +50,8 @@ class IBM(clouds.Cloud):
50
50
  ),
51
51
  clouds.CloudImplementationFeatures.OPEN_PORTS:
52
52
  (f'Opening ports is currently not supported on {cls._REPR}.'),
53
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
54
+ ('High availability controllers are not supported on IBM.'),
53
55
  }
54
56
  if resources.use_spot:
55
57
  features[clouds.CloudImplementationFeatures.STOP] = (
sky/clouds/kubernetes.py CHANGED
@@ -429,22 +429,26 @@ class Kubernetes(clouds.Cloud):
429
429
  acc_count = k.accelerator_count if k.accelerator_count else 0
430
430
  acc_type = k.accelerator_type if k.accelerator_type else None
431
431
 
432
- image_id_dict = resources.image_id
433
- if image_id_dict is not None:
434
- # Use custom image specified in resources
435
- if None in image_id_dict:
436
- image_id = image_id_dict[None]
432
+ def _get_image_id(resources: 'resources_lib.Resources') -> str:
433
+ image_id_dict = resources.image_id
434
+ if image_id_dict is not None:
435
+ # Use custom image specified in resources
436
+ if None in image_id_dict:
437
+ image_id = image_id_dict[None]
438
+ else:
439
+ assert resources.region in image_id_dict, image_id_dict
440
+ image_id = image_id_dict[resources.region]
441
+ if image_id.startswith('docker:'):
442
+ image_id = image_id[len('docker:'):]
437
443
  else:
438
- assert resources.region in image_id_dict, image_id_dict
439
- image_id = image_id_dict[resources.region]
440
- if image_id.startswith('docker:'):
441
- image_id = image_id[len('docker:'):]
442
- else:
443
- # Select image based on whether we are using GPUs or not.
444
- image_id = self.IMAGE_GPU if acc_count > 0 else self.IMAGE_CPU
445
- # Get the container image ID from the service catalog.
446
- image_id = service_catalog.get_image_id_from_tag(
447
- image_id, clouds='kubernetes')
444
+ # Select image based on whether we are using GPUs or not.
445
+ image_id = self.IMAGE_GPU if acc_count > 0 else self.IMAGE_CPU
446
+ # Get the container image ID from the service catalog.
447
+ image_id = service_catalog.get_image_id_from_tag(
448
+ image_id, clouds='kubernetes')
449
+ return image_id
450
+
451
+ image_id = _get_image_id(resources)
448
452
  # TODO(romilb): Create a lightweight image for SSH jump host
449
453
  ssh_jump_image = service_catalog.get_image_id_from_tag(
450
454
  self.IMAGE_CPU, clouds='kubernetes')
@@ -540,6 +544,13 @@ class Kubernetes(clouds.Cloud):
540
544
  # cpus is <1.
541
545
  'num-cpus': str(max(int(cpus), 1)),
542
546
  }
547
+
548
+ # Get the storage class name for high availability controller's PVC
549
+ k8s_ha_storage_class_name = skypilot_config.get_nested(
550
+ ('kubernetes', 'high_availability', 'storage_class_name'),
551
+ None,
552
+ override_configs=resources.cluster_config_overrides)
553
+
543
554
  deploy_vars = {
544
555
  'instance_type': resources.instance_type,
545
556
  'custom_resources': custom_resources,
@@ -574,6 +585,18 @@ class Kubernetes(clouds.Cloud):
574
585
  'skypilot_ray_port': constants.SKY_REMOTE_RAY_PORT,
575
586
  'ray_worker_start_command': instance_setup.ray_worker_start_command(
576
587
  custom_resources, custom_ray_options, no_restart=False),
588
+ 'k8s_high_availability_deployment_volume_mount_name':
589
+ (kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME
590
+ ),
591
+ 'k8s_high_availability_deployment_volume_mount_path':
592
+ (kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH
593
+ ),
594
+ 'k8s_high_availability_deployment_setup_script_path':
595
+ (constants.PERSISTENT_SETUP_SCRIPT_PATH),
596
+ 'k8s_high_availability_deployment_run_script_dir':
597
+ (constants.PERSISTENT_RUN_SCRIPT_DIR),
598
+ 'k8s_high_availability_storage_class_name':
599
+ (k8s_ha_storage_class_name),
577
600
  }
578
601
 
579
602
  # Add kubecontext if it is set. It may be None if SkyPilot is running
@@ -44,6 +44,7 @@ class Lambda(clouds.Cloud):
44
44
  clouds.CloudImplementationFeatures.IMAGE_ID: f'Specifying image ID is not supported in {_REPR}.',
45
45
  clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: f'Custom disk tiers are not supported in {_REPR}.',
46
46
  clouds.CloudImplementationFeatures.HOST_CONTROLLERS: f'Host controllers are not supported in {_REPR}.',
47
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS: f'High availability controllers are not supported on {_REPR}.',
47
48
  }
48
49
 
49
50
  PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
sky/clouds/nebius.py CHANGED
@@ -65,6 +65,8 @@ class Nebius(clouds.Cloud):
65
65
  '`run` section in task.yaml.'),
66
66
  clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
67
67
  (f'Custom disk tier is currently not supported on {_REPR}.'),
68
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
69
+ ('High availability controllers are not supported on Nebius.'),
68
70
  }
69
71
  # Nebius maximum instance name length defined as <= 63 as a hostname length
70
72
  # 63 - 8 - 5 = 50 characters since
sky/clouds/oci.py CHANGED
@@ -69,19 +69,22 @@ class OCI(clouds.Cloud):
69
69
  def _unsupported_features_for_resources(
70
70
  cls, resources: 'resources_lib.Resources'
71
71
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
72
- features = {
72
+ unsupported_features = {
73
73
  clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
74
74
  (f'Migrating disk is currently not supported on {cls._REPR}.'),
75
75
  clouds.CloudImplementationFeatures.DOCKER_IMAGE:
76
76
  (f'Docker image is currently not supported on {cls._REPR}. '
77
77
  'You can try running docker command inside the '
78
78
  '`run` section in task.yaml.'),
79
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
80
+ ('High availability controllers are not supported on '
81
+ f'{cls._REPR}.'),
79
82
  }
80
83
  if resources.use_spot:
81
- features[clouds.CloudImplementationFeatures.STOP] = (
84
+ unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
82
85
  f'Stopping spot instances is currently not supported on '
83
86
  f'{cls._REPR}.')
84
- return features
87
+ return unsupported_features
85
88
 
86
89
  @classmethod
87
90
  def max_cluster_name_length(cls) -> Optional[int]:
sky/clouds/paperspace.py CHANGED
@@ -41,6 +41,8 @@ class Paperspace(clouds.Cloud):
41
41
  clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
42
42
  'Custom disk tiers'
43
43
  f' is not supported in {_REPR}.',
44
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
45
+ (f'High availability controllers are not supported in {_REPR}.'),
44
46
  }
45
47
  _MAX_CLUSTER_NAME_LEN_LIMIT = 120
46
48
  _regions: List[clouds.Region] = []
sky/clouds/runpod.py CHANGED
@@ -34,6 +34,8 @@ class RunPod(clouds.Cloud):
34
34
  ('Mounting object stores is not supported on RunPod. To read data '
35
35
  'from object stores on RunPod, use `mode: COPY` to copy the data '
36
36
  'to local disk.'),
37
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
38
+ ('High availability controllers are not supported on RunPod.'),
37
39
  }
38
40
  _MAX_CLUSTER_NAME_LEN_LIMIT = 120
39
41
  _regions: List[clouds.Region] = []
sky/clouds/scp.py CHANGED
@@ -58,6 +58,8 @@ class SCP(clouds.Cloud):
58
58
  (f'Custom disk tiers are not supported in {_REPR}.'),
59
59
  clouds.CloudImplementationFeatures.OPEN_PORTS:
60
60
  (f'Opening ports is currently not supported on {_REPR}.'),
61
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
62
+ (f'High availability controllers are not supported on {_REPR}.'),
61
63
  }
62
64
 
63
65
  _INDENT_PREFIX = ' '
sky/clouds/vast.py CHANGED
@@ -29,6 +29,8 @@ class Vast(clouds.Cloud):
29
29
  ('Opening ports is currently not supported on Vast.'),
30
30
  clouds.CloudImplementationFeatures.STORAGE_MOUNTING:
31
31
  ('Mounting object stores is not supported on Vast.'),
32
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
33
+ ('High availability controllers are not supported on Vast.'),
32
34
  }
33
35
  #
34
36
  # Vast doesn't have a max cluster name limit. This number
sky/clouds/vsphere.py CHANGED
@@ -54,6 +54,8 @@ class Vsphere(clouds.Cloud):
54
54
  (f'Custom disk tiers are not supported in {_REPR}.'),
55
55
  clouds.CloudImplementationFeatures.OPEN_PORTS:
56
56
  (f'Opening ports is currently not supported on {_REPR}.'),
57
+ clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
58
+ (f'High availability controllers are not supported on {_REPR}.'),
57
59
  }
58
60
 
59
61
  _MAX_CLUSTER_NAME_LEN_LIMIT = 80 # The name can't exceeds 80 characters
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><title>404: This page could not be found</title><meta name="next-head-count" content="3"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-1be831200e60c5c0.js" defer=""></script><script src="/dashboard/_next/static/WO8lTFPfj-lO3_gDGEiN8/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/WO8lTFPfj-lO3_gDGEiN8/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div style="font-family:system-ui,&quot;Segoe UI&quot;,Roboto,Helvetica,Arial,sans-serif,&quot;Apple Color Emoji&quot;,&quot;Segoe UI Emoji&quot;;height:100vh;text-align:center;display:flex;flex-direction:column;align-items:center;justify-content:center"><div style="line-height:48px"><style>body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}</style><h1 class="next-error-h1" style="display:inline-block;margin:0 20px 0 0;padding-right:23px;font-size:24px;font-weight:500;vertical-align:top">404</h1><div style="display:inline-block"><h2 style="font-size:14px;font-weight:400;line-height:28px">This page could not be found<!-- -->.</h2></div></div></div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"WO8lTFPfj-lO3_gDGEiN8","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><title>404: This page could not be found</title><meta name="next-head-count" content="3"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-1be831200e60c5c0.js" defer=""></script><script src="/dashboard/_next/static/2f-jlOWR_G5mOwCF4RcZz/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/2f-jlOWR_G5mOwCF4RcZz/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div style="font-family:system-ui,&quot;Segoe UI&quot;,Roboto,Helvetica,Arial,sans-serif,&quot;Apple Color Emoji&quot;,&quot;Segoe UI Emoji&quot;;height:100vh;text-align:center;display:flex;flex-direction:column;align-items:center;justify-content:center"><div style="line-height:48px"><style>body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}</style><h1 class="next-error-h1" style="display:inline-block;margin:0 20px 0 0;padding-right:23px;font-size:24px;font-weight:500;vertical-align:top">404</h1><div style="display:inline-block"><h2 style="font-size:14px;font-weight:400;line-height:28px">This page could not be found<!-- -->.</h2></div></div></div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"2f-jlOWR_G5mOwCF4RcZz","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/678-206dddca808e6d16.js" defer=""></script><script src="/dashboard/_next/static/chunks/979-7bf73a4c7cea0f5c.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/clusters/%5Bcluster%5D/%5Bjob%5D-6ac338bc2239cb45.js" defer=""></script><script src="/dashboard/_next/static/WO8lTFPfj-lO3_gDGEiN8/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/WO8lTFPfj-lO3_gDGEiN8/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div>Loading...</div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/clusters/[cluster]/[job]","query":{},"buildId":"WO8lTFPfj-lO3_gDGEiN8","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/678-206dddca808e6d16.js" defer=""></script><script src="/dashboard/_next/static/chunks/979-7bf73a4c7cea0f5c.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/clusters/%5Bcluster%5D/%5Bjob%5D-6ac338bc2239cb45.js" defer=""></script><script src="/dashboard/_next/static/2f-jlOWR_G5mOwCF4RcZz/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/2f-jlOWR_G5mOwCF4RcZz/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div>Loading...</div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/clusters/[cluster]/[job]","query":{},"buildId":"2f-jlOWR_G5mOwCF4RcZz","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
@@ -1 +1 @@
1
- <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/678-206dddca808e6d16.js" defer=""></script><script src="/dashboard/_next/static/chunks/312-c3c8845990db8ffc.js" defer=""></script><script src="/dashboard/_next/static/chunks/979-7bf73a4c7cea0f5c.js" defer=""></script><script src="/dashboard/_next/static/chunks/845-9e60713e0c441abc.js" defer=""></script><script src="/dashboard/_next/static/chunks/236-2db3ee3fba33dd9e.js" defer=""></script><script src="/dashboard/_next/static/chunks/37-0a572fe0dbb89c4d.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/clusters/%5Bcluster%5D-f383db7389368ea7.js" defer=""></script><script src="/dashboard/_next/static/WO8lTFPfj-lO3_gDGEiN8/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/WO8lTFPfj-lO3_gDGEiN8/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div>Loading...</div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/clusters/[cluster]","query":{},"buildId":"WO8lTFPfj-lO3_gDGEiN8","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
1
+ <!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/678-206dddca808e6d16.js" defer=""></script><script src="/dashboard/_next/static/chunks/312-c3c8845990db8ffc.js" defer=""></script><script src="/dashboard/_next/static/chunks/979-7bf73a4c7cea0f5c.js" defer=""></script><script src="/dashboard/_next/static/chunks/845-9e60713e0c441abc.js" defer=""></script><script src="/dashboard/_next/static/chunks/236-2db3ee3fba33dd9e.js" defer=""></script><script src="/dashboard/_next/static/chunks/37-0a572fe0dbb89c4d.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/clusters/%5Bcluster%5D-f383db7389368ea7.js" defer=""></script><script src="/dashboard/_next/static/2f-jlOWR_G5mOwCF4RcZz/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/2f-jlOWR_G5mOwCF4RcZz/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div>Loading...</div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/clusters/[cluster]","query":{},"buildId":"2f-jlOWR_G5mOwCF4RcZz","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>