skypilot-nightly 1.0.0.dev20250426__py3-none-any.whl → 1.0.0.dev20250428__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +19 -2
- sky/backends/cloud_vm_ray_backend.py +33 -8
- sky/backends/local_docker_backend.py +1 -2
- sky/cli.py +1 -1
- sky/client/cli.py +1 -1
- sky/clouds/aws.py +12 -6
- sky/clouds/azure.py +3 -0
- sky/clouds/cloud.py +3 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +3 -0
- sky/clouds/fluidstack.py +3 -0
- sky/clouds/gcp.py +7 -0
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +38 -15
- sky/clouds/lambda_cloud.py +1 -0
- sky/clouds/nebius.py +2 -0
- sky/clouds/oci.py +6 -3
- sky/clouds/paperspace.py +2 -0
- sky/clouds/runpod.py +2 -0
- sky/clouds/scp.py +2 -0
- sky/clouds/vast.py +2 -0
- sky/clouds/vsphere.py +2 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/exceptions.py +6 -0
- sky/execution.py +19 -4
- sky/global_user_state.py +1 -0
- sky/provision/common.py +2 -5
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +280 -94
- sky/provision/kubernetes/network.py +1 -1
- sky/provision/kubernetes/utils.py +10 -0
- sky/provision/provisioner.py +6 -0
- sky/serve/replica_managers.py +51 -5
- sky/serve/serve_state.py +41 -0
- sky/serve/service.py +108 -63
- sky/server/requests/executor.py +4 -4
- sky/skylet/constants.py +7 -0
- sky/task.py +1 -1
- sky/templates/kubernetes-ray.yml.j2 +122 -2
- sky/utils/command_runner.py +17 -3
- sky/utils/command_runner.pyi +2 -0
- sky/utils/controller_utils.py +24 -0
- sky/utils/kubernetes/rsync_helper.sh +20 -4
- sky/utils/schemas.py +13 -0
- {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/RECORD +59 -59
- {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/WHEEL +1 -1
- /sky/dashboard/out/_next/static/{WO8lTFPfj-lO3_gDGEiN8 → 2f-jlOWR_G5mOwCF4RcZz}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{WO8lTFPfj-lO3_gDGEiN8 → 2f-jlOWR_G5mOwCF4RcZz}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '607eee0a24e50718d783e92081f141f45cac6cda'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250428'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/backends/backend_utils.py
CHANGED
@@ -179,6 +179,9 @@ _RAY_YAML_KEYS_TO_RESTORE_EXCEPTIONS = [
|
|
179
179
|
('available_node_types', 'ray.head.default', 'node_config', 'UserData'),
|
180
180
|
('available_node_types', 'ray.head.default', 'node_config',
|
181
181
|
'azure_arm_parameters', 'cloudInitSetupCommands'),
|
182
|
+
('available_node_types', 'ray_head_default', 'node_config', 'pvc_spec'),
|
183
|
+
('available_node_types', 'ray_head_default', 'node_config',
|
184
|
+
'deployment_spec'),
|
182
185
|
]
|
183
186
|
# These keys are expected to change when provisioning on an existing cluster,
|
184
187
|
# but they don't actually represent a change that requires re-provisioning the
|
@@ -705,6 +708,13 @@ def write_cluster_config(
|
|
705
708
|
is_custom_docker = ('true' if to_provision.extract_docker_image()
|
706
709
|
is not None else 'false')
|
707
710
|
|
711
|
+
# Here, if users specify the controller to be high availability, we will
|
712
|
+
# provision a high availability controller. Whether the cloud supports
|
713
|
+
# this feature has been checked by
|
714
|
+
# CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS
|
715
|
+
high_availability_specified = controller_utils.high_availability_specified(
|
716
|
+
cluster_name_on_cloud)
|
717
|
+
|
708
718
|
# Use a tmp file path to avoid incomplete YAML file being re-used in the
|
709
719
|
# future.
|
710
720
|
tmp_yaml_path = yaml_path + '.tmp'
|
@@ -790,6 +800,9 @@ def write_cluster_config(
|
|
790
800
|
'sky_wheel_hash': wheel_hash,
|
791
801
|
# Authentication (optional).
|
792
802
|
**auth_config,
|
803
|
+
|
804
|
+
# High availability
|
805
|
+
'high_availability': high_availability_specified,
|
793
806
|
}),
|
794
807
|
output_path=tmp_yaml_path)
|
795
808
|
config_dict['cluster_name'] = cluster_name
|
@@ -802,8 +815,12 @@ def write_cluster_config(
|
|
802
815
|
cluster_config_overrides=to_provision.cluster_config_overrides)
|
803
816
|
kubernetes_utils.combine_metadata_fields(tmp_yaml_path)
|
804
817
|
yaml_obj = common_utils.read_yaml(tmp_yaml_path)
|
805
|
-
pod_config = yaml_obj['available_node_types'][
|
806
|
-
'node_config']
|
818
|
+
pod_config: Dict[str, Any] = yaml_obj['available_node_types'][
|
819
|
+
'ray_head_default']['node_config']
|
820
|
+
|
821
|
+
# Check pod spec only. For high availability controllers, we deploy pvc & deployment for the controller. Read kubernetes-ray.yml.j2 for more details.
|
822
|
+
pod_config.pop('deployment_spec', None)
|
823
|
+
pod_config.pop('pvc_spec', None)
|
807
824
|
valid, message = kubernetes_utils.check_pod_config(pod_config)
|
808
825
|
if not valid:
|
809
826
|
raise exceptions.InvalidCloudConfigs(
|
@@ -1583,6 +1583,10 @@ class RetryingVmProvisioner(object):
|
|
1583
1583
|
except provision_common.StopFailoverError:
|
1584
1584
|
with ux_utils.print_exception_no_traceback():
|
1585
1585
|
raise
|
1586
|
+
except exceptions.InconsistentHighAvailabilityError:
|
1587
|
+
# No teardown happens for this error.
|
1588
|
+
with ux_utils.print_exception_no_traceback():
|
1589
|
+
raise
|
1586
1590
|
except Exception as e: # pylint: disable=broad-except
|
1587
1591
|
# NOTE: We try to cleanup the cluster even if the previous
|
1588
1592
|
# cluster does not exist. Also we are fast at
|
@@ -2032,6 +2036,7 @@ class RetryingVmProvisioner(object):
|
|
2032
2036
|
# Recheck cluster name as the 'except:' block below may
|
2033
2037
|
# change the cloud assignment.
|
2034
2038
|
common_utils.check_cluster_name_is_valid(cluster_name)
|
2039
|
+
|
2035
2040
|
if dryrun:
|
2036
2041
|
cloud_user = None
|
2037
2042
|
else:
|
@@ -2459,6 +2464,7 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
|
|
2459
2464
|
'Tried to use cached cluster info, but it\'s missing for '
|
2460
2465
|
f'cluster "{self.cluster_name}"')
|
2461
2466
|
self._update_cluster_info()
|
2467
|
+
|
2462
2468
|
assert self.cached_cluster_info is not None, self
|
2463
2469
|
runners = provision_lib.get_command_runners(
|
2464
2470
|
self.cached_cluster_info.provider_name, self.cached_cluster_info,
|
@@ -2689,6 +2695,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
2689
2695
|
self._optimize_target) or common.OptimizeTarget.COST
|
2690
2696
|
self._requested_features = kwargs.pop('requested_features',
|
2691
2697
|
self._requested_features)
|
2698
|
+
self._dump_final_script = kwargs.pop('dump_final_script', False)
|
2692
2699
|
assert not kwargs, f'Unexpected kwargs: {kwargs}'
|
2693
2700
|
|
2694
2701
|
def check_resources_fit_cluster(
|
@@ -3272,18 +3279,30 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3272
3279
|
env_vars=setup_envs)
|
3273
3280
|
encoded_script = shlex.quote(setup_script)
|
3274
3281
|
|
3275
|
-
def
|
3282
|
+
def _dump_final_script(
|
3283
|
+
setup_script: str,
|
3284
|
+
target_dir: str = remote_setup_file_name) -> None:
|
3276
3285
|
with tempfile.NamedTemporaryFile('w', prefix='sky_setup_') as f:
|
3277
3286
|
f.write(setup_script)
|
3278
3287
|
f.flush()
|
3279
3288
|
setup_sh_path = f.name
|
3280
3289
|
runner.rsync(source=setup_sh_path,
|
3281
|
-
target=
|
3290
|
+
target=target_dir,
|
3282
3291
|
up=True,
|
3283
3292
|
stream_logs=False)
|
3284
3293
|
|
3294
|
+
# Always dump the full setup script to the persistent path first
|
3295
|
+
# In high availability mode, we need to dump the full setup script
|
3296
|
+
# to a persistent path BEFORE any other operations. This ensures
|
3297
|
+
# that if the pod restarts, it can find and execute the complete
|
3298
|
+
# setup script, rather than a reference to a temporary file that
|
3299
|
+
# would no longer exist after restart.
|
3300
|
+
if self._dump_final_script:
|
3301
|
+
_dump_final_script(setup_script,
|
3302
|
+
constants.PERSISTENT_SETUP_SCRIPT_PATH)
|
3303
|
+
|
3285
3304
|
if detach_setup or _is_command_length_over_limit(encoded_script):
|
3286
|
-
|
3305
|
+
_dump_final_script(setup_script)
|
3287
3306
|
create_script_code = 'true'
|
3288
3307
|
else:
|
3289
3308
|
create_script_code = (f'{{ echo {encoded_script} > '
|
@@ -3335,7 +3354,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3335
3354
|
'Failed to run setup command inline due to '
|
3336
3355
|
'command length limit. Dumping setup script to '
|
3337
3356
|
'file and running it with SSH.')
|
3338
|
-
|
3357
|
+
_dump_final_script(setup_script)
|
3339
3358
|
returncode = _run_setup(setup_cmd)
|
3340
3359
|
|
3341
3360
|
def error_message() -> str:
|
@@ -3426,14 +3445,14 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3426
3445
|
code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
|
3427
3446
|
job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
|
3428
3447
|
|
3429
|
-
def _dump_code_to_file(codegen: str
|
3448
|
+
def _dump_code_to_file(codegen: str,
|
3449
|
+
target_dir: str = SKY_REMOTE_APP_DIR) -> None:
|
3430
3450
|
runners = handle.get_command_runners()
|
3431
3451
|
head_runner = runners[0]
|
3432
3452
|
with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
|
3433
3453
|
fp.write(codegen)
|
3434
3454
|
fp.flush()
|
3435
|
-
script_path = os.path.join(
|
3436
|
-
f'sky_job_{job_id}')
|
3455
|
+
script_path = os.path.join(target_dir, f'sky_job_{job_id}')
|
3437
3456
|
# We choose to sync code + exec, because the alternative of 'ray
|
3438
3457
|
# submit' may not work as it may use system python (python2) to
|
3439
3458
|
# execute the script. Happens for AWS.
|
@@ -3442,6 +3461,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3442
3461
|
up=True,
|
3443
3462
|
stream_logs=False)
|
3444
3463
|
|
3464
|
+
# Should also be ealier than _is_command_length_over_limit
|
3465
|
+
# Same reason as in _setup
|
3466
|
+
if self._dump_final_script:
|
3467
|
+
_dump_code_to_file(job_submit_cmd,
|
3468
|
+
constants.PERSISTENT_RUN_SCRIPT_DIR)
|
3469
|
+
|
3445
3470
|
if _is_command_length_over_limit(job_submit_cmd):
|
3446
3471
|
_dump_code_to_file(codegen)
|
3447
3472
|
job_submit_cmd = f'{mkdir_code} && {code}'
|
@@ -3457,7 +3482,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
|
|
3457
3482
|
# We cannot set the managed job to PENDING state in the job template
|
3458
3483
|
# (jobs-controller.yaml.j2), as it may need to wait for the run
|
3459
3484
|
# commands to be scheduled on the job controller in high-load cases.
|
3460
|
-
job_submit_cmd
|
3485
|
+
job_submit_cmd += ' && ' + managed_job_code
|
3461
3486
|
|
3462
3487
|
returncode, stdout, stderr = self.run_on_head(handle,
|
3463
3488
|
job_submit_cmd,
|
@@ -276,7 +276,6 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
276
276
|
detach_run: bool,
|
277
277
|
dryrun: bool = False) -> None:
|
278
278
|
""" Launches the container."""
|
279
|
-
|
280
279
|
if detach_run:
|
281
280
|
raise NotImplementedError('detach_run=True is not supported in '
|
282
281
|
'LocalDockerBackend.')
|
@@ -364,7 +363,7 @@ class LocalDockerBackend(backends.Backend['LocalDockerResourceHandle']):
|
|
364
363
|
if k.startswith(_DOCKER_LABEL_PREFIX):
|
365
364
|
# Remove 'skymeta_' from key
|
366
365
|
metadata[k[len(_DOCKER_LABEL_PREFIX):]] = v
|
367
|
-
self.images[c.name] =
|
366
|
+
self.images[c.name] = (c.image, metadata)
|
368
367
|
self.containers[c.name] = c
|
369
368
|
|
370
369
|
def _execute_task_one_node(self, handle: LocalDockerResourceHandle,
|
sky/cli.py
CHANGED
@@ -162,7 +162,7 @@ def _get_cluster_records_and_set_ssh_config(
|
|
162
162
|
'-o StrictHostKeyChecking=no '
|
163
163
|
'-o UserKnownHostsFile=/dev/null '
|
164
164
|
'-o IdentitiesOnly=yes '
|
165
|
-
'-W %h:%p '
|
165
|
+
'-W \'[%h]:%p\' '
|
166
166
|
f'{handle.ssh_user}@127.0.0.1 '
|
167
167
|
'-o ProxyCommand='
|
168
168
|
# TODO(zhwu): write the template to a temp file, don't use
|
sky/client/cli.py
CHANGED
@@ -162,7 +162,7 @@ def _get_cluster_records_and_set_ssh_config(
|
|
162
162
|
'-o StrictHostKeyChecking=no '
|
163
163
|
'-o UserKnownHostsFile=/dev/null '
|
164
164
|
'-o IdentitiesOnly=yes '
|
165
|
-
'-W %h:%p '
|
165
|
+
'-W \'[%h]:%p\' '
|
166
166
|
f'{handle.ssh_user}@127.0.0.1 '
|
167
167
|
'-o ProxyCommand='
|
168
168
|
# TODO(zhwu): write the template to a temp file, don't use
|
sky/clouds/aws.py
CHANGED
@@ -161,13 +161,19 @@ class AWS(clouds.Cloud):
|
|
161
161
|
def _unsupported_features_for_resources(
|
162
162
|
cls, resources: 'resources_lib.Resources'
|
163
163
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
164
|
+
unsupported_features = {}
|
164
165
|
if resources.use_spot:
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
166
|
+
unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
|
167
|
+
f'Stopping spot instances is currently not supported on {cls._REPR}.'
|
168
|
+
)
|
169
|
+
|
170
|
+
unsupported_features[
|
171
|
+
clouds.CloudImplementationFeatures.
|
172
|
+
HIGH_AVAILABILITY_CONTROLLERS] = (
|
173
|
+
f'High availability controllers are not supported on {cls._REPR}.'
|
174
|
+
)
|
175
|
+
|
176
|
+
return unsupported_features
|
171
177
|
|
172
178
|
@classmethod
|
173
179
|
def max_cluster_name_length(cls) -> Optional[int]:
|
sky/clouds/azure.py
CHANGED
@@ -90,6 +90,9 @@ class Azure(clouds.Cloud):
|
|
90
90
|
features = {
|
91
91
|
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
|
92
92
|
(f'Migrating disk is currently not supported on {cls._REPR}.'),
|
93
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS: (
|
94
|
+
f'High availability controllers are not supported on {cls._REPR}.'
|
95
|
+
),
|
93
96
|
}
|
94
97
|
if resources.use_spot:
|
95
98
|
features[clouds.CloudImplementationFeatures.STOP] = (
|
sky/clouds/cloud.py
CHANGED
@@ -47,6 +47,9 @@ class CloudImplementationFeatures(enum.Enum):
|
|
47
47
|
OPEN_PORTS = 'open_ports'
|
48
48
|
STORAGE_MOUNTING = 'storage_mounting'
|
49
49
|
HOST_CONTROLLERS = 'host_controllers' # Can run jobs/serve controllers
|
50
|
+
HIGH_AVAILABILITY_CONTROLLERS = ('high_availability_controllers'
|
51
|
+
) # Controller can auto-restart
|
52
|
+
AUTO_TERMINATE = 'auto_terminate' # Pod/VM can stop or down itself
|
50
53
|
AUTOSTOP = 'autostop' # Pod/VM can stop itself
|
51
54
|
AUTODOWN = 'autodown' # Pod/VM can down itself
|
52
55
|
|
sky/clouds/cudo.py
CHANGED
@@ -68,6 +68,8 @@ class Cudo(clouds.Cloud):
|
|
68
68
|
'Cudo Compute cannot host a controller as it does not '
|
69
69
|
'autostopping, which will leave the controller to run indefinitely.'
|
70
70
|
),
|
71
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
72
|
+
('High availability controllers are not supported on Cudo.'),
|
71
73
|
}
|
72
74
|
_MAX_CLUSTER_NAME_LEN_LIMIT = 60
|
73
75
|
|
sky/clouds/do.py
CHANGED
@@ -33,6 +33,9 @@ class DO(clouds.Cloud):
|
|
33
33
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
34
34
|
'Custom disk tiers'
|
35
35
|
f' is not supported in {_REPR}.',
|
36
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
37
|
+
('High availability controllers are not supported in '
|
38
|
+
f'{_REPR}.'),
|
36
39
|
}
|
37
40
|
# DO maximum node name length defined as <= 255
|
38
41
|
# https://docs.digitalocean.com/reference/api/api-reference/#operation/droplets_create
|
sky/clouds/fluidstack.py
CHANGED
@@ -56,6 +56,9 @@ class Fluidstack(clouds.Cloud):
|
|
56
56
|
clouds.CloudImplementationFeatures.HOST_CONTROLLERS:
|
57
57
|
'Host controllers'
|
58
58
|
f' are not supported in {_REPR}.',
|
59
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
60
|
+
('High availability controllers are not supported in '
|
61
|
+
f'{_REPR}.'),
|
59
62
|
}
|
60
63
|
# Using the latest SkyPilot provisioner API to provision and check status.
|
61
64
|
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
sky/clouds/gcp.py
CHANGED
@@ -232,6 +232,13 @@ class GCP(clouds.Cloud):
|
|
232
232
|
unsupported[clouds.CloudImplementationFeatures.SPOT_INSTANCE] = (
|
233
233
|
'Managed Instance Group with DWS does not support '
|
234
234
|
'spot instances.')
|
235
|
+
|
236
|
+
unsupported[
|
237
|
+
clouds.CloudImplementationFeatures.
|
238
|
+
HIGH_AVAILABILITY_CONTROLLERS] = (
|
239
|
+
f'High availability controllers are not supported on {cls._REPR}.'
|
240
|
+
)
|
241
|
+
|
235
242
|
return unsupported
|
236
243
|
|
237
244
|
@classmethod
|
sky/clouds/ibm.py
CHANGED
@@ -50,6 +50,8 @@ class IBM(clouds.Cloud):
|
|
50
50
|
),
|
51
51
|
clouds.CloudImplementationFeatures.OPEN_PORTS:
|
52
52
|
(f'Opening ports is currently not supported on {cls._REPR}.'),
|
53
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
54
|
+
('High availability controllers are not supported on IBM.'),
|
53
55
|
}
|
54
56
|
if resources.use_spot:
|
55
57
|
features[clouds.CloudImplementationFeatures.STOP] = (
|
sky/clouds/kubernetes.py
CHANGED
@@ -429,22 +429,26 @@ class Kubernetes(clouds.Cloud):
|
|
429
429
|
acc_count = k.accelerator_count if k.accelerator_count else 0
|
430
430
|
acc_type = k.accelerator_type if k.accelerator_type else None
|
431
431
|
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
432
|
+
def _get_image_id(resources: 'resources_lib.Resources') -> str:
|
433
|
+
image_id_dict = resources.image_id
|
434
|
+
if image_id_dict is not None:
|
435
|
+
# Use custom image specified in resources
|
436
|
+
if None in image_id_dict:
|
437
|
+
image_id = image_id_dict[None]
|
438
|
+
else:
|
439
|
+
assert resources.region in image_id_dict, image_id_dict
|
440
|
+
image_id = image_id_dict[resources.region]
|
441
|
+
if image_id.startswith('docker:'):
|
442
|
+
image_id = image_id[len('docker:'):]
|
437
443
|
else:
|
438
|
-
|
439
|
-
image_id =
|
440
|
-
|
441
|
-
image_id =
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
image_id = service_catalog.get_image_id_from_tag(
|
447
|
-
image_id, clouds='kubernetes')
|
444
|
+
# Select image based on whether we are using GPUs or not.
|
445
|
+
image_id = self.IMAGE_GPU if acc_count > 0 else self.IMAGE_CPU
|
446
|
+
# Get the container image ID from the service catalog.
|
447
|
+
image_id = service_catalog.get_image_id_from_tag(
|
448
|
+
image_id, clouds='kubernetes')
|
449
|
+
return image_id
|
450
|
+
|
451
|
+
image_id = _get_image_id(resources)
|
448
452
|
# TODO(romilb): Create a lightweight image for SSH jump host
|
449
453
|
ssh_jump_image = service_catalog.get_image_id_from_tag(
|
450
454
|
self.IMAGE_CPU, clouds='kubernetes')
|
@@ -540,6 +544,13 @@ class Kubernetes(clouds.Cloud):
|
|
540
544
|
# cpus is <1.
|
541
545
|
'num-cpus': str(max(int(cpus), 1)),
|
542
546
|
}
|
547
|
+
|
548
|
+
# Get the storage class name for high availability controller's PVC
|
549
|
+
k8s_ha_storage_class_name = skypilot_config.get_nested(
|
550
|
+
('kubernetes', 'high_availability', 'storage_class_name'),
|
551
|
+
None,
|
552
|
+
override_configs=resources.cluster_config_overrides)
|
553
|
+
|
543
554
|
deploy_vars = {
|
544
555
|
'instance_type': resources.instance_type,
|
545
556
|
'custom_resources': custom_resources,
|
@@ -574,6 +585,18 @@ class Kubernetes(clouds.Cloud):
|
|
574
585
|
'skypilot_ray_port': constants.SKY_REMOTE_RAY_PORT,
|
575
586
|
'ray_worker_start_command': instance_setup.ray_worker_start_command(
|
576
587
|
custom_resources, custom_ray_options, no_restart=False),
|
588
|
+
'k8s_high_availability_deployment_volume_mount_name':
|
589
|
+
(kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME
|
590
|
+
),
|
591
|
+
'k8s_high_availability_deployment_volume_mount_path':
|
592
|
+
(kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH
|
593
|
+
),
|
594
|
+
'k8s_high_availability_deployment_setup_script_path':
|
595
|
+
(constants.PERSISTENT_SETUP_SCRIPT_PATH),
|
596
|
+
'k8s_high_availability_deployment_run_script_dir':
|
597
|
+
(constants.PERSISTENT_RUN_SCRIPT_DIR),
|
598
|
+
'k8s_high_availability_storage_class_name':
|
599
|
+
(k8s_ha_storage_class_name),
|
577
600
|
}
|
578
601
|
|
579
602
|
# Add kubecontext if it is set. It may be None if SkyPilot is running
|
sky/clouds/lambda_cloud.py
CHANGED
@@ -44,6 +44,7 @@ class Lambda(clouds.Cloud):
|
|
44
44
|
clouds.CloudImplementationFeatures.IMAGE_ID: f'Specifying image ID is not supported in {_REPR}.',
|
45
45
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER: f'Custom disk tiers are not supported in {_REPR}.',
|
46
46
|
clouds.CloudImplementationFeatures.HOST_CONTROLLERS: f'Host controllers are not supported in {_REPR}.',
|
47
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS: f'High availability controllers are not supported on {_REPR}.',
|
47
48
|
}
|
48
49
|
|
49
50
|
PROVISIONER_VERSION = clouds.ProvisionerVersion.SKYPILOT
|
sky/clouds/nebius.py
CHANGED
@@ -65,6 +65,8 @@ class Nebius(clouds.Cloud):
|
|
65
65
|
'`run` section in task.yaml.'),
|
66
66
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
67
67
|
(f'Custom disk tier is currently not supported on {_REPR}.'),
|
68
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
69
|
+
('High availability controllers are not supported on Nebius.'),
|
68
70
|
}
|
69
71
|
# Nebius maximum instance name length defined as <= 63 as a hostname length
|
70
72
|
# 63 - 8 - 5 = 50 characters since
|
sky/clouds/oci.py
CHANGED
@@ -69,19 +69,22 @@ class OCI(clouds.Cloud):
|
|
69
69
|
def _unsupported_features_for_resources(
|
70
70
|
cls, resources: 'resources_lib.Resources'
|
71
71
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
72
|
-
|
72
|
+
unsupported_features = {
|
73
73
|
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
|
74
74
|
(f'Migrating disk is currently not supported on {cls._REPR}.'),
|
75
75
|
clouds.CloudImplementationFeatures.DOCKER_IMAGE:
|
76
76
|
(f'Docker image is currently not supported on {cls._REPR}. '
|
77
77
|
'You can try running docker command inside the '
|
78
78
|
'`run` section in task.yaml.'),
|
79
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
80
|
+
('High availability controllers are not supported on '
|
81
|
+
f'{cls._REPR}.'),
|
79
82
|
}
|
80
83
|
if resources.use_spot:
|
81
|
-
|
84
|
+
unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
|
82
85
|
f'Stopping spot instances is currently not supported on '
|
83
86
|
f'{cls._REPR}.')
|
84
|
-
return
|
87
|
+
return unsupported_features
|
85
88
|
|
86
89
|
@classmethod
|
87
90
|
def max_cluster_name_length(cls) -> Optional[int]:
|
sky/clouds/paperspace.py
CHANGED
@@ -41,6 +41,8 @@ class Paperspace(clouds.Cloud):
|
|
41
41
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
42
42
|
'Custom disk tiers'
|
43
43
|
f' is not supported in {_REPR}.',
|
44
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
45
|
+
(f'High availability controllers are not supported in {_REPR}.'),
|
44
46
|
}
|
45
47
|
_MAX_CLUSTER_NAME_LEN_LIMIT = 120
|
46
48
|
_regions: List[clouds.Region] = []
|
sky/clouds/runpod.py
CHANGED
@@ -34,6 +34,8 @@ class RunPod(clouds.Cloud):
|
|
34
34
|
('Mounting object stores is not supported on RunPod. To read data '
|
35
35
|
'from object stores on RunPod, use `mode: COPY` to copy the data '
|
36
36
|
'to local disk.'),
|
37
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
38
|
+
('High availability controllers are not supported on RunPod.'),
|
37
39
|
}
|
38
40
|
_MAX_CLUSTER_NAME_LEN_LIMIT = 120
|
39
41
|
_regions: List[clouds.Region] = []
|
sky/clouds/scp.py
CHANGED
@@ -58,6 +58,8 @@ class SCP(clouds.Cloud):
|
|
58
58
|
(f'Custom disk tiers are not supported in {_REPR}.'),
|
59
59
|
clouds.CloudImplementationFeatures.OPEN_PORTS:
|
60
60
|
(f'Opening ports is currently not supported on {_REPR}.'),
|
61
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
62
|
+
(f'High availability controllers are not supported on {_REPR}.'),
|
61
63
|
}
|
62
64
|
|
63
65
|
_INDENT_PREFIX = ' '
|
sky/clouds/vast.py
CHANGED
@@ -29,6 +29,8 @@ class Vast(clouds.Cloud):
|
|
29
29
|
('Opening ports is currently not supported on Vast.'),
|
30
30
|
clouds.CloudImplementationFeatures.STORAGE_MOUNTING:
|
31
31
|
('Mounting object stores is not supported on Vast.'),
|
32
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
33
|
+
('High availability controllers are not supported on Vast.'),
|
32
34
|
}
|
33
35
|
#
|
34
36
|
# Vast doesn't have a max cluster name limit. This number
|
sky/clouds/vsphere.py
CHANGED
@@ -54,6 +54,8 @@ class Vsphere(clouds.Cloud):
|
|
54
54
|
(f'Custom disk tiers are not supported in {_REPR}.'),
|
55
55
|
clouds.CloudImplementationFeatures.OPEN_PORTS:
|
56
56
|
(f'Opening ports is currently not supported on {_REPR}.'),
|
57
|
+
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
58
|
+
(f'High availability controllers are not supported on {_REPR}.'),
|
57
59
|
}
|
58
60
|
|
59
61
|
_MAX_CLUSTER_NAME_LEN_LIMIT = 80 # The name can't exceeds 80 characters
|
sky/dashboard/out/404.html
CHANGED
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><title>404: This page could not be found</title><meta name="next-head-count" content="3"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-1be831200e60c5c0.js" defer=""></script><script src="/dashboard/_next/static/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><title>404: This page could not be found</title><meta name="next-head-count" content="3"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_error-1be831200e60c5c0.js" defer=""></script><script src="/dashboard/_next/static/2f-jlOWR_G5mOwCF4RcZz/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/2f-jlOWR_G5mOwCF4RcZz/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div style="font-family:system-ui,"Segoe UI",Roboto,Helvetica,Arial,sans-serif,"Apple Color Emoji","Segoe UI Emoji";height:100vh;text-align:center;display:flex;flex-direction:column;align-items:center;justify-content:center"><div style="line-height:48px"><style>body{color:#000;background:#fff;margin:0}.next-error-h1{border-right:1px solid rgba(0,0,0,.3)}@media (prefers-color-scheme:dark){body{color:#fff;background:#000}.next-error-h1{border-right:1px solid rgba(255,255,255,.3)}}</style><h1 class="next-error-h1" style="display:inline-block;margin:0 20px 0 0;padding-right:23px;font-size:24px;font-weight:500;vertical-align:top">404</h1><div style="display:inline-block"><h2 style="font-size:14px;font-weight:400;line-height:28px">This page could not be found<!-- -->.</h2></div></div></div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{"statusCode":404}},"page":"/_error","query":{},"buildId":"2f-jlOWR_G5mOwCF4RcZz","assetPrefix":"/dashboard","nextExport":true,"isFallback":false,"gip":true,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/678-206dddca808e6d16.js" defer=""></script><script src="/dashboard/_next/static/chunks/979-7bf73a4c7cea0f5c.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/clusters/%5Bcluster%5D/%5Bjob%5D-6ac338bc2239cb45.js" defer=""></script><script src="/dashboard/_next/static/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/678-206dddca808e6d16.js" defer=""></script><script src="/dashboard/_next/static/chunks/979-7bf73a4c7cea0f5c.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/clusters/%5Bcluster%5D/%5Bjob%5D-6ac338bc2239cb45.js" defer=""></script><script src="/dashboard/_next/static/2f-jlOWR_G5mOwCF4RcZz/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/2f-jlOWR_G5mOwCF4RcZz/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div>Loading...</div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/clusters/[cluster]/[job]","query":{},"buildId":"2f-jlOWR_G5mOwCF4RcZz","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|
@@ -1 +1 @@
|
|
1
|
-
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/678-206dddca808e6d16.js" defer=""></script><script src="/dashboard/_next/static/chunks/312-c3c8845990db8ffc.js" defer=""></script><script src="/dashboard/_next/static/chunks/979-7bf73a4c7cea0f5c.js" defer=""></script><script src="/dashboard/_next/static/chunks/845-9e60713e0c441abc.js" defer=""></script><script src="/dashboard/_next/static/chunks/236-2db3ee3fba33dd9e.js" defer=""></script><script src="/dashboard/_next/static/chunks/37-0a572fe0dbb89c4d.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/clusters/%5Bcluster%5D-f383db7389368ea7.js" defer=""></script><script src="/dashboard/_next/static/
|
1
|
+
<!DOCTYPE html><html><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><meta name="next-head-count" content="2"/><link rel="preload" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" as="style"/><link rel="stylesheet" href="/dashboard/_next/static/css/c6933bbb2ce7f4dd.css" data-n-g=""/><noscript data-n-css=""></noscript><script defer="" nomodule="" src="/dashboard/_next/static/chunks/polyfills-78c92fac7aa8fdd8.js"></script><script src="/dashboard/_next/static/chunks/webpack-830f59b8404e96b8.js" defer=""></script><script src="/dashboard/_next/static/chunks/framework-87d061ee6ed71b28.js" defer=""></script><script src="/dashboard/_next/static/chunks/main-e0e2335212e72357.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js" defer=""></script><script src="/dashboard/_next/static/chunks/678-206dddca808e6d16.js" defer=""></script><script src="/dashboard/_next/static/chunks/312-c3c8845990db8ffc.js" defer=""></script><script src="/dashboard/_next/static/chunks/979-7bf73a4c7cea0f5c.js" defer=""></script><script src="/dashboard/_next/static/chunks/845-9e60713e0c441abc.js" defer=""></script><script src="/dashboard/_next/static/chunks/236-2db3ee3fba33dd9e.js" defer=""></script><script src="/dashboard/_next/static/chunks/37-0a572fe0dbb89c4d.js" defer=""></script><script src="/dashboard/_next/static/chunks/pages/clusters/%5Bcluster%5D-f383db7389368ea7.js" defer=""></script><script src="/dashboard/_next/static/2f-jlOWR_G5mOwCF4RcZz/_buildManifest.js" defer=""></script><script src="/dashboard/_next/static/2f-jlOWR_G5mOwCF4RcZz/_ssgManifest.js" defer=""></script></head><body><div id="__next"><div>Loading...</div></div><script id="__NEXT_DATA__" type="application/json">{"props":{"pageProps":{}},"page":"/clusters/[cluster]","query":{},"buildId":"2f-jlOWR_G5mOwCF4RcZz","assetPrefix":"/dashboard","nextExport":true,"autoExport":true,"isFallback":false,"scriptLoader":[]}</script></body></html>
|