skypilot-nightly 1.0.0.dev20250411__py3-none-any.whl → 1.0.0.dev20250413__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/oci.py +2 -2
- sky/authentication.py +2 -2
- sky/backends/backend_utils.py +1 -1
- sky/backends/cloud_vm_ray_backend.py +3 -3
- sky/check.py +1 -1
- sky/cli.py +51 -47
- sky/client/cli.py +51 -47
- sky/client/sdk.py +2 -1
- sky/clouds/aws.py +2 -2
- sky/clouds/cloud.py +3 -2
- sky/clouds/kubernetes.py +20 -3
- sky/clouds/nebius.py +2 -4
- sky/clouds/oci.py +2 -2
- sky/clouds/utils/oci_utils.py +1 -1
- sky/core.py +12 -17
- sky/data/mounting_utils.py +34 -10
- sky/exceptions.py +1 -1
- sky/execution.py +5 -4
- sky/provision/instance_setup.py +3 -1
- sky/provision/kubernetes/config.py +41 -36
- sky/provision/kubernetes/instance.py +4 -7
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +54 -0
- sky/provision/kubernetes/network_utils.py +1 -1
- sky/provision/kubernetes/utils.py +51 -35
- sky/server/requests/payloads.py +2 -0
- sky/setup_files/dependencies.py +1 -1
- sky/skylet/constants.py +2 -2
- sky/skypilot_config.py +179 -41
- sky/templates/kubernetes-ray.yml.j2 +66 -25
- sky/templates/websocket_proxy.py +41 -2
- sky/utils/config_utils.py +1 -1
- sky/utils/controller_utils.py +1 -1
- sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
- sky/utils/kubernetes/rsync_helper.sh +26 -11
- {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/METADATA +3 -1
- {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/RECORD +41 -42
- sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml +0 -10
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +0 -68
- {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = '197c8dd3ea85d23323477e7d7cf69e8dc1b693c6'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20250413'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/adaptors/oci.py
CHANGED
@@ -13,7 +13,7 @@ from sky.clouds.utils import oci_utils
|
|
13
13
|
# effect.
|
14
14
|
logging.getLogger('oci.circuit_breaker').setLevel(logging.WARNING)
|
15
15
|
|
16
|
-
|
16
|
+
OCI_CONFIG_PATH = '~/.oci/config'
|
17
17
|
ENV_VAR_OCI_CONFIG = 'OCI_CONFIG'
|
18
18
|
|
19
19
|
oci = common.LazyImport(
|
@@ -23,7 +23,7 @@ oci = common.LazyImport(
|
|
23
23
|
|
24
24
|
|
25
25
|
def get_config_file() -> str:
|
26
|
-
conf_file_path =
|
26
|
+
conf_file_path = OCI_CONFIG_PATH
|
27
27
|
config_path_via_env_var = os.environ.get(ENV_VAR_OCI_CONFIG)
|
28
28
|
if config_path_via_env_var is not None:
|
29
29
|
conf_file_path = config_path_via_env_var
|
sky/authentication.py
CHANGED
@@ -382,10 +382,10 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
382
382
|
network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
|
383
383
|
network_mode_str)
|
384
384
|
except ValueError as e:
|
385
|
-
# Add message saying "Please check: ~/.sky/
|
385
|
+
# Add message saying "Please check: ~/.sky/skyconfig.yaml" to the error
|
386
386
|
# message.
|
387
387
|
with ux_utils.print_exception_no_traceback():
|
388
|
-
raise ValueError(str(e) + ' Please check: ~/.sky/
|
388
|
+
raise ValueError(str(e) + ' Please check: ~/.sky/skyconfig.yaml.') \
|
389
389
|
from None
|
390
390
|
_, public_key_path = get_or_generate_keys()
|
391
391
|
|
sky/backends/backend_utils.py
CHANGED
@@ -682,7 +682,7 @@ def write_cluster_config(
|
|
682
682
|
ssh_proxy_command = ssh_proxy_command_config[region_name]
|
683
683
|
logger.debug(f'Using ssh_proxy_command: {ssh_proxy_command!r}')
|
684
684
|
|
685
|
-
# User-supplied global instance tags from ~/.sky/
|
685
|
+
# User-supplied global instance tags from ~/.sky/skyconfig.yaml.
|
686
686
|
labels = skypilot_config.get_nested((str(cloud).lower(), 'labels'), {})
|
687
687
|
# labels is a dict, which is guaranteed by the type check in
|
688
688
|
# schemas.py
|
@@ -1473,7 +1473,7 @@ class RetryingVmProvisioner(object):
|
|
1473
1473
|
f'invalid cloud credentials: '
|
1474
1474
|
f'{common_utils.format_exception(e)}')
|
1475
1475
|
except exceptions.InvalidCloudConfigs as e:
|
1476
|
-
# Failed due to invalid user configs in ~/.sky/
|
1476
|
+
# Failed due to invalid user configs in ~/.sky/skyconfig.yaml.
|
1477
1477
|
logger.warning(f'{common_utils.format_exception(e)}')
|
1478
1478
|
# We should block the entire cloud if the user config is
|
1479
1479
|
# invalid.
|
@@ -2065,10 +2065,10 @@ class RetryingVmProvisioner(object):
|
|
2065
2065
|
(clouds.Kubernetes, clouds.RunPod)) and
|
2066
2066
|
controller_utils.Controllers.from_name(cluster_name)
|
2067
2067
|
is not None):
|
2068
|
-
assert (clouds.CloudImplementationFeatures.
|
2068
|
+
assert (clouds.CloudImplementationFeatures.AUTOSTOP
|
2069
2069
|
in requested_features), requested_features
|
2070
2070
|
requested_features.remove(
|
2071
|
-
clouds.CloudImplementationFeatures.
|
2071
|
+
clouds.CloudImplementationFeatures.AUTOSTOP)
|
2072
2072
|
|
2073
2073
|
# Skip if to_provision.cloud does not support requested features
|
2074
2074
|
to_provision.cloud.check_features_are_supported(
|
sky/check.py
CHANGED
@@ -142,7 +142,7 @@ def check_capabilities(
|
|
142
142
|
if disallowed_cloud_names:
|
143
143
|
disallowed_clouds_hint = (
|
144
144
|
'\nNote: The following clouds were disabled because they were not '
|
145
|
-
'included in allowed_clouds in ~/.sky/
|
145
|
+
'included in allowed_clouds in ~/.sky/skyconfig.yaml: '
|
146
146
|
f'{", ".join([c for c in disallowed_cloud_names])}')
|
147
147
|
if not all_enabled_clouds:
|
148
148
|
echo(
|
sky/cli.py
CHANGED
@@ -35,7 +35,8 @@ import sys
|
|
35
35
|
import textwrap
|
36
36
|
import traceback
|
37
37
|
import typing
|
38
|
-
from typing import Any, Callable, Dict, Generator, List, Optional, Tuple,
|
38
|
+
from typing import (Any, Callable, Dict, Generator, List, Optional, Set, Tuple,
|
39
|
+
Union)
|
39
40
|
|
40
41
|
import click
|
41
42
|
import colorama
|
@@ -134,49 +135,51 @@ def _get_cluster_records_and_set_ssh_config(
|
|
134
135
|
# Update the SSH config for all clusters
|
135
136
|
for record in cluster_records:
|
136
137
|
handle = record['handle']
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
if (handle is not None and handle.cached_external_ips is not None and
|
141
|
-
'credentials' in record):
|
142
|
-
credentials = record['credentials']
|
143
|
-
if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
|
144
|
-
# Replace the proxy command to proxy through the SkyPilot API
|
145
|
-
# server with websocket.
|
146
|
-
key_path = (
|
147
|
-
cluster_utils.SSHConfigHelper.generate_local_key_file(
|
148
|
-
handle.cluster_name, credentials))
|
149
|
-
# Instead of directly use websocket_proxy.py, we add an
|
150
|
-
# additional proxy, so that ssh can use the head pod in the
|
151
|
-
# cluster to jump to worker pods.
|
152
|
-
proxy_command = (
|
153
|
-
f'ssh -tt -i {key_path} '
|
154
|
-
'-o StrictHostKeyChecking=no '
|
155
|
-
'-o UserKnownHostsFile=/dev/null '
|
156
|
-
'-o IdentitiesOnly=yes '
|
157
|
-
'-W %h:%p '
|
158
|
-
f'{handle.ssh_user}@127.0.0.1 '
|
159
|
-
'-o ProxyCommand='
|
160
|
-
# TODO(zhwu): write the template to a temp file, don't use
|
161
|
-
# the one in skypilot repo, to avoid changing the file when
|
162
|
-
# updating skypilot.
|
163
|
-
f'\'{sys.executable} {sky.__root_dir__}/templates/'
|
164
|
-
f'websocket_proxy.py '
|
165
|
-
f'{server_common.get_server_url().split("://")[1]} '
|
166
|
-
f'{handle.cluster_name}\'')
|
167
|
-
credentials['ssh_proxy_command'] = proxy_command
|
168
|
-
cluster_utils.SSHConfigHelper.add_cluster(
|
169
|
-
handle.cluster_name,
|
170
|
-
handle.cached_external_ips,
|
171
|
-
credentials,
|
172
|
-
handle.cached_external_ssh_ports,
|
173
|
-
handle.docker_user,
|
174
|
-
handle.ssh_user,
|
175
|
-
)
|
176
|
-
else:
|
138
|
+
|
139
|
+
if not (handle is not None and handle.cached_external_ips is not None
|
140
|
+
and 'credentials' in record):
|
177
141
|
# If the cluster is not UP or does not have credentials available,
|
178
142
|
# we need to remove the cluster from the SSH config.
|
179
143
|
cluster_utils.SSHConfigHelper.remove_cluster(record['name'])
|
144
|
+
continue
|
145
|
+
|
146
|
+
# During the failover, even though a cluster does not exist, the handle
|
147
|
+
# can still exist in the record, and we check for credentials to avoid
|
148
|
+
# updating the SSH config for non-existent clusters.
|
149
|
+
credentials = record['credentials']
|
150
|
+
if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
|
151
|
+
# Replace the proxy command to proxy through the SkyPilot API
|
152
|
+
# server with websocket.
|
153
|
+
key_path = (cluster_utils.SSHConfigHelper.generate_local_key_file(
|
154
|
+
handle.cluster_name, credentials))
|
155
|
+
# Instead of directly use websocket_proxy.py, we add an
|
156
|
+
# additional proxy, so that ssh can use the head pod in the
|
157
|
+
# cluster to jump to worker pods.
|
158
|
+
proxy_command = (
|
159
|
+
f'ssh -tt -i {key_path} '
|
160
|
+
'-o StrictHostKeyChecking=no '
|
161
|
+
'-o UserKnownHostsFile=/dev/null '
|
162
|
+
'-o IdentitiesOnly=yes '
|
163
|
+
'-W %h:%p '
|
164
|
+
f'{handle.ssh_user}@127.0.0.1 '
|
165
|
+
'-o ProxyCommand='
|
166
|
+
# TODO(zhwu): write the template to a temp file, don't use
|
167
|
+
# the one in skypilot repo, to avoid changing the file when
|
168
|
+
# updating skypilot.
|
169
|
+
f'\'{sys.executable} {sky.__root_dir__}/templates/'
|
170
|
+
f'websocket_proxy.py '
|
171
|
+
f'{server_common.get_server_url()} '
|
172
|
+
f'{handle.cluster_name}\'')
|
173
|
+
credentials['ssh_proxy_command'] = proxy_command
|
174
|
+
|
175
|
+
cluster_utils.SSHConfigHelper.add_cluster(
|
176
|
+
handle.cluster_name,
|
177
|
+
handle.cached_external_ips,
|
178
|
+
credentials,
|
179
|
+
handle.cached_external_ssh_ports,
|
180
|
+
handle.docker_user,
|
181
|
+
handle.ssh_user,
|
182
|
+
)
|
180
183
|
|
181
184
|
# Clean up SSH configs for clusters that do not exist.
|
182
185
|
#
|
@@ -186,14 +189,15 @@ def _get_cluster_records_and_set_ssh_config(
|
|
186
189
|
# removing clusters, because SkyPilot has no idea whether to remove
|
187
190
|
# ssh config of a cluster from another user.
|
188
191
|
clusters_exists = set(record['name'] for record in cluster_records)
|
192
|
+
clusters_to_remove: Set[str] = set()
|
189
193
|
if clusters is not None:
|
190
|
-
|
191
|
-
if cluster not in clusters_exists:
|
192
|
-
cluster_utils.SSHConfigHelper.remove_cluster(cluster)
|
194
|
+
clusters_to_remove = set(clusters) - clusters_exists
|
193
195
|
elif all_users:
|
194
|
-
|
195
|
-
|
196
|
-
|
196
|
+
clusters_to_remove = set(cluster_utils.SSHConfigHelper.
|
197
|
+
list_cluster_names()) - clusters_exists
|
198
|
+
|
199
|
+
for cluster_name in clusters_to_remove:
|
200
|
+
cluster_utils.SSHConfigHelper.remove_cluster(cluster_name)
|
197
201
|
|
198
202
|
return cluster_records
|
199
203
|
|
sky/client/cli.py
CHANGED
@@ -35,7 +35,8 @@ import sys
|
|
35
35
|
import textwrap
|
36
36
|
import traceback
|
37
37
|
import typing
|
38
|
-
from typing import Any, Callable, Dict, Generator, List, Optional, Tuple,
|
38
|
+
from typing import (Any, Callable, Dict, Generator, List, Optional, Set, Tuple,
|
39
|
+
Union)
|
39
40
|
|
40
41
|
import click
|
41
42
|
import colorama
|
@@ -134,49 +135,51 @@ def _get_cluster_records_and_set_ssh_config(
|
|
134
135
|
# Update the SSH config for all clusters
|
135
136
|
for record in cluster_records:
|
136
137
|
handle = record['handle']
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
if (handle is not None and handle.cached_external_ips is not None and
|
141
|
-
'credentials' in record):
|
142
|
-
credentials = record['credentials']
|
143
|
-
if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
|
144
|
-
# Replace the proxy command to proxy through the SkyPilot API
|
145
|
-
# server with websocket.
|
146
|
-
key_path = (
|
147
|
-
cluster_utils.SSHConfigHelper.generate_local_key_file(
|
148
|
-
handle.cluster_name, credentials))
|
149
|
-
# Instead of directly use websocket_proxy.py, we add an
|
150
|
-
# additional proxy, so that ssh can use the head pod in the
|
151
|
-
# cluster to jump to worker pods.
|
152
|
-
proxy_command = (
|
153
|
-
f'ssh -tt -i {key_path} '
|
154
|
-
'-o StrictHostKeyChecking=no '
|
155
|
-
'-o UserKnownHostsFile=/dev/null '
|
156
|
-
'-o IdentitiesOnly=yes '
|
157
|
-
'-W %h:%p '
|
158
|
-
f'{handle.ssh_user}@127.0.0.1 '
|
159
|
-
'-o ProxyCommand='
|
160
|
-
# TODO(zhwu): write the template to a temp file, don't use
|
161
|
-
# the one in skypilot repo, to avoid changing the file when
|
162
|
-
# updating skypilot.
|
163
|
-
f'\'{sys.executable} {sky.__root_dir__}/templates/'
|
164
|
-
f'websocket_proxy.py '
|
165
|
-
f'{server_common.get_server_url().split("://")[1]} '
|
166
|
-
f'{handle.cluster_name}\'')
|
167
|
-
credentials['ssh_proxy_command'] = proxy_command
|
168
|
-
cluster_utils.SSHConfigHelper.add_cluster(
|
169
|
-
handle.cluster_name,
|
170
|
-
handle.cached_external_ips,
|
171
|
-
credentials,
|
172
|
-
handle.cached_external_ssh_ports,
|
173
|
-
handle.docker_user,
|
174
|
-
handle.ssh_user,
|
175
|
-
)
|
176
|
-
else:
|
138
|
+
|
139
|
+
if not (handle is not None and handle.cached_external_ips is not None
|
140
|
+
and 'credentials' in record):
|
177
141
|
# If the cluster is not UP or does not have credentials available,
|
178
142
|
# we need to remove the cluster from the SSH config.
|
179
143
|
cluster_utils.SSHConfigHelper.remove_cluster(record['name'])
|
144
|
+
continue
|
145
|
+
|
146
|
+
# During the failover, even though a cluster does not exist, the handle
|
147
|
+
# can still exist in the record, and we check for credentials to avoid
|
148
|
+
# updating the SSH config for non-existent clusters.
|
149
|
+
credentials = record['credentials']
|
150
|
+
if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
|
151
|
+
# Replace the proxy command to proxy through the SkyPilot API
|
152
|
+
# server with websocket.
|
153
|
+
key_path = (cluster_utils.SSHConfigHelper.generate_local_key_file(
|
154
|
+
handle.cluster_name, credentials))
|
155
|
+
# Instead of directly use websocket_proxy.py, we add an
|
156
|
+
# additional proxy, so that ssh can use the head pod in the
|
157
|
+
# cluster to jump to worker pods.
|
158
|
+
proxy_command = (
|
159
|
+
f'ssh -tt -i {key_path} '
|
160
|
+
'-o StrictHostKeyChecking=no '
|
161
|
+
'-o UserKnownHostsFile=/dev/null '
|
162
|
+
'-o IdentitiesOnly=yes '
|
163
|
+
'-W %h:%p '
|
164
|
+
f'{handle.ssh_user}@127.0.0.1 '
|
165
|
+
'-o ProxyCommand='
|
166
|
+
# TODO(zhwu): write the template to a temp file, don't use
|
167
|
+
# the one in skypilot repo, to avoid changing the file when
|
168
|
+
# updating skypilot.
|
169
|
+
f'\'{sys.executable} {sky.__root_dir__}/templates/'
|
170
|
+
f'websocket_proxy.py '
|
171
|
+
f'{server_common.get_server_url()} '
|
172
|
+
f'{handle.cluster_name}\'')
|
173
|
+
credentials['ssh_proxy_command'] = proxy_command
|
174
|
+
|
175
|
+
cluster_utils.SSHConfigHelper.add_cluster(
|
176
|
+
handle.cluster_name,
|
177
|
+
handle.cached_external_ips,
|
178
|
+
credentials,
|
179
|
+
handle.cached_external_ssh_ports,
|
180
|
+
handle.docker_user,
|
181
|
+
handle.ssh_user,
|
182
|
+
)
|
180
183
|
|
181
184
|
# Clean up SSH configs for clusters that do not exist.
|
182
185
|
#
|
@@ -186,14 +189,15 @@ def _get_cluster_records_and_set_ssh_config(
|
|
186
189
|
# removing clusters, because SkyPilot has no idea whether to remove
|
187
190
|
# ssh config of a cluster from another user.
|
188
191
|
clusters_exists = set(record['name'] for record in cluster_records)
|
192
|
+
clusters_to_remove: Set[str] = set()
|
189
193
|
if clusters is not None:
|
190
|
-
|
191
|
-
if cluster not in clusters_exists:
|
192
|
-
cluster_utils.SSHConfigHelper.remove_cluster(cluster)
|
194
|
+
clusters_to_remove = set(clusters) - clusters_exists
|
193
195
|
elif all_users:
|
194
|
-
|
195
|
-
|
196
|
-
|
196
|
+
clusters_to_remove = set(cluster_utils.SSHConfigHelper.
|
197
|
+
list_cluster_names()) - clusters_exists
|
198
|
+
|
199
|
+
for cluster_name in clusters_to_remove:
|
200
|
+
cluster_utils.SSHConfigHelper.remove_cluster(cluster_name)
|
197
201
|
|
198
202
|
return cluster_records
|
199
203
|
|
sky/client/sdk.py
CHANGED
@@ -1812,7 +1812,8 @@ def api_login(endpoint: Optional[str] = None) -> None:
|
|
1812
1812
|
server_common.check_server_healthy(endpoint)
|
1813
1813
|
|
1814
1814
|
# Set the endpoint in the config file
|
1815
|
-
config_path = pathlib.Path(
|
1815
|
+
config_path = pathlib.Path(
|
1816
|
+
skypilot_config.get_user_config_path()).expanduser()
|
1816
1817
|
with filelock.FileLock(config_path.with_suffix('.lock')):
|
1817
1818
|
if not skypilot_config.loaded():
|
1818
1819
|
config_path.touch()
|
sky/clouds/aws.py
CHANGED
@@ -472,10 +472,10 @@ class AWS(clouds.Cloud):
|
|
472
472
|
with ux_utils.print_exception_no_traceback():
|
473
473
|
logger.warning(
|
474
474
|
f'Skip opening ports {resources.ports} for cluster {cluster_name!r}, '
|
475
|
-
'as `aws.security_group_name` in `~/.sky/
|
475
|
+
'as `aws.security_group_name` in `~/.sky/skyconfig.yaml` is specified as '
|
476
476
|
f' {security_group!r}. Please make sure the specified security group '
|
477
477
|
'has requested ports setup; or, leave out `aws.security_group_name` '
|
478
|
-
'in `~/.sky/
|
478
|
+
'in `~/.sky/skyconfig.yaml`.')
|
479
479
|
|
480
480
|
return {
|
481
481
|
'instance_type': r.instance_type,
|
sky/clouds/cloud.py
CHANGED
@@ -37,7 +37,7 @@ class CloudImplementationFeatures(enum.Enum):
|
|
37
37
|
_cloud_unsupported_features in all clouds to make sure the
|
38
38
|
check_features_are_supported() works as expected.
|
39
39
|
"""
|
40
|
-
STOP = 'stop'
|
40
|
+
STOP = 'stop'
|
41
41
|
MULTI_NODE = 'multi-node'
|
42
42
|
CLONE_DISK_FROM_CLUSTER = 'clone_disk_from_cluster'
|
43
43
|
IMAGE_ID = 'image_id'
|
@@ -47,7 +47,8 @@ class CloudImplementationFeatures(enum.Enum):
|
|
47
47
|
OPEN_PORTS = 'open_ports'
|
48
48
|
STORAGE_MOUNTING = 'storage_mounting'
|
49
49
|
HOST_CONTROLLERS = 'host_controllers' # Can run jobs/serve controllers
|
50
|
-
|
50
|
+
AUTOSTOP = 'autostop' # Pod/VM can stop itself
|
51
|
+
AUTODOWN = 'autodown' # Pod/VM can down itself
|
51
52
|
|
52
53
|
|
53
54
|
# Use str, enum.Enum to allow CloudCapability to be used as a string.
|
sky/clouds/kubernetes.py
CHANGED
@@ -35,6 +35,10 @@ CREDENTIAL_PATH = os.environ.get('KUBECONFIG', DEFAULT_KUBECONFIG_PATH)
|
|
35
35
|
# E.g., FUSE device manager daemonset is run in this namespace.
|
36
36
|
_SKYPILOT_SYSTEM_NAMESPACE = 'skypilot-system'
|
37
37
|
|
38
|
+
# Shared directory to communicate with fusermount-server, refer to
|
39
|
+
# addons/fuse-proxy/README.md for more details.
|
40
|
+
_FUSERMOUNT_SHARED_DIR = '/var/run/fusermount'
|
41
|
+
|
38
42
|
|
39
43
|
@registry.CLOUD_REGISTRY.register(aliases=['k8s'])
|
40
44
|
class Kubernetes(clouds.Cloud):
|
@@ -110,9 +114,13 @@ class Kubernetes(clouds.Cloud):
|
|
110
114
|
# Controllers cannot spin up new pods with exec auth.
|
111
115
|
unsupported_features[
|
112
116
|
clouds.CloudImplementationFeatures.HOST_CONTROLLERS] = message
|
113
|
-
# Pod does not have permissions to
|
117
|
+
# Pod does not have permissions to down itself with exec auth.
|
114
118
|
unsupported_features[
|
115
|
-
clouds.CloudImplementationFeatures.
|
119
|
+
clouds.CloudImplementationFeatures.AUTODOWN] = message
|
120
|
+
unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
|
121
|
+
'Stopping clusters is not supported on Kubernetes.')
|
122
|
+
unsupported_features[clouds.CloudImplementationFeatures.AUTOSTOP] = (
|
123
|
+
'Auto-stop is not supported on Kubernetes.')
|
116
124
|
# Allow spot instances if supported by the cluster
|
117
125
|
try:
|
118
126
|
spot_label_key, _ = kubernetes_utils.get_spot_label(context)
|
@@ -551,8 +559,9 @@ class Kubernetes(clouds.Cloud):
|
|
551
559
|
'k8s_service_account_name': k8s_service_account_name,
|
552
560
|
'k8s_automount_sa_token': k8s_automount_sa_token,
|
553
561
|
'k8s_fuse_device_required': fuse_device_required,
|
554
|
-
# Namespace to run the
|
562
|
+
# Namespace to run the fusermount-server daemonset in
|
555
563
|
'k8s_skypilot_system_namespace': _SKYPILOT_SYSTEM_NAMESPACE,
|
564
|
+
'k8s_fusermount_shared_dir': _FUSERMOUNT_SHARED_DIR,
|
556
565
|
'k8s_spot_label_key': spot_label_key,
|
557
566
|
'k8s_spot_label_value': spot_label_value,
|
558
567
|
'tpu_requested': tpu_requested,
|
@@ -658,6 +667,14 @@ class Kubernetes(clouds.Cloud):
|
|
658
667
|
def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
|
659
668
|
"""Checks if the user has access credentials to
|
660
669
|
Kubernetes."""
|
670
|
+
# Check for port forward dependencies
|
671
|
+
reasons = kubernetes_utils.check_port_forward_mode_dependencies(False)
|
672
|
+
if reasons is not None:
|
673
|
+
formatted = '\n'.join(
|
674
|
+
[reasons[0]] +
|
675
|
+
[f'{cls._INDENT_PREFIX}' + r for r in reasons[1:]])
|
676
|
+
return (False, formatted)
|
677
|
+
|
661
678
|
# Test using python API
|
662
679
|
try:
|
663
680
|
existing_allowed_contexts = cls.existing_allowed_contexts()
|
sky/clouds/nebius.py
CHANGED
@@ -53,10 +53,8 @@ class Nebius(clouds.Cloud):
|
|
53
53
|
"""Nebius GPU Cloud"""
|
54
54
|
_REPR = 'Nebius'
|
55
55
|
_CLOUD_UNSUPPORTED_FEATURES = {
|
56
|
-
clouds.CloudImplementationFeatures.
|
57
|
-
('Autodown
|
58
|
-
# Autostop functionality can be implemented, but currently,
|
59
|
-
# there is only a single flag for both autostop and autodown.
|
56
|
+
clouds.CloudImplementationFeatures.AUTODOWN:
|
57
|
+
('Autodown not supported. Can\'t delete OS disk.'),
|
60
58
|
clouds.CloudImplementationFeatures.SPOT_INSTANCE:
|
61
59
|
('Spot is not supported, as Nebius API does not implement spot.'),
|
62
60
|
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
|
sky/clouds/oci.py
CHANGED
@@ -9,8 +9,8 @@ History:
|
|
9
9
|
file path resolution (by os.path.expanduser) when construct the file
|
10
10
|
mounts. This bug will cause the created workder nodes located in different
|
11
11
|
compartment and VCN than the header node if user specifies compartment_id
|
12
|
-
in the sky config file, because the ~/.sky/
|
13
|
-
remote machine.
|
12
|
+
in the sky config file, because the ~/.sky/skyconfig.yaml is not
|
13
|
+
sync-ed to the remote machine.
|
14
14
|
The workaround is set the sky config file path using ENV before running
|
15
15
|
the sky launch: export SKYPILOT_CONFIG=/home/ubuntu/.sky/config.yaml
|
16
16
|
- Hysun He (hysun.he@oracle.com) @ Oct 12, 2024:
|
sky/clouds/utils/oci_utils.py
CHANGED
@@ -147,7 +147,7 @@ class OCIConfig:
|
|
147
147
|
if config_path_via_env_var is not None:
|
148
148
|
config_path = config_path_via_env_var
|
149
149
|
else:
|
150
|
-
config_path = skypilot_config.
|
150
|
+
config_path = skypilot_config.get_user_config_path()
|
151
151
|
return config_path
|
152
152
|
|
153
153
|
@classmethod
|
sky/core.py
CHANGED
@@ -629,26 +629,21 @@ def autostop(
|
|
629
629
|
raise exceptions.NotSupportedError(
|
630
630
|
f'{operation} cluster {cluster_name!r} with backend '
|
631
631
|
f'{backend.__class__.__name__!r} is not supported.')
|
632
|
-
# Check autostop is implemented for cloud
|
633
632
|
cloud = handle.launched_resources.cloud
|
634
|
-
if
|
635
|
-
try:
|
636
|
-
cloud.check_features_are_supported(
|
637
|
-
handle.launched_resources,
|
638
|
-
{clouds.CloudImplementationFeatures.STOP})
|
639
|
-
except exceptions.NotSupportedError as e:
|
640
|
-
raise exceptions.NotSupportedError(
|
641
|
-
f'{colorama.Fore.YELLOW}Scheduling autostop on cluster '
|
642
|
-
f'{cluster_name!r}...skipped.{colorama.Style.RESET_ALL}\n'
|
643
|
-
f' {_stop_not_supported_message(handle.launched_resources)}.'
|
644
|
-
) from e
|
645
|
-
|
646
|
-
# Check if autodown is required and supported
|
633
|
+
# Check if autostop/autodown is required and supported
|
647
634
|
if not is_cancel:
|
648
635
|
try:
|
649
|
-
|
650
|
-
|
651
|
-
|
636
|
+
if down:
|
637
|
+
cloud.check_features_are_supported(
|
638
|
+
handle.launched_resources,
|
639
|
+
{clouds.CloudImplementationFeatures.AUTODOWN})
|
640
|
+
else:
|
641
|
+
cloud.check_features_are_supported(
|
642
|
+
handle.launched_resources,
|
643
|
+
{clouds.CloudImplementationFeatures.STOP})
|
644
|
+
cloud.check_features_are_supported(
|
645
|
+
handle.launched_resources,
|
646
|
+
{clouds.CloudImplementationFeatures.AUTOSTOP})
|
652
647
|
except exceptions.NotSupportedError as e:
|
653
648
|
raise exceptions.NotSupportedError(
|
654
649
|
f'{colorama.Fore.YELLOW}{operation} on cluster '
|
sky/data/mounting_utils.py
CHANGED
@@ -30,9 +30,17 @@ _BLOBFUSE_CACHE_DIR = ('~/.sky/blobfuse2_cache/'
|
|
30
30
|
# https://github.com/rclone/rclone/releases
|
31
31
|
RCLONE_VERSION = 'v1.68.2'
|
32
32
|
|
33
|
+
# A wrapper for goofys to choose the logging mechanism based on environment.
|
34
|
+
_GOOFYS_WRAPPER = ('$(if [ -S /dev/log ] ; then '
|
35
|
+
'echo "goofys"; '
|
36
|
+
'else '
|
37
|
+
'echo "goofys --log-file $(mktemp -t goofys.XXXX.log)"; '
|
38
|
+
'fi)')
|
39
|
+
|
33
40
|
|
34
41
|
def get_s3_mount_install_cmd() -> str:
|
35
42
|
"""Returns a command to install S3 mount utility goofys."""
|
43
|
+
# TODO(aylei): maintain our goofys fork under skypilot-org
|
36
44
|
install_cmd = ('ARCH=$(uname -m) && '
|
37
45
|
'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
|
38
46
|
' echo "goofys is not supported on $ARCH" && '
|
@@ -40,8 +48,8 @@ def get_s3_mount_install_cmd() -> str:
|
|
40
48
|
'else '
|
41
49
|
' ARCH_SUFFIX="amd64"; '
|
42
50
|
'fi && '
|
43
|
-
'sudo wget -nc https://github.com/
|
44
|
-
'releases/download/0.24.0-
|
51
|
+
'sudo wget -nc https://github.com/aylei/goofys/'
|
52
|
+
'releases/download/0.24.0-aylei-upstream/goofys '
|
45
53
|
'-O /usr/local/bin/goofys && '
|
46
54
|
'sudo chmod 755 /usr/local/bin/goofys')
|
47
55
|
return install_cmd
|
@@ -56,7 +64,7 @@ def get_s3_mount_cmd(bucket_name: str,
|
|
56
64
|
_bucket_sub_path = ''
|
57
65
|
else:
|
58
66
|
_bucket_sub_path = f':{_bucket_sub_path}'
|
59
|
-
mount_cmd = ('
|
67
|
+
mount_cmd = (f'{_GOOFYS_WRAPPER} -o allow_other '
|
60
68
|
f'--stat-cache-ttl {_STAT_CACHE_TTL} '
|
61
69
|
f'--type-cache-ttl {_TYPE_CACHE_TTL} '
|
62
70
|
f'{bucket_name}{_bucket_sub_path} {mount_path}')
|
@@ -73,7 +81,8 @@ def get_nebius_mount_cmd(nebius_profile_name: str,
|
|
73
81
|
_bucket_sub_path = ''
|
74
82
|
else:
|
75
83
|
_bucket_sub_path = f':{_bucket_sub_path}'
|
76
|
-
mount_cmd = (f'AWS_PROFILE={nebius_profile_name}
|
84
|
+
mount_cmd = (f'AWS_PROFILE={nebius_profile_name} {_GOOFYS_WRAPPER} '
|
85
|
+
'-o allow_other '
|
77
86
|
f'--stat-cache-ttl {_STAT_CACHE_TTL} '
|
78
87
|
f'--type-cache-ttl {_TYPE_CACHE_TTL} '
|
79
88
|
f'--endpoint {endpoint_url} '
|
@@ -185,14 +194,28 @@ def get_az_mount_cmd(container_name: str,
|
|
185
194
|
bucket_sub_path_arg = ''
|
186
195
|
else:
|
187
196
|
bucket_sub_path_arg = f'--subdirectory={_bucket_sub_path}/ '
|
197
|
+
mount_options = '-o allow_other -o default_permissions'
|
188
198
|
# TODO(zpoint): clear old cache that has been created in the previous boot.
|
199
|
+
blobfuse2_cmd = ('blobfuse2 --no-symlinks -o umask=022 '
|
200
|
+
f'--tmp-path {cache_path}_$({remote_boot_time_cmd}) '
|
201
|
+
f'{bucket_sub_path_arg}'
|
202
|
+
f'--container-name {container_name}')
|
203
|
+
# 1. Set -o nonempty to bypass empty directory check of blobfuse2 when using
|
204
|
+
# fusermount-wrapper, since the mount is delegated to fusermount and
|
205
|
+
# blobfuse2 only get the mounted fd.
|
206
|
+
# 2. {} is the mount point placeholder that will be replaced with the
|
207
|
+
# mounted fd by fusermount-wrapper.
|
208
|
+
wrapped = (f'fusermount-wrapper -m {mount_path} {mount_options} '
|
209
|
+
f'-- {blobfuse2_cmd} -o nonempty {{}}')
|
210
|
+
original = f'{blobfuse2_cmd} {mount_options} {mount_path}'
|
211
|
+
# If fusermount-wrapper is available, use it to wrap the blobfuse2 command
|
212
|
+
# to avoid requiring root privilege.
|
213
|
+
# TODO(aylei): feeling hacky, refactor this.
|
214
|
+
get_mount_cmd = ('command -v fusermount-wrapper >/dev/null 2>&1 && '
|
215
|
+
f'echo "{wrapped}" || echo "{original}"')
|
189
216
|
mount_cmd = (f'AZURE_STORAGE_ACCOUNT={storage_account_name} '
|
190
217
|
f'{key_env_var} '
|
191
|
-
f'
|
192
|
-
'-o umask=022 -o default_permissions '
|
193
|
-
f'--tmp-path {cache_path}_$({remote_boot_time_cmd}) '
|
194
|
-
f'{bucket_sub_path_arg}'
|
195
|
-
f'--container-name {container_name}')
|
218
|
+
f'$({get_mount_cmd})')
|
196
219
|
return mount_cmd
|
197
220
|
|
198
221
|
|
@@ -209,7 +232,8 @@ def get_r2_mount_cmd(r2_credentials_path: str,
|
|
209
232
|
else:
|
210
233
|
_bucket_sub_path = f':{_bucket_sub_path}'
|
211
234
|
mount_cmd = (f'AWS_SHARED_CREDENTIALS_FILE={r2_credentials_path} '
|
212
|
-
f'AWS_PROFILE={r2_profile_name}
|
235
|
+
f'AWS_PROFILE={r2_profile_name} {_GOOFYS_WRAPPER} '
|
236
|
+
'-o allow_other '
|
213
237
|
f'--stat-cache-ttl {_STAT_CACHE_TTL} '
|
214
238
|
f'--type-cache-ttl {_TYPE_CACHE_TTL} '
|
215
239
|
f'--endpoint {endpoint_url} '
|
sky/exceptions.py
CHANGED
@@ -265,7 +265,7 @@ class CommandError(SkyPilotExcludeArgsBaseException):
|
|
265
265
|
# Chunk the command to avoid overflow.
|
266
266
|
command = command[:100] + '...'
|
267
267
|
message = (f'Command {command} failed with return code '
|
268
|
-
f'{returncode}.\n{error_msg}')
|
268
|
+
f'{returncode}.\n{error_msg}\n{detailed_reason}')
|
269
269
|
super().__init__(message)
|
270
270
|
|
271
271
|
|
sky/execution.py
CHANGED
@@ -237,11 +237,12 @@ def _execute(
|
|
237
237
|
if Stage.DOWN in stages:
|
238
238
|
stages.remove(Stage.DOWN)
|
239
239
|
if idle_minutes_to_autostop >= 0:
|
240
|
-
|
241
|
-
clouds.CloudImplementationFeatures.AUTO_TERMINATE)
|
242
|
-
if not down:
|
240
|
+
if down:
|
243
241
|
requested_features.add(
|
244
|
-
clouds.CloudImplementationFeatures.
|
242
|
+
clouds.CloudImplementationFeatures.AUTODOWN)
|
243
|
+
else:
|
244
|
+
requested_features.add(
|
245
|
+
clouds.CloudImplementationFeatures.AUTOSTOP)
|
245
246
|
# NOTE: in general we may not have sufficiently specified info
|
246
247
|
# (cloud/resource) to check STOP_SPOT_INSTANCE here. This is checked in
|
247
248
|
# the backend.
|
sky/provision/instance_setup.py
CHANGED
@@ -44,7 +44,9 @@ _DUMP_RAY_PORTS = (
|
|
44
44
|
|
45
45
|
_RAY_PORT_COMMAND = (
|
46
46
|
f'RAY_PORT=$({constants.SKY_PYTHON_CMD} -c '
|
47
|
-
'"from sky
|
47
|
+
'"from sky import sky_logging\n'
|
48
|
+
'with sky_logging.silent(): '
|
49
|
+
'from sky.skylet import job_lib; print(job_lib.get_ray_port())" '
|
48
50
|
'2> /dev/null || echo 6379);'
|
49
51
|
f'{constants.SKY_PYTHON_CMD} -c "from sky.utils import message_utils; '
|
50
52
|
'print(message_utils.encode_payload({\'ray_port\': $RAY_PORT}))"')
|