skypilot-nightly 1.0.0.dev20250410__py3-none-any.whl → 1.0.0.dev20250412__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/oci.py +2 -2
- sky/authentication.py +2 -2
- sky/backends/backend_utils.py +1 -1
- sky/backends/cloud_vm_ray_backend.py +3 -3
- sky/check.py +1 -1
- sky/cli.py +51 -47
- sky/client/cli.py +51 -47
- sky/client/common.py +4 -2
- sky/client/sdk.py +60 -27
- sky/clouds/aws.py +2 -2
- sky/clouds/cloud.py +3 -2
- sky/clouds/kubernetes.py +20 -3
- sky/clouds/nebius.py +2 -4
- sky/clouds/oci.py +2 -2
- sky/clouds/utils/oci_utils.py +1 -1
- sky/core.py +12 -17
- sky/data/mounting_utils.py +34 -10
- sky/exceptions.py +1 -1
- sky/execution.py +5 -4
- sky/jobs/client/sdk.py +5 -0
- sky/optimizer.py +1 -2
- sky/provision/instance_setup.py +3 -1
- sky/provision/kubernetes/config.py +41 -36
- sky/provision/kubernetes/instance.py +4 -7
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +54 -0
- sky/provision/kubernetes/network_utils.py +1 -1
- sky/provision/kubernetes/utils.py +51 -35
- sky/serve/client/sdk.py +6 -0
- sky/server/common.py +16 -1
- sky/server/constants.py +5 -0
- sky/server/requests/payloads.py +2 -0
- sky/setup_files/dependencies.py +1 -1
- sky/skylet/constants.py +2 -2
- sky/skypilot_config.py +197 -70
- sky/templates/kubernetes-ray.yml.j2 +66 -25
- sky/templates/websocket_proxy.py +41 -2
- sky/utils/config_utils.py +1 -1
- sky/utils/controller_utils.py +1 -1
- sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
- sky/utils/kubernetes/rsync_helper.sh +26 -11
- {skypilot_nightly-1.0.0.dev20250410.dist-info → skypilot_nightly-1.0.0.dev20250412.dist-info}/METADATA +3 -1
- {skypilot_nightly-1.0.0.dev20250410.dist-info → skypilot_nightly-1.0.0.dev20250412.dist-info}/RECORD +47 -48
- sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml +0 -10
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +0 -68
- {skypilot_nightly-1.0.0.dev20250410.dist-info → skypilot_nightly-1.0.0.dev20250412.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250410.dist-info → skypilot_nightly-1.0.0.dev20250412.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250410.dist-info → skypilot_nightly-1.0.0.dev20250412.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250410.dist-info → skypilot_nightly-1.0.0.dev20250412.dist-info}/top_level.txt +0 -0
sky/clouds/aws.py
CHANGED
@@ -472,10 +472,10 @@ class AWS(clouds.Cloud):
|
|
472
472
|
with ux_utils.print_exception_no_traceback():
|
473
473
|
logger.warning(
|
474
474
|
f'Skip opening ports {resources.ports} for cluster {cluster_name!r}, '
|
475
|
-
'as `aws.security_group_name` in `~/.sky/
|
475
|
+
'as `aws.security_group_name` in `~/.sky/skyconfig.yaml` is specified as '
|
476
476
|
f' {security_group!r}. Please make sure the specified security group '
|
477
477
|
'has requested ports setup; or, leave out `aws.security_group_name` '
|
478
|
-
'in `~/.sky/
|
478
|
+
'in `~/.sky/skyconfig.yaml`.')
|
479
479
|
|
480
480
|
return {
|
481
481
|
'instance_type': r.instance_type,
|
sky/clouds/cloud.py
CHANGED
@@ -37,7 +37,7 @@ class CloudImplementationFeatures(enum.Enum):
|
|
37
37
|
_cloud_unsupported_features in all clouds to make sure the
|
38
38
|
check_features_are_supported() works as expected.
|
39
39
|
"""
|
40
|
-
STOP = 'stop'
|
40
|
+
STOP = 'stop'
|
41
41
|
MULTI_NODE = 'multi-node'
|
42
42
|
CLONE_DISK_FROM_CLUSTER = 'clone_disk_from_cluster'
|
43
43
|
IMAGE_ID = 'image_id'
|
@@ -47,7 +47,8 @@ class CloudImplementationFeatures(enum.Enum):
|
|
47
47
|
OPEN_PORTS = 'open_ports'
|
48
48
|
STORAGE_MOUNTING = 'storage_mounting'
|
49
49
|
HOST_CONTROLLERS = 'host_controllers' # Can run jobs/serve controllers
|
50
|
-
|
50
|
+
AUTOSTOP = 'autostop' # Pod/VM can stop itself
|
51
|
+
AUTODOWN = 'autodown' # Pod/VM can down itself
|
51
52
|
|
52
53
|
|
53
54
|
# Use str, enum.Enum to allow CloudCapability to be used as a string.
|
sky/clouds/kubernetes.py
CHANGED
@@ -35,6 +35,10 @@ CREDENTIAL_PATH = os.environ.get('KUBECONFIG', DEFAULT_KUBECONFIG_PATH)
|
|
35
35
|
# E.g., FUSE device manager daemonset is run in this namespace.
|
36
36
|
_SKYPILOT_SYSTEM_NAMESPACE = 'skypilot-system'
|
37
37
|
|
38
|
+
# Shared directory to communicate with fusermount-server, refer to
|
39
|
+
# addons/fuse-proxy/README.md for more details.
|
40
|
+
_FUSERMOUNT_SHARED_DIR = '/var/run/fusermount'
|
41
|
+
|
38
42
|
|
39
43
|
@registry.CLOUD_REGISTRY.register(aliases=['k8s'])
|
40
44
|
class Kubernetes(clouds.Cloud):
|
@@ -110,9 +114,13 @@ class Kubernetes(clouds.Cloud):
|
|
110
114
|
# Controllers cannot spin up new pods with exec auth.
|
111
115
|
unsupported_features[
|
112
116
|
clouds.CloudImplementationFeatures.HOST_CONTROLLERS] = message
|
113
|
-
# Pod does not have permissions to
|
117
|
+
# Pod does not have permissions to down itself with exec auth.
|
114
118
|
unsupported_features[
|
115
|
-
clouds.CloudImplementationFeatures.
|
119
|
+
clouds.CloudImplementationFeatures.AUTODOWN] = message
|
120
|
+
unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
|
121
|
+
'Stopping clusters is not supported on Kubernetes.')
|
122
|
+
unsupported_features[clouds.CloudImplementationFeatures.AUTOSTOP] = (
|
123
|
+
'Auto-stop is not supported on Kubernetes.')
|
116
124
|
# Allow spot instances if supported by the cluster
|
117
125
|
try:
|
118
126
|
spot_label_key, _ = kubernetes_utils.get_spot_label(context)
|
@@ -551,8 +559,9 @@ class Kubernetes(clouds.Cloud):
|
|
551
559
|
'k8s_service_account_name': k8s_service_account_name,
|
552
560
|
'k8s_automount_sa_token': k8s_automount_sa_token,
|
553
561
|
'k8s_fuse_device_required': fuse_device_required,
|
554
|
-
# Namespace to run the
|
562
|
+
# Namespace to run the fusermount-server daemonset in
|
555
563
|
'k8s_skypilot_system_namespace': _SKYPILOT_SYSTEM_NAMESPACE,
|
564
|
+
'k8s_fusermount_shared_dir': _FUSERMOUNT_SHARED_DIR,
|
556
565
|
'k8s_spot_label_key': spot_label_key,
|
557
566
|
'k8s_spot_label_value': spot_label_value,
|
558
567
|
'tpu_requested': tpu_requested,
|
@@ -658,6 +667,14 @@ class Kubernetes(clouds.Cloud):
|
|
658
667
|
def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
|
659
668
|
"""Checks if the user has access credentials to
|
660
669
|
Kubernetes."""
|
670
|
+
# Check for port forward dependencies
|
671
|
+
reasons = kubernetes_utils.check_port_forward_mode_dependencies(False)
|
672
|
+
if reasons is not None:
|
673
|
+
formatted = '\n'.join(
|
674
|
+
[reasons[0]] +
|
675
|
+
[f'{cls._INDENT_PREFIX}' + r for r in reasons[1:]])
|
676
|
+
return (False, formatted)
|
677
|
+
|
661
678
|
# Test using python API
|
662
679
|
try:
|
663
680
|
existing_allowed_contexts = cls.existing_allowed_contexts()
|
sky/clouds/nebius.py
CHANGED
@@ -53,10 +53,8 @@ class Nebius(clouds.Cloud):
|
|
53
53
|
"""Nebius GPU Cloud"""
|
54
54
|
_REPR = 'Nebius'
|
55
55
|
_CLOUD_UNSUPPORTED_FEATURES = {
|
56
|
-
clouds.CloudImplementationFeatures.
|
57
|
-
('Autodown
|
58
|
-
# Autostop functionality can be implemented, but currently,
|
59
|
-
# there is only a single flag for both autostop and autodown.
|
56
|
+
clouds.CloudImplementationFeatures.AUTODOWN:
|
57
|
+
('Autodown not supported. Can\'t delete OS disk.'),
|
60
58
|
clouds.CloudImplementationFeatures.SPOT_INSTANCE:
|
61
59
|
('Spot is not supported, as Nebius API does not implement spot.'),
|
62
60
|
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
|
sky/clouds/oci.py
CHANGED
@@ -9,8 +9,8 @@ History:
|
|
9
9
|
file path resolution (by os.path.expanduser) when construct the file
|
10
10
|
mounts. This bug will cause the created workder nodes located in different
|
11
11
|
compartment and VCN than the header node if user specifies compartment_id
|
12
|
-
in the sky config file, because the ~/.sky/
|
13
|
-
remote machine.
|
12
|
+
in the sky config file, because the ~/.sky/skyconfig.yaml is not
|
13
|
+
sync-ed to the remote machine.
|
14
14
|
The workaround is set the sky config file path using ENV before running
|
15
15
|
the sky launch: export SKYPILOT_CONFIG=/home/ubuntu/.sky/config.yaml
|
16
16
|
- Hysun He (hysun.he@oracle.com) @ Oct 12, 2024:
|
sky/clouds/utils/oci_utils.py
CHANGED
@@ -147,7 +147,7 @@ class OCIConfig:
|
|
147
147
|
if config_path_via_env_var is not None:
|
148
148
|
config_path = config_path_via_env_var
|
149
149
|
else:
|
150
|
-
config_path = skypilot_config.
|
150
|
+
config_path = skypilot_config.get_user_config_path()
|
151
151
|
return config_path
|
152
152
|
|
153
153
|
@classmethod
|
sky/core.py
CHANGED
@@ -629,26 +629,21 @@ def autostop(
|
|
629
629
|
raise exceptions.NotSupportedError(
|
630
630
|
f'{operation} cluster {cluster_name!r} with backend '
|
631
631
|
f'{backend.__class__.__name__!r} is not supported.')
|
632
|
-
# Check autostop is implemented for cloud
|
633
632
|
cloud = handle.launched_resources.cloud
|
634
|
-
if
|
635
|
-
try:
|
636
|
-
cloud.check_features_are_supported(
|
637
|
-
handle.launched_resources,
|
638
|
-
{clouds.CloudImplementationFeatures.STOP})
|
639
|
-
except exceptions.NotSupportedError as e:
|
640
|
-
raise exceptions.NotSupportedError(
|
641
|
-
f'{colorama.Fore.YELLOW}Scheduling autostop on cluster '
|
642
|
-
f'{cluster_name!r}...skipped.{colorama.Style.RESET_ALL}\n'
|
643
|
-
f' {_stop_not_supported_message(handle.launched_resources)}.'
|
644
|
-
) from e
|
645
|
-
|
646
|
-
# Check if autodown is required and supported
|
633
|
+
# Check if autostop/autodown is required and supported
|
647
634
|
if not is_cancel:
|
648
635
|
try:
|
649
|
-
|
650
|
-
|
651
|
-
|
636
|
+
if down:
|
637
|
+
cloud.check_features_are_supported(
|
638
|
+
handle.launched_resources,
|
639
|
+
{clouds.CloudImplementationFeatures.AUTODOWN})
|
640
|
+
else:
|
641
|
+
cloud.check_features_are_supported(
|
642
|
+
handle.launched_resources,
|
643
|
+
{clouds.CloudImplementationFeatures.STOP})
|
644
|
+
cloud.check_features_are_supported(
|
645
|
+
handle.launched_resources,
|
646
|
+
{clouds.CloudImplementationFeatures.AUTOSTOP})
|
652
647
|
except exceptions.NotSupportedError as e:
|
653
648
|
raise exceptions.NotSupportedError(
|
654
649
|
f'{colorama.Fore.YELLOW}{operation} on cluster '
|
sky/data/mounting_utils.py
CHANGED
@@ -30,9 +30,17 @@ _BLOBFUSE_CACHE_DIR = ('~/.sky/blobfuse2_cache/'
|
|
30
30
|
# https://github.com/rclone/rclone/releases
|
31
31
|
RCLONE_VERSION = 'v1.68.2'
|
32
32
|
|
33
|
+
# A wrapper for goofys to choose the logging mechanism based on environment.
|
34
|
+
_GOOFYS_WRAPPER = ('$(if [ -S /dev/log ] ; then '
|
35
|
+
'echo "goofys"; '
|
36
|
+
'else '
|
37
|
+
'echo "goofys --log-file $(mktemp -t goofys.XXXX.log)"; '
|
38
|
+
'fi)')
|
39
|
+
|
33
40
|
|
34
41
|
def get_s3_mount_install_cmd() -> str:
|
35
42
|
"""Returns a command to install S3 mount utility goofys."""
|
43
|
+
# TODO(aylei): maintain our goofys fork under skypilot-org
|
36
44
|
install_cmd = ('ARCH=$(uname -m) && '
|
37
45
|
'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
|
38
46
|
' echo "goofys is not supported on $ARCH" && '
|
@@ -40,8 +48,8 @@ def get_s3_mount_install_cmd() -> str:
|
|
40
48
|
'else '
|
41
49
|
' ARCH_SUFFIX="amd64"; '
|
42
50
|
'fi && '
|
43
|
-
'sudo wget -nc https://github.com/
|
44
|
-
'releases/download/0.24.0-
|
51
|
+
'sudo wget -nc https://github.com/aylei/goofys/'
|
52
|
+
'releases/download/0.24.0-aylei-upstream/goofys '
|
45
53
|
'-O /usr/local/bin/goofys && '
|
46
54
|
'sudo chmod 755 /usr/local/bin/goofys')
|
47
55
|
return install_cmd
|
@@ -56,7 +64,7 @@ def get_s3_mount_cmd(bucket_name: str,
|
|
56
64
|
_bucket_sub_path = ''
|
57
65
|
else:
|
58
66
|
_bucket_sub_path = f':{_bucket_sub_path}'
|
59
|
-
mount_cmd = ('
|
67
|
+
mount_cmd = (f'{_GOOFYS_WRAPPER} -o allow_other '
|
60
68
|
f'--stat-cache-ttl {_STAT_CACHE_TTL} '
|
61
69
|
f'--type-cache-ttl {_TYPE_CACHE_TTL} '
|
62
70
|
f'{bucket_name}{_bucket_sub_path} {mount_path}')
|
@@ -73,7 +81,8 @@ def get_nebius_mount_cmd(nebius_profile_name: str,
|
|
73
81
|
_bucket_sub_path = ''
|
74
82
|
else:
|
75
83
|
_bucket_sub_path = f':{_bucket_sub_path}'
|
76
|
-
mount_cmd = (f'AWS_PROFILE={nebius_profile_name}
|
84
|
+
mount_cmd = (f'AWS_PROFILE={nebius_profile_name} {_GOOFYS_WRAPPER} '
|
85
|
+
'-o allow_other '
|
77
86
|
f'--stat-cache-ttl {_STAT_CACHE_TTL} '
|
78
87
|
f'--type-cache-ttl {_TYPE_CACHE_TTL} '
|
79
88
|
f'--endpoint {endpoint_url} '
|
@@ -185,14 +194,28 @@ def get_az_mount_cmd(container_name: str,
|
|
185
194
|
bucket_sub_path_arg = ''
|
186
195
|
else:
|
187
196
|
bucket_sub_path_arg = f'--subdirectory={_bucket_sub_path}/ '
|
197
|
+
mount_options = '-o allow_other -o default_permissions'
|
188
198
|
# TODO(zpoint): clear old cache that has been created in the previous boot.
|
199
|
+
blobfuse2_cmd = ('blobfuse2 --no-symlinks -o umask=022 '
|
200
|
+
f'--tmp-path {cache_path}_$({remote_boot_time_cmd}) '
|
201
|
+
f'{bucket_sub_path_arg}'
|
202
|
+
f'--container-name {container_name}')
|
203
|
+
# 1. Set -o nonempty to bypass empty directory check of blobfuse2 when using
|
204
|
+
# fusermount-wrapper, since the mount is delegated to fusermount and
|
205
|
+
# blobfuse2 only get the mounted fd.
|
206
|
+
# 2. {} is the mount point placeholder that will be replaced with the
|
207
|
+
# mounted fd by fusermount-wrapper.
|
208
|
+
wrapped = (f'fusermount-wrapper -m {mount_path} {mount_options} '
|
209
|
+
f'-- {blobfuse2_cmd} -o nonempty {{}}')
|
210
|
+
original = f'{blobfuse2_cmd} {mount_options} {mount_path}'
|
211
|
+
# If fusermount-wrapper is available, use it to wrap the blobfuse2 command
|
212
|
+
# to avoid requiring root privilege.
|
213
|
+
# TODO(aylei): feeling hacky, refactor this.
|
214
|
+
get_mount_cmd = ('command -v fusermount-wrapper >/dev/null 2>&1 && '
|
215
|
+
f'echo "{wrapped}" || echo "{original}"')
|
189
216
|
mount_cmd = (f'AZURE_STORAGE_ACCOUNT={storage_account_name} '
|
190
217
|
f'{key_env_var} '
|
191
|
-
f'
|
192
|
-
'-o umask=022 -o default_permissions '
|
193
|
-
f'--tmp-path {cache_path}_$({remote_boot_time_cmd}) '
|
194
|
-
f'{bucket_sub_path_arg}'
|
195
|
-
f'--container-name {container_name}')
|
218
|
+
f'$({get_mount_cmd})')
|
196
219
|
return mount_cmd
|
197
220
|
|
198
221
|
|
@@ -209,7 +232,8 @@ def get_r2_mount_cmd(r2_credentials_path: str,
|
|
209
232
|
else:
|
210
233
|
_bucket_sub_path = f':{_bucket_sub_path}'
|
211
234
|
mount_cmd = (f'AWS_SHARED_CREDENTIALS_FILE={r2_credentials_path} '
|
212
|
-
f'AWS_PROFILE={r2_profile_name}
|
235
|
+
f'AWS_PROFILE={r2_profile_name} {_GOOFYS_WRAPPER} '
|
236
|
+
'-o allow_other '
|
213
237
|
f'--stat-cache-ttl {_STAT_CACHE_TTL} '
|
214
238
|
f'--type-cache-ttl {_TYPE_CACHE_TTL} '
|
215
239
|
f'--endpoint {endpoint_url} '
|
sky/exceptions.py
CHANGED
@@ -265,7 +265,7 @@ class CommandError(SkyPilotExcludeArgsBaseException):
|
|
265
265
|
# Chunk the command to avoid overflow.
|
266
266
|
command = command[:100] + '...'
|
267
267
|
message = (f'Command {command} failed with return code '
|
268
|
-
f'{returncode}.\n{error_msg}')
|
268
|
+
f'{returncode}.\n{error_msg}\n{detailed_reason}')
|
269
269
|
super().__init__(message)
|
270
270
|
|
271
271
|
|
sky/execution.py
CHANGED
@@ -237,11 +237,12 @@ def _execute(
|
|
237
237
|
if Stage.DOWN in stages:
|
238
238
|
stages.remove(Stage.DOWN)
|
239
239
|
if idle_minutes_to_autostop >= 0:
|
240
|
-
|
241
|
-
clouds.CloudImplementationFeatures.AUTO_TERMINATE)
|
242
|
-
if not down:
|
240
|
+
if down:
|
243
241
|
requested_features.add(
|
244
|
-
clouds.CloudImplementationFeatures.
|
242
|
+
clouds.CloudImplementationFeatures.AUTODOWN)
|
243
|
+
else:
|
244
|
+
requested_features.add(
|
245
|
+
clouds.CloudImplementationFeatures.AUTOSTOP)
|
245
246
|
# NOTE: in general we may not have sufficiently specified info
|
246
247
|
# (cloud/resource) to check STOP_SPOT_INSTANCE here. This is checked in
|
247
248
|
# the backend.
|
sky/jobs/client/sdk.py
CHANGED
@@ -82,6 +82,7 @@ def launch(
|
|
82
82
|
f'{server_common.get_server_url()}/jobs/launch',
|
83
83
|
json=json.loads(body.model_dump_json()),
|
84
84
|
timeout=(5, None),
|
85
|
+
cookies=server_common.get_api_cookie_jar(),
|
85
86
|
)
|
86
87
|
return server_common.get_request_id(response)
|
87
88
|
|
@@ -138,6 +139,7 @@ def queue(refresh: bool,
|
|
138
139
|
f'{server_common.get_server_url()}/jobs/queue',
|
139
140
|
json=json.loads(body.model_dump_json()),
|
140
141
|
timeout=(5, None),
|
142
|
+
cookies=server_common.get_api_cookie_jar(),
|
141
143
|
)
|
142
144
|
return server_common.get_request_id(response=response)
|
143
145
|
|
@@ -177,6 +179,7 @@ def cancel(
|
|
177
179
|
f'{server_common.get_server_url()}/jobs/cancel',
|
178
180
|
json=json.loads(body.model_dump_json()),
|
179
181
|
timeout=(5, None),
|
182
|
+
cookies=server_common.get_api_cookie_jar(),
|
180
183
|
)
|
181
184
|
return server_common.get_request_id(response=response)
|
182
185
|
|
@@ -224,6 +227,7 @@ def tail_logs(name: Optional[str] = None,
|
|
224
227
|
json=json.loads(body.model_dump_json()),
|
225
228
|
stream=True,
|
226
229
|
timeout=(5, None),
|
230
|
+
cookies=server_common.get_api_cookie_jar(),
|
227
231
|
)
|
228
232
|
request_id = server_common.get_request_id(response)
|
229
233
|
return sdk.stream_response(request_id, response, output_stream)
|
@@ -267,6 +271,7 @@ def download_logs(
|
|
267
271
|
f'{server_common.get_server_url()}/jobs/download_logs',
|
268
272
|
json=json.loads(body.model_dump_json()),
|
269
273
|
timeout=(5, None),
|
274
|
+
cookies=server_common.get_api_cookie_jar(),
|
270
275
|
)
|
271
276
|
job_id_remote_path_dict = sdk.stream_and_get(
|
272
277
|
server_common.get_request_id(response))
|
sky/optimizer.py
CHANGED
@@ -6,6 +6,7 @@ import typing
|
|
6
6
|
from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
|
7
7
|
|
8
8
|
import colorama
|
9
|
+
import numpy as np
|
9
10
|
import prettytable
|
10
11
|
|
11
12
|
from sky import check as sky_check
|
@@ -28,12 +29,10 @@ from sky.utils import ux_utils
|
|
28
29
|
|
29
30
|
if typing.TYPE_CHECKING:
|
30
31
|
import networkx as nx
|
31
|
-
import numpy as np
|
32
32
|
|
33
33
|
from sky import dag as dag_lib
|
34
34
|
else:
|
35
35
|
nx = adaptors_common.LazyImport('networkx')
|
36
|
-
np = adaptors_common.LazyImport('numpy')
|
37
36
|
|
38
37
|
logger = sky_logging.init_logger(__name__)
|
39
38
|
|
sky/provision/instance_setup.py
CHANGED
@@ -44,7 +44,9 @@ _DUMP_RAY_PORTS = (
|
|
44
44
|
|
45
45
|
_RAY_PORT_COMMAND = (
|
46
46
|
f'RAY_PORT=$({constants.SKY_PYTHON_CMD} -c '
|
47
|
-
'"from sky
|
47
|
+
'"from sky import sky_logging\n'
|
48
|
+
'with sky_logging.silent(): '
|
49
|
+
'from sky.skylet import job_lib; print(job_lib.get_ray_port())" '
|
48
50
|
'2> /dev/null || echo 6379);'
|
49
51
|
f'{constants.SKY_PYTHON_CMD} -c "from sky.utils import message_utils; '
|
50
52
|
'print(message_utils.encode_payload({\'ray_port\': $RAY_PORT}))"')
|
@@ -43,7 +43,7 @@ def bootstrap_instances(
|
|
43
43
|
if (requested_service_account ==
|
44
44
|
kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME):
|
45
45
|
# If the user has requested a different service account (via pod_config
|
46
|
-
# in ~/.sky/
|
46
|
+
# in ~/.sky/skyconfig.yaml), we assume they have already set up the
|
47
47
|
# necessary roles and role bindings.
|
48
48
|
# If not, set up the roles and bindings for skypilot-service-account
|
49
49
|
# here.
|
@@ -561,69 +561,74 @@ def _configure_skypilot_system_namespace(
|
|
561
561
|
|
562
562
|
|
563
563
|
def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None:
|
564
|
-
"""Creates
|
564
|
+
"""Creates the privileged daemonset required for FUSE mounting.
|
565
565
|
|
566
566
|
FUSE mounting in Kubernetes without privileged containers requires us to
|
567
|
-
run a
|
568
|
-
|
569
|
-
|
567
|
+
run a privileged daemonset which accepts fusermount requests via unix
|
568
|
+
domain socket and perform the mount/unmount operations on the host /dev/fuse
|
569
|
+
device.
|
570
570
|
|
571
571
|
We create this daemonset in the skypilot_system_namespace, which is
|
572
|
-
configurable in the provider config. This allows the
|
573
|
-
|
572
|
+
configurable in the provider config. This allows the daemonset to be
|
573
|
+
shared across multiple tenants. The default namespace is
|
574
574
|
'skypilot-system' (populated in clouds.Kubernetes).
|
575
|
+
|
576
|
+
For legacy smarter-device-manager daemonset, we keep it as is since it may
|
577
|
+
still be used by other tenants.
|
575
578
|
"""
|
576
579
|
|
577
|
-
logger.info(
|
580
|
+
logger.info(
|
581
|
+
'_configure_fuse_mounting: Setting up fusermount-server daemonset.')
|
578
582
|
|
579
|
-
|
583
|
+
fuse_proxy_namespace = provider_config['skypilot_system_namespace']
|
580
584
|
context = kubernetes_utils.get_context_from_config(provider_config)
|
581
585
|
|
582
|
-
# Read the
|
586
|
+
# Read the YAMLs from the manifests directory
|
583
587
|
root_dir = os.path.dirname(os.path.dirname(__file__))
|
584
588
|
|
585
|
-
# Load and create the ConfigMap
|
586
|
-
logger.info('_configure_fuse_mounting: Creating configmap.')
|
587
|
-
config_map_path = os.path.join(
|
588
|
-
root_dir, 'kubernetes/manifests/smarter-device-manager-configmap.yaml')
|
589
|
-
with open(config_map_path, 'r', encoding='utf-8') as file:
|
590
|
-
config_map = yaml.safe_load(file)
|
591
|
-
kubernetes_utils.merge_custom_metadata(config_map['metadata'])
|
592
|
-
try:
|
593
|
-
kubernetes.core_api(context).create_namespaced_config_map(
|
594
|
-
fuse_device_manager_namespace, config_map)
|
595
|
-
except kubernetes.api_exception() as e:
|
596
|
-
if e.status == 409:
|
597
|
-
logger.info('_configure_fuse_mounting: ConfigMap already exists '
|
598
|
-
f'in namespace {fuse_device_manager_namespace!r}')
|
599
|
-
else:
|
600
|
-
raise
|
601
|
-
else:
|
602
|
-
logger.info('_configure_fuse_mounting: ConfigMap created '
|
603
|
-
f'in namespace {fuse_device_manager_namespace!r}')
|
604
|
-
|
605
589
|
# Load and create the DaemonSet
|
590
|
+
# TODO(aylei): support customize and upgrade the fusermount-server image
|
606
591
|
logger.info('_configure_fuse_mounting: Creating daemonset.')
|
607
592
|
daemonset_path = os.path.join(
|
608
|
-
root_dir, 'kubernetes/manifests/
|
593
|
+
root_dir, 'kubernetes/manifests/fusermount-server-daemonset.yaml')
|
609
594
|
with open(daemonset_path, 'r', encoding='utf-8') as file:
|
610
595
|
daemonset = yaml.safe_load(file)
|
611
596
|
kubernetes_utils.merge_custom_metadata(daemonset['metadata'])
|
612
597
|
try:
|
613
598
|
kubernetes.apps_api(context).create_namespaced_daemon_set(
|
614
|
-
|
599
|
+
fuse_proxy_namespace, daemonset)
|
615
600
|
except kubernetes.api_exception() as e:
|
616
601
|
if e.status == 409:
|
617
602
|
logger.info('_configure_fuse_mounting: DaemonSet already exists '
|
618
|
-
f'in namespace {
|
603
|
+
f'in namespace {fuse_proxy_namespace!r}')
|
604
|
+
existing_ds = kubernetes.apps_api(
|
605
|
+
context).read_namespaced_daemon_set(
|
606
|
+
daemonset['metadata']['name'], fuse_proxy_namespace)
|
607
|
+
ds_image = daemonset['spec']['template']['spec']['containers'][0][
|
608
|
+
'image']
|
609
|
+
if existing_ds.spec.template.spec.containers[0].image != ds_image:
|
610
|
+
logger.info(
|
611
|
+
'_configure_fuse_mounting: Updating DaemonSet image.')
|
612
|
+
kubernetes.apps_api(context).patch_namespaced_daemon_set(
|
613
|
+
daemonset['metadata']['name'], fuse_proxy_namespace,
|
614
|
+
daemonset)
|
615
|
+
elif e.status == 403 or e.status == 401:
|
616
|
+
logger.error('SkyPilot does not have permission to create '
|
617
|
+
'fusermount-server DaemonSet in namespace '
|
618
|
+
f'{fuse_proxy_namespace!r}, Error: {e.reason}. '
|
619
|
+
'Please check the permissions of the SkyPilot service '
|
620
|
+
'account or contact your cluster admin to create the '
|
621
|
+
'DaemonSet manually. '
|
622
|
+
'Reference: https://docs.skypilot.co/reference/kubernetes/kubernetes-setup.html#kubernetes-setup-fuse') # pylint: disable=line-too-long
|
623
|
+
raise
|
619
624
|
else:
|
620
625
|
raise
|
621
626
|
else:
|
622
627
|
logger.info('_configure_fuse_mounting: DaemonSet created '
|
623
|
-
f'in namespace {
|
628
|
+
f'in namespace {fuse_proxy_namespace!r}')
|
624
629
|
|
625
|
-
logger.info('
|
626
|
-
f'in namespace {
|
630
|
+
logger.info('fusermount-server daemonset setup complete '
|
631
|
+
f'in namespace {fuse_proxy_namespace!r}')
|
627
632
|
|
628
633
|
|
629
634
|
def _configure_services(namespace: str, context: Optional[str],
|
@@ -162,12 +162,9 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
162
162
|
raise config_lib.KubernetesError(
|
163
163
|
_lack_resource_msg('memory', pod,
|
164
164
|
details=event_message))
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
' Try restarting your FUSE pods by running '
|
169
|
-
'`kubectl delete pods -n skypilot-system -l name=smarter-device-manager`.' # pylint: disable=line-too-long
|
170
|
-
f' Full error: {event_message}')
|
165
|
+
# TODO(aylei): after switching from smarter-device-manager to
|
166
|
+
# fusermount-server, we need a new way to check whether the
|
167
|
+
# fusermount-server daemonset is ready.
|
171
168
|
gpu_lf_keys = [
|
172
169
|
key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
|
173
170
|
for key in lf.get_label_keys()
|
@@ -723,7 +720,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
723
720
|
f'{common_utils.format_exception(e)}'
|
724
721
|
'Continuing without using nvidia RuntimeClass.\n'
|
725
722
|
'If you are on a K3s cluster, manually '
|
726
|
-
'override runtimeClassName in ~/.sky/
|
723
|
+
'override runtimeClassName in ~/.sky/skyconfig.yaml. '
|
727
724
|
'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html') # pylint: disable=line-too-long
|
728
725
|
|
729
726
|
needs_gpus = False
|
@@ -0,0 +1,54 @@
|
|
1
|
+
apiVersion: apps/v1
|
2
|
+
kind: DaemonSet
|
3
|
+
metadata:
|
4
|
+
name: fusermount-server
|
5
|
+
labels:
|
6
|
+
app: fusermount-server
|
7
|
+
role: agent
|
8
|
+
parent: skypilot
|
9
|
+
spec:
|
10
|
+
selector:
|
11
|
+
matchLabels:
|
12
|
+
app: fusermount-server
|
13
|
+
template:
|
14
|
+
metadata:
|
15
|
+
labels:
|
16
|
+
app: fusermount-server
|
17
|
+
spec:
|
18
|
+
# Add tolerations to run on all nodes
|
19
|
+
tolerations:
|
20
|
+
- operator: Exists
|
21
|
+
effect: NoSchedule
|
22
|
+
- operator: Exists
|
23
|
+
effect: NoExecute
|
24
|
+
containers:
|
25
|
+
- name: server
|
26
|
+
# TODO(aylei): version strategy of our addon images
|
27
|
+
image: berkeleyskypilot/fusermount-server:latest
|
28
|
+
securityContext:
|
29
|
+
privileged: true
|
30
|
+
volumeMounts:
|
31
|
+
- name: shared-dir
|
32
|
+
mountPath: /var/run/fusermount
|
33
|
+
env:
|
34
|
+
- name: FUSERMOUNT_SHARED_DIR
|
35
|
+
value: /var/run/fusermount
|
36
|
+
resources:
|
37
|
+
requests:
|
38
|
+
cpu: 50m
|
39
|
+
memory: 50Mi
|
40
|
+
livenessProbe:
|
41
|
+
exec:
|
42
|
+
command:
|
43
|
+
- /bin/sh
|
44
|
+
- -c
|
45
|
+
- "test -S /var/run/fusermount/server.sock"
|
46
|
+
initialDelaySeconds: 10
|
47
|
+
periodSeconds: 5
|
48
|
+
timeoutSeconds: 2
|
49
|
+
failureThreshold: 10
|
50
|
+
volumes:
|
51
|
+
- name: shared-dir
|
52
|
+
hostPath:
|
53
|
+
path: /var/run/fusermount
|
54
|
+
type: DirectoryOrCreate
|
@@ -66,7 +66,7 @@ def get_networking_mode(
|
|
66
66
|
except ValueError as e:
|
67
67
|
with ux_utils.print_exception_no_traceback():
|
68
68
|
raise ValueError(str(e) +
|
69
|
-
' Please check: ~/.sky/
|
69
|
+
' Please check: ~/.sky/skyconfig.yaml.') from None
|
70
70
|
return networking_mode
|
71
71
|
|
72
72
|
|