skypilot-nightly 1.0.0.dev20250411__py3-none-any.whl → 1.0.0.dev20250413__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/oci.py +2 -2
- sky/authentication.py +2 -2
- sky/backends/backend_utils.py +1 -1
- sky/backends/cloud_vm_ray_backend.py +3 -3
- sky/check.py +1 -1
- sky/cli.py +51 -47
- sky/client/cli.py +51 -47
- sky/client/sdk.py +2 -1
- sky/clouds/aws.py +2 -2
- sky/clouds/cloud.py +3 -2
- sky/clouds/kubernetes.py +20 -3
- sky/clouds/nebius.py +2 -4
- sky/clouds/oci.py +2 -2
- sky/clouds/utils/oci_utils.py +1 -1
- sky/core.py +12 -17
- sky/data/mounting_utils.py +34 -10
- sky/exceptions.py +1 -1
- sky/execution.py +5 -4
- sky/provision/instance_setup.py +3 -1
- sky/provision/kubernetes/config.py +41 -36
- sky/provision/kubernetes/instance.py +4 -7
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +54 -0
- sky/provision/kubernetes/network_utils.py +1 -1
- sky/provision/kubernetes/utils.py +51 -35
- sky/server/requests/payloads.py +2 -0
- sky/setup_files/dependencies.py +1 -1
- sky/skylet/constants.py +2 -2
- sky/skypilot_config.py +179 -41
- sky/templates/kubernetes-ray.yml.j2 +66 -25
- sky/templates/websocket_proxy.py +41 -2
- sky/utils/config_utils.py +1 -1
- sky/utils/controller_utils.py +1 -1
- sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
- sky/utils/kubernetes/rsync_helper.sh +26 -11
- {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/METADATA +3 -1
- {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/RECORD +41 -42
- sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml +0 -10
- sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +0 -68
- {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/top_level.txt +0 -0
sky/skypilot_config.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
"""Immutable user configurations (EXPERIMENTAL).
|
2
2
|
|
3
|
-
On module import, we attempt to parse the config located at
|
4
|
-
(default: ~/.sky/
|
3
|
+
On module import, we attempt to parse the config located at _USER_CONFIG_PATH
|
4
|
+
(default: ~/.sky/skyconfig.yaml). Caller can then use
|
5
5
|
|
6
6
|
>> skypilot_config.loaded()
|
7
7
|
|
@@ -35,14 +35,14 @@ Consider the following config contents:
|
|
35
35
|
|
36
36
|
then:
|
37
37
|
|
38
|
-
# Assuming ~/.sky/
|
38
|
+
# Assuming ~/.sky/skyconfig.yaml exists and can be loaded:
|
39
39
|
skypilot_config.loaded() # ==> True
|
40
40
|
|
41
41
|
skypilot_config.get_nested(('a', 'nested'), None) # ==> 1
|
42
42
|
skypilot_config.get_nested(('a', 'nonexist'), None) # ==> None
|
43
43
|
skypilot_config.get_nested(('a',), None) # ==> {'nested': 1}
|
44
44
|
|
45
|
-
# If ~/.sky/
|
45
|
+
# If ~/.sky/skyconfig.yaml doesn't exist or failed to be loaded:
|
46
46
|
skypilot_config.loaded() # ==> False
|
47
47
|
skypilot_config.get_nested(('a', 'nested'), None) # ==> None
|
48
48
|
skypilot_config.get_nested(('a', 'nonexist'), None) # ==> None
|
@@ -71,22 +71,38 @@ else:
|
|
71
71
|
|
72
72
|
logger = sky_logging.init_logger(__name__)
|
73
73
|
|
74
|
-
# The config
|
74
|
+
# The config is generated as described below:
|
75
75
|
#
|
76
|
-
# (
|
77
|
-
# path
|
78
|
-
#
|
76
|
+
# (*) (Used internally) If env var {ENV_VAR_SKYPILOT_CONFIG} exists, use its
|
77
|
+
# path as the config file. Do not use any other config files.
|
78
|
+
# This behavior is subject to change and should not be relied on by users.
|
79
|
+
# Else,
|
80
|
+
# (1) If env var {ENV_VAR_USER_CONFIG} exists, use its path as the user
|
81
|
+
# config file. Else, use the default path {_USER_CONFIG_PATH}.
|
82
|
+
# (2) If env var {ENV_VAR_PROJECT_CONFIG} exists, use its path as the project
|
83
|
+
# config file. Else, use the default path {_PROJECT_CONFIG_PATH}.
|
84
|
+
# (3) Override any config keys in (1) with the ones in (2).
|
85
|
+
# (4) Validate the final config.
|
79
86
|
#
|
80
|
-
#
|
81
|
-
#
|
87
|
+
# (*) is used internally to implement the behavior of the jobs controller.
|
88
|
+
# It is not intended to be used by end users.
|
89
|
+
# (1) and (2) are used by end users to set non-default user and project config
|
90
|
+
# files on clients.
|
82
91
|
|
83
92
|
# (Used internally) An env var holding the path to the local config file. This
|
84
93
|
# is only used by jobs controller tasks to ensure recoveries of the same job
|
85
94
|
# use the same config file.
|
86
95
|
ENV_VAR_SKYPILOT_CONFIG = f'{constants.SKYPILOT_ENV_VAR_PREFIX}CONFIG'
|
87
96
|
|
88
|
-
#
|
89
|
-
|
97
|
+
# (Used by users) Environment variables for setting non-default user and
|
98
|
+
# project config files on clients.
|
99
|
+
ENV_VAR_USER_CONFIG = f'{constants.SKYPILOT_ENV_VAR_PREFIX}USER_CONFIG'
|
100
|
+
ENV_VAR_PROJECT_CONFIG = f'{constants.SKYPILOT_ENV_VAR_PREFIX}PROJECT_CONFIG'
|
101
|
+
|
102
|
+
# Path to the local config files.
|
103
|
+
_LEGACY_USER_CONFIG_PATH = '~/.sky/config.yaml'
|
104
|
+
_USER_CONFIG_PATH = '~/.sky/skyconfig.yaml'
|
105
|
+
_PROJECT_CONFIG_PATH = 'skyconfig.yaml'
|
90
106
|
|
91
107
|
# The loaded config.
|
92
108
|
_dict = config_utils.Config()
|
@@ -94,6 +110,23 @@ _loaded_config_path: Optional[str] = None
|
|
94
110
|
_config_overridden: bool = False
|
95
111
|
|
96
112
|
|
113
|
+
# This function exists solely to maintain backward compatibility with the
|
114
|
+
# legacy user config file located at ~/.sky/config.yaml.
|
115
|
+
def get_user_config_path() -> str:
|
116
|
+
"""Returns the path to the user config file.
|
117
|
+
|
118
|
+
If only the legacy user config file exists, return
|
119
|
+
the legacy user config path.
|
120
|
+
Otherwise, return the new user config path.
|
121
|
+
"""
|
122
|
+
user_config_path = os.path.expanduser(_USER_CONFIG_PATH)
|
123
|
+
legacy_user_config_path = os.path.expanduser(_LEGACY_USER_CONFIG_PATH)
|
124
|
+
if (os.path.exists(legacy_user_config_path) and
|
125
|
+
not os.path.exists(user_config_path)):
|
126
|
+
return _LEGACY_USER_CONFIG_PATH
|
127
|
+
return _USER_CONFIG_PATH
|
128
|
+
|
129
|
+
|
97
130
|
def get_nested(keys: Tuple[str, ...],
|
98
131
|
default_value: Any,
|
99
132
|
override_configs: Optional[Dict[str, Any]] = None) -> Any:
|
@@ -137,44 +170,149 @@ def to_dict() -> config_utils.Config:
|
|
137
170
|
return copy.deepcopy(_dict)
|
138
171
|
|
139
172
|
|
173
|
+
def _get_config_file_path(envvar: str) -> Optional[str]:
|
174
|
+
config_path_via_env_var = os.environ.get(envvar)
|
175
|
+
if config_path_via_env_var is not None:
|
176
|
+
return os.path.expanduser(config_path_via_env_var)
|
177
|
+
return None
|
178
|
+
|
179
|
+
|
180
|
+
def _validate_config(config: Dict[str, Any], config_path: str) -> None:
|
181
|
+
"""Validates the config."""
|
182
|
+
common_utils.validate_schema(
|
183
|
+
config,
|
184
|
+
schemas.get_config_schema(),
|
185
|
+
f'Invalid config YAML ({config_path}). See: '
|
186
|
+
'https://docs.skypilot.co/en/latest/reference/config.html. ' # pylint: disable=line-too-long
|
187
|
+
'Error: ',
|
188
|
+
skip_none=False)
|
189
|
+
|
190
|
+
|
191
|
+
def _overlay_skypilot_config(
|
192
|
+
original_config: Optional[config_utils.Config],
|
193
|
+
override_configs: Optional[config_utils.Config]) -> config_utils.Config:
|
194
|
+
"""Overlays the override configs on the original configs."""
|
195
|
+
if original_config is None:
|
196
|
+
original_config = config_utils.Config()
|
197
|
+
config = original_config.get_nested(keys=tuple(),
|
198
|
+
default_value=None,
|
199
|
+
override_configs=override_configs,
|
200
|
+
allowed_override_keys=None,
|
201
|
+
disallowed_override_keys=None)
|
202
|
+
return config
|
203
|
+
|
204
|
+
|
140
205
|
def _reload_config() -> None:
|
206
|
+
internal_config_path = os.environ.get(ENV_VAR_SKYPILOT_CONFIG)
|
207
|
+
if internal_config_path is not None:
|
208
|
+
# {ENV_VAR_SKYPILOT_CONFIG} is used internally.
|
209
|
+
# When this environment variable is set, the config loading
|
210
|
+
# behavior is not defined in the public interface.
|
211
|
+
# SkyPilot reserves the right to change the config loading behavior
|
212
|
+
# at any time when this environment variable is set.
|
213
|
+
_reload_config_from_internal_file(internal_config_path)
|
214
|
+
return
|
215
|
+
|
216
|
+
_reload_config_hierarchical()
|
217
|
+
|
218
|
+
|
219
|
+
def _parse_config_file(config_path: str) -> config_utils.Config:
|
220
|
+
config = config_utils.Config()
|
221
|
+
try:
|
222
|
+
config_dict = common_utils.read_yaml(config_path)
|
223
|
+
config = config_utils.Config.from_dict(config_dict)
|
224
|
+
logger.debug(
|
225
|
+
f'Config loaded from {config_path}:\n{pprint.pformat(config)}')
|
226
|
+
except yaml.YAMLError as e:
|
227
|
+
logger.error(f'Error in loading config file ({config_path}):', e)
|
228
|
+
if config:
|
229
|
+
_validate_config(config, config_path)
|
230
|
+
|
231
|
+
logger.debug(f'Config syntax check passed for path: {config_path}')
|
232
|
+
return config
|
233
|
+
|
234
|
+
|
235
|
+
def _reload_config_from_internal_file(internal_config_path: str) -> None:
|
141
236
|
global _dict, _loaded_config_path
|
142
237
|
# Reset the global variables, to avoid using stale values.
|
143
238
|
_dict = config_utils.Config()
|
144
239
|
_loaded_config_path = None
|
145
240
|
|
146
|
-
|
147
|
-
if
|
148
|
-
|
149
|
-
|
241
|
+
config_path = os.path.expanduser(internal_config_path)
|
242
|
+
if not os.path.exists(config_path):
|
243
|
+
with ux_utils.print_exception_no_traceback():
|
244
|
+
raise FileNotFoundError(
|
245
|
+
'Config file specified by env var '
|
246
|
+
f'{ENV_VAR_SKYPILOT_CONFIG} ({config_path!r}) does not '
|
247
|
+
'exist. Please double check the path or unset the env var: '
|
248
|
+
f'unset {ENV_VAR_SKYPILOT_CONFIG}')
|
249
|
+
logger.debug(f'Using config path: {config_path}')
|
250
|
+
_dict = _parse_config_file(config_path)
|
251
|
+
_loaded_config_path = config_path
|
252
|
+
|
253
|
+
|
254
|
+
def _reload_config_hierarchical() -> None:
|
255
|
+
global _dict
|
256
|
+
# Reset the global variables, to avoid using stale values.
|
257
|
+
_dict = config_utils.Config()
|
258
|
+
|
259
|
+
# find the user config file
|
260
|
+
user_config_path = _get_config_file_path(ENV_VAR_USER_CONFIG)
|
261
|
+
if user_config_path:
|
262
|
+
logger.debug('using user config file specified by '
|
263
|
+
f'{ENV_VAR_USER_CONFIG}: {user_config_path}')
|
264
|
+
user_config_path = os.path.expanduser(user_config_path)
|
265
|
+
if not os.path.exists(user_config_path):
|
266
|
+
with ux_utils.print_exception_no_traceback():
|
267
|
+
raise FileNotFoundError(
|
268
|
+
'Config file specified by env var '
|
269
|
+
f'{ENV_VAR_USER_CONFIG} ({user_config_path!r}) '
|
270
|
+
'does not exist. Please double check the path or unset the '
|
271
|
+
f'env var: unset {ENV_VAR_USER_CONFIG}')
|
272
|
+
else:
|
273
|
+
user_config_path = get_user_config_path()
|
274
|
+
logger.debug(f'using default user config file: {user_config_path}')
|
275
|
+
user_config_path = os.path.expanduser(user_config_path)
|
276
|
+
|
277
|
+
overrides = []
|
278
|
+
|
279
|
+
# find the project config file
|
280
|
+
project_config_path = _get_config_file_path(ENV_VAR_PROJECT_CONFIG)
|
281
|
+
if project_config_path:
|
282
|
+
logger.debug('using project config file specified by '
|
283
|
+
f'{ENV_VAR_PROJECT_CONFIG}: {project_config_path}')
|
284
|
+
project_config_path = os.path.expanduser(project_config_path)
|
285
|
+
if not os.path.exists(project_config_path):
|
150
286
|
with ux_utils.print_exception_no_traceback():
|
151
287
|
raise FileNotFoundError(
|
152
288
|
'Config file specified by env var '
|
153
|
-
f'{
|
154
|
-
'exist. Please double check the path or unset the
|
155
|
-
f'unset {
|
289
|
+
f'{ENV_VAR_PROJECT_CONFIG} ({project_config_path!r}) '
|
290
|
+
'does not exist. Please double check the path or unset the '
|
291
|
+
f'env var: unset {ENV_VAR_PROJECT_CONFIG}')
|
156
292
|
else:
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
177
|
-
|
293
|
+
logger.debug(
|
294
|
+
f'using default project config file: {_PROJECT_CONFIG_PATH}')
|
295
|
+
project_config_path = _PROJECT_CONFIG_PATH
|
296
|
+
project_config_path = os.path.expanduser(project_config_path)
|
297
|
+
|
298
|
+
# load the user config file
|
299
|
+
if os.path.exists(user_config_path):
|
300
|
+
user_config = _parse_config_file(user_config_path)
|
301
|
+
_validate_config(user_config, user_config_path)
|
302
|
+
overrides.append(user_config)
|
303
|
+
|
304
|
+
if os.path.exists(project_config_path):
|
305
|
+
project_config = _parse_config_file(project_config_path)
|
306
|
+
_validate_config(project_config, project_config_path)
|
307
|
+
overrides.append(project_config)
|
308
|
+
|
309
|
+
# layer the configs on top of each other based on priority
|
310
|
+
overlaid_client_config: config_utils.Config = config_utils.Config()
|
311
|
+
for override in overrides:
|
312
|
+
overlaid_client_config = _overlay_skypilot_config(
|
313
|
+
original_config=overlaid_client_config, override_configs=override)
|
314
|
+
logger.debug(f'final config: {overlaid_client_config}')
|
315
|
+
_dict = overlaid_client_config
|
178
316
|
|
179
317
|
|
180
318
|
def loaded_config_path() -> Optional[str]:
|
@@ -216,7 +354,7 @@ def override_skypilot_config(
|
|
216
354
|
common_utils.validate_schema(
|
217
355
|
config,
|
218
356
|
schemas.get_config_schema(),
|
219
|
-
|
357
|
+
'Invalid config. See: '
|
220
358
|
'https://docs.skypilot.co/en/latest/reference/config.html. ' # pylint: disable=line-too-long
|
221
359
|
'Error: ',
|
222
360
|
skip_none=False)
|
@@ -267,11 +267,6 @@ available_node_types:
|
|
267
267
|
{%- for label_key, label_value in labels.items() %}
|
268
268
|
{{ label_key }}: {{ label_value|tojson }}
|
269
269
|
{%- endfor %}
|
270
|
-
{% if k8s_fuse_device_required %}
|
271
|
-
annotations:
|
272
|
-
# Required for FUSE mounting to access /dev/fuse
|
273
|
-
container.apparmor.security.beta.kubernetes.io/ray-node: unconfined
|
274
|
-
{% endif %}
|
275
270
|
spec:
|
276
271
|
# serviceAccountName: skypilot-service-account
|
277
272
|
serviceAccountName: {{k8s_service_account_name}}
|
@@ -310,9 +305,12 @@ available_node_types:
|
|
310
305
|
- name: dshm
|
311
306
|
emptyDir:
|
312
307
|
medium: Memory
|
313
|
-
|
308
|
+
{% if k8s_fuse_device_required %}
|
309
|
+
- name: fusermount-shared-dir
|
314
310
|
hostPath:
|
315
|
-
path:
|
311
|
+
path: {{k8s_fusermount_shared_dir}}
|
312
|
+
type: DirectoryOrCreate
|
313
|
+
{% endif %}
|
316
314
|
containers:
|
317
315
|
- name: ray-node
|
318
316
|
imagePullPolicy: IfNotPresent
|
@@ -326,6 +324,10 @@ available_node_types:
|
|
326
324
|
- name: {{ key }}
|
327
325
|
value: {{ value }}
|
328
326
|
{% endfor %}
|
327
|
+
{% if k8s_fuse_device_required %}
|
328
|
+
- name: FUSERMOUNT_SHARED_DIR
|
329
|
+
value: {{k8s_fusermount_shared_dir}}
|
330
|
+
{% endif %}
|
329
331
|
# Do not change this command - it keeps the pod alive until it is
|
330
332
|
# explicitly killed.
|
331
333
|
command: ["/bin/bash", "-c", "--"]
|
@@ -350,11 +352,14 @@ available_node_types:
|
|
350
352
|
(
|
351
353
|
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > /tmp/apt-update.log 2>&1 || \
|
352
354
|
echo "Warning: apt-get update failed. Continuing anyway..." >> /tmp/apt-update.log
|
353
|
-
|
355
|
+
# Install both fuse2 and fuse3 for compatibility for all possible fuse adapters in advance,
|
356
|
+
# so that both fusemount and fusermount3 can be masked before enabling SSH access.
|
357
|
+
PACKAGES="rsync curl wget netcat gcc patch pciutils fuse fuse3 openssh-server";
|
354
358
|
|
355
359
|
# Separate packages into two groups: packages that are installed first
|
356
|
-
# so that curl and
|
360
|
+
# so that curl, rsync and wget are available sooner to unblock the following
|
357
361
|
# conda installation and rsync.
|
362
|
+
# Also, we install fuse first to avoid confliction with fuse3.
|
358
363
|
set -e
|
359
364
|
INSTALL_FIRST="";
|
360
365
|
MISSING_PACKAGES="";
|
@@ -364,7 +369,7 @@ available_node_types:
|
|
364
369
|
INSTALL_FIRST="$INSTALL_FIRST netcat-openbsd";
|
365
370
|
fi
|
366
371
|
elif ! dpkg -l | grep -q "^ii $pkg "; then
|
367
|
-
if [ "$pkg" == "curl" ] || [ "$pkg" == "rsync" ]; then
|
372
|
+
if [ "$pkg" == "curl" ] || [ "$pkg" == "rsync" ] || [ "$pkg" == "fuse" ] || [ "$pkg" == "wget" ]; then
|
368
373
|
INSTALL_FIRST="$INSTALL_FIRST $pkg";
|
369
374
|
else
|
370
375
|
MISSING_PACKAGES="$MISSING_PACKAGES $pkg";
|
@@ -382,7 +387,52 @@ available_node_types:
|
|
382
387
|
echo "Installing missing packages: $MISSING_PACKAGES";
|
383
388
|
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $MISSING_PACKAGES;
|
384
389
|
fi;
|
385
|
-
|
390
|
+
|
391
|
+
{% if k8s_fuse_device_required %}
|
392
|
+
set -e
|
393
|
+
# Mask fusermount binary before enabling SSH access
|
394
|
+
FUSERMOUNT_PATH=$(which fusermount)
|
395
|
+
if [ -z "$FUSERMOUNT_PATH" ]; then
|
396
|
+
echo "Error: fusermount binary not found"
|
397
|
+
exit 1
|
398
|
+
fi
|
399
|
+
$(prefix_cmd) cp -p "$FUSERMOUNT_PATH" "${FUSERMOUNT_PATH}-original"
|
400
|
+
$(prefix_cmd) ln -sf {{k8s_fusermount_shared_dir}}/fusermount-shim "$FUSERMOUNT_PATH"
|
401
|
+
FUSERMOUNT3_PATH=$(which fusermount3)
|
402
|
+
if [ -z "$FUSERMOUNT3_PATH" ]; then
|
403
|
+
FUSERMOUNT3_PATH="${FUSERMOUNT_PATH}3"
|
404
|
+
fi
|
405
|
+
# Also mask fusermount3 for rclone and blobfuse2 (for unmount operation)
|
406
|
+
$(prefix_cmd) ln -sf "$FUSERMOUNT_PATH" "$FUSERMOUNT3_PATH"
|
407
|
+
# Add fusermount-wrapper to handle adapters that use libfuse directly, e.g. blobfuse2
|
408
|
+
$(prefix_cmd) ln -sf {{k8s_fusermount_shared_dir}}/fusermount-wrapper /bin/fusermount-wrapper
|
409
|
+
# Wait for the server to setup the fusermount shim binary in case:
|
410
|
+
# 1. The server daemonset was just deployed and is still starting up.
|
411
|
+
# 2. The node was just started and the server Pod is still starting up.
|
412
|
+
wait_for_fusermount() {
|
413
|
+
local timeout=60
|
414
|
+
local start_time=$(date +%s)
|
415
|
+
while ! command -v fusermount >/dev/null 2>&1; do
|
416
|
+
current_time=$(date +%s)
|
417
|
+
elapsed=$((current_time - start_time))
|
418
|
+
if [ $elapsed -ge $timeout ]; then
|
419
|
+
echo "Error: fusermount not ready after $timeout seconds"
|
420
|
+
exit 1
|
421
|
+
fi
|
422
|
+
sleep 1
|
423
|
+
done
|
424
|
+
}
|
425
|
+
wait_for_fusermount
|
426
|
+
# Some distributions may mount hostPath with noexec, copy the binary in this case.
|
427
|
+
if ! fusermount -V; then
|
428
|
+
echo "fusermount -V failed, copying fusermount-shim directly"
|
429
|
+
$(prefix_cmd) rm -f "$FUSERMOUNT_PATH"
|
430
|
+
$(prefix_cmd) cp -p {{k8s_fusermount_shared_dir}}/fusermount-shim "$FUSERMOUNT_PATH"
|
431
|
+
$(prefix_cmd) rm -f /bin/fusermount-wrapper
|
432
|
+
$(prefix_cmd) cp -p {{k8s_fusermount_shared_dir}}/fusermount-wrapper /bin/fusermount-wrapper
|
433
|
+
fi
|
434
|
+
{% endif %}
|
435
|
+
|
386
436
|
$(prefix_cmd) mkdir -p /var/run/sshd;
|
387
437
|
$(prefix_cmd) sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config;
|
388
438
|
$(prefix_cmd) sed "s@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g" -i /etc/pam.d/sshd;
|
@@ -394,6 +444,7 @@ available_node_types:
|
|
394
444
|
$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys;
|
395
445
|
$(prefix_cmd) service ssh restart;
|
396
446
|
$(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
|
447
|
+
|
397
448
|
) > /tmp/${STEPS[0]}.log 2>&1 || {
|
398
449
|
echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed
|
399
450
|
cat /tmp/${STEPS[0]}.log
|
@@ -539,10 +590,8 @@ available_node_types:
|
|
539
590
|
- mountPath: /dev/shm
|
540
591
|
name: dshm
|
541
592
|
{% if k8s_fuse_device_required %}
|
542
|
-
|
543
|
-
|
544
|
-
add:
|
545
|
-
- "SYS_ADMIN"
|
593
|
+
- name: fusermount-shared-dir
|
594
|
+
mountPath: {{k8s_fusermount_shared_dir}}
|
546
595
|
{% endif %}
|
547
596
|
resources:
|
548
597
|
requests:
|
@@ -556,20 +605,12 @@ available_node_types:
|
|
556
605
|
# https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work
|
557
606
|
{{k8s_resource_key}}: {{accelerator_count}}
|
558
607
|
{% endif %}
|
559
|
-
|
560
|
-
# Kubernetes resource exposed by the fuse device manager
|
561
|
-
# https://gitlab.com/arm-research/smarter/smarter-device-manager
|
562
|
-
smarter-devices/fuse: "1"
|
563
|
-
{% endif %}
|
564
|
-
{% if k8s_resource_key is not none or k8s_fuse_device_required %}
|
608
|
+
{% if k8s_resource_key is not none %}
|
565
609
|
limits:
|
566
610
|
# Limits need to be defined for GPU/TPU requests
|
567
611
|
{% if k8s_resource_key is not none %}
|
568
612
|
{{k8s_resource_key}}: {{accelerator_count}}
|
569
613
|
{% endif %}
|
570
|
-
{% if k8s_fuse_device_required %}
|
571
|
-
smarter-devices/fuse: "1"
|
572
|
-
{% endif %}
|
573
614
|
{% endif %}
|
574
615
|
|
575
616
|
setup_commands:
|
@@ -578,7 +619,7 @@ setup_commands:
|
|
578
619
|
# Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
|
579
620
|
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
580
621
|
# Line 'mkdir -p ..': disable host key check
|
581
|
-
# Line '
|
622
|
+
# Line '[-f /etc/fuse.conf] ..': enable `-o allow_other` option for `goofys`
|
582
623
|
# Line 'for step in ..': check if any failure indicator exists for the setup done in pod args and print the error message. This is only a best effort, as the
|
583
624
|
# commands in pod args are asynchronous and we cannot guarantee the failure indicators are created before the setup commands finish.
|
584
625
|
- |
|
sky/templates/websocket_proxy.py
CHANGED
@@ -1,17 +1,46 @@
|
|
1
1
|
#!/usr/bin/env python3
|
2
|
+
# /// script
|
3
|
+
# dependencies = [
|
4
|
+
# "websockets>=14.0",
|
5
|
+
# ]
|
6
|
+
# ///
|
2
7
|
"""Starting a websocket with SkyPilot API server to proxy SSH to a k8s pod.
|
3
8
|
|
4
9
|
This script is useful for users who do not have local Kubernetes credentials.
|
5
10
|
"""
|
6
11
|
import asyncio
|
12
|
+
from http.cookiejar import MozillaCookieJar
|
7
13
|
import os
|
8
14
|
import sys
|
15
|
+
from typing import Dict
|
16
|
+
from urllib.request import Request
|
9
17
|
|
10
18
|
import websockets
|
19
|
+
from websockets.asyncio.client import connect
|
20
|
+
|
21
|
+
|
22
|
+
def _get_cookie_header(url: str) -> Dict[str, str]:
|
23
|
+
"""Extract Cookie header value from a cookie jar for a specific URL"""
|
24
|
+
cookie_path = os.environ.get('SKYPILOT_API_COOKIE_FILE')
|
25
|
+
if cookie_path is None:
|
26
|
+
return {}
|
27
|
+
|
28
|
+
request = Request(url)
|
29
|
+
cookie_jar = MozillaCookieJar(os.path.expanduser(cookie_path))
|
30
|
+
cookie_jar.load(ignore_discard=True, ignore_expires=True)
|
31
|
+
cookie_jar.add_cookie_header(request)
|
32
|
+
cookie_header = request.get_header('Cookie')
|
33
|
+
# if cookie file is empty, return empty dict
|
34
|
+
if cookie_header is None:
|
35
|
+
return {}
|
36
|
+
return {'Cookie': cookie_header}
|
11
37
|
|
12
38
|
|
13
39
|
async def main(url: str) -> None:
|
14
|
-
|
40
|
+
cookie_header = _get_cookie_header(url)
|
41
|
+
async with connect(url,
|
42
|
+
ping_interval=None,
|
43
|
+
additional_headers=cookie_header) as websocket:
|
15
44
|
if os.isatty(sys.stdin.fileno()):
|
16
45
|
# pylint: disable=import-outside-toplevel
|
17
46
|
import termios
|
@@ -59,6 +88,16 @@ async def websocket_to_stdout(websocket):
|
|
59
88
|
|
60
89
|
if __name__ == '__main__':
|
61
90
|
server_url = sys.argv[1].strip('/')
|
62
|
-
|
91
|
+
if '://' not in server_url:
|
92
|
+
# Keep backward compatibility for legacy server URLs without protocol
|
93
|
+
# TODO(aylei): Remove this after 0.10.0
|
94
|
+
server_url = f'http://{server_url}'
|
95
|
+
|
96
|
+
server_proto, server_fqdn = server_url.split('://')
|
97
|
+
websocket_proto = 'ws'
|
98
|
+
if server_proto == 'https':
|
99
|
+
websocket_proto = 'wss'
|
100
|
+
server_url = f'{websocket_proto}://{server_fqdn}'
|
101
|
+
websocket_url = (f'{server_url}/kubernetes-pod-ssh-proxy'
|
63
102
|
f'?cluster_name={sys.argv[2]}')
|
64
103
|
asyncio.run(main(websocket_url))
|
sky/utils/config_utils.py
CHANGED
sky/utils/controller_utils.py
CHANGED
@@ -46,7 +46,7 @@ logger = sky_logging.init_logger(__name__)
|
|
46
46
|
# controller resources spec.
|
47
47
|
CONTROLLER_RESOURCES_NOT_VALID_MESSAGE = (
|
48
48
|
'{controller_type} controller resources is not valid, please check '
|
49
|
-
'~/.sky/
|
49
|
+
'~/.sky/skyconfig.yaml file and make sure '
|
50
50
|
'{controller_type}.controller.resources is a valid resources spec. '
|
51
51
|
'Details:\n {err}')
|
52
52
|
|
@@ -328,9 +328,9 @@ cp kubeconfig ~/.kube/config
|
|
328
328
|
# Verify that you can access the cluster
|
329
329
|
kubectl get pods
|
330
330
|
|
331
|
-
Also add this to your ~/.sky/
|
331
|
+
Also add this to your ~/.sky/skyconfig.yaml to use the new service account:
|
332
332
|
|
333
|
-
# ~/.sky/
|
333
|
+
# ~/.sky/skyconfig.yaml
|
334
334
|
kubernetes:
|
335
335
|
remote_identity: ${SKYPILOT_SA}
|
336
336
|
"
|
@@ -1,20 +1,35 @@
|
|
1
|
-
#
|
2
|
-
# We need to split the pod@namespace+context into pod, namespace and context
|
1
|
+
# We need to determine the pod, namespace and context from the args
|
3
2
|
# For backward compatibility, we use + as the separator between namespace and context and add handling when context is not provided
|
4
|
-
|
5
|
-
pod
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
3
|
+
if [ "$1" = "-l" ]; then
|
4
|
+
# -l pod namespace+context ...
|
5
|
+
# used by normal rsync
|
6
|
+
shift
|
7
|
+
pod=$1
|
8
|
+
shift
|
9
|
+
encoded_namespace_context=$1
|
10
|
+
shift
|
11
|
+
echo "pod: $pod" >&2
|
12
|
+
# Revert the encoded namespace+context to the original string.
|
13
|
+
namespace_context=$(echo "$encoded_namespace_context" | sed 's|%40|@|g' | sed 's|%3A|:|g' | sed 's|%2B|+|g' | sed 's|%2F|/|g')
|
14
|
+
echo "namespace_context: $namespace_context" >&2
|
15
|
+
else
|
16
|
+
# pod@namespace+context ...
|
17
|
+
# used by openrsync
|
18
|
+
encoded_pod_namespace_context=$1
|
19
|
+
shift
|
20
|
+
pod_namespace_context=$(echo "$encoded_pod_namespace_context" | sed 's|%40|@|g' | sed 's|%3A|:|g' | sed 's|%2B|+|g' | sed 's|%2F|/|g')
|
21
|
+
echo "pod_namespace_context: $pod_namespace_context" >&2
|
22
|
+
pod=$(echo $pod_namespace_context | cut -d@ -f1)
|
23
|
+
echo "pod: $pod" >&2
|
24
|
+
namespace_context=$(echo $pod_namespace_context | cut -d@ -f2-)
|
25
|
+
echo "namespace_context: $namespace_context" >&2
|
26
|
+
fi
|
12
27
|
namespace=$(echo $namespace_context | cut -d+ -f1)
|
13
28
|
echo "namespace: $namespace" >&2
|
14
29
|
context=$(echo $namespace_context | grep '+' >/dev/null && echo $namespace_context | cut -d+ -f2- || echo "")
|
15
30
|
echo "context: $context" >&2
|
16
31
|
context_lower=$(echo "$context" | tr '[:upper:]' '[:lower:]')
|
17
|
-
|
32
|
+
|
18
33
|
if [ -z "$context" ] || [ "$context_lower" = "none" ]; then
|
19
34
|
# If context is none, it means we are using incluster auth. In this case,
|
20
35
|
# use need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: skypilot-nightly
|
3
|
-
Version: 1.0.0.
|
3
|
+
Version: 1.0.0.dev20250413
|
4
4
|
Summary: SkyPilot: An intercloud broker for the clouds
|
5
5
|
Author: SkyPilot Team
|
6
6
|
License: Apache 2.0
|
@@ -88,6 +88,7 @@ Requires-Dist: oci; extra == "oci"
|
|
88
88
|
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "oci"
|
89
89
|
Provides-Extra: kubernetes
|
90
90
|
Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "kubernetes"
|
91
|
+
Requires-Dist: websockets; extra == "kubernetes"
|
91
92
|
Provides-Extra: remote
|
92
93
|
Requires-Dist: grpcio!=1.48.0,>=1.32.0; python_version < "3.10" and extra == "remote"
|
93
94
|
Requires-Dist: grpcio!=1.48.0,>=1.42.0; python_version >= "3.10" and extra == "remote"
|
@@ -145,6 +146,7 @@ Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
|
|
145
146
|
Requires-Dist: oci; extra == "all"
|
146
147
|
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
|
147
148
|
Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
|
149
|
+
Requires-Dist: websockets; extra == "all"
|
148
150
|
Requires-Dist: grpcio!=1.48.0,>=1.32.0; python_version < "3.10" and extra == "all"
|
149
151
|
Requires-Dist: grpcio!=1.48.0,>=1.42.0; python_version >= "3.10" and extra == "all"
|
150
152
|
Requires-Dist: protobuf!=3.19.5,>=3.15.3; extra == "all"
|