skypilot-nightly 1.0.0.dev20250411__py3-none-any.whl → 1.0.0.dev20250413__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/oci.py +2 -2
  3. sky/authentication.py +2 -2
  4. sky/backends/backend_utils.py +1 -1
  5. sky/backends/cloud_vm_ray_backend.py +3 -3
  6. sky/check.py +1 -1
  7. sky/cli.py +51 -47
  8. sky/client/cli.py +51 -47
  9. sky/client/sdk.py +2 -1
  10. sky/clouds/aws.py +2 -2
  11. sky/clouds/cloud.py +3 -2
  12. sky/clouds/kubernetes.py +20 -3
  13. sky/clouds/nebius.py +2 -4
  14. sky/clouds/oci.py +2 -2
  15. sky/clouds/utils/oci_utils.py +1 -1
  16. sky/core.py +12 -17
  17. sky/data/mounting_utils.py +34 -10
  18. sky/exceptions.py +1 -1
  19. sky/execution.py +5 -4
  20. sky/provision/instance_setup.py +3 -1
  21. sky/provision/kubernetes/config.py +41 -36
  22. sky/provision/kubernetes/instance.py +4 -7
  23. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +54 -0
  24. sky/provision/kubernetes/network_utils.py +1 -1
  25. sky/provision/kubernetes/utils.py +51 -35
  26. sky/server/requests/payloads.py +2 -0
  27. sky/setup_files/dependencies.py +1 -1
  28. sky/skylet/constants.py +2 -2
  29. sky/skypilot_config.py +179 -41
  30. sky/templates/kubernetes-ray.yml.j2 +66 -25
  31. sky/templates/websocket_proxy.py +41 -2
  32. sky/utils/config_utils.py +1 -1
  33. sky/utils/controller_utils.py +1 -1
  34. sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
  35. sky/utils/kubernetes/rsync_helper.sh +26 -11
  36. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/METADATA +3 -1
  37. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/RECORD +41 -42
  38. sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml +0 -10
  39. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +0 -68
  40. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/WHEEL +0 -0
  41. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/entry_points.txt +0 -0
  42. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/licenses/LICENSE +0 -0
  43. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/top_level.txt +0 -0
sky/skypilot_config.py CHANGED
@@ -1,7 +1,7 @@
1
1
  """Immutable user configurations (EXPERIMENTAL).
2
2
 
3
- On module import, we attempt to parse the config located at CONFIG_PATH
4
- (default: ~/.sky/config.yaml). Caller can then use
3
+ On module import, we attempt to parse the config located at _USER_CONFIG_PATH
4
+ (default: ~/.sky/skyconfig.yaml). Caller can then use
5
5
 
6
6
  >> skypilot_config.loaded()
7
7
 
@@ -35,14 +35,14 @@ Consider the following config contents:
35
35
 
36
36
  then:
37
37
 
38
- # Assuming ~/.sky/config.yaml exists and can be loaded:
38
+ # Assuming ~/.sky/skyconfig.yaml exists and can be loaded:
39
39
  skypilot_config.loaded() # ==> True
40
40
 
41
41
  skypilot_config.get_nested(('a', 'nested'), None) # ==> 1
42
42
  skypilot_config.get_nested(('a', 'nonexist'), None) # ==> None
43
43
  skypilot_config.get_nested(('a',), None) # ==> {'nested': 1}
44
44
 
45
- # If ~/.sky/config.yaml doesn't exist or failed to be loaded:
45
+ # If ~/.sky/skyconfig.yaml doesn't exist or failed to be loaded:
46
46
  skypilot_config.loaded() # ==> False
47
47
  skypilot_config.get_nested(('a', 'nested'), None) # ==> None
48
48
  skypilot_config.get_nested(('a', 'nonexist'), None) # ==> None
@@ -71,22 +71,38 @@ else:
71
71
 
72
72
  logger = sky_logging.init_logger(__name__)
73
73
 
74
- # The config path is discovered in this order:
74
+ # The config is generated as described below:
75
75
  #
76
- # (1) (Used internally) If env var {ENV_VAR_SKYPILOT_CONFIG} exists, use its
77
- # path;
78
- # (2) If file {CONFIG_PATH} exists, use this file.
76
+ # (*) (Used internally) If env var {ENV_VAR_SKYPILOT_CONFIG} exists, use its
77
+ # path as the config file. Do not use any other config files.
78
+ # This behavior is subject to change and should not be relied on by users.
79
+ # Else,
80
+ # (1) If env var {ENV_VAR_USER_CONFIG} exists, use its path as the user
81
+ # config file. Else, use the default path {_USER_CONFIG_PATH}.
82
+ # (2) If env var {ENV_VAR_PROJECT_CONFIG} exists, use its path as the project
83
+ # config file. Else, use the default path {_PROJECT_CONFIG_PATH}.
84
+ # (3) Override any config keys in (1) with the ones in (2).
85
+ # (4) Validate the final config.
79
86
  #
80
- # If the path discovered by (1) fails to load, we do not attempt to go to step
81
- # 2 in the list.
87
+ # (*) is used internally to implement the behavior of the jobs controller.
88
+ # It is not intended to be used by end users.
89
+ # (1) and (2) are used by end users to set non-default user and project config
90
+ # files on clients.
82
91
 
83
92
  # (Used internally) An env var holding the path to the local config file. This
84
93
  # is only used by jobs controller tasks to ensure recoveries of the same job
85
94
  # use the same config file.
86
95
  ENV_VAR_SKYPILOT_CONFIG = f'{constants.SKYPILOT_ENV_VAR_PREFIX}CONFIG'
87
96
 
88
- # Path to the local config file.
89
- CONFIG_PATH = '~/.sky/config.yaml'
97
+ # (Used by users) Environment variables for setting non-default user and
98
+ # project config files on clients.
99
+ ENV_VAR_USER_CONFIG = f'{constants.SKYPILOT_ENV_VAR_PREFIX}USER_CONFIG'
100
+ ENV_VAR_PROJECT_CONFIG = f'{constants.SKYPILOT_ENV_VAR_PREFIX}PROJECT_CONFIG'
101
+
102
+ # Path to the local config files.
103
+ _LEGACY_USER_CONFIG_PATH = '~/.sky/config.yaml'
104
+ _USER_CONFIG_PATH = '~/.sky/skyconfig.yaml'
105
+ _PROJECT_CONFIG_PATH = 'skyconfig.yaml'
90
106
 
91
107
  # The loaded config.
92
108
  _dict = config_utils.Config()
@@ -94,6 +110,23 @@ _loaded_config_path: Optional[str] = None
94
110
  _config_overridden: bool = False
95
111
 
96
112
 
113
+ # This function exists solely to maintain backward compatibility with the
114
+ # legacy user config file located at ~/.sky/config.yaml.
115
+ def get_user_config_path() -> str:
116
+ """Returns the path to the user config file.
117
+
118
+ If only the legacy user config file exists, return
119
+ the legacy user config path.
120
+ Otherwise, return the new user config path.
121
+ """
122
+ user_config_path = os.path.expanduser(_USER_CONFIG_PATH)
123
+ legacy_user_config_path = os.path.expanduser(_LEGACY_USER_CONFIG_PATH)
124
+ if (os.path.exists(legacy_user_config_path) and
125
+ not os.path.exists(user_config_path)):
126
+ return _LEGACY_USER_CONFIG_PATH
127
+ return _USER_CONFIG_PATH
128
+
129
+
97
130
  def get_nested(keys: Tuple[str, ...],
98
131
  default_value: Any,
99
132
  override_configs: Optional[Dict[str, Any]] = None) -> Any:
@@ -137,44 +170,149 @@ def to_dict() -> config_utils.Config:
137
170
  return copy.deepcopy(_dict)
138
171
 
139
172
 
173
+ def _get_config_file_path(envvar: str) -> Optional[str]:
174
+ config_path_via_env_var = os.environ.get(envvar)
175
+ if config_path_via_env_var is not None:
176
+ return os.path.expanduser(config_path_via_env_var)
177
+ return None
178
+
179
+
180
+ def _validate_config(config: Dict[str, Any], config_path: str) -> None:
181
+ """Validates the config."""
182
+ common_utils.validate_schema(
183
+ config,
184
+ schemas.get_config_schema(),
185
+ f'Invalid config YAML ({config_path}). See: '
186
+ 'https://docs.skypilot.co/en/latest/reference/config.html. ' # pylint: disable=line-too-long
187
+ 'Error: ',
188
+ skip_none=False)
189
+
190
+
191
+ def _overlay_skypilot_config(
192
+ original_config: Optional[config_utils.Config],
193
+ override_configs: Optional[config_utils.Config]) -> config_utils.Config:
194
+ """Overlays the override configs on the original configs."""
195
+ if original_config is None:
196
+ original_config = config_utils.Config()
197
+ config = original_config.get_nested(keys=tuple(),
198
+ default_value=None,
199
+ override_configs=override_configs,
200
+ allowed_override_keys=None,
201
+ disallowed_override_keys=None)
202
+ return config
203
+
204
+
140
205
  def _reload_config() -> None:
206
+ internal_config_path = os.environ.get(ENV_VAR_SKYPILOT_CONFIG)
207
+ if internal_config_path is not None:
208
+ # {ENV_VAR_SKYPILOT_CONFIG} is used internally.
209
+ # When this environment variable is set, the config loading
210
+ # behavior is not defined in the public interface.
211
+ # SkyPilot reserves the right to change the config loading behavior
212
+ # at any time when this environment variable is set.
213
+ _reload_config_from_internal_file(internal_config_path)
214
+ return
215
+
216
+ _reload_config_hierarchical()
217
+
218
+
219
+ def _parse_config_file(config_path: str) -> config_utils.Config:
220
+ config = config_utils.Config()
221
+ try:
222
+ config_dict = common_utils.read_yaml(config_path)
223
+ config = config_utils.Config.from_dict(config_dict)
224
+ logger.debug(
225
+ f'Config loaded from {config_path}:\n{pprint.pformat(config)}')
226
+ except yaml.YAMLError as e:
227
+ logger.error(f'Error in loading config file ({config_path}):', e)
228
+ if config:
229
+ _validate_config(config, config_path)
230
+
231
+ logger.debug(f'Config syntax check passed for path: {config_path}')
232
+ return config
233
+
234
+
235
+ def _reload_config_from_internal_file(internal_config_path: str) -> None:
141
236
  global _dict, _loaded_config_path
142
237
  # Reset the global variables, to avoid using stale values.
143
238
  _dict = config_utils.Config()
144
239
  _loaded_config_path = None
145
240
 
146
- config_path_via_env_var = os.environ.get(ENV_VAR_SKYPILOT_CONFIG)
147
- if config_path_via_env_var is not None:
148
- config_path = os.path.expanduser(config_path_via_env_var)
149
- if not os.path.exists(config_path):
241
+ config_path = os.path.expanduser(internal_config_path)
242
+ if not os.path.exists(config_path):
243
+ with ux_utils.print_exception_no_traceback():
244
+ raise FileNotFoundError(
245
+ 'Config file specified by env var '
246
+ f'{ENV_VAR_SKYPILOT_CONFIG} ({config_path!r}) does not '
247
+ 'exist. Please double check the path or unset the env var: '
248
+ f'unset {ENV_VAR_SKYPILOT_CONFIG}')
249
+ logger.debug(f'Using config path: {config_path}')
250
+ _dict = _parse_config_file(config_path)
251
+ _loaded_config_path = config_path
252
+
253
+
254
+ def _reload_config_hierarchical() -> None:
255
+ global _dict
256
+ # Reset the global variables, to avoid using stale values.
257
+ _dict = config_utils.Config()
258
+
259
+ # find the user config file
260
+ user_config_path = _get_config_file_path(ENV_VAR_USER_CONFIG)
261
+ if user_config_path:
262
+ logger.debug('using user config file specified by '
263
+ f'{ENV_VAR_USER_CONFIG}: {user_config_path}')
264
+ user_config_path = os.path.expanduser(user_config_path)
265
+ if not os.path.exists(user_config_path):
266
+ with ux_utils.print_exception_no_traceback():
267
+ raise FileNotFoundError(
268
+ 'Config file specified by env var '
269
+ f'{ENV_VAR_USER_CONFIG} ({user_config_path!r}) '
270
+ 'does not exist. Please double check the path or unset the '
271
+ f'env var: unset {ENV_VAR_USER_CONFIG}')
272
+ else:
273
+ user_config_path = get_user_config_path()
274
+ logger.debug(f'using default user config file: {user_config_path}')
275
+ user_config_path = os.path.expanduser(user_config_path)
276
+
277
+ overrides = []
278
+
279
+ # find the project config file
280
+ project_config_path = _get_config_file_path(ENV_VAR_PROJECT_CONFIG)
281
+ if project_config_path:
282
+ logger.debug('using project config file specified by '
283
+ f'{ENV_VAR_PROJECT_CONFIG}: {project_config_path}')
284
+ project_config_path = os.path.expanduser(project_config_path)
285
+ if not os.path.exists(project_config_path):
150
286
  with ux_utils.print_exception_no_traceback():
151
287
  raise FileNotFoundError(
152
288
  'Config file specified by env var '
153
- f'{ENV_VAR_SKYPILOT_CONFIG} ({config_path!r}) does not '
154
- 'exist. Please double check the path or unset the env var: '
155
- f'unset {ENV_VAR_SKYPILOT_CONFIG}')
289
+ f'{ENV_VAR_PROJECT_CONFIG} ({project_config_path!r}) '
290
+ 'does not exist. Please double check the path or unset the '
291
+ f'env var: unset {ENV_VAR_PROJECT_CONFIG}')
156
292
  else:
157
- config_path = CONFIG_PATH
158
- config_path = os.path.expanduser(config_path)
159
- if os.path.exists(config_path):
160
- logger.debug(f'Using config path: {config_path}')
161
- try:
162
- config = common_utils.read_yaml(config_path)
163
- _dict = config_utils.Config.from_dict(config)
164
- _loaded_config_path = config_path
165
- logger.debug(f'Config loaded:\n{pprint.pformat(_dict)}')
166
- except yaml.YAMLError as e:
167
- logger.error(f'Error in loading config file ({config_path}):', e)
168
- if _dict:
169
- common_utils.validate_schema(
170
- _dict,
171
- schemas.get_config_schema(),
172
- f'Invalid config YAML ({config_path}). See: '
173
- 'https://docs.skypilot.co/en/latest/reference/config.html. ' # pylint: disable=line-too-long
174
- 'Error: ',
175
- skip_none=False)
176
-
177
- logger.debug('Config syntax check passed.')
293
+ logger.debug(
294
+ f'using default project config file: {_PROJECT_CONFIG_PATH}')
295
+ project_config_path = _PROJECT_CONFIG_PATH
296
+ project_config_path = os.path.expanduser(project_config_path)
297
+
298
+ # load the user config file
299
+ if os.path.exists(user_config_path):
300
+ user_config = _parse_config_file(user_config_path)
301
+ _validate_config(user_config, user_config_path)
302
+ overrides.append(user_config)
303
+
304
+ if os.path.exists(project_config_path):
305
+ project_config = _parse_config_file(project_config_path)
306
+ _validate_config(project_config, project_config_path)
307
+ overrides.append(project_config)
308
+
309
+ # layer the configs on top of each other based on priority
310
+ overlaid_client_config: config_utils.Config = config_utils.Config()
311
+ for override in overrides:
312
+ overlaid_client_config = _overlay_skypilot_config(
313
+ original_config=overlaid_client_config, override_configs=override)
314
+ logger.debug(f'final config: {overlaid_client_config}')
315
+ _dict = overlaid_client_config
178
316
 
179
317
 
180
318
  def loaded_config_path() -> Optional[str]:
@@ -216,7 +354,7 @@ def override_skypilot_config(
216
354
  common_utils.validate_schema(
217
355
  config,
218
356
  schemas.get_config_schema(),
219
- f'Invalid config {config}. See: '
357
+ 'Invalid config. See: '
220
358
  'https://docs.skypilot.co/en/latest/reference/config.html. ' # pylint: disable=line-too-long
221
359
  'Error: ',
222
360
  skip_none=False)
@@ -267,11 +267,6 @@ available_node_types:
267
267
  {%- for label_key, label_value in labels.items() %}
268
268
  {{ label_key }}: {{ label_value|tojson }}
269
269
  {%- endfor %}
270
- {% if k8s_fuse_device_required %}
271
- annotations:
272
- # Required for FUSE mounting to access /dev/fuse
273
- container.apparmor.security.beta.kubernetes.io/ray-node: unconfined
274
- {% endif %}
275
270
  spec:
276
271
  # serviceAccountName: skypilot-service-account
277
272
  serviceAccountName: {{k8s_service_account_name}}
@@ -310,9 +305,12 @@ available_node_types:
310
305
  - name: dshm
311
306
  emptyDir:
312
307
  medium: Memory
313
- - name: dev-fuse # Required for fuse mounting
308
+ {% if k8s_fuse_device_required %}
309
+ - name: fusermount-shared-dir
314
310
  hostPath:
315
- path: /dev/fuse
311
+ path: {{k8s_fusermount_shared_dir}}
312
+ type: DirectoryOrCreate
313
+ {% endif %}
316
314
  containers:
317
315
  - name: ray-node
318
316
  imagePullPolicy: IfNotPresent
@@ -326,6 +324,10 @@ available_node_types:
326
324
  - name: {{ key }}
327
325
  value: {{ value }}
328
326
  {% endfor %}
327
+ {% if k8s_fuse_device_required %}
328
+ - name: FUSERMOUNT_SHARED_DIR
329
+ value: {{k8s_fusermount_shared_dir}}
330
+ {% endif %}
329
331
  # Do not change this command - it keeps the pod alive until it is
330
332
  # explicitly killed.
331
333
  command: ["/bin/bash", "-c", "--"]
@@ -350,11 +352,14 @@ available_node_types:
350
352
  (
351
353
  DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > /tmp/apt-update.log 2>&1 || \
352
354
  echo "Warning: apt-get update failed. Continuing anyway..." >> /tmp/apt-update.log
353
- PACKAGES="rsync curl netcat gcc patch pciutils fuse openssh-server";
355
+ # Install both fuse2 and fuse3 for compatibility for all possible fuse adapters in advance,
356
+ # so that both fusemount and fusermount3 can be masked before enabling SSH access.
357
+ PACKAGES="rsync curl wget netcat gcc patch pciutils fuse fuse3 openssh-server";
354
358
 
355
359
  # Separate packages into two groups: packages that are installed first
356
- # so that curl and rsync are available sooner to unblock the following
360
+ # so that curl, rsync and wget are available sooner to unblock the following
357
361
  # conda installation and rsync.
362
+ # Also, we install fuse first to avoid confliction with fuse3.
358
363
  set -e
359
364
  INSTALL_FIRST="";
360
365
  MISSING_PACKAGES="";
@@ -364,7 +369,7 @@ available_node_types:
364
369
  INSTALL_FIRST="$INSTALL_FIRST netcat-openbsd";
365
370
  fi
366
371
  elif ! dpkg -l | grep -q "^ii $pkg "; then
367
- if [ "$pkg" == "curl" ] || [ "$pkg" == "rsync" ]; then
372
+ if [ "$pkg" == "curl" ] || [ "$pkg" == "rsync" ] || [ "$pkg" == "fuse" ] || [ "$pkg" == "wget" ]; then
368
373
  INSTALL_FIRST="$INSTALL_FIRST $pkg";
369
374
  else
370
375
  MISSING_PACKAGES="$MISSING_PACKAGES $pkg";
@@ -382,7 +387,52 @@ available_node_types:
382
387
  echo "Installing missing packages: $MISSING_PACKAGES";
383
388
  DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $MISSING_PACKAGES;
384
389
  fi;
385
-
390
+
391
+ {% if k8s_fuse_device_required %}
392
+ set -e
393
+ # Mask fusermount binary before enabling SSH access
394
+ FUSERMOUNT_PATH=$(which fusermount)
395
+ if [ -z "$FUSERMOUNT_PATH" ]; then
396
+ echo "Error: fusermount binary not found"
397
+ exit 1
398
+ fi
399
+ $(prefix_cmd) cp -p "$FUSERMOUNT_PATH" "${FUSERMOUNT_PATH}-original"
400
+ $(prefix_cmd) ln -sf {{k8s_fusermount_shared_dir}}/fusermount-shim "$FUSERMOUNT_PATH"
401
+ FUSERMOUNT3_PATH=$(which fusermount3)
402
+ if [ -z "$FUSERMOUNT3_PATH" ]; then
403
+ FUSERMOUNT3_PATH="${FUSERMOUNT_PATH}3"
404
+ fi
405
+ # Also mask fusermount3 for rclone and blobfuse2 (for unmount operation)
406
+ $(prefix_cmd) ln -sf "$FUSERMOUNT_PATH" "$FUSERMOUNT3_PATH"
407
+ # Add fusermount-wrapper to handle adapters that use libfuse directly, e.g. blobfuse2
408
+ $(prefix_cmd) ln -sf {{k8s_fusermount_shared_dir}}/fusermount-wrapper /bin/fusermount-wrapper
409
+ # Wait for the server to setup the fusermount shim binary in case:
410
+ # 1. The server daemonset was just deployed and is still starting up.
411
+ # 2. The node was just started and the server Pod is still starting up.
412
+ wait_for_fusermount() {
413
+ local timeout=60
414
+ local start_time=$(date +%s)
415
+ while ! command -v fusermount >/dev/null 2>&1; do
416
+ current_time=$(date +%s)
417
+ elapsed=$((current_time - start_time))
418
+ if [ $elapsed -ge $timeout ]; then
419
+ echo "Error: fusermount not ready after $timeout seconds"
420
+ exit 1
421
+ fi
422
+ sleep 1
423
+ done
424
+ }
425
+ wait_for_fusermount
426
+ # Some distributions may mount hostPath with noexec, copy the binary in this case.
427
+ if ! fusermount -V; then
428
+ echo "fusermount -V failed, copying fusermount-shim directly"
429
+ $(prefix_cmd) rm -f "$FUSERMOUNT_PATH"
430
+ $(prefix_cmd) cp -p {{k8s_fusermount_shared_dir}}/fusermount-shim "$FUSERMOUNT_PATH"
431
+ $(prefix_cmd) rm -f /bin/fusermount-wrapper
432
+ $(prefix_cmd) cp -p {{k8s_fusermount_shared_dir}}/fusermount-wrapper /bin/fusermount-wrapper
433
+ fi
434
+ {% endif %}
435
+
386
436
  $(prefix_cmd) mkdir -p /var/run/sshd;
387
437
  $(prefix_cmd) sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config;
388
438
  $(prefix_cmd) sed "s@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g" -i /etc/pam.d/sshd;
@@ -394,6 +444,7 @@ available_node_types:
394
444
  $(prefix_cmd) chmod 644 ~/.ssh/authorized_keys;
395
445
  $(prefix_cmd) service ssh restart;
396
446
  $(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
447
+
397
448
  ) > /tmp/${STEPS[0]}.log 2>&1 || {
398
449
  echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed
399
450
  cat /tmp/${STEPS[0]}.log
@@ -539,10 +590,8 @@ available_node_types:
539
590
  - mountPath: /dev/shm
540
591
  name: dshm
541
592
  {% if k8s_fuse_device_required %}
542
- securityContext:
543
- capabilities:
544
- add:
545
- - "SYS_ADMIN"
593
+ - name: fusermount-shared-dir
594
+ mountPath: {{k8s_fusermount_shared_dir}}
546
595
  {% endif %}
547
596
  resources:
548
597
  requests:
@@ -556,20 +605,12 @@ available_node_types:
556
605
  # https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work
557
606
  {{k8s_resource_key}}: {{accelerator_count}}
558
607
  {% endif %}
559
- {% if k8s_fuse_device_required %}
560
- # Kubernetes resource exposed by the fuse device manager
561
- # https://gitlab.com/arm-research/smarter/smarter-device-manager
562
- smarter-devices/fuse: "1"
563
- {% endif %}
564
- {% if k8s_resource_key is not none or k8s_fuse_device_required %}
608
+ {% if k8s_resource_key is not none %}
565
609
  limits:
566
610
  # Limits need to be defined for GPU/TPU requests
567
611
  {% if k8s_resource_key is not none %}
568
612
  {{k8s_resource_key}}: {{accelerator_count}}
569
613
  {% endif %}
570
- {% if k8s_fuse_device_required %}
571
- smarter-devices/fuse: "1"
572
- {% endif %}
573
614
  {% endif %}
574
615
 
575
616
  setup_commands:
@@ -578,7 +619,7 @@ setup_commands:
578
619
  # Line 'sudo bash ..': set the ulimit as suggested by ray docs for performance. https://docs.ray.io/en/latest/cluster/vms/user-guides/large-cluster-best-practices.html#system-configuration
579
620
  # Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
580
621
  # Line 'mkdir -p ..': disable host key check
581
- # Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
622
+ # Line '[-f /etc/fuse.conf] ..': enable `-o allow_other` option for `goofys`
582
623
  # Line 'for step in ..': check if any failure indicator exists for the setup done in pod args and print the error message. This is only a best effort, as the
583
624
  # commands in pod args are asynchronous and we cannot guarantee the failure indicators are created before the setup commands finish.
584
625
  - |
@@ -1,17 +1,46 @@
1
1
  #!/usr/bin/env python3
2
+ # /// script
3
+ # dependencies = [
4
+ # "websockets>=14.0",
5
+ # ]
6
+ # ///
2
7
  """Starting a websocket with SkyPilot API server to proxy SSH to a k8s pod.
3
8
 
4
9
  This script is useful for users who do not have local Kubernetes credentials.
5
10
  """
6
11
  import asyncio
12
+ from http.cookiejar import MozillaCookieJar
7
13
  import os
8
14
  import sys
15
+ from typing import Dict
16
+ from urllib.request import Request
9
17
 
10
18
  import websockets
19
+ from websockets.asyncio.client import connect
20
+
21
+
22
+ def _get_cookie_header(url: str) -> Dict[str, str]:
23
+ """Extract Cookie header value from a cookie jar for a specific URL"""
24
+ cookie_path = os.environ.get('SKYPILOT_API_COOKIE_FILE')
25
+ if cookie_path is None:
26
+ return {}
27
+
28
+ request = Request(url)
29
+ cookie_jar = MozillaCookieJar(os.path.expanduser(cookie_path))
30
+ cookie_jar.load(ignore_discard=True, ignore_expires=True)
31
+ cookie_jar.add_cookie_header(request)
32
+ cookie_header = request.get_header('Cookie')
33
+ # if cookie file is empty, return empty dict
34
+ if cookie_header is None:
35
+ return {}
36
+ return {'Cookie': cookie_header}
11
37
 
12
38
 
13
39
  async def main(url: str) -> None:
14
- async with websockets.connect(url, ping_interval=None) as websocket:
40
+ cookie_header = _get_cookie_header(url)
41
+ async with connect(url,
42
+ ping_interval=None,
43
+ additional_headers=cookie_header) as websocket:
15
44
  if os.isatty(sys.stdin.fileno()):
16
45
  # pylint: disable=import-outside-toplevel
17
46
  import termios
@@ -59,6 +88,16 @@ async def websocket_to_stdout(websocket):
59
88
 
60
89
  if __name__ == '__main__':
61
90
  server_url = sys.argv[1].strip('/')
62
- websocket_url = (f'ws://{server_url}/kubernetes-pod-ssh-proxy'
91
+ if '://' not in server_url:
92
+ # Keep backward compatibility for legacy server URLs without protocol
93
+ # TODO(aylei): Remove this after 0.10.0
94
+ server_url = f'http://{server_url}'
95
+
96
+ server_proto, server_fqdn = server_url.split('://')
97
+ websocket_proto = 'ws'
98
+ if server_proto == 'https':
99
+ websocket_proto = 'wss'
100
+ server_url = f'{websocket_proto}://{server_fqdn}'
101
+ websocket_url = (f'{server_url}/kubernetes-pod-ssh-proxy'
63
102
  f'?cluster_name={sys.argv[2]}')
64
103
  asyncio.run(main(websocket_url))
sky/utils/config_utils.py CHANGED
@@ -146,7 +146,7 @@ def _get_nested(configs: Optional[Dict[str, Any]],
146
146
  curr = value
147
147
  else:
148
148
  return default_value
149
- logger.debug(f'User config: {".".join(keys)} -> {curr}')
149
+ logger.debug(f'Config: {".".join(keys)} -> {curr}')
150
150
  return curr
151
151
 
152
152
 
@@ -46,7 +46,7 @@ logger = sky_logging.init_logger(__name__)
46
46
  # controller resources spec.
47
47
  CONTROLLER_RESOURCES_NOT_VALID_MESSAGE = (
48
48
  '{controller_type} controller resources is not valid, please check '
49
- '~/.sky/config.yaml file and make sure '
49
+ '~/.sky/skyconfig.yaml file and make sure '
50
50
  '{controller_type}.controller.resources is a valid resources spec. '
51
51
  'Details:\n {err}')
52
52
 
@@ -328,9 +328,9 @@ cp kubeconfig ~/.kube/config
328
328
  # Verify that you can access the cluster
329
329
  kubectl get pods
330
330
 
331
- Also add this to your ~/.sky/config.yaml to use the new service account:
331
+ Also add this to your ~/.sky/skyconfig.yaml to use the new service account:
332
332
 
333
- # ~/.sky/config.yaml
333
+ # ~/.sky/skyconfig.yaml
334
334
  kubernetes:
335
335
  remote_identity: ${SKYPILOT_SA}
336
336
  "
@@ -1,20 +1,35 @@
1
- # When using pod@namespace+context, rsync passes args as: {us} -l pod namespace+context
2
- # We need to split the pod@namespace+context into pod, namespace and context
1
+ # We need to determine the pod, namespace and context from the args
3
2
  # For backward compatibility, we use + as the separator between namespace and context and add handling when context is not provided
4
- shift
5
- pod=$1
6
- shift
7
- echo "pod: $pod" >&2
8
- encoded_namespace_context=$1
9
- # Revert the encoded namespace+context to the original string.
10
- namespace_context=$(echo "$encoded_namespace_context" | sed 's|%40|@|g' | sed 's|%3A|:|g' | sed 's|%2B|+|g' | sed 's|%2F|/|g')
11
- echo "namespace_context: $namespace_context" >&2
3
+ if [ "$1" = "-l" ]; then
4
+ # -l pod namespace+context ...
5
+ # used by normal rsync
6
+ shift
7
+ pod=$1
8
+ shift
9
+ encoded_namespace_context=$1
10
+ shift
11
+ echo "pod: $pod" >&2
12
+ # Revert the encoded namespace+context to the original string.
13
+ namespace_context=$(echo "$encoded_namespace_context" | sed 's|%40|@|g' | sed 's|%3A|:|g' | sed 's|%2B|+|g' | sed 's|%2F|/|g')
14
+ echo "namespace_context: $namespace_context" >&2
15
+ else
16
+ # pod@namespace+context ...
17
+ # used by openrsync
18
+ encoded_pod_namespace_context=$1
19
+ shift
20
+ pod_namespace_context=$(echo "$encoded_pod_namespace_context" | sed 's|%40|@|g' | sed 's|%3A|:|g' | sed 's|%2B|+|g' | sed 's|%2F|/|g')
21
+ echo "pod_namespace_context: $pod_namespace_context" >&2
22
+ pod=$(echo $pod_namespace_context | cut -d@ -f1)
23
+ echo "pod: $pod" >&2
24
+ namespace_context=$(echo $pod_namespace_context | cut -d@ -f2-)
25
+ echo "namespace_context: $namespace_context" >&2
26
+ fi
12
27
  namespace=$(echo $namespace_context | cut -d+ -f1)
13
28
  echo "namespace: $namespace" >&2
14
29
  context=$(echo $namespace_context | grep '+' >/dev/null && echo $namespace_context | cut -d+ -f2- || echo "")
15
30
  echo "context: $context" >&2
16
31
  context_lower=$(echo "$context" | tr '[:upper:]' '[:lower:]')
17
- shift
32
+
18
33
  if [ -z "$context" ] || [ "$context_lower" = "none" ]; then
19
34
  # If context is none, it means we are using incluster auth. In this case,
20
35
  # use need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250411
3
+ Version: 1.0.0.dev20250413
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -88,6 +88,7 @@ Requires-Dist: oci; extra == "oci"
88
88
  Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "oci"
89
89
  Provides-Extra: kubernetes
90
90
  Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "kubernetes"
91
+ Requires-Dist: websockets; extra == "kubernetes"
91
92
  Provides-Extra: remote
92
93
  Requires-Dist: grpcio!=1.48.0,>=1.32.0; python_version < "3.10" and extra == "remote"
93
94
  Requires-Dist: grpcio!=1.48.0,>=1.42.0; python_version >= "3.10" and extra == "remote"
@@ -145,6 +146,7 @@ Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
145
146
  Requires-Dist: oci; extra == "all"
146
147
  Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
147
148
  Requires-Dist: kubernetes!=32.0.0,>=20.0.0; extra == "all"
149
+ Requires-Dist: websockets; extra == "all"
148
150
  Requires-Dist: grpcio!=1.48.0,>=1.32.0; python_version < "3.10" and extra == "all"
149
151
  Requires-Dist: grpcio!=1.48.0,>=1.42.0; python_version >= "3.10" and extra == "all"
150
152
  Requires-Dist: protobuf!=3.19.5,>=3.15.3; extra == "all"