skypilot-nightly 1.0.0.dev20250410__py3-none-any.whl → 1.0.0.dev20250412__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/oci.py +2 -2
  3. sky/authentication.py +2 -2
  4. sky/backends/backend_utils.py +1 -1
  5. sky/backends/cloud_vm_ray_backend.py +3 -3
  6. sky/check.py +1 -1
  7. sky/cli.py +51 -47
  8. sky/client/cli.py +51 -47
  9. sky/client/common.py +4 -2
  10. sky/client/sdk.py +60 -27
  11. sky/clouds/aws.py +2 -2
  12. sky/clouds/cloud.py +3 -2
  13. sky/clouds/kubernetes.py +20 -3
  14. sky/clouds/nebius.py +2 -4
  15. sky/clouds/oci.py +2 -2
  16. sky/clouds/utils/oci_utils.py +1 -1
  17. sky/core.py +12 -17
  18. sky/data/mounting_utils.py +34 -10
  19. sky/exceptions.py +1 -1
  20. sky/execution.py +5 -4
  21. sky/jobs/client/sdk.py +5 -0
  22. sky/optimizer.py +1 -2
  23. sky/provision/instance_setup.py +3 -1
  24. sky/provision/kubernetes/config.py +41 -36
  25. sky/provision/kubernetes/instance.py +4 -7
  26. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +54 -0
  27. sky/provision/kubernetes/network_utils.py +1 -1
  28. sky/provision/kubernetes/utils.py +51 -35
  29. sky/serve/client/sdk.py +6 -0
  30. sky/server/common.py +16 -1
  31. sky/server/constants.py +5 -0
  32. sky/server/requests/payloads.py +2 -0
  33. sky/setup_files/dependencies.py +1 -1
  34. sky/skylet/constants.py +2 -2
  35. sky/skypilot_config.py +197 -70
  36. sky/templates/kubernetes-ray.yml.j2 +66 -25
  37. sky/templates/websocket_proxy.py +41 -2
  38. sky/utils/config_utils.py +1 -1
  39. sky/utils/controller_utils.py +1 -1
  40. sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
  41. sky/utils/kubernetes/rsync_helper.sh +26 -11
  42. {skypilot_nightly-1.0.0.dev20250410.dist-info → skypilot_nightly-1.0.0.dev20250412.dist-info}/METADATA +3 -1
  43. {skypilot_nightly-1.0.0.dev20250410.dist-info → skypilot_nightly-1.0.0.dev20250412.dist-info}/RECORD +47 -48
  44. sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml +0 -10
  45. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +0 -68
  46. {skypilot_nightly-1.0.0.dev20250410.dist-info → skypilot_nightly-1.0.0.dev20250412.dist-info}/WHEEL +0 -0
  47. {skypilot_nightly-1.0.0.dev20250410.dist-info → skypilot_nightly-1.0.0.dev20250412.dist-info}/entry_points.txt +0 -0
  48. {skypilot_nightly-1.0.0.dev20250410.dist-info → skypilot_nightly-1.0.0.dev20250412.dist-info}/licenses/LICENSE +0 -0
  49. {skypilot_nightly-1.0.0.dev20250410.dist-info → skypilot_nightly-1.0.0.dev20250412.dist-info}/top_level.txt +0 -0
sky/clouds/aws.py CHANGED
@@ -472,10 +472,10 @@ class AWS(clouds.Cloud):
472
472
  with ux_utils.print_exception_no_traceback():
473
473
  logger.warning(
474
474
  f'Skip opening ports {resources.ports} for cluster {cluster_name!r}, '
475
- 'as `aws.security_group_name` in `~/.sky/config.yaml` is specified as '
475
+ 'as `aws.security_group_name` in `~/.sky/skyconfig.yaml` is specified as '
476
476
  f' {security_group!r}. Please make sure the specified security group '
477
477
  'has requested ports setup; or, leave out `aws.security_group_name` '
478
- 'in `~/.sky/config.yaml`.')
478
+ 'in `~/.sky/skyconfig.yaml`.')
479
479
 
480
480
  return {
481
481
  'instance_type': r.instance_type,
sky/clouds/cloud.py CHANGED
@@ -37,7 +37,7 @@ class CloudImplementationFeatures(enum.Enum):
37
37
  _cloud_unsupported_features in all clouds to make sure the
38
38
  check_features_are_supported() works as expected.
39
39
  """
40
- STOP = 'stop' # Includes both stop and autostop.
40
+ STOP = 'stop'
41
41
  MULTI_NODE = 'multi-node'
42
42
  CLONE_DISK_FROM_CLUSTER = 'clone_disk_from_cluster'
43
43
  IMAGE_ID = 'image_id'
@@ -47,7 +47,8 @@ class CloudImplementationFeatures(enum.Enum):
47
47
  OPEN_PORTS = 'open_ports'
48
48
  STORAGE_MOUNTING = 'storage_mounting'
49
49
  HOST_CONTROLLERS = 'host_controllers' # Can run jobs/serve controllers
50
- AUTO_TERMINATE = 'auto_terminate' # Pod/VM can stop or down itself
50
+ AUTOSTOP = 'autostop' # Pod/VM can stop itself
51
+ AUTODOWN = 'autodown' # Pod/VM can down itself
51
52
 
52
53
 
53
54
  # Use str, enum.Enum to allow CloudCapability to be used as a string.
sky/clouds/kubernetes.py CHANGED
@@ -35,6 +35,10 @@ CREDENTIAL_PATH = os.environ.get('KUBECONFIG', DEFAULT_KUBECONFIG_PATH)
35
35
  # E.g., FUSE device manager daemonset is run in this namespace.
36
36
  _SKYPILOT_SYSTEM_NAMESPACE = 'skypilot-system'
37
37
 
38
+ # Shared directory to communicate with fusermount-server, refer to
39
+ # addons/fuse-proxy/README.md for more details.
40
+ _FUSERMOUNT_SHARED_DIR = '/var/run/fusermount'
41
+
38
42
 
39
43
  @registry.CLOUD_REGISTRY.register(aliases=['k8s'])
40
44
  class Kubernetes(clouds.Cloud):
@@ -110,9 +114,13 @@ class Kubernetes(clouds.Cloud):
110
114
  # Controllers cannot spin up new pods with exec auth.
111
115
  unsupported_features[
112
116
  clouds.CloudImplementationFeatures.HOST_CONTROLLERS] = message
113
- # Pod does not have permissions to terminate itself with exec auth.
117
+ # Pod does not have permissions to down itself with exec auth.
114
118
  unsupported_features[
115
- clouds.CloudImplementationFeatures.AUTO_TERMINATE] = message
119
+ clouds.CloudImplementationFeatures.AUTODOWN] = message
120
+ unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
121
+ 'Stopping clusters is not supported on Kubernetes.')
122
+ unsupported_features[clouds.CloudImplementationFeatures.AUTOSTOP] = (
123
+ 'Auto-stop is not supported on Kubernetes.')
116
124
  # Allow spot instances if supported by the cluster
117
125
  try:
118
126
  spot_label_key, _ = kubernetes_utils.get_spot_label(context)
@@ -551,8 +559,9 @@ class Kubernetes(clouds.Cloud):
551
559
  'k8s_service_account_name': k8s_service_account_name,
552
560
  'k8s_automount_sa_token': k8s_automount_sa_token,
553
561
  'k8s_fuse_device_required': fuse_device_required,
554
- # Namespace to run the FUSE device manager in
562
+ # Namespace to run the fusermount-server daemonset in
555
563
  'k8s_skypilot_system_namespace': _SKYPILOT_SYSTEM_NAMESPACE,
564
+ 'k8s_fusermount_shared_dir': _FUSERMOUNT_SHARED_DIR,
556
565
  'k8s_spot_label_key': spot_label_key,
557
566
  'k8s_spot_label_value': spot_label_value,
558
567
  'tpu_requested': tpu_requested,
@@ -658,6 +667,14 @@ class Kubernetes(clouds.Cloud):
658
667
  def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
659
668
  """Checks if the user has access credentials to
660
669
  Kubernetes."""
670
+ # Check for port forward dependencies
671
+ reasons = kubernetes_utils.check_port_forward_mode_dependencies(False)
672
+ if reasons is not None:
673
+ formatted = '\n'.join(
674
+ [reasons[0]] +
675
+ [f'{cls._INDENT_PREFIX}' + r for r in reasons[1:]])
676
+ return (False, formatted)
677
+
661
678
  # Test using python API
662
679
  try:
663
680
  existing_allowed_contexts = cls.existing_allowed_contexts()
sky/clouds/nebius.py CHANGED
@@ -53,10 +53,8 @@ class Nebius(clouds.Cloud):
53
53
  """Nebius GPU Cloud"""
54
54
  _REPR = 'Nebius'
55
55
  _CLOUD_UNSUPPORTED_FEATURES = {
56
- clouds.CloudImplementationFeatures.AUTO_TERMINATE:
57
- ('Autodown and Autostop not supported. Can\'t delete disk.'),
58
- # Autostop functionality can be implemented, but currently,
59
- # there is only a single flag for both autostop and autodown.
56
+ clouds.CloudImplementationFeatures.AUTODOWN:
57
+ ('Autodown not supported. Can\'t delete OS disk.'),
60
58
  clouds.CloudImplementationFeatures.SPOT_INSTANCE:
61
59
  ('Spot is not supported, as Nebius API does not implement spot.'),
62
60
  clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
sky/clouds/oci.py CHANGED
@@ -9,8 +9,8 @@ History:
9
9
  file path resolution (by os.path.expanduser) when construct the file
10
10
  mounts. This bug will cause the created workder nodes located in different
11
11
  compartment and VCN than the header node if user specifies compartment_id
12
- in the sky config file, because the ~/.sky/config.yaml is not sync-ed to the
13
- remote machine.
12
+ in the sky config file, because the ~/.sky/skyconfig.yaml is not
13
+ sync-ed to the remote machine.
14
14
  The workaround is set the sky config file path using ENV before running
15
15
  the sky launch: export SKYPILOT_CONFIG=/home/ubuntu/.sky/config.yaml
16
16
  - Hysun He (hysun.he@oracle.com) @ Oct 12, 2024:
@@ -147,7 +147,7 @@ class OCIConfig:
147
147
  if config_path_via_env_var is not None:
148
148
  config_path = config_path_via_env_var
149
149
  else:
150
- config_path = skypilot_config.CONFIG_PATH
150
+ config_path = skypilot_config.get_user_config_path()
151
151
  return config_path
152
152
 
153
153
  @classmethod
sky/core.py CHANGED
@@ -629,26 +629,21 @@ def autostop(
629
629
  raise exceptions.NotSupportedError(
630
630
  f'{operation} cluster {cluster_name!r} with backend '
631
631
  f'{backend.__class__.__name__!r} is not supported.')
632
- # Check autostop is implemented for cloud
633
632
  cloud = handle.launched_resources.cloud
634
- if not down and not is_cancel:
635
- try:
636
- cloud.check_features_are_supported(
637
- handle.launched_resources,
638
- {clouds.CloudImplementationFeatures.STOP})
639
- except exceptions.NotSupportedError as e:
640
- raise exceptions.NotSupportedError(
641
- f'{colorama.Fore.YELLOW}Scheduling autostop on cluster '
642
- f'{cluster_name!r}...skipped.{colorama.Style.RESET_ALL}\n'
643
- f' {_stop_not_supported_message(handle.launched_resources)}.'
644
- ) from e
645
-
646
- # Check if autodown is required and supported
633
+ # Check if autostop/autodown is required and supported
647
634
  if not is_cancel:
648
635
  try:
649
- cloud.check_features_are_supported(
650
- handle.launched_resources,
651
- {clouds.CloudImplementationFeatures.AUTO_TERMINATE})
636
+ if down:
637
+ cloud.check_features_are_supported(
638
+ handle.launched_resources,
639
+ {clouds.CloudImplementationFeatures.AUTODOWN})
640
+ else:
641
+ cloud.check_features_are_supported(
642
+ handle.launched_resources,
643
+ {clouds.CloudImplementationFeatures.STOP})
644
+ cloud.check_features_are_supported(
645
+ handle.launched_resources,
646
+ {clouds.CloudImplementationFeatures.AUTOSTOP})
652
647
  except exceptions.NotSupportedError as e:
653
648
  raise exceptions.NotSupportedError(
654
649
  f'{colorama.Fore.YELLOW}{operation} on cluster '
@@ -30,9 +30,17 @@ _BLOBFUSE_CACHE_DIR = ('~/.sky/blobfuse2_cache/'
30
30
  # https://github.com/rclone/rclone/releases
31
31
  RCLONE_VERSION = 'v1.68.2'
32
32
 
33
+ # A wrapper for goofys to choose the logging mechanism based on environment.
34
+ _GOOFYS_WRAPPER = ('$(if [ -S /dev/log ] ; then '
35
+ 'echo "goofys"; '
36
+ 'else '
37
+ 'echo "goofys --log-file $(mktemp -t goofys.XXXX.log)"; '
38
+ 'fi)')
39
+
33
40
 
34
41
  def get_s3_mount_install_cmd() -> str:
35
42
  """Returns a command to install S3 mount utility goofys."""
43
+ # TODO(aylei): maintain our goofys fork under skypilot-org
36
44
  install_cmd = ('ARCH=$(uname -m) && '
37
45
  'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
38
46
  ' echo "goofys is not supported on $ARCH" && '
@@ -40,8 +48,8 @@ def get_s3_mount_install_cmd() -> str:
40
48
  'else '
41
49
  ' ARCH_SUFFIX="amd64"; '
42
50
  'fi && '
43
- 'sudo wget -nc https://github.com/romilbhardwaj/goofys/'
44
- 'releases/download/0.24.0-romilb-upstream/goofys '
51
+ 'sudo wget -nc https://github.com/aylei/goofys/'
52
+ 'releases/download/0.24.0-aylei-upstream/goofys '
45
53
  '-O /usr/local/bin/goofys && '
46
54
  'sudo chmod 755 /usr/local/bin/goofys')
47
55
  return install_cmd
@@ -56,7 +64,7 @@ def get_s3_mount_cmd(bucket_name: str,
56
64
  _bucket_sub_path = ''
57
65
  else:
58
66
  _bucket_sub_path = f':{_bucket_sub_path}'
59
- mount_cmd = ('goofys -o allow_other '
67
+ mount_cmd = (f'{_GOOFYS_WRAPPER} -o allow_other '
60
68
  f'--stat-cache-ttl {_STAT_CACHE_TTL} '
61
69
  f'--type-cache-ttl {_TYPE_CACHE_TTL} '
62
70
  f'{bucket_name}{_bucket_sub_path} {mount_path}')
@@ -73,7 +81,8 @@ def get_nebius_mount_cmd(nebius_profile_name: str,
73
81
  _bucket_sub_path = ''
74
82
  else:
75
83
  _bucket_sub_path = f':{_bucket_sub_path}'
76
- mount_cmd = (f'AWS_PROFILE={nebius_profile_name} goofys -o allow_other '
84
+ mount_cmd = (f'AWS_PROFILE={nebius_profile_name} {_GOOFYS_WRAPPER} '
85
+ '-o allow_other '
77
86
  f'--stat-cache-ttl {_STAT_CACHE_TTL} '
78
87
  f'--type-cache-ttl {_TYPE_CACHE_TTL} '
79
88
  f'--endpoint {endpoint_url} '
@@ -185,14 +194,28 @@ def get_az_mount_cmd(container_name: str,
185
194
  bucket_sub_path_arg = ''
186
195
  else:
187
196
  bucket_sub_path_arg = f'--subdirectory={_bucket_sub_path}/ '
197
+ mount_options = '-o allow_other -o default_permissions'
188
198
  # TODO(zpoint): clear old cache that has been created in the previous boot.
199
+ blobfuse2_cmd = ('blobfuse2 --no-symlinks -o umask=022 '
200
+ f'--tmp-path {cache_path}_$({remote_boot_time_cmd}) '
201
+ f'{bucket_sub_path_arg}'
202
+ f'--container-name {container_name}')
203
+ # 1. Set -o nonempty to bypass empty directory check of blobfuse2 when using
204
+ # fusermount-wrapper, since the mount is delegated to fusermount and
205
+ # blobfuse2 only get the mounted fd.
206
+ # 2. {} is the mount point placeholder that will be replaced with the
207
+ # mounted fd by fusermount-wrapper.
208
+ wrapped = (f'fusermount-wrapper -m {mount_path} {mount_options} '
209
+ f'-- {blobfuse2_cmd} -o nonempty {{}}')
210
+ original = f'{blobfuse2_cmd} {mount_options} {mount_path}'
211
+ # If fusermount-wrapper is available, use it to wrap the blobfuse2 command
212
+ # to avoid requiring root privilege.
213
+ # TODO(aylei): feeling hacky, refactor this.
214
+ get_mount_cmd = ('command -v fusermount-wrapper >/dev/null 2>&1 && '
215
+ f'echo "{wrapped}" || echo "{original}"')
189
216
  mount_cmd = (f'AZURE_STORAGE_ACCOUNT={storage_account_name} '
190
217
  f'{key_env_var} '
191
- f'blobfuse2 {mount_path} --allow-other --no-symlinks '
192
- '-o umask=022 -o default_permissions '
193
- f'--tmp-path {cache_path}_$({remote_boot_time_cmd}) '
194
- f'{bucket_sub_path_arg}'
195
- f'--container-name {container_name}')
218
+ f'$({get_mount_cmd})')
196
219
  return mount_cmd
197
220
 
198
221
 
@@ -209,7 +232,8 @@ def get_r2_mount_cmd(r2_credentials_path: str,
209
232
  else:
210
233
  _bucket_sub_path = f':{_bucket_sub_path}'
211
234
  mount_cmd = (f'AWS_SHARED_CREDENTIALS_FILE={r2_credentials_path} '
212
- f'AWS_PROFILE={r2_profile_name} goofys -o allow_other '
235
+ f'AWS_PROFILE={r2_profile_name} {_GOOFYS_WRAPPER} '
236
+ '-o allow_other '
213
237
  f'--stat-cache-ttl {_STAT_CACHE_TTL} '
214
238
  f'--type-cache-ttl {_TYPE_CACHE_TTL} '
215
239
  f'--endpoint {endpoint_url} '
sky/exceptions.py CHANGED
@@ -265,7 +265,7 @@ class CommandError(SkyPilotExcludeArgsBaseException):
265
265
  # Chunk the command to avoid overflow.
266
266
  command = command[:100] + '...'
267
267
  message = (f'Command {command} failed with return code '
268
- f'{returncode}.\n{error_msg}')
268
+ f'{returncode}.\n{error_msg}\n{detailed_reason}')
269
269
  super().__init__(message)
270
270
 
271
271
 
sky/execution.py CHANGED
@@ -237,11 +237,12 @@ def _execute(
237
237
  if Stage.DOWN in stages:
238
238
  stages.remove(Stage.DOWN)
239
239
  if idle_minutes_to_autostop >= 0:
240
- requested_features.add(
241
- clouds.CloudImplementationFeatures.AUTO_TERMINATE)
242
- if not down:
240
+ if down:
243
241
  requested_features.add(
244
- clouds.CloudImplementationFeatures.STOP)
242
+ clouds.CloudImplementationFeatures.AUTODOWN)
243
+ else:
244
+ requested_features.add(
245
+ clouds.CloudImplementationFeatures.AUTOSTOP)
245
246
  # NOTE: in general we may not have sufficiently specified info
246
247
  # (cloud/resource) to check STOP_SPOT_INSTANCE here. This is checked in
247
248
  # the backend.
sky/jobs/client/sdk.py CHANGED
@@ -82,6 +82,7 @@ def launch(
82
82
  f'{server_common.get_server_url()}/jobs/launch',
83
83
  json=json.loads(body.model_dump_json()),
84
84
  timeout=(5, None),
85
+ cookies=server_common.get_api_cookie_jar(),
85
86
  )
86
87
  return server_common.get_request_id(response)
87
88
 
@@ -138,6 +139,7 @@ def queue(refresh: bool,
138
139
  f'{server_common.get_server_url()}/jobs/queue',
139
140
  json=json.loads(body.model_dump_json()),
140
141
  timeout=(5, None),
142
+ cookies=server_common.get_api_cookie_jar(),
141
143
  )
142
144
  return server_common.get_request_id(response=response)
143
145
 
@@ -177,6 +179,7 @@ def cancel(
177
179
  f'{server_common.get_server_url()}/jobs/cancel',
178
180
  json=json.loads(body.model_dump_json()),
179
181
  timeout=(5, None),
182
+ cookies=server_common.get_api_cookie_jar(),
180
183
  )
181
184
  return server_common.get_request_id(response=response)
182
185
 
@@ -224,6 +227,7 @@ def tail_logs(name: Optional[str] = None,
224
227
  json=json.loads(body.model_dump_json()),
225
228
  stream=True,
226
229
  timeout=(5, None),
230
+ cookies=server_common.get_api_cookie_jar(),
227
231
  )
228
232
  request_id = server_common.get_request_id(response)
229
233
  return sdk.stream_response(request_id, response, output_stream)
@@ -267,6 +271,7 @@ def download_logs(
267
271
  f'{server_common.get_server_url()}/jobs/download_logs',
268
272
  json=json.loads(body.model_dump_json()),
269
273
  timeout=(5, None),
274
+ cookies=server_common.get_api_cookie_jar(),
270
275
  )
271
276
  job_id_remote_path_dict = sdk.stream_and_get(
272
277
  server_common.get_request_id(response))
sky/optimizer.py CHANGED
@@ -6,6 +6,7 @@ import typing
6
6
  from typing import Any, Dict, Iterable, List, Optional, Set, Tuple
7
7
 
8
8
  import colorama
9
+ import numpy as np
9
10
  import prettytable
10
11
 
11
12
  from sky import check as sky_check
@@ -28,12 +29,10 @@ from sky.utils import ux_utils
28
29
 
29
30
  if typing.TYPE_CHECKING:
30
31
  import networkx as nx
31
- import numpy as np
32
32
 
33
33
  from sky import dag as dag_lib
34
34
  else:
35
35
  nx = adaptors_common.LazyImport('networkx')
36
- np = adaptors_common.LazyImport('numpy')
37
36
 
38
37
  logger = sky_logging.init_logger(__name__)
39
38
 
@@ -44,7 +44,9 @@ _DUMP_RAY_PORTS = (
44
44
 
45
45
  _RAY_PORT_COMMAND = (
46
46
  f'RAY_PORT=$({constants.SKY_PYTHON_CMD} -c '
47
- '"from sky.skylet import job_lib; print(job_lib.get_ray_port())" '
47
+ '"from sky import sky_logging\n'
48
+ 'with sky_logging.silent(): '
49
+ 'from sky.skylet import job_lib; print(job_lib.get_ray_port())" '
48
50
  '2> /dev/null || echo 6379);'
49
51
  f'{constants.SKY_PYTHON_CMD} -c "from sky.utils import message_utils; '
50
52
  'print(message_utils.encode_payload({\'ray_port\': $RAY_PORT}))"')
@@ -43,7 +43,7 @@ def bootstrap_instances(
43
43
  if (requested_service_account ==
44
44
  kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME):
45
45
  # If the user has requested a different service account (via pod_config
46
- # in ~/.sky/config.yaml), we assume they have already set up the
46
+ # in ~/.sky/skyconfig.yaml), we assume they have already set up the
47
47
  # necessary roles and role bindings.
48
48
  # If not, set up the roles and bindings for skypilot-service-account
49
49
  # here.
@@ -561,69 +561,74 @@ def _configure_skypilot_system_namespace(
561
561
 
562
562
 
563
563
  def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None:
564
- """Creates sidecars required for FUSE mounting.
564
+ """Creates the privileged daemonset required for FUSE mounting.
565
565
 
566
566
  FUSE mounting in Kubernetes without privileged containers requires us to
567
- run a sidecar container with the necessary capabilities. We run a daemonset
568
- which exposes the host /dev/fuse device as a Kubernetes resource. The
569
- SkyPilot pod requests this resource to mount the FUSE filesystem.
567
+ run a privileged daemonset which accepts fusermount requests via unix
568
+ domain socket and perform the mount/unmount operations on the host /dev/fuse
569
+ device.
570
570
 
571
571
  We create this daemonset in the skypilot_system_namespace, which is
572
- configurable in the provider config. This allows the FUSE mounting sidecar
573
- to be shared across multiple tenants. The default namespace is
572
+ configurable in the provider config. This allows the daemonset to be
573
+ shared across multiple tenants. The default namespace is
574
574
  'skypilot-system' (populated in clouds.Kubernetes).
575
+
576
+ For legacy smarter-device-manager daemonset, we keep it as is since it may
577
+ still be used by other tenants.
575
578
  """
576
579
 
577
- logger.info('_configure_fuse_mounting: Setting up FUSE device manager.')
580
+ logger.info(
581
+ '_configure_fuse_mounting: Setting up fusermount-server daemonset.')
578
582
 
579
- fuse_device_manager_namespace = provider_config['skypilot_system_namespace']
583
+ fuse_proxy_namespace = provider_config['skypilot_system_namespace']
580
584
  context = kubernetes_utils.get_context_from_config(provider_config)
581
585
 
582
- # Read the device manager YAMLs from the manifests directory
586
+ # Read the YAMLs from the manifests directory
583
587
  root_dir = os.path.dirname(os.path.dirname(__file__))
584
588
 
585
- # Load and create the ConfigMap
586
- logger.info('_configure_fuse_mounting: Creating configmap.')
587
- config_map_path = os.path.join(
588
- root_dir, 'kubernetes/manifests/smarter-device-manager-configmap.yaml')
589
- with open(config_map_path, 'r', encoding='utf-8') as file:
590
- config_map = yaml.safe_load(file)
591
- kubernetes_utils.merge_custom_metadata(config_map['metadata'])
592
- try:
593
- kubernetes.core_api(context).create_namespaced_config_map(
594
- fuse_device_manager_namespace, config_map)
595
- except kubernetes.api_exception() as e:
596
- if e.status == 409:
597
- logger.info('_configure_fuse_mounting: ConfigMap already exists '
598
- f'in namespace {fuse_device_manager_namespace!r}')
599
- else:
600
- raise
601
- else:
602
- logger.info('_configure_fuse_mounting: ConfigMap created '
603
- f'in namespace {fuse_device_manager_namespace!r}')
604
-
605
589
  # Load and create the DaemonSet
590
+ # TODO(aylei): support customize and upgrade the fusermount-server image
606
591
  logger.info('_configure_fuse_mounting: Creating daemonset.')
607
592
  daemonset_path = os.path.join(
608
- root_dir, 'kubernetes/manifests/smarter-device-manager-daemonset.yaml')
593
+ root_dir, 'kubernetes/manifests/fusermount-server-daemonset.yaml')
609
594
  with open(daemonset_path, 'r', encoding='utf-8') as file:
610
595
  daemonset = yaml.safe_load(file)
611
596
  kubernetes_utils.merge_custom_metadata(daemonset['metadata'])
612
597
  try:
613
598
  kubernetes.apps_api(context).create_namespaced_daemon_set(
614
- fuse_device_manager_namespace, daemonset)
599
+ fuse_proxy_namespace, daemonset)
615
600
  except kubernetes.api_exception() as e:
616
601
  if e.status == 409:
617
602
  logger.info('_configure_fuse_mounting: DaemonSet already exists '
618
- f'in namespace {fuse_device_manager_namespace!r}')
603
+ f'in namespace {fuse_proxy_namespace!r}')
604
+ existing_ds = kubernetes.apps_api(
605
+ context).read_namespaced_daemon_set(
606
+ daemonset['metadata']['name'], fuse_proxy_namespace)
607
+ ds_image = daemonset['spec']['template']['spec']['containers'][0][
608
+ 'image']
609
+ if existing_ds.spec.template.spec.containers[0].image != ds_image:
610
+ logger.info(
611
+ '_configure_fuse_mounting: Updating DaemonSet image.')
612
+ kubernetes.apps_api(context).patch_namespaced_daemon_set(
613
+ daemonset['metadata']['name'], fuse_proxy_namespace,
614
+ daemonset)
615
+ elif e.status == 403 or e.status == 401:
616
+ logger.error('SkyPilot does not have permission to create '
617
+ 'fusermount-server DaemonSet in namespace '
618
+ f'{fuse_proxy_namespace!r}, Error: {e.reason}. '
619
+ 'Please check the permissions of the SkyPilot service '
620
+ 'account or contact your cluster admin to create the '
621
+ 'DaemonSet manually. '
622
+ 'Reference: https://docs.skypilot.co/reference/kubernetes/kubernetes-setup.html#kubernetes-setup-fuse') # pylint: disable=line-too-long
623
+ raise
619
624
  else:
620
625
  raise
621
626
  else:
622
627
  logger.info('_configure_fuse_mounting: DaemonSet created '
623
- f'in namespace {fuse_device_manager_namespace!r}')
628
+ f'in namespace {fuse_proxy_namespace!r}')
624
629
 
625
- logger.info('FUSE device manager setup complete '
626
- f'in namespace {fuse_device_manager_namespace!r}')
630
+ logger.info('fusermount-server daemonset setup complete '
631
+ f'in namespace {fuse_proxy_namespace!r}')
627
632
 
628
633
 
629
634
  def _configure_services(namespace: str, context: Optional[str],
@@ -162,12 +162,9 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
162
162
  raise config_lib.KubernetesError(
163
163
  _lack_resource_msg('memory', pod,
164
164
  details=event_message))
165
- if 'Insufficient smarter-devices/fuse' in event_message:
166
- raise config_lib.KubernetesError(
167
- 'Something went wrong with FUSE device daemonset.'
168
- ' Try restarting your FUSE pods by running '
169
- '`kubectl delete pods -n skypilot-system -l name=smarter-device-manager`.' # pylint: disable=line-too-long
170
- f' Full error: {event_message}')
165
+ # TODO(aylei): after switching from smarter-device-manager to
166
+ # fusermount-server, we need a new way to check whether the
167
+ # fusermount-server daemonset is ready.
171
168
  gpu_lf_keys = [
172
169
  key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
173
170
  for key in lf.get_label_keys()
@@ -723,7 +720,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
723
720
  f'{common_utils.format_exception(e)}'
724
721
  'Continuing without using nvidia RuntimeClass.\n'
725
722
  'If you are on a K3s cluster, manually '
726
- 'override runtimeClassName in ~/.sky/config.yaml. '
723
+ 'override runtimeClassName in ~/.sky/skyconfig.yaml. '
727
724
  'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html') # pylint: disable=line-too-long
728
725
 
729
726
  needs_gpus = False
@@ -0,0 +1,54 @@
1
+ apiVersion: apps/v1
2
+ kind: DaemonSet
3
+ metadata:
4
+ name: fusermount-server
5
+ labels:
6
+ app: fusermount-server
7
+ role: agent
8
+ parent: skypilot
9
+ spec:
10
+ selector:
11
+ matchLabels:
12
+ app: fusermount-server
13
+ template:
14
+ metadata:
15
+ labels:
16
+ app: fusermount-server
17
+ spec:
18
+ # Add tolerations to run on all nodes
19
+ tolerations:
20
+ - operator: Exists
21
+ effect: NoSchedule
22
+ - operator: Exists
23
+ effect: NoExecute
24
+ containers:
25
+ - name: server
26
+ # TODO(aylei): version strategy of our addon images
27
+ image: berkeleyskypilot/fusermount-server:latest
28
+ securityContext:
29
+ privileged: true
30
+ volumeMounts:
31
+ - name: shared-dir
32
+ mountPath: /var/run/fusermount
33
+ env:
34
+ - name: FUSERMOUNT_SHARED_DIR
35
+ value: /var/run/fusermount
36
+ resources:
37
+ requests:
38
+ cpu: 50m
39
+ memory: 50Mi
40
+ livenessProbe:
41
+ exec:
42
+ command:
43
+ - /bin/sh
44
+ - -c
45
+ - "test -S /var/run/fusermount/server.sock"
46
+ initialDelaySeconds: 10
47
+ periodSeconds: 5
48
+ timeoutSeconds: 2
49
+ failureThreshold: 10
50
+ volumes:
51
+ - name: shared-dir
52
+ hostPath:
53
+ path: /var/run/fusermount
54
+ type: DirectoryOrCreate
@@ -66,7 +66,7 @@ def get_networking_mode(
66
66
  except ValueError as e:
67
67
  with ux_utils.print_exception_no_traceback():
68
68
  raise ValueError(str(e) +
69
- ' Please check: ~/.sky/config.yaml.') from None
69
+ ' Please check: ~/.sky/skyconfig.yaml.') from None
70
70
  return networking_mode
71
71
 
72
72