skypilot-nightly 1.0.0.dev20250411__py3-none-any.whl → 1.0.0.dev20250413__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/oci.py +2 -2
  3. sky/authentication.py +2 -2
  4. sky/backends/backend_utils.py +1 -1
  5. sky/backends/cloud_vm_ray_backend.py +3 -3
  6. sky/check.py +1 -1
  7. sky/cli.py +51 -47
  8. sky/client/cli.py +51 -47
  9. sky/client/sdk.py +2 -1
  10. sky/clouds/aws.py +2 -2
  11. sky/clouds/cloud.py +3 -2
  12. sky/clouds/kubernetes.py +20 -3
  13. sky/clouds/nebius.py +2 -4
  14. sky/clouds/oci.py +2 -2
  15. sky/clouds/utils/oci_utils.py +1 -1
  16. sky/core.py +12 -17
  17. sky/data/mounting_utils.py +34 -10
  18. sky/exceptions.py +1 -1
  19. sky/execution.py +5 -4
  20. sky/provision/instance_setup.py +3 -1
  21. sky/provision/kubernetes/config.py +41 -36
  22. sky/provision/kubernetes/instance.py +4 -7
  23. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +54 -0
  24. sky/provision/kubernetes/network_utils.py +1 -1
  25. sky/provision/kubernetes/utils.py +51 -35
  26. sky/server/requests/payloads.py +2 -0
  27. sky/setup_files/dependencies.py +1 -1
  28. sky/skylet/constants.py +2 -2
  29. sky/skypilot_config.py +179 -41
  30. sky/templates/kubernetes-ray.yml.j2 +66 -25
  31. sky/templates/websocket_proxy.py +41 -2
  32. sky/utils/config_utils.py +1 -1
  33. sky/utils/controller_utils.py +1 -1
  34. sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
  35. sky/utils/kubernetes/rsync_helper.sh +26 -11
  36. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/METADATA +3 -1
  37. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/RECORD +41 -42
  38. sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml +0 -10
  39. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +0 -68
  40. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/WHEEL +0 -0
  41. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/entry_points.txt +0 -0
  42. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/licenses/LICENSE +0 -0
  43. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/top_level.txt +0 -0
sky/__init__.py CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
5
5
  import urllib.request
6
6
 
7
7
  # Replaced with the current commit when building the wheels.
8
- _SKYPILOT_COMMIT_SHA = 'b4202948016cdf48a5939ea9bf1769a2d31f73bf'
8
+ _SKYPILOT_COMMIT_SHA = '197c8dd3ea85d23323477e7d7cf69e8dc1b693c6'
9
9
 
10
10
 
11
11
  def _get_git_commit():
@@ -35,7 +35,7 @@ def _get_git_commit():
35
35
 
36
36
 
37
37
  __commit__ = _get_git_commit()
38
- __version__ = '1.0.0.dev20250411'
38
+ __version__ = '1.0.0.dev20250413'
39
39
  __root_dir__ = os.path.dirname(os.path.abspath(__file__))
40
40
 
41
41
 
sky/adaptors/oci.py CHANGED
@@ -13,7 +13,7 @@ from sky.clouds.utils import oci_utils
13
13
  # effect.
14
14
  logging.getLogger('oci.circuit_breaker').setLevel(logging.WARNING)
15
15
 
16
- CONFIG_PATH = '~/.oci/config'
16
+ OCI_CONFIG_PATH = '~/.oci/config'
17
17
  ENV_VAR_OCI_CONFIG = 'OCI_CONFIG'
18
18
 
19
19
  oci = common.LazyImport(
@@ -23,7 +23,7 @@ oci = common.LazyImport(
23
23
 
24
24
 
25
25
  def get_config_file() -> str:
26
- conf_file_path = CONFIG_PATH
26
+ conf_file_path = OCI_CONFIG_PATH
27
27
  config_path_via_env_var = os.environ.get(ENV_VAR_OCI_CONFIG)
28
28
  if config_path_via_env_var is not None:
29
29
  conf_file_path = config_path_via_env_var
sky/authentication.py CHANGED
@@ -382,10 +382,10 @@ def setup_kubernetes_authentication(config: Dict[str, Any]) -> Dict[str, Any]:
382
382
  network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
383
383
  network_mode_str)
384
384
  except ValueError as e:
385
- # Add message saying "Please check: ~/.sky/config.yaml" to the error
385
+ # Add message saying "Please check: ~/.sky/skyconfig.yaml" to the error
386
386
  # message.
387
387
  with ux_utils.print_exception_no_traceback():
388
- raise ValueError(str(e) + ' Please check: ~/.sky/config.yaml.') \
388
+ raise ValueError(str(e) + ' Please check: ~/.sky/skyconfig.yaml.') \
389
389
  from None
390
390
  _, public_key_path = get_or_generate_keys()
391
391
 
@@ -682,7 +682,7 @@ def write_cluster_config(
682
682
  ssh_proxy_command = ssh_proxy_command_config[region_name]
683
683
  logger.debug(f'Using ssh_proxy_command: {ssh_proxy_command!r}')
684
684
 
685
- # User-supplied global instance tags from ~/.sky/config.yaml.
685
+ # User-supplied global instance tags from ~/.sky/skyconfig.yaml.
686
686
  labels = skypilot_config.get_nested((str(cloud).lower(), 'labels'), {})
687
687
  # labels is a dict, which is guaranteed by the type check in
688
688
  # schemas.py
@@ -1473,7 +1473,7 @@ class RetryingVmProvisioner(object):
1473
1473
  f'invalid cloud credentials: '
1474
1474
  f'{common_utils.format_exception(e)}')
1475
1475
  except exceptions.InvalidCloudConfigs as e:
1476
- # Failed due to invalid user configs in ~/.sky/config.yaml.
1476
+ # Failed due to invalid user configs in ~/.sky/skyconfig.yaml.
1477
1477
  logger.warning(f'{common_utils.format_exception(e)}')
1478
1478
  # We should block the entire cloud if the user config is
1479
1479
  # invalid.
@@ -2065,10 +2065,10 @@ class RetryingVmProvisioner(object):
2065
2065
  (clouds.Kubernetes, clouds.RunPod)) and
2066
2066
  controller_utils.Controllers.from_name(cluster_name)
2067
2067
  is not None):
2068
- assert (clouds.CloudImplementationFeatures.STOP
2068
+ assert (clouds.CloudImplementationFeatures.AUTOSTOP
2069
2069
  in requested_features), requested_features
2070
2070
  requested_features.remove(
2071
- clouds.CloudImplementationFeatures.STOP)
2071
+ clouds.CloudImplementationFeatures.AUTOSTOP)
2072
2072
 
2073
2073
  # Skip if to_provision.cloud does not support requested features
2074
2074
  to_provision.cloud.check_features_are_supported(
sky/check.py CHANGED
@@ -142,7 +142,7 @@ def check_capabilities(
142
142
  if disallowed_cloud_names:
143
143
  disallowed_clouds_hint = (
144
144
  '\nNote: The following clouds were disabled because they were not '
145
- 'included in allowed_clouds in ~/.sky/config.yaml: '
145
+ 'included in allowed_clouds in ~/.sky/skyconfig.yaml: '
146
146
  f'{", ".join([c for c in disallowed_cloud_names])}')
147
147
  if not all_enabled_clouds:
148
148
  echo(
sky/cli.py CHANGED
@@ -35,7 +35,8 @@ import sys
35
35
  import textwrap
36
36
  import traceback
37
37
  import typing
38
- from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
38
+ from typing import (Any, Callable, Dict, Generator, List, Optional, Set, Tuple,
39
+ Union)
39
40
 
40
41
  import click
41
42
  import colorama
@@ -134,49 +135,51 @@ def _get_cluster_records_and_set_ssh_config(
134
135
  # Update the SSH config for all clusters
135
136
  for record in cluster_records:
136
137
  handle = record['handle']
137
- # During the failover, even though a cluster does not exist, the handle
138
- # can still exist in the record, and we check for credentials to avoid
139
- # updating the SSH config for non-existent clusters.
140
- if (handle is not None and handle.cached_external_ips is not None and
141
- 'credentials' in record):
142
- credentials = record['credentials']
143
- if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
144
- # Replace the proxy command to proxy through the SkyPilot API
145
- # server with websocket.
146
- key_path = (
147
- cluster_utils.SSHConfigHelper.generate_local_key_file(
148
- handle.cluster_name, credentials))
149
- # Instead of directly use websocket_proxy.py, we add an
150
- # additional proxy, so that ssh can use the head pod in the
151
- # cluster to jump to worker pods.
152
- proxy_command = (
153
- f'ssh -tt -i {key_path} '
154
- '-o StrictHostKeyChecking=no '
155
- '-o UserKnownHostsFile=/dev/null '
156
- '-o IdentitiesOnly=yes '
157
- '-W %h:%p '
158
- f'{handle.ssh_user}@127.0.0.1 '
159
- '-o ProxyCommand='
160
- # TODO(zhwu): write the template to a temp file, don't use
161
- # the one in skypilot repo, to avoid changing the file when
162
- # updating skypilot.
163
- f'\'{sys.executable} {sky.__root_dir__}/templates/'
164
- f'websocket_proxy.py '
165
- f'{server_common.get_server_url().split("://")[1]} '
166
- f'{handle.cluster_name}\'')
167
- credentials['ssh_proxy_command'] = proxy_command
168
- cluster_utils.SSHConfigHelper.add_cluster(
169
- handle.cluster_name,
170
- handle.cached_external_ips,
171
- credentials,
172
- handle.cached_external_ssh_ports,
173
- handle.docker_user,
174
- handle.ssh_user,
175
- )
176
- else:
138
+
139
+ if not (handle is not None and handle.cached_external_ips is not None
140
+ and 'credentials' in record):
177
141
  # If the cluster is not UP or does not have credentials available,
178
142
  # we need to remove the cluster from the SSH config.
179
143
  cluster_utils.SSHConfigHelper.remove_cluster(record['name'])
144
+ continue
145
+
146
+ # During the failover, even though a cluster does not exist, the handle
147
+ # can still exist in the record, and we check for credentials to avoid
148
+ # updating the SSH config for non-existent clusters.
149
+ credentials = record['credentials']
150
+ if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
151
+ # Replace the proxy command to proxy through the SkyPilot API
152
+ # server with websocket.
153
+ key_path = (cluster_utils.SSHConfigHelper.generate_local_key_file(
154
+ handle.cluster_name, credentials))
155
+ # Instead of directly use websocket_proxy.py, we add an
156
+ # additional proxy, so that ssh can use the head pod in the
157
+ # cluster to jump to worker pods.
158
+ proxy_command = (
159
+ f'ssh -tt -i {key_path} '
160
+ '-o StrictHostKeyChecking=no '
161
+ '-o UserKnownHostsFile=/dev/null '
162
+ '-o IdentitiesOnly=yes '
163
+ '-W %h:%p '
164
+ f'{handle.ssh_user}@127.0.0.1 '
165
+ '-o ProxyCommand='
166
+ # TODO(zhwu): write the template to a temp file, don't use
167
+ # the one in skypilot repo, to avoid changing the file when
168
+ # updating skypilot.
169
+ f'\'{sys.executable} {sky.__root_dir__}/templates/'
170
+ f'websocket_proxy.py '
171
+ f'{server_common.get_server_url()} '
172
+ f'{handle.cluster_name}\'')
173
+ credentials['ssh_proxy_command'] = proxy_command
174
+
175
+ cluster_utils.SSHConfigHelper.add_cluster(
176
+ handle.cluster_name,
177
+ handle.cached_external_ips,
178
+ credentials,
179
+ handle.cached_external_ssh_ports,
180
+ handle.docker_user,
181
+ handle.ssh_user,
182
+ )
180
183
 
181
184
  # Clean up SSH configs for clusters that do not exist.
182
185
  #
@@ -186,14 +189,15 @@ def _get_cluster_records_and_set_ssh_config(
186
189
  # removing clusters, because SkyPilot has no idea whether to remove
187
190
  # ssh config of a cluster from another user.
188
191
  clusters_exists = set(record['name'] for record in cluster_records)
192
+ clusters_to_remove: Set[str] = set()
189
193
  if clusters is not None:
190
- for cluster in clusters:
191
- if cluster not in clusters_exists:
192
- cluster_utils.SSHConfigHelper.remove_cluster(cluster)
194
+ clusters_to_remove = set(clusters) - clusters_exists
193
195
  elif all_users:
194
- for cluster_name in cluster_utils.SSHConfigHelper.list_cluster_names():
195
- if cluster_name not in clusters_exists:
196
- cluster_utils.SSHConfigHelper.remove_cluster(cluster_name)
196
+ clusters_to_remove = set(cluster_utils.SSHConfigHelper.
197
+ list_cluster_names()) - clusters_exists
198
+
199
+ for cluster_name in clusters_to_remove:
200
+ cluster_utils.SSHConfigHelper.remove_cluster(cluster_name)
197
201
 
198
202
  return cluster_records
199
203
 
sky/client/cli.py CHANGED
@@ -35,7 +35,8 @@ import sys
35
35
  import textwrap
36
36
  import traceback
37
37
  import typing
38
- from typing import Any, Callable, Dict, Generator, List, Optional, Tuple, Union
38
+ from typing import (Any, Callable, Dict, Generator, List, Optional, Set, Tuple,
39
+ Union)
39
40
 
40
41
  import click
41
42
  import colorama
@@ -134,49 +135,51 @@ def _get_cluster_records_and_set_ssh_config(
134
135
  # Update the SSH config for all clusters
135
136
  for record in cluster_records:
136
137
  handle = record['handle']
137
- # During the failover, even though a cluster does not exist, the handle
138
- # can still exist in the record, and we check for credentials to avoid
139
- # updating the SSH config for non-existent clusters.
140
- if (handle is not None and handle.cached_external_ips is not None and
141
- 'credentials' in record):
142
- credentials = record['credentials']
143
- if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
144
- # Replace the proxy command to proxy through the SkyPilot API
145
- # server with websocket.
146
- key_path = (
147
- cluster_utils.SSHConfigHelper.generate_local_key_file(
148
- handle.cluster_name, credentials))
149
- # Instead of directly use websocket_proxy.py, we add an
150
- # additional proxy, so that ssh can use the head pod in the
151
- # cluster to jump to worker pods.
152
- proxy_command = (
153
- f'ssh -tt -i {key_path} '
154
- '-o StrictHostKeyChecking=no '
155
- '-o UserKnownHostsFile=/dev/null '
156
- '-o IdentitiesOnly=yes '
157
- '-W %h:%p '
158
- f'{handle.ssh_user}@127.0.0.1 '
159
- '-o ProxyCommand='
160
- # TODO(zhwu): write the template to a temp file, don't use
161
- # the one in skypilot repo, to avoid changing the file when
162
- # updating skypilot.
163
- f'\'{sys.executable} {sky.__root_dir__}/templates/'
164
- f'websocket_proxy.py '
165
- f'{server_common.get_server_url().split("://")[1]} '
166
- f'{handle.cluster_name}\'')
167
- credentials['ssh_proxy_command'] = proxy_command
168
- cluster_utils.SSHConfigHelper.add_cluster(
169
- handle.cluster_name,
170
- handle.cached_external_ips,
171
- credentials,
172
- handle.cached_external_ssh_ports,
173
- handle.docker_user,
174
- handle.ssh_user,
175
- )
176
- else:
138
+
139
+ if not (handle is not None and handle.cached_external_ips is not None
140
+ and 'credentials' in record):
177
141
  # If the cluster is not UP or does not have credentials available,
178
142
  # we need to remove the cluster from the SSH config.
179
143
  cluster_utils.SSHConfigHelper.remove_cluster(record['name'])
144
+ continue
145
+
146
+ # During the failover, even though a cluster does not exist, the handle
147
+ # can still exist in the record, and we check for credentials to avoid
148
+ # updating the SSH config for non-existent clusters.
149
+ credentials = record['credentials']
150
+ if isinstance(handle.launched_resources.cloud, clouds.Kubernetes):
151
+ # Replace the proxy command to proxy through the SkyPilot API
152
+ # server with websocket.
153
+ key_path = (cluster_utils.SSHConfigHelper.generate_local_key_file(
154
+ handle.cluster_name, credentials))
155
+ # Instead of directly use websocket_proxy.py, we add an
156
+ # additional proxy, so that ssh can use the head pod in the
157
+ # cluster to jump to worker pods.
158
+ proxy_command = (
159
+ f'ssh -tt -i {key_path} '
160
+ '-o StrictHostKeyChecking=no '
161
+ '-o UserKnownHostsFile=/dev/null '
162
+ '-o IdentitiesOnly=yes '
163
+ '-W %h:%p '
164
+ f'{handle.ssh_user}@127.0.0.1 '
165
+ '-o ProxyCommand='
166
+ # TODO(zhwu): write the template to a temp file, don't use
167
+ # the one in skypilot repo, to avoid changing the file when
168
+ # updating skypilot.
169
+ f'\'{sys.executable} {sky.__root_dir__}/templates/'
170
+ f'websocket_proxy.py '
171
+ f'{server_common.get_server_url()} '
172
+ f'{handle.cluster_name}\'')
173
+ credentials['ssh_proxy_command'] = proxy_command
174
+
175
+ cluster_utils.SSHConfigHelper.add_cluster(
176
+ handle.cluster_name,
177
+ handle.cached_external_ips,
178
+ credentials,
179
+ handle.cached_external_ssh_ports,
180
+ handle.docker_user,
181
+ handle.ssh_user,
182
+ )
180
183
 
181
184
  # Clean up SSH configs for clusters that do not exist.
182
185
  #
@@ -186,14 +189,15 @@ def _get_cluster_records_and_set_ssh_config(
186
189
  # removing clusters, because SkyPilot has no idea whether to remove
187
190
  # ssh config of a cluster from another user.
188
191
  clusters_exists = set(record['name'] for record in cluster_records)
192
+ clusters_to_remove: Set[str] = set()
189
193
  if clusters is not None:
190
- for cluster in clusters:
191
- if cluster not in clusters_exists:
192
- cluster_utils.SSHConfigHelper.remove_cluster(cluster)
194
+ clusters_to_remove = set(clusters) - clusters_exists
193
195
  elif all_users:
194
- for cluster_name in cluster_utils.SSHConfigHelper.list_cluster_names():
195
- if cluster_name not in clusters_exists:
196
- cluster_utils.SSHConfigHelper.remove_cluster(cluster_name)
196
+ clusters_to_remove = set(cluster_utils.SSHConfigHelper.
197
+ list_cluster_names()) - clusters_exists
198
+
199
+ for cluster_name in clusters_to_remove:
200
+ cluster_utils.SSHConfigHelper.remove_cluster(cluster_name)
197
201
 
198
202
  return cluster_records
199
203
 
sky/client/sdk.py CHANGED
@@ -1812,7 +1812,8 @@ def api_login(endpoint: Optional[str] = None) -> None:
1812
1812
  server_common.check_server_healthy(endpoint)
1813
1813
 
1814
1814
  # Set the endpoint in the config file
1815
- config_path = pathlib.Path(skypilot_config.CONFIG_PATH).expanduser()
1815
+ config_path = pathlib.Path(
1816
+ skypilot_config.get_user_config_path()).expanduser()
1816
1817
  with filelock.FileLock(config_path.with_suffix('.lock')):
1817
1818
  if not skypilot_config.loaded():
1818
1819
  config_path.touch()
sky/clouds/aws.py CHANGED
@@ -472,10 +472,10 @@ class AWS(clouds.Cloud):
472
472
  with ux_utils.print_exception_no_traceback():
473
473
  logger.warning(
474
474
  f'Skip opening ports {resources.ports} for cluster {cluster_name!r}, '
475
- 'as `aws.security_group_name` in `~/.sky/config.yaml` is specified as '
475
+ 'as `aws.security_group_name` in `~/.sky/skyconfig.yaml` is specified as '
476
476
  f' {security_group!r}. Please make sure the specified security group '
477
477
  'has requested ports setup; or, leave out `aws.security_group_name` '
478
- 'in `~/.sky/config.yaml`.')
478
+ 'in `~/.sky/skyconfig.yaml`.')
479
479
 
480
480
  return {
481
481
  'instance_type': r.instance_type,
sky/clouds/cloud.py CHANGED
@@ -37,7 +37,7 @@ class CloudImplementationFeatures(enum.Enum):
37
37
  _cloud_unsupported_features in all clouds to make sure the
38
38
  check_features_are_supported() works as expected.
39
39
  """
40
- STOP = 'stop' # Includes both stop and autostop.
40
+ STOP = 'stop'
41
41
  MULTI_NODE = 'multi-node'
42
42
  CLONE_DISK_FROM_CLUSTER = 'clone_disk_from_cluster'
43
43
  IMAGE_ID = 'image_id'
@@ -47,7 +47,8 @@ class CloudImplementationFeatures(enum.Enum):
47
47
  OPEN_PORTS = 'open_ports'
48
48
  STORAGE_MOUNTING = 'storage_mounting'
49
49
  HOST_CONTROLLERS = 'host_controllers' # Can run jobs/serve controllers
50
- AUTO_TERMINATE = 'auto_terminate' # Pod/VM can stop or down itself
50
+ AUTOSTOP = 'autostop' # Pod/VM can stop itself
51
+ AUTODOWN = 'autodown' # Pod/VM can down itself
51
52
 
52
53
 
53
54
  # Use str, enum.Enum to allow CloudCapability to be used as a string.
sky/clouds/kubernetes.py CHANGED
@@ -35,6 +35,10 @@ CREDENTIAL_PATH = os.environ.get('KUBECONFIG', DEFAULT_KUBECONFIG_PATH)
35
35
  # E.g., FUSE device manager daemonset is run in this namespace.
36
36
  _SKYPILOT_SYSTEM_NAMESPACE = 'skypilot-system'
37
37
 
38
+ # Shared directory to communicate with fusermount-server, refer to
39
+ # addons/fuse-proxy/README.md for more details.
40
+ _FUSERMOUNT_SHARED_DIR = '/var/run/fusermount'
41
+
38
42
 
39
43
  @registry.CLOUD_REGISTRY.register(aliases=['k8s'])
40
44
  class Kubernetes(clouds.Cloud):
@@ -110,9 +114,13 @@ class Kubernetes(clouds.Cloud):
110
114
  # Controllers cannot spin up new pods with exec auth.
111
115
  unsupported_features[
112
116
  clouds.CloudImplementationFeatures.HOST_CONTROLLERS] = message
113
- # Pod does not have permissions to terminate itself with exec auth.
117
+ # Pod does not have permissions to down itself with exec auth.
114
118
  unsupported_features[
115
- clouds.CloudImplementationFeatures.AUTO_TERMINATE] = message
119
+ clouds.CloudImplementationFeatures.AUTODOWN] = message
120
+ unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
121
+ 'Stopping clusters is not supported on Kubernetes.')
122
+ unsupported_features[clouds.CloudImplementationFeatures.AUTOSTOP] = (
123
+ 'Auto-stop is not supported on Kubernetes.')
116
124
  # Allow spot instances if supported by the cluster
117
125
  try:
118
126
  spot_label_key, _ = kubernetes_utils.get_spot_label(context)
@@ -551,8 +559,9 @@ class Kubernetes(clouds.Cloud):
551
559
  'k8s_service_account_name': k8s_service_account_name,
552
560
  'k8s_automount_sa_token': k8s_automount_sa_token,
553
561
  'k8s_fuse_device_required': fuse_device_required,
554
- # Namespace to run the FUSE device manager in
562
+ # Namespace to run the fusermount-server daemonset in
555
563
  'k8s_skypilot_system_namespace': _SKYPILOT_SYSTEM_NAMESPACE,
564
+ 'k8s_fusermount_shared_dir': _FUSERMOUNT_SHARED_DIR,
556
565
  'k8s_spot_label_key': spot_label_key,
557
566
  'k8s_spot_label_value': spot_label_value,
558
567
  'tpu_requested': tpu_requested,
@@ -658,6 +667,14 @@ class Kubernetes(clouds.Cloud):
658
667
  def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
659
668
  """Checks if the user has access credentials to
660
669
  Kubernetes."""
670
+ # Check for port forward dependencies
671
+ reasons = kubernetes_utils.check_port_forward_mode_dependencies(False)
672
+ if reasons is not None:
673
+ formatted = '\n'.join(
674
+ [reasons[0]] +
675
+ [f'{cls._INDENT_PREFIX}' + r for r in reasons[1:]])
676
+ return (False, formatted)
677
+
661
678
  # Test using python API
662
679
  try:
663
680
  existing_allowed_contexts = cls.existing_allowed_contexts()
sky/clouds/nebius.py CHANGED
@@ -53,10 +53,8 @@ class Nebius(clouds.Cloud):
53
53
  """Nebius GPU Cloud"""
54
54
  _REPR = 'Nebius'
55
55
  _CLOUD_UNSUPPORTED_FEATURES = {
56
- clouds.CloudImplementationFeatures.AUTO_TERMINATE:
57
- ('Autodown and Autostop not supported. Can\'t delete disk.'),
58
- # Autostop functionality can be implemented, but currently,
59
- # there is only a single flag for both autostop and autodown.
56
+ clouds.CloudImplementationFeatures.AUTODOWN:
57
+ ('Autodown not supported. Can\'t delete OS disk.'),
60
58
  clouds.CloudImplementationFeatures.SPOT_INSTANCE:
61
59
  ('Spot is not supported, as Nebius API does not implement spot.'),
62
60
  clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
sky/clouds/oci.py CHANGED
@@ -9,8 +9,8 @@ History:
9
9
  file path resolution (by os.path.expanduser) when construct the file
10
10
  mounts. This bug will cause the created workder nodes located in different
11
11
  compartment and VCN than the header node if user specifies compartment_id
12
- in the sky config file, because the ~/.sky/config.yaml is not sync-ed to the
13
- remote machine.
12
+ in the sky config file, because the ~/.sky/skyconfig.yaml is not
13
+ sync-ed to the remote machine.
14
14
  The workaround is set the sky config file path using ENV before running
15
15
  the sky launch: export SKYPILOT_CONFIG=/home/ubuntu/.sky/config.yaml
16
16
  - Hysun He (hysun.he@oracle.com) @ Oct 12, 2024:
@@ -147,7 +147,7 @@ class OCIConfig:
147
147
  if config_path_via_env_var is not None:
148
148
  config_path = config_path_via_env_var
149
149
  else:
150
- config_path = skypilot_config.CONFIG_PATH
150
+ config_path = skypilot_config.get_user_config_path()
151
151
  return config_path
152
152
 
153
153
  @classmethod
sky/core.py CHANGED
@@ -629,26 +629,21 @@ def autostop(
629
629
  raise exceptions.NotSupportedError(
630
630
  f'{operation} cluster {cluster_name!r} with backend '
631
631
  f'{backend.__class__.__name__!r} is not supported.')
632
- # Check autostop is implemented for cloud
633
632
  cloud = handle.launched_resources.cloud
634
- if not down and not is_cancel:
635
- try:
636
- cloud.check_features_are_supported(
637
- handle.launched_resources,
638
- {clouds.CloudImplementationFeatures.STOP})
639
- except exceptions.NotSupportedError as e:
640
- raise exceptions.NotSupportedError(
641
- f'{colorama.Fore.YELLOW}Scheduling autostop on cluster '
642
- f'{cluster_name!r}...skipped.{colorama.Style.RESET_ALL}\n'
643
- f' {_stop_not_supported_message(handle.launched_resources)}.'
644
- ) from e
645
-
646
- # Check if autodown is required and supported
633
+ # Check if autostop/autodown is required and supported
647
634
  if not is_cancel:
648
635
  try:
649
- cloud.check_features_are_supported(
650
- handle.launched_resources,
651
- {clouds.CloudImplementationFeatures.AUTO_TERMINATE})
636
+ if down:
637
+ cloud.check_features_are_supported(
638
+ handle.launched_resources,
639
+ {clouds.CloudImplementationFeatures.AUTODOWN})
640
+ else:
641
+ cloud.check_features_are_supported(
642
+ handle.launched_resources,
643
+ {clouds.CloudImplementationFeatures.STOP})
644
+ cloud.check_features_are_supported(
645
+ handle.launched_resources,
646
+ {clouds.CloudImplementationFeatures.AUTOSTOP})
652
647
  except exceptions.NotSupportedError as e:
653
648
  raise exceptions.NotSupportedError(
654
649
  f'{colorama.Fore.YELLOW}{operation} on cluster '
@@ -30,9 +30,17 @@ _BLOBFUSE_CACHE_DIR = ('~/.sky/blobfuse2_cache/'
30
30
  # https://github.com/rclone/rclone/releases
31
31
  RCLONE_VERSION = 'v1.68.2'
32
32
 
33
+ # A wrapper for goofys to choose the logging mechanism based on environment.
34
+ _GOOFYS_WRAPPER = ('$(if [ -S /dev/log ] ; then '
35
+ 'echo "goofys"; '
36
+ 'else '
37
+ 'echo "goofys --log-file $(mktemp -t goofys.XXXX.log)"; '
38
+ 'fi)')
39
+
33
40
 
34
41
  def get_s3_mount_install_cmd() -> str:
35
42
  """Returns a command to install S3 mount utility goofys."""
43
+ # TODO(aylei): maintain our goofys fork under skypilot-org
36
44
  install_cmd = ('ARCH=$(uname -m) && '
37
45
  'if [ "$ARCH" = "aarch64" ] || [ "$ARCH" = "arm64" ]; then '
38
46
  ' echo "goofys is not supported on $ARCH" && '
@@ -40,8 +48,8 @@ def get_s3_mount_install_cmd() -> str:
40
48
  'else '
41
49
  ' ARCH_SUFFIX="amd64"; '
42
50
  'fi && '
43
- 'sudo wget -nc https://github.com/romilbhardwaj/goofys/'
44
- 'releases/download/0.24.0-romilb-upstream/goofys '
51
+ 'sudo wget -nc https://github.com/aylei/goofys/'
52
+ 'releases/download/0.24.0-aylei-upstream/goofys '
45
53
  '-O /usr/local/bin/goofys && '
46
54
  'sudo chmod 755 /usr/local/bin/goofys')
47
55
  return install_cmd
@@ -56,7 +64,7 @@ def get_s3_mount_cmd(bucket_name: str,
56
64
  _bucket_sub_path = ''
57
65
  else:
58
66
  _bucket_sub_path = f':{_bucket_sub_path}'
59
- mount_cmd = ('goofys -o allow_other '
67
+ mount_cmd = (f'{_GOOFYS_WRAPPER} -o allow_other '
60
68
  f'--stat-cache-ttl {_STAT_CACHE_TTL} '
61
69
  f'--type-cache-ttl {_TYPE_CACHE_TTL} '
62
70
  f'{bucket_name}{_bucket_sub_path} {mount_path}')
@@ -73,7 +81,8 @@ def get_nebius_mount_cmd(nebius_profile_name: str,
73
81
  _bucket_sub_path = ''
74
82
  else:
75
83
  _bucket_sub_path = f':{_bucket_sub_path}'
76
- mount_cmd = (f'AWS_PROFILE={nebius_profile_name} goofys -o allow_other '
84
+ mount_cmd = (f'AWS_PROFILE={nebius_profile_name} {_GOOFYS_WRAPPER} '
85
+ '-o allow_other '
77
86
  f'--stat-cache-ttl {_STAT_CACHE_TTL} '
78
87
  f'--type-cache-ttl {_TYPE_CACHE_TTL} '
79
88
  f'--endpoint {endpoint_url} '
@@ -185,14 +194,28 @@ def get_az_mount_cmd(container_name: str,
185
194
  bucket_sub_path_arg = ''
186
195
  else:
187
196
  bucket_sub_path_arg = f'--subdirectory={_bucket_sub_path}/ '
197
+ mount_options = '-o allow_other -o default_permissions'
188
198
  # TODO(zpoint): clear old cache that has been created in the previous boot.
199
+ blobfuse2_cmd = ('blobfuse2 --no-symlinks -o umask=022 '
200
+ f'--tmp-path {cache_path}_$({remote_boot_time_cmd}) '
201
+ f'{bucket_sub_path_arg}'
202
+ f'--container-name {container_name}')
203
+ # 1. Set -o nonempty to bypass empty directory check of blobfuse2 when using
204
+ # fusermount-wrapper, since the mount is delegated to fusermount and
205
+ # blobfuse2 only get the mounted fd.
206
+ # 2. {} is the mount point placeholder that will be replaced with the
207
+ # mounted fd by fusermount-wrapper.
208
+ wrapped = (f'fusermount-wrapper -m {mount_path} {mount_options} '
209
+ f'-- {blobfuse2_cmd} -o nonempty {{}}')
210
+ original = f'{blobfuse2_cmd} {mount_options} {mount_path}'
211
+ # If fusermount-wrapper is available, use it to wrap the blobfuse2 command
212
+ # to avoid requiring root privilege.
213
+ # TODO(aylei): feeling hacky, refactor this.
214
+ get_mount_cmd = ('command -v fusermount-wrapper >/dev/null 2>&1 && '
215
+ f'echo "{wrapped}" || echo "{original}"')
189
216
  mount_cmd = (f'AZURE_STORAGE_ACCOUNT={storage_account_name} '
190
217
  f'{key_env_var} '
191
- f'blobfuse2 {mount_path} --allow-other --no-symlinks '
192
- '-o umask=022 -o default_permissions '
193
- f'--tmp-path {cache_path}_$({remote_boot_time_cmd}) '
194
- f'{bucket_sub_path_arg}'
195
- f'--container-name {container_name}')
218
+ f'$({get_mount_cmd})')
196
219
  return mount_cmd
197
220
 
198
221
 
@@ -209,7 +232,8 @@ def get_r2_mount_cmd(r2_credentials_path: str,
209
232
  else:
210
233
  _bucket_sub_path = f':{_bucket_sub_path}'
211
234
  mount_cmd = (f'AWS_SHARED_CREDENTIALS_FILE={r2_credentials_path} '
212
- f'AWS_PROFILE={r2_profile_name} goofys -o allow_other '
235
+ f'AWS_PROFILE={r2_profile_name} {_GOOFYS_WRAPPER} '
236
+ '-o allow_other '
213
237
  f'--stat-cache-ttl {_STAT_CACHE_TTL} '
214
238
  f'--type-cache-ttl {_TYPE_CACHE_TTL} '
215
239
  f'--endpoint {endpoint_url} '
sky/exceptions.py CHANGED
@@ -265,7 +265,7 @@ class CommandError(SkyPilotExcludeArgsBaseException):
265
265
  # Chunk the command to avoid overflow.
266
266
  command = command[:100] + '...'
267
267
  message = (f'Command {command} failed with return code '
268
- f'{returncode}.\n{error_msg}')
268
+ f'{returncode}.\n{error_msg}\n{detailed_reason}')
269
269
  super().__init__(message)
270
270
 
271
271
 
sky/execution.py CHANGED
@@ -237,11 +237,12 @@ def _execute(
237
237
  if Stage.DOWN in stages:
238
238
  stages.remove(Stage.DOWN)
239
239
  if idle_minutes_to_autostop >= 0:
240
- requested_features.add(
241
- clouds.CloudImplementationFeatures.AUTO_TERMINATE)
242
- if not down:
240
+ if down:
243
241
  requested_features.add(
244
- clouds.CloudImplementationFeatures.STOP)
242
+ clouds.CloudImplementationFeatures.AUTODOWN)
243
+ else:
244
+ requested_features.add(
245
+ clouds.CloudImplementationFeatures.AUTOSTOP)
245
246
  # NOTE: in general we may not have sufficiently specified info
246
247
  # (cloud/resource) to check STOP_SPOT_INSTANCE here. This is checked in
247
248
  # the backend.
@@ -44,7 +44,9 @@ _DUMP_RAY_PORTS = (
44
44
 
45
45
  _RAY_PORT_COMMAND = (
46
46
  f'RAY_PORT=$({constants.SKY_PYTHON_CMD} -c '
47
- '"from sky.skylet import job_lib; print(job_lib.get_ray_port())" '
47
+ '"from sky import sky_logging\n'
48
+ 'with sky_logging.silent(): '
49
+ 'from sky.skylet import job_lib; print(job_lib.get_ray_port())" '
48
50
  '2> /dev/null || echo 6379);'
49
51
  f'{constants.SKY_PYTHON_CMD} -c "from sky.utils import message_utils; '
50
52
  'print(message_utils.encode_payload({\'ray_port\': $RAY_PORT}))"')