skypilot-nightly 1.0.0.dev20250411__py3-none-any.whl → 1.0.0.dev20250413__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/oci.py +2 -2
  3. sky/authentication.py +2 -2
  4. sky/backends/backend_utils.py +1 -1
  5. sky/backends/cloud_vm_ray_backend.py +3 -3
  6. sky/check.py +1 -1
  7. sky/cli.py +51 -47
  8. sky/client/cli.py +51 -47
  9. sky/client/sdk.py +2 -1
  10. sky/clouds/aws.py +2 -2
  11. sky/clouds/cloud.py +3 -2
  12. sky/clouds/kubernetes.py +20 -3
  13. sky/clouds/nebius.py +2 -4
  14. sky/clouds/oci.py +2 -2
  15. sky/clouds/utils/oci_utils.py +1 -1
  16. sky/core.py +12 -17
  17. sky/data/mounting_utils.py +34 -10
  18. sky/exceptions.py +1 -1
  19. sky/execution.py +5 -4
  20. sky/provision/instance_setup.py +3 -1
  21. sky/provision/kubernetes/config.py +41 -36
  22. sky/provision/kubernetes/instance.py +4 -7
  23. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +54 -0
  24. sky/provision/kubernetes/network_utils.py +1 -1
  25. sky/provision/kubernetes/utils.py +51 -35
  26. sky/server/requests/payloads.py +2 -0
  27. sky/setup_files/dependencies.py +1 -1
  28. sky/skylet/constants.py +2 -2
  29. sky/skypilot_config.py +179 -41
  30. sky/templates/kubernetes-ray.yml.j2 +66 -25
  31. sky/templates/websocket_proxy.py +41 -2
  32. sky/utils/config_utils.py +1 -1
  33. sky/utils/controller_utils.py +1 -1
  34. sky/utils/kubernetes/generate_kubeconfig.sh +2 -2
  35. sky/utils/kubernetes/rsync_helper.sh +26 -11
  36. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/METADATA +3 -1
  37. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/RECORD +41 -42
  38. sky/provision/kubernetes/manifests/smarter-device-manager-configmap.yaml +0 -10
  39. sky/provision/kubernetes/manifests/smarter-device-manager-daemonset.yaml +0 -68
  40. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/WHEEL +0 -0
  41. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/entry_points.txt +0 -0
  42. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/licenses/LICENSE +0 -0
  43. {skypilot_nightly-1.0.0.dev20250411.dist-info → skypilot_nightly-1.0.0.dev20250413.dist-info}/top_level.txt +0 -0
@@ -43,7 +43,7 @@ def bootstrap_instances(
43
43
  if (requested_service_account ==
44
44
  kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME):
45
45
  # If the user has requested a different service account (via pod_config
46
- # in ~/.sky/config.yaml), we assume they have already set up the
46
+ # in ~/.sky/skyconfig.yaml), we assume they have already set up the
47
47
  # necessary roles and role bindings.
48
48
  # If not, set up the roles and bindings for skypilot-service-account
49
49
  # here.
@@ -561,69 +561,74 @@ def _configure_skypilot_system_namespace(
561
561
 
562
562
 
563
563
  def _configure_fuse_mounting(provider_config: Dict[str, Any]) -> None:
564
- """Creates sidecars required for FUSE mounting.
564
+ """Creates the privileged daemonset required for FUSE mounting.
565
565
 
566
566
  FUSE mounting in Kubernetes without privileged containers requires us to
567
- run a sidecar container with the necessary capabilities. We run a daemonset
568
- which exposes the host /dev/fuse device as a Kubernetes resource. The
569
- SkyPilot pod requests this resource to mount the FUSE filesystem.
567
+ run a privileged daemonset which accepts fusermount requests via unix
568
+ domain socket and perform the mount/unmount operations on the host /dev/fuse
569
+ device.
570
570
 
571
571
  We create this daemonset in the skypilot_system_namespace, which is
572
- configurable in the provider config. This allows the FUSE mounting sidecar
573
- to be shared across multiple tenants. The default namespace is
572
+ configurable in the provider config. This allows the daemonset to be
573
+ shared across multiple tenants. The default namespace is
574
574
  'skypilot-system' (populated in clouds.Kubernetes).
575
+
576
+ For legacy smarter-device-manager daemonset, we keep it as is since it may
577
+ still be used by other tenants.
575
578
  """
576
579
 
577
- logger.info('_configure_fuse_mounting: Setting up FUSE device manager.')
580
+ logger.info(
581
+ '_configure_fuse_mounting: Setting up fusermount-server daemonset.')
578
582
 
579
- fuse_device_manager_namespace = provider_config['skypilot_system_namespace']
583
+ fuse_proxy_namespace = provider_config['skypilot_system_namespace']
580
584
  context = kubernetes_utils.get_context_from_config(provider_config)
581
585
 
582
- # Read the device manager YAMLs from the manifests directory
586
+ # Read the YAMLs from the manifests directory
583
587
  root_dir = os.path.dirname(os.path.dirname(__file__))
584
588
 
585
- # Load and create the ConfigMap
586
- logger.info('_configure_fuse_mounting: Creating configmap.')
587
- config_map_path = os.path.join(
588
- root_dir, 'kubernetes/manifests/smarter-device-manager-configmap.yaml')
589
- with open(config_map_path, 'r', encoding='utf-8') as file:
590
- config_map = yaml.safe_load(file)
591
- kubernetes_utils.merge_custom_metadata(config_map['metadata'])
592
- try:
593
- kubernetes.core_api(context).create_namespaced_config_map(
594
- fuse_device_manager_namespace, config_map)
595
- except kubernetes.api_exception() as e:
596
- if e.status == 409:
597
- logger.info('_configure_fuse_mounting: ConfigMap already exists '
598
- f'in namespace {fuse_device_manager_namespace!r}')
599
- else:
600
- raise
601
- else:
602
- logger.info('_configure_fuse_mounting: ConfigMap created '
603
- f'in namespace {fuse_device_manager_namespace!r}')
604
-
605
589
  # Load and create the DaemonSet
590
+ # TODO(aylei): support customize and upgrade the fusermount-server image
606
591
  logger.info('_configure_fuse_mounting: Creating daemonset.')
607
592
  daemonset_path = os.path.join(
608
- root_dir, 'kubernetes/manifests/smarter-device-manager-daemonset.yaml')
593
+ root_dir, 'kubernetes/manifests/fusermount-server-daemonset.yaml')
609
594
  with open(daemonset_path, 'r', encoding='utf-8') as file:
610
595
  daemonset = yaml.safe_load(file)
611
596
  kubernetes_utils.merge_custom_metadata(daemonset['metadata'])
612
597
  try:
613
598
  kubernetes.apps_api(context).create_namespaced_daemon_set(
614
- fuse_device_manager_namespace, daemonset)
599
+ fuse_proxy_namespace, daemonset)
615
600
  except kubernetes.api_exception() as e:
616
601
  if e.status == 409:
617
602
  logger.info('_configure_fuse_mounting: DaemonSet already exists '
618
- f'in namespace {fuse_device_manager_namespace!r}')
603
+ f'in namespace {fuse_proxy_namespace!r}')
604
+ existing_ds = kubernetes.apps_api(
605
+ context).read_namespaced_daemon_set(
606
+ daemonset['metadata']['name'], fuse_proxy_namespace)
607
+ ds_image = daemonset['spec']['template']['spec']['containers'][0][
608
+ 'image']
609
+ if existing_ds.spec.template.spec.containers[0].image != ds_image:
610
+ logger.info(
611
+ '_configure_fuse_mounting: Updating DaemonSet image.')
612
+ kubernetes.apps_api(context).patch_namespaced_daemon_set(
613
+ daemonset['metadata']['name'], fuse_proxy_namespace,
614
+ daemonset)
615
+ elif e.status == 403 or e.status == 401:
616
+ logger.error('SkyPilot does not have permission to create '
617
+ 'fusermount-server DaemonSet in namespace '
618
+ f'{fuse_proxy_namespace!r}, Error: {e.reason}. '
619
+ 'Please check the permissions of the SkyPilot service '
620
+ 'account or contact your cluster admin to create the '
621
+ 'DaemonSet manually. '
622
+ 'Reference: https://docs.skypilot.co/reference/kubernetes/kubernetes-setup.html#kubernetes-setup-fuse') # pylint: disable=line-too-long
623
+ raise
619
624
  else:
620
625
  raise
621
626
  else:
622
627
  logger.info('_configure_fuse_mounting: DaemonSet created '
623
- f'in namespace {fuse_device_manager_namespace!r}')
628
+ f'in namespace {fuse_proxy_namespace!r}')
624
629
 
625
- logger.info('FUSE device manager setup complete '
626
- f'in namespace {fuse_device_manager_namespace!r}')
630
+ logger.info('fusermount-server daemonset setup complete '
631
+ f'in namespace {fuse_proxy_namespace!r}')
627
632
 
628
633
 
629
634
  def _configure_services(namespace: str, context: Optional[str],
@@ -162,12 +162,9 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
162
162
  raise config_lib.KubernetesError(
163
163
  _lack_resource_msg('memory', pod,
164
164
  details=event_message))
165
- if 'Insufficient smarter-devices/fuse' in event_message:
166
- raise config_lib.KubernetesError(
167
- 'Something went wrong with FUSE device daemonset.'
168
- ' Try restarting your FUSE pods by running '
169
- '`kubectl delete pods -n skypilot-system -l name=smarter-device-manager`.' # pylint: disable=line-too-long
170
- f' Full error: {event_message}')
165
+ # TODO(aylei): after switching from smarter-device-manager to
166
+ # fusermount-server, we need a new way to check whether the
167
+ # fusermount-server daemonset is ready.
171
168
  gpu_lf_keys = [
172
169
  key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
173
170
  for key in lf.get_label_keys()
@@ -723,7 +720,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
723
720
  f'{common_utils.format_exception(e)}'
724
721
  'Continuing without using nvidia RuntimeClass.\n'
725
722
  'If you are on a K3s cluster, manually '
726
- 'override runtimeClassName in ~/.sky/config.yaml. '
723
+ 'override runtimeClassName in ~/.sky/skyconfig.yaml. '
727
724
  'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html') # pylint: disable=line-too-long
728
725
 
729
726
  needs_gpus = False
@@ -0,0 +1,54 @@
1
+ apiVersion: apps/v1
2
+ kind: DaemonSet
3
+ metadata:
4
+ name: fusermount-server
5
+ labels:
6
+ app: fusermount-server
7
+ role: agent
8
+ parent: skypilot
9
+ spec:
10
+ selector:
11
+ matchLabels:
12
+ app: fusermount-server
13
+ template:
14
+ metadata:
15
+ labels:
16
+ app: fusermount-server
17
+ spec:
18
+ # Add tolerations to run on all nodes
19
+ tolerations:
20
+ - operator: Exists
21
+ effect: NoSchedule
22
+ - operator: Exists
23
+ effect: NoExecute
24
+ containers:
25
+ - name: server
26
+ # TODO(aylei): version strategy of our addon images
27
+ image: berkeleyskypilot/fusermount-server:latest
28
+ securityContext:
29
+ privileged: true
30
+ volumeMounts:
31
+ - name: shared-dir
32
+ mountPath: /var/run/fusermount
33
+ env:
34
+ - name: FUSERMOUNT_SHARED_DIR
35
+ value: /var/run/fusermount
36
+ resources:
37
+ requests:
38
+ cpu: 50m
39
+ memory: 50Mi
40
+ livenessProbe:
41
+ exec:
42
+ command:
43
+ - /bin/sh
44
+ - -c
45
+ - "test -S /var/run/fusermount/server.sock"
46
+ initialDelaySeconds: 10
47
+ periodSeconds: 5
48
+ timeoutSeconds: 2
49
+ failureThreshold: 10
50
+ volumes:
51
+ - name: shared-dir
52
+ hostPath:
53
+ path: /var/run/fusermount
54
+ type: DirectoryOrCreate
@@ -66,7 +66,7 @@ def get_networking_mode(
66
66
  except ValueError as e:
67
67
  with ux_utils.print_exception_no_traceback():
68
68
  raise ValueError(str(e) +
69
- ' Please check: ~/.sky/config.yaml.') from None
69
+ ' Please check: ~/.sky/skyconfig.yaml.') from None
70
70
  return networking_mode
71
71
 
72
72
 
@@ -1454,14 +1454,14 @@ def is_kubeconfig_exec_auth(
1454
1454
 
1455
1455
 
1456
1456
  Using exec-based authentication is problematic when used in conjunction
1457
- with kubernetes.remote_identity = LOCAL_CREDENTIAL in ~/.sky/config.yaml.
1457
+ with kubernetes.remote_identity = LOCAL_CREDENTIAL in ~/.sky/skyconfig.yaml.
1458
1458
  This is because the exec-based authentication may not have the relevant
1459
1459
  dependencies installed on the remote cluster or may have hardcoded paths
1460
1460
  that are not available on the remote cluster.
1461
1461
 
1462
1462
  Returns:
1463
1463
  bool: True if exec-based authentication is used and LOCAL_CREDENTIAL
1464
- mode is used for remote_identity in ~/.sky/config.yaml.
1464
+ mode is used for remote_identity in ~/.sky/skyconfig.yaml.
1465
1465
  str: Error message if exec-based authentication is used, None otherwise
1466
1466
  """
1467
1467
  k8s = kubernetes.kubernetes
@@ -1514,7 +1514,7 @@ def is_kubeconfig_exec_auth(
1514
1514
  'Managed Jobs or SkyServe controller on Kubernetes. '
1515
1515
  'To fix, configure SkyPilot to create a service account '
1516
1516
  'for running pods by setting the following in '
1517
- '~/.sky/config.yaml:\n'
1517
+ '~/.sky/skyconfig.yaml:\n'
1518
1518
  ' kubernetes:\n'
1519
1519
  ' remote_identity: SERVICE_ACCOUNT\n'
1520
1520
  ' More: https://docs.skypilot.co/en/latest/'
@@ -2148,32 +2148,35 @@ def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
2148
2148
  return content
2149
2149
 
2150
2150
 
2151
- def check_port_forward_mode_dependencies() -> None:
2152
- """Checks if 'socat' and 'nc' are installed"""
2151
+ def check_port_forward_mode_dependencies(
2152
+ raise_error: bool = True) -> Optional[List[str]]:
2153
+ """Checks if 'socat' and 'nc' are installed
2153
2154
 
2154
- # Construct runtime errors
2155
- socat_default_error = RuntimeError(
2156
- f'`socat` is required to setup Kubernetes cloud with '
2155
+ Args:
2156
+ raise_error: set to true when the dependencies need to be present.
2157
+ set to false for `sky check`, where reason strings are compiled
2158
+ at the end.
2159
+
2160
+ Returns: the reasons list if there are missing dependencies.
2161
+ """
2162
+
2163
+ # errors
2164
+ socat_message = (
2165
+ '`socat` is required to setup Kubernetes cloud with '
2157
2166
  f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` ' # pylint: disable=line-too-long
2158
- 'default networking mode and it is not installed. '
2159
- 'On Debian/Ubuntu, install it with:\n'
2160
- f' $ sudo apt install socat\n'
2161
- f'On MacOS, install it with: \n'
2162
- f' $ brew install socat')
2163
- netcat_default_error = RuntimeError(
2164
- f'`nc` is required to setup Kubernetes cloud with '
2167
+ 'default networking mode and it is not installed. ')
2168
+ netcat_default_message = (
2169
+ '`nc` is required to setup Kubernetes cloud with '
2165
2170
  f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` ' # pylint: disable=line-too-long
2166
- 'default networking mode and it is not installed. '
2167
- 'On Debian/Ubuntu, install it with:\n'
2168
- f' $ sudo apt install netcat\n'
2169
- f'On MacOS, install it with: \n'
2170
- f' $ brew install netcat')
2171
- mac_installed_error = RuntimeError(
2172
- f'The default MacOS `nc` is installed. However, for '
2171
+ 'default networking mode and it is not installed. ')
2172
+ netcat_macos_message = (
2173
+ 'The default MacOS `nc` is installed. However, for '
2173
2174
  f'`{kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value}` ' # pylint: disable=line-too-long
2174
- 'default networking mode, GNU netcat is required. '
2175
- f'On MacOS, install it with: \n'
2176
- f' $ brew install netcat')
2175
+ 'default networking mode, GNU netcat is required. ')
2176
+
2177
+ # save
2178
+ reasons = []
2179
+ required_binaries = []
2177
2180
 
2178
2181
  # Ensure socat is installed
2179
2182
  try:
@@ -2182,8 +2185,8 @@ def check_port_forward_mode_dependencies() -> None:
2182
2185
  stderr=subprocess.DEVNULL,
2183
2186
  check=True)
2184
2187
  except (FileNotFoundError, subprocess.CalledProcessError):
2185
- with ux_utils.print_exception_no_traceback():
2186
- raise socat_default_error from None
2188
+ required_binaries.append('socat')
2189
+ reasons.append(socat_message)
2187
2190
 
2188
2191
  # Ensure netcat is installed
2189
2192
  #
@@ -2198,15 +2201,28 @@ def check_port_forward_mode_dependencies() -> None:
2198
2201
  netcat_output.stderr)
2199
2202
 
2200
2203
  if nc_mac_installed:
2201
- with ux_utils.print_exception_no_traceback():
2202
- raise mac_installed_error from None
2204
+ required_binaries.append('netcat')
2205
+ reasons.append(netcat_macos_message)
2203
2206
  elif netcat_output.returncode != 0:
2204
- with ux_utils.print_exception_no_traceback():
2205
- raise netcat_default_error from None
2207
+ required_binaries.append('netcat')
2208
+ reasons.append(netcat_default_message)
2206
2209
 
2207
2210
  except FileNotFoundError:
2208
- with ux_utils.print_exception_no_traceback():
2209
- raise netcat_default_error from None
2211
+ required_binaries.append('netcat')
2212
+ reasons.append(netcat_default_message)
2213
+
2214
+ if required_binaries:
2215
+ reasons.extend([
2216
+ 'On Debian/Ubuntu, install the missing dependenc(ies) with:',
2217
+ f' $ sudo apt install {" ".join(required_binaries)}',
2218
+ 'On MacOS, install with: ',
2219
+ f' $ brew install {" ".join(required_binaries)}',
2220
+ ])
2221
+ if raise_error:
2222
+ with ux_utils.print_exception_no_traceback():
2223
+ raise RuntimeError('\n'.join(reasons))
2224
+ return reasons
2225
+ return None
2210
2226
 
2211
2227
 
2212
2228
  def get_endpoint_debug_message() -> str:
@@ -2236,7 +2252,7 @@ def combine_pod_config_fields(
2236
2252
  cluster_config_overrides: Dict[str, Any],
2237
2253
  ) -> None:
2238
2254
  """Adds or updates fields in the YAML with fields from the
2239
- ~/.sky/config.yaml's kubernetes.pod_spec dict.
2255
+ ~/.sky/skyconfig.yaml's kubernetes.pod_spec dict.
2240
2256
  This can be used to add fields to the YAML that are not supported by
2241
2257
  SkyPilot yet, or require simple configuration (e.g., adding an
2242
2258
  imagePullSecrets field).
@@ -2296,7 +2312,7 @@ def combine_pod_config_fields(
2296
2312
 
2297
2313
  def combine_metadata_fields(cluster_yaml_path: str) -> None:
2298
2314
  """Updates the metadata for all Kubernetes objects created by SkyPilot with
2299
- fields from the ~/.sky/config.yaml's kubernetes.custom_metadata dict.
2315
+ fields from the ~/.sky/skyconfig.yaml's kubernetes.custom_metadata dict.
2300
2316
 
2301
2317
  Obeys the same add or update semantics as combine_pod_config_fields().
2302
2318
  """
@@ -47,6 +47,8 @@ def request_body_env_vars() -> dict:
47
47
  # Remove the path to config file, as the config content is included in the
48
48
  # request body and will be merged with the config on the server side.
49
49
  env_vars.pop(skypilot_config.ENV_VAR_SKYPILOT_CONFIG, None)
50
+ env_vars.pop(skypilot_config.ENV_VAR_USER_CONFIG, None)
51
+ env_vars.pop(skypilot_config.ENV_VAR_PROJECT_CONFIG, None)
50
52
  return env_vars
51
53
 
52
54
 
@@ -131,7 +131,7 @@ extras_require: Dict[str, List[str]] = {
131
131
  'scp': local_ray,
132
132
  'oci': ['oci'] + local_ray,
133
133
  # Kubernetes 32.0.0 has an authentication bug: https://github.com/kubernetes-client/python/issues/2333 # pylint: disable=line-too-long
134
- 'kubernetes': ['kubernetes>=20.0.0,!=32.0.0'],
134
+ 'kubernetes': ['kubernetes>=20.0.0,!=32.0.0', 'websockets'],
135
135
  'remote': remote,
136
136
  # For the container registry auth api. Reference:
137
137
  # https://github.com/runpod/runpod-python/releases/tag/1.6.1
sky/skylet/constants.py CHANGED
@@ -117,7 +117,7 @@ RUNPOD_DOCKER_USERNAME_ENV_VAR = 'SKYPILOT_RUNPOD_DOCKER_USERNAME'
117
117
 
118
118
  # Commands for disable GPU ECC, which can improve the performance of the GPU
119
119
  # for some workloads by 30%. This will only be applied when a user specify
120
- # `nvidia_gpus.disable_ecc: true` in ~/.sky/config.yaml.
120
+ # `nvidia_gpus.disable_ecc: true` in ~/.sky/skyconfig.yaml.
121
121
  # Running this command will reboot the machine, introducing overhead for
122
122
  # provisioning the machine.
123
123
  # https://portal.nutanix.com/page/documents/kbs/details?targetId=kA00e000000LKjOCAW
@@ -337,7 +337,7 @@ RCLONE_LOG_DIR = '~/.sky/rclone_log'
337
337
  RCLONE_CACHE_DIR = '~/.cache/rclone'
338
338
  RCLONE_CACHE_REFRESH_INTERVAL = 10
339
339
 
340
- # The keys that can be overridden in the `~/.sky/config.yaml` file. The
340
+ # The keys that can be overridden in the `~/.sky/skyconfig.yaml` file. The
341
341
  # overrides are specified in task YAMLs.
342
342
  OVERRIDEABLE_CONFIG_KEYS_IN_TASK: List[Tuple[str, ...]] = [
343
343
  ('docker', 'run_options'),