skypilot-nightly 1.0.0.dev20250521__py3-none-any.whl → 1.0.0.dev20250523__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +46 -16
  3. sky/backends/cloud_vm_ray_backend.py +16 -4
  4. sky/check.py +109 -44
  5. sky/cli.py +261 -90
  6. sky/client/cli.py +261 -90
  7. sky/client/sdk.py +122 -3
  8. sky/clouds/__init__.py +5 -0
  9. sky/clouds/aws.py +4 -2
  10. sky/clouds/azure.py +4 -2
  11. sky/clouds/cloud.py +30 -6
  12. sky/clouds/cudo.py +2 -1
  13. sky/clouds/do.py +2 -1
  14. sky/clouds/fluidstack.py +2 -1
  15. sky/clouds/gcp.py +160 -23
  16. sky/clouds/ibm.py +4 -2
  17. sky/clouds/kubernetes.py +66 -22
  18. sky/clouds/lambda_cloud.py +2 -1
  19. sky/clouds/nebius.py +18 -2
  20. sky/clouds/oci.py +4 -2
  21. sky/clouds/paperspace.py +2 -1
  22. sky/clouds/runpod.py +2 -1
  23. sky/clouds/scp.py +2 -1
  24. sky/clouds/service_catalog/__init__.py +3 -0
  25. sky/clouds/service_catalog/common.py +9 -2
  26. sky/clouds/service_catalog/constants.py +2 -1
  27. sky/clouds/service_catalog/ssh_catalog.py +167 -0
  28. sky/clouds/ssh.py +203 -0
  29. sky/clouds/vast.py +2 -1
  30. sky/clouds/vsphere.py +2 -1
  31. sky/core.py +59 -17
  32. sky/dashboard/out/404.html +1 -1
  33. sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_buildManifest.js +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +1 -0
  35. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  36. sky/dashboard/out/clusters/[cluster].html +1 -1
  37. sky/dashboard/out/clusters.html +1 -1
  38. sky/dashboard/out/index.html +1 -1
  39. sky/dashboard/out/infra.html +1 -1
  40. sky/dashboard/out/jobs/[job].html +1 -1
  41. sky/dashboard/out/jobs.html +1 -1
  42. sky/data/storage.py +1 -0
  43. sky/execution.py +56 -7
  44. sky/jobs/server/core.py +4 -2
  45. sky/optimizer.py +29 -15
  46. sky/provision/__init__.py +1 -0
  47. sky/provision/aws/instance.py +17 -1
  48. sky/provision/gcp/constants.py +147 -4
  49. sky/provision/gcp/instance_utils.py +10 -0
  50. sky/provision/gcp/volume_utils.py +247 -0
  51. sky/provision/kubernetes/instance.py +16 -5
  52. sky/provision/kubernetes/utils.py +37 -19
  53. sky/provision/nebius/instance.py +3 -1
  54. sky/provision/nebius/utils.py +14 -2
  55. sky/provision/ssh/__init__.py +18 -0
  56. sky/resources.py +177 -4
  57. sky/serve/server/core.py +2 -4
  58. sky/server/common.py +46 -9
  59. sky/server/constants.py +2 -0
  60. sky/server/html/token_page.html +154 -0
  61. sky/server/requests/executor.py +3 -6
  62. sky/server/requests/payloads.py +7 -0
  63. sky/server/server.py +80 -8
  64. sky/setup_files/dependencies.py +1 -0
  65. sky/skypilot_config.py +117 -31
  66. sky/task.py +24 -1
  67. sky/templates/gcp-ray.yml.j2 +44 -1
  68. sky/templates/nebius-ray.yml.j2 +12 -2
  69. sky/utils/admin_policy_utils.py +26 -22
  70. sky/utils/context.py +36 -6
  71. sky/utils/context_utils.py +15 -0
  72. sky/utils/infra_utils.py +21 -1
  73. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  74. sky/utils/kubernetes/create_cluster.sh +1 -0
  75. sky/utils/kubernetes/deploy_remote_cluster.py +1437 -0
  76. sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
  77. sky/utils/kubernetes/ssh-tunnel.sh +387 -0
  78. sky/utils/log_utils.py +214 -1
  79. sky/utils/resources_utils.py +14 -0
  80. sky/utils/schemas.py +67 -0
  81. sky/utils/ux_utils.py +2 -1
  82. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/METADATA +6 -1
  83. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/RECORD +88 -81
  84. sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
  85. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  86. /sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_ssgManifest.js +0 -0
  87. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/WHEEL +0 -0
  88. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/entry_points.txt +0 -0
  89. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/licenses/LICENSE +0 -0
  90. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/top_level.txt +0 -0
@@ -109,12 +109,27 @@ available_node_types:
109
109
  {%- if tpu_vm %}
110
110
  acceleratorType: {{tpu_type}}
111
111
  runtimeVersion: {{runtime_version}}
112
+ {%- if volumes %}
113
+ dataDisks:
114
+ {%- for volume in volumes %}
115
+ {%- if volume.source %}
116
+ - sourceDisk: {{volume.source}}
117
+ {%- endif %}
118
+ {%- if volume.attach_mode %}
119
+ mode: {{volume.attach_mode}}
120
+ {%- endif %}
121
+ {%- endfor %}
122
+ {%- endif %}
112
123
  metadata:
113
124
  # TPU VM's metadata has different format than normal VMs.
114
125
  # After replacing the variables, this will become username:ssh_public_key_content.
115
126
  # This is a specific syntax required by GCP https://cloud.google.com/compute/docs/connect/add-ssh-keys
116
127
  ssh-keys: |-
117
128
  skypilot:ssh_user:skypilot:ssh_public_key_content
129
+ {%- if user_data is not none %}
130
+ startup-script: |-
131
+ {{ user_data | indent(10) }}
132
+ {%- endif %}
118
133
  {%- if use_spot %}
119
134
  schedulingConfig:
120
135
  preemptible: true
@@ -138,6 +153,34 @@ available_node_types:
138
153
  {%- if disk_iops %}
139
154
  provisionedIops: {{disk_iops}}
140
155
  {%- endif %}
156
+ {%- for volume in volumes %}
157
+ - boot: false
158
+ autoDelete: {{volume.auto_delete}}
159
+ type: {{volume.storage_type}}
160
+ deviceName: {{volume.device_name}}
161
+ {%- if volume.source %}
162
+ source: {{volume.source}}
163
+ {%- endif %}
164
+ {%- if volume.attach_mode %}
165
+ mode: {{volume.attach_mode}}
166
+ {%- endif %}
167
+ {%- if volume.interface_type %}
168
+ interface: {{volume.interface_type}}
169
+ {%- endif %}
170
+ {%- if volume.disk_tier %}
171
+ initializeParams:
172
+ diskType: zones/{{zones}}/diskTypes/{{volume.disk_tier}}
173
+ {%- endif %}
174
+ {%- if volume.disk_name %}
175
+ diskName: {{volume.disk_name}}
176
+ {%- endif %}
177
+ {%- if volume.disk_size %}
178
+ diskSizeGb: {{volume.disk_size}}
179
+ {%- endif %}
180
+ {%- if volume.iops %}
181
+ provisionedIops: {{volume.iops}}
182
+ {%- endif %}
183
+ {%- endfor %}
141
184
  {%- if gpu is not none %}
142
185
  guestAccelerators:
143
186
  - acceleratorType: projects/{{gcp_project_id}}/zones/{{zones}}/acceleratorTypes/{{gpu}}
@@ -157,7 +200,7 @@ available_node_types:
157
200
  {%- if user_data is not none %}
158
201
  - key: user-data
159
202
  value: |-
160
- {{ user_data | indent(10) }}
203
+ {{ user_data | indent(14) }}
161
204
  {%- endif %}
162
205
  {%- if use_spot or gpu is not none %}
163
206
  scheduling:
@@ -46,12 +46,17 @@ available_node_types:
46
46
  InstanceType: {{instance_type}}
47
47
  ImageId: {{image_id}}
48
48
  DiskSize: {{disk_size}}
49
+ filesystems:
50
+ {%- for fs in filesystems %}
51
+ - filesystem_id: {{ fs.filesystem_id }}
52
+ filesystem_mount_tag: {{ fs.filesystem_mount_tag }}
53
+ filesystem_attach_mode: {{ fs.filesystem_attach_mode }}
54
+ filesystem_mount_path: {{ fs.filesystem_mount_path }}
55
+ {%- endfor %}
49
56
  UserData: |
50
- {%- if docker_image is not none %}
51
57
  runcmd:
52
58
  - sudo sed -i 's/^#\?AllowTcpForwarding.*/AllowTcpForwarding yes/' /etc/ssh/sshd_config
53
59
  - systemctl restart sshd
54
- {%- endif %}
55
60
 
56
61
  {# Two available OS images:
57
62
  1. ubuntu22.04-driverless - requires Docker installation
@@ -132,6 +137,11 @@ setup_commands:
132
137
  - {%- for initial_setup_command in initial_setup_commands %}
133
138
  {{ initial_setup_command }}
134
139
  {%- endfor %}
140
+ {%- for fs in filesystems %}
141
+ sudo mkdir {{ fs.filesystem_mount_path }};
142
+ sudo mount -t virtiofs {{ fs.filesystem_mount_tag }} {{ fs.filesystem_mount_path }};
143
+ sudo chmod a+w {{ fs.filesystem_mount_path }};
144
+ {%- endfor %}
135
145
  sudo systemctl stop unattended-upgrades || true;
136
146
  sudo systemctl disable unattended-upgrades || true;
137
147
  sudo sed -i 's/Unattended-Upgrade "1"/Unattended-Upgrade "0"/g' /etc/apt/apt.conf.d/20auto-upgrades || true;
@@ -1,9 +1,8 @@
1
1
  """Admin policy utils."""
2
+ import contextlib
2
3
  import copy
3
4
  import importlib
4
- import os
5
- import tempfile
6
- from typing import Optional, Tuple, Union
5
+ from typing import Iterator, Optional, Tuple, Union
7
6
 
8
7
  import colorama
9
8
 
@@ -52,9 +51,31 @@ def _get_policy_cls(
52
51
  return policy_cls
53
52
 
54
53
 
54
+ @contextlib.contextmanager
55
+ def apply_and_use_config_in_current_request(
56
+ entrypoint: Union['dag_lib.Dag', 'task_lib.Task'],
57
+ request_options: Optional[admin_policy.RequestOptions] = None,
58
+ ) -> Iterator['dag_lib.Dag']:
59
+ """Applies an admin policy and override SkyPilot config for current request
60
+
61
+ This is a helper function of `apply()` that applies an admin policy and
62
+ overrides the SkyPilot config for the current request as a context manager.
63
+ The original SkyPilot config will be restored when the context manager is
64
+ exited.
65
+
66
+ Refer to `apply()` for more details.
67
+ """
68
+ original_config = skypilot_config.to_dict()
69
+ dag, mutated_config = apply(entrypoint, request_options)
70
+ if mutated_config != original_config:
71
+ with skypilot_config.replace_skypilot_config(mutated_config):
72
+ yield dag
73
+ else:
74
+ yield dag
75
+
76
+
55
77
  def apply(
56
78
  entrypoint: Union['dag_lib.Dag', 'task_lib.Task'],
57
- use_mutated_config_in_current_request: bool = True,
58
79
  request_options: Optional[admin_policy.RequestOptions] = None,
59
80
  ) -> Tuple['dag_lib.Dag', config_utils.Config]:
60
81
  """Applies an admin policy (if registered) to a DAG or a task.
@@ -85,8 +106,7 @@ def apply(
85
106
  return dag, skypilot_config.to_dict()
86
107
 
87
108
  logger.info(f'Applying policy: {policy}')
88
- original_config = skypilot_config.to_dict()
89
- config = copy.deepcopy(original_config)
109
+ config = copy.deepcopy(skypilot_config.to_dict())
90
110
  mutated_dag = dag_lib.Dag()
91
111
  mutated_dag.name = dag.name
92
112
 
@@ -126,22 +146,6 @@ def apply(
126
146
  mutated_dag.graph.add_edge(mutated_dag.tasks[u_idx],
127
147
  mutated_dag.tasks[v_idx])
128
148
 
129
- if (use_mutated_config_in_current_request and
130
- original_config != mutated_config):
131
- with tempfile.NamedTemporaryFile(
132
- delete=False,
133
- mode='w',
134
- prefix='policy-mutated-skypilot-config-',
135
- suffix='.yaml') as temp_file:
136
-
137
- common_utils.dump_yaml(temp_file.name, dict(**mutated_config))
138
- os.environ[skypilot_config.ENV_VAR_SKYPILOT_CONFIG] = temp_file.name
139
- logger.debug(f'Updated SkyPilot config: {temp_file.name}')
140
- # TODO(zhwu): This is not a clean way to update the SkyPilot config,
141
- # because we are resetting the global context for a single DAG,
142
- # which is conceptually weird.
143
- importlib.reload(skypilot_config)
144
-
145
149
  logger.debug(f'Mutated user request: {mutated_user_request}')
146
150
  mutated_dag.policy_applied = True
147
151
  return mutated_dag, mutated_config
sky/utils/context.py CHANGED
@@ -57,6 +57,7 @@ class Context(object):
57
57
  self._log_file = None
58
58
  self._log_file_handle = None
59
59
  self.env_overrides = {}
60
+ self.config_context = None
60
61
 
61
62
  def cancel(self):
62
63
  """Cancel the context."""
@@ -159,17 +160,25 @@ class ContextualEnviron(MutableMapping):
159
160
  ctx = get()
160
161
  if ctx is not None:
161
162
  if key in ctx.env_overrides:
162
- return ctx.env_overrides[key]
163
+ value = ctx.env_overrides[key]
164
+ # None is used to indicate that the key is deleted in the
165
+ # context.
166
+ if value is None:
167
+ raise KeyError(key)
168
+ return value
163
169
  return self._environ[key]
164
170
 
165
171
  def __iter__(self):
166
172
  ctx = get()
173
+ deleted_keys = set()
167
174
  if ctx is not None:
168
- for key in ctx.env_overrides:
175
+ for key, value in ctx.env_overrides.items():
176
+ if value is None:
177
+ deleted_keys.add(key)
169
178
  yield key
170
179
  for key in self._environ:
171
180
  # Deduplicate the keys
172
- if key not in ctx.env_overrides:
181
+ if key not in ctx.env_overrides and key not in deleted_keys:
173
182
  yield key
174
183
  else:
175
184
  return self._environ.__iter__()
@@ -178,10 +187,27 @@ class ContextualEnviron(MutableMapping):
178
187
  return len(dict(self))
179
188
 
180
189
  def __setitem__(self, key, value):
181
- return self._environ.__setitem__(key, value)
190
+ ctx = get()
191
+ if ctx is not None:
192
+ ctx.env_overrides[key] = value
193
+ else:
194
+ self._environ.__setitem__(key, value)
182
195
 
183
196
  def __delitem__(self, key):
184
- return self._environ.__delitem__(key)
197
+ ctx = get()
198
+ if ctx is not None:
199
+ if key in ctx.env_overrides:
200
+ del ctx.env_overrides[key]
201
+ elif key in self._environ:
202
+ # If the key is not set in the context but set in the environ
203
+ # of the process, we mark it as deleted in the context by
204
+ # setting the value to None.
205
+ ctx.env_overrides[key] = None
206
+ else:
207
+ # The key is not set in the context nor the process.
208
+ raise KeyError(key)
209
+ else:
210
+ self._environ.__delitem__(key)
185
211
 
186
212
  def __repr__(self):
187
213
  return self._environ.__repr__()
@@ -190,7 +216,11 @@ class ContextualEnviron(MutableMapping):
190
216
  copied = self._environ.copy()
191
217
  ctx = get()
192
218
  if ctx is not None:
193
- copied.update(ctx.env_overrides)
219
+ for key in ctx.env_overrides:
220
+ if ctx.env_overrides[key] is None:
221
+ copied.pop(key)
222
+ else:
223
+ copied[key] = ctx.env_overrides[key]
194
224
  return copied
195
225
 
196
226
  def setdefault(self, key, default=None):
@@ -1,5 +1,6 @@
1
1
  """Utilities for SkyPilot context."""
2
2
  import asyncio
3
+ import contextvars
3
4
  import functools
4
5
  import io
5
6
  import multiprocessing
@@ -170,3 +171,17 @@ def cancellation_guard(func: F) -> F:
170
171
  return func(*args, **kwargs)
171
172
 
172
173
  return typing.cast(F, wrapper)
174
+
175
+
176
+ # TODO(aylei): replace this with asyncio.to_thread once we drop support for
177
+ # python 3.8
178
+ def to_thread(func, /, *args, **kwargs):
179
+ """Asynchronously run function *func* in a separate thread.
180
+
181
+ This is same as asyncio.to_thread added in python 3.9
182
+ """
183
+ loop = asyncio.get_running_loop()
184
+ # This is critical to pass the current coroutine context to the new thread
185
+ pyctx = contextvars.copy_context()
186
+ func_call = functools.partial(pyctx.run, func, *args, **kwargs)
187
+ return loop.run_in_executor(None, func_call)
sky/utils/infra_utils.py CHANGED
@@ -86,6 +86,16 @@ class InfraInfo:
86
86
  cloud_name = 'kubernetes' # Normalize k8s to kubernetes
87
87
  region = '/'.join(parts[1:]) if len(parts) >= 2 else None
88
88
  zone = None
89
+ elif cloud_name == 'ssh':
90
+ # For SSH, the entire string after "ssh/" is the
91
+ # node pool name. We prepend 'ssh-' for the internal implementation
92
+ # which reuses the context name.
93
+ # TODO(romilb): This is a workaround while we use the global
94
+ # kubeconfig to store the ssh contexts.
95
+ region = '/'.join(parts[1:]) if len(parts) >= 2 else None
96
+ if region:
97
+ region = f'ssh-{region}'
98
+ zone = None
89
99
  else:
90
100
  # For non-Kubernetes clouds, continue with regular parsing
91
101
  # but be careful to only split into max 3 parts
@@ -133,6 +143,12 @@ class InfraInfo:
133
143
  if zone is None:
134
144
  zone = '*'
135
145
 
146
+ # If the cloud is ssh, we remove the ssh- prefix from the region
147
+ # TODO(romilb): This is a workaround while we use the global
148
+ # kubeconfig to store the ssh contexts.
149
+ if region and region.startswith('ssh-'):
150
+ region = region[4:]
151
+
136
152
  # Build the parts list and filter out trailing wildcards
137
153
  parts = [cloud.lower(), region, zone]
138
154
  while parts and parts[-1] == '*':
@@ -160,7 +176,11 @@ class InfraInfo:
160
176
  if self.zone is not None and self.zone != '*':
161
177
  region_or_zone = self.zone
162
178
  elif self.region is not None and self.region != '*':
163
- region_or_zone = self.region
179
+ # If using region, we remove the ssh- prefix if it exists for SSH
180
+ # Node Pools.
181
+ # TODO(romilb): This is a workaround while we use the global
182
+ # kubeconfig to store the ssh contexts.
183
+ region_or_zone = self.region.lstrip('ssh-')
164
184
 
165
185
  if region_or_zone is not None and truncate:
166
186
  region_or_zone = common_utils.truncate_long_string(
@@ -0,0 +1,62 @@
1
+ #!/bin/bash
2
+ # cleanup-tunnel.sh - Script to clean up SSH tunnels for a Kubernetes context
3
+
4
+ # Usage: cleanup-tunnel.sh CONTEXT_NAME
5
+
6
+ CONTEXT="${1:-default}"
7
+ TUNNEL_DIR="$HOME/.sky/ssh_node_pools_info"
8
+ PID_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.pid"
9
+ LOG_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.log"
10
+ LOCK_FILE="$TUNNEL_DIR/$CONTEXT-tunnel.lock"
11
+
12
+ # Get the port from kubeconfig if available
13
+ KUBE_PORT=$(kubectl config view --minify --context="$CONTEXT" -o jsonpath='{.clusters[0].cluster.server}' 2>/dev/null | grep -o ":[0-9]\+" | tr -d ":" || echo "")
14
+
15
+ if [[ -z "$KUBE_PORT" ]]; then
16
+ # Default to 6443 if we can't determine the port
17
+ KUBE_PORT=6443
18
+ echo "$(date): Could not determine port from kubeconfig, using default port $KUBE_PORT" >> "$LOG_FILE"
19
+ else
20
+ echo "$(date): Found port $KUBE_PORT in kubeconfig for context $CONTEXT" >> "$LOG_FILE"
21
+ fi
22
+
23
+ # Check if PID file exists
24
+ if [[ -f "$PID_FILE" ]]; then
25
+ OLD_PID=$(cat "$PID_FILE")
26
+
27
+ # Log the cleanup attempt
28
+ echo "$(date): Attempting to clean up tunnel for context $CONTEXT (PID: $OLD_PID, Port: $KUBE_PORT)" >> "$LOG_FILE"
29
+
30
+ # Try to kill the process
31
+ if kill -0 "$OLD_PID" 2>/dev/null; then
32
+ # Process exists, kill it
33
+ kill "$OLD_PID" 2>/dev/null
34
+
35
+ # Wait a moment and check if it's really gone
36
+ sleep 1
37
+ if kill -0 "$OLD_PID" 2>/dev/null; then
38
+ # Still running, force kill
39
+ kill -9 "$OLD_PID" 2>/dev/null
40
+ echo "$(date): Forcefully terminated tunnel process $OLD_PID" >> "$LOG_FILE"
41
+ else
42
+ echo "$(date): Successfully terminated tunnel process $OLD_PID" >> "$LOG_FILE"
43
+ fi
44
+ else
45
+ echo "$(date): No running process found with PID $OLD_PID" >> "$LOG_FILE"
46
+ fi
47
+
48
+ # Remove PID file
49
+ rm -f "$PID_FILE"
50
+ else
51
+ echo "$(date): No PID file found for context $CONTEXT. Nothing to clean up." >> "$LOG_FILE"
52
+ fi
53
+
54
+ # Clean up lock file if it exists
55
+ rm -f "$LOCK_FILE"
56
+
57
+ # Check if port is still in use
58
+ if nc -z localhost "$KUBE_PORT" 2>/dev/null; then
59
+ echo "$(date): Warning: Port $KUBE_PORT is still in use after cleanup. Another process might be using it." >> "$LOG_FILE"
60
+ fi
61
+
62
+ echo "$(date): Cleanup complete for context $CONTEXT" >> "$LOG_FILE"
@@ -85,6 +85,7 @@ fi
85
85
  if kind get clusters | grep -q skypilot; then
86
86
  echo "Local cluster already exists. Exiting."
87
87
  # Switch context to the local cluster
88
+ kind export kubeconfig --name skypilot
88
89
  kubectl config use-context kind-skypilot
89
90
  exit 100
90
91
  fi