skypilot-nightly 1.0.0.dev20250919__py3-none-any.whl → 1.0.0.dev20250925__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend.py +10 -0
- sky/backends/backend_utils.py +200 -78
- sky/backends/cloud_vm_ray_backend.py +37 -13
- sky/backends/local_docker_backend.py +9 -0
- sky/client/cli/command.py +104 -53
- sky/client/sdk.py +13 -5
- sky/client/sdk_async.py +4 -2
- sky/clouds/kubernetes.py +2 -1
- sky/clouds/runpod.py +20 -7
- sky/core.py +7 -53
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{VvaUqYDvHOcHZRnvMBmax → bn-NHt5qTzeTN2PefXuDA}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-b2a3938c22b6647b.js → webpack-16ba1d7187d2e3b1.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +19 -10
- sky/execution.py +4 -2
- sky/global_user_state.py +224 -38
- sky/jobs/client/sdk.py +10 -1
- sky/jobs/controller.py +7 -7
- sky/jobs/server/core.py +3 -3
- sky/jobs/server/server.py +15 -11
- sky/jobs/utils.py +1 -1
- sky/logs/agent.py +30 -3
- sky/logs/aws.py +9 -19
- sky/provision/__init__.py +2 -1
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/cudo/instance.py +2 -2
- sky/provision/do/instance.py +2 -2
- sky/provision/docker_utils.py +41 -19
- sky/provision/fluidstack/instance.py +2 -2
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +134 -8
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -2
- sky/provision/primeintellect/instance.py +2 -2
- sky/provision/provisioner.py +1 -0
- sky/provision/runpod/instance.py +2 -2
- sky/provision/scp/instance.py +2 -2
- sky/provision/seeweb/instance.py +2 -1
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +6 -5
- sky/schemas/api/responses.py +2 -1
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +45 -19
- sky/serve/replica_managers.py +12 -5
- sky/serve/serve_utils.py +5 -11
- sky/serve/server/core.py +9 -6
- sky/serve/server/impl.py +78 -25
- sky/serve/server/server.py +4 -5
- sky/serve/service_spec.py +33 -0
- sky/server/auth/oauth2_proxy.py +2 -2
- sky/server/constants.py +1 -1
- sky/server/daemons.py +2 -3
- sky/server/requests/executor.py +56 -6
- sky/server/requests/payloads.py +31 -8
- sky/server/requests/preconditions.py +2 -3
- sky/server/rest.py +2 -0
- sky/server/server.py +28 -19
- sky/server/stream_utils.py +34 -12
- sky/setup_files/dependencies.py +12 -2
- sky/setup_files/setup.py +44 -44
- sky/skylet/constants.py +2 -3
- sky/templates/kubernetes-ray.yml.j2 +16 -15
- sky/usage/usage_lib.py +3 -0
- sky/utils/cli_utils/status_utils.py +4 -5
- sky/utils/context.py +104 -29
- sky/utils/controller_utils.py +7 -6
- sky/utils/kubernetes/create_cluster.sh +13 -28
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/kubernetes_deploy_utils.py +170 -37
- sky/utils/kubernetes_enums.py +5 -0
- sky/utils/ux_utils.py +35 -1
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +44 -8
- sky/volumes/server/server.py +33 -7
- sky/volumes/volume.py +22 -14
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/METADATA +38 -33
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/RECORD +109 -109
- sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
- /sky/dashboard/out/_next/static/{VvaUqYDvHOcHZRnvMBmax → bn-NHt5qTzeTN2PefXuDA}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250919.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/top_level.txt +0 -0
sky/utils/context.py
CHANGED
|
@@ -2,15 +2,21 @@
|
|
|
2
2
|
|
|
3
3
|
import asyncio
|
|
4
4
|
from collections.abc import Mapping
|
|
5
|
-
from collections.abc import MutableMapping
|
|
6
5
|
import contextvars
|
|
6
|
+
import copy
|
|
7
7
|
import functools
|
|
8
|
+
import inspect
|
|
8
9
|
import os
|
|
9
10
|
import pathlib
|
|
10
11
|
import subprocess
|
|
11
12
|
import sys
|
|
12
|
-
import
|
|
13
|
-
|
|
13
|
+
from typing import (Callable, Dict, Iterator, MutableMapping, Optional, TextIO,
|
|
14
|
+
TYPE_CHECKING, TypeVar)
|
|
15
|
+
|
|
16
|
+
from typing_extensions import ParamSpec
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
from sky.skypilot_config import ConfigContext
|
|
14
20
|
|
|
15
21
|
|
|
16
22
|
class Context(object):
|
|
@@ -88,7 +94,7 @@ class Context(object):
|
|
|
88
94
|
else:
|
|
89
95
|
self._log_file_handle = open(log_file, 'a', encoding='utf-8')
|
|
90
96
|
self._log_file = log_file
|
|
91
|
-
if
|
|
97
|
+
if original_log_handle is not None:
|
|
92
98
|
original_log_handle.close()
|
|
93
99
|
return original_log_file
|
|
94
100
|
|
|
@@ -102,8 +108,30 @@ class Context(object):
|
|
|
102
108
|
for k, v in envs.items():
|
|
103
109
|
self.env_overrides[k] = v
|
|
104
110
|
|
|
111
|
+
def cleanup(self):
|
|
112
|
+
"""Clean up the context."""
|
|
113
|
+
if self._log_file_handle is not None:
|
|
114
|
+
self._log_file_handle.close()
|
|
115
|
+
self._log_file_handle = None
|
|
116
|
+
|
|
117
|
+
def copy(self) -> 'Context':
|
|
118
|
+
"""Create a copy of the context.
|
|
119
|
+
|
|
120
|
+
Changes to the current context after this call will not affect the copy.
|
|
121
|
+
The new context will get its own handle/fd for the log file.
|
|
122
|
+
The new context will get an independent copy of the env var overrides.
|
|
123
|
+
The new context will get an independent copy of the config context.
|
|
124
|
+
Cancellation of the current context will not be propagated to the copy.
|
|
125
|
+
"""
|
|
126
|
+
new_context = Context()
|
|
127
|
+
new_context.redirect_log(self._log_file)
|
|
128
|
+
new_context.env_overrides = self.env_overrides.copy()
|
|
129
|
+
new_context.config_context = copy.deepcopy(self.config_context)
|
|
130
|
+
return new_context
|
|
105
131
|
|
|
106
|
-
|
|
132
|
+
|
|
133
|
+
_CONTEXT = contextvars.ContextVar[Optional[Context]]('sky_context',
|
|
134
|
+
default=None)
|
|
107
135
|
|
|
108
136
|
|
|
109
137
|
def get() -> Optional[Context]:
|
|
@@ -116,7 +144,7 @@ def get() -> Optional[Context]:
|
|
|
116
144
|
return _CONTEXT.get()
|
|
117
145
|
|
|
118
146
|
|
|
119
|
-
class ContextualEnviron(MutableMapping):
|
|
147
|
+
class ContextualEnviron(MutableMapping[str, str]):
|
|
120
148
|
"""Environment variables wrapper with contextual overrides.
|
|
121
149
|
|
|
122
150
|
An instance of ContextualEnviron will typically be used to replace
|
|
@@ -155,10 +183,10 @@ class ContextualEnviron(MutableMapping):
|
|
|
155
183
|
assert os.environ['FOO'] == 'BAR1'
|
|
156
184
|
"""
|
|
157
185
|
|
|
158
|
-
def __init__(self, environ):
|
|
186
|
+
def __init__(self, environ: 'os._Environ[str]') -> None:
|
|
159
187
|
self._environ = environ
|
|
160
188
|
|
|
161
|
-
def __getitem__(self, key):
|
|
189
|
+
def __getitem__(self, key: str) -> str:
|
|
162
190
|
ctx = get()
|
|
163
191
|
if ctx is not None:
|
|
164
192
|
if key in ctx.env_overrides:
|
|
@@ -170,10 +198,10 @@ class ContextualEnviron(MutableMapping):
|
|
|
170
198
|
return value
|
|
171
199
|
return self._environ[key]
|
|
172
200
|
|
|
173
|
-
def __iter__(self):
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
201
|
+
def __iter__(self) -> Iterator[str]:
|
|
202
|
+
|
|
203
|
+
def iter_from_context(ctx: Context) -> Iterator[str]:
|
|
204
|
+
deleted_keys = set()
|
|
177
205
|
for key, value in ctx.env_overrides.items():
|
|
178
206
|
if value is None:
|
|
179
207
|
deleted_keys.add(key)
|
|
@@ -182,20 +210,24 @@ class ContextualEnviron(MutableMapping):
|
|
|
182
210
|
# Deduplicate the keys
|
|
183
211
|
if key not in ctx.env_overrides and key not in deleted_keys:
|
|
184
212
|
yield key
|
|
213
|
+
|
|
214
|
+
ctx = get()
|
|
215
|
+
if ctx is not None:
|
|
216
|
+
return iter_from_context(ctx)
|
|
185
217
|
else:
|
|
186
218
|
return self._environ.__iter__()
|
|
187
219
|
|
|
188
|
-
def __len__(self):
|
|
220
|
+
def __len__(self) -> int:
|
|
189
221
|
return len(dict(self))
|
|
190
222
|
|
|
191
|
-
def __setitem__(self, key, value):
|
|
223
|
+
def __setitem__(self, key: str, value: str) -> None:
|
|
192
224
|
ctx = get()
|
|
193
225
|
if ctx is not None:
|
|
194
226
|
ctx.env_overrides[key] = value
|
|
195
227
|
else:
|
|
196
228
|
self._environ.__setitem__(key, value)
|
|
197
229
|
|
|
198
|
-
def __delitem__(self, key):
|
|
230
|
+
def __delitem__(self, key: str) -> None:
|
|
199
231
|
ctx = get()
|
|
200
232
|
if ctx is not None:
|
|
201
233
|
if key in ctx.env_overrides:
|
|
@@ -211,10 +243,13 @@ class ContextualEnviron(MutableMapping):
|
|
|
211
243
|
else:
|
|
212
244
|
self._environ.__delitem__(key)
|
|
213
245
|
|
|
214
|
-
def __repr__(self):
|
|
215
|
-
|
|
246
|
+
def __repr__(self) -> str:
|
|
247
|
+
# Adapted from os._Environ.__repr__
|
|
248
|
+
formatted_items = ', '.join(
|
|
249
|
+
f'{key!r}: {value!r}' for key, value in self.items())
|
|
250
|
+
return f'ctx_environ({{{formatted_items}}})'
|
|
216
251
|
|
|
217
|
-
def copy(self):
|
|
252
|
+
def copy(self) -> Dict[str, str]:
|
|
218
253
|
copied = self._environ.copy()
|
|
219
254
|
ctx = get()
|
|
220
255
|
if ctx is not None:
|
|
@@ -225,7 +260,7 @@ class ContextualEnviron(MutableMapping):
|
|
|
225
260
|
copied[key] = ctx.env_overrides[key]
|
|
226
261
|
return copied
|
|
227
262
|
|
|
228
|
-
def setdefault(self, key, default
|
|
263
|
+
def setdefault(self, key: str, default: str) -> str:
|
|
229
264
|
return self._environ.setdefault(key, default)
|
|
230
265
|
|
|
231
266
|
def __ior__(self, other):
|
|
@@ -260,27 +295,67 @@ class Popen(subprocess.Popen):
|
|
|
260
295
|
super().__init__(*args, env=env, **kwargs)
|
|
261
296
|
|
|
262
297
|
|
|
263
|
-
|
|
298
|
+
P = ParamSpec('P')
|
|
299
|
+
T = TypeVar('T')
|
|
264
300
|
|
|
265
301
|
|
|
266
|
-
def contextual(func:
|
|
302
|
+
def contextual(func: Callable[P, T]) -> Callable[P, T]:
|
|
267
303
|
"""Decorator to initialize a context before executing the function.
|
|
268
304
|
|
|
269
|
-
If a context is already initialized, this decorator will
|
|
270
|
-
|
|
305
|
+
If a context is already initialized, this decorator will create a new
|
|
306
|
+
context that inherits the values from the existing context.
|
|
271
307
|
"""
|
|
272
308
|
|
|
273
309
|
@functools.wraps(func)
|
|
274
|
-
def wrapper(*args, **kwargs):
|
|
275
|
-
|
|
276
|
-
|
|
310
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
|
|
311
|
+
original_ctx = get()
|
|
312
|
+
initialize(original_ctx)
|
|
313
|
+
ctx = get()
|
|
314
|
+
cleanup_after_await = False
|
|
315
|
+
|
|
316
|
+
def cleanup():
|
|
317
|
+
try:
|
|
318
|
+
if ctx is not None:
|
|
319
|
+
ctx.cleanup()
|
|
320
|
+
finally:
|
|
321
|
+
# Note: _CONTEXT.reset() is not reliable - may fail with
|
|
322
|
+
# ValueError: <Token ... at ...> was created in a different
|
|
323
|
+
# Context
|
|
324
|
+
# We must make sure this happens because otherwise we may try to
|
|
325
|
+
# write to the wrong log.
|
|
326
|
+
_CONTEXT.set(original_ctx)
|
|
327
|
+
|
|
328
|
+
# There are two cases:
|
|
329
|
+
# 1. The function is synchronous (that is, return type is not awaitable)
|
|
330
|
+
# In this case, we use a finally block to cleanup the context.
|
|
331
|
+
# 2. The function is asynchronous (that is, return type is awaitable)
|
|
332
|
+
# In this case, we need to construct an async def wrapper and await
|
|
333
|
+
# the value, then call the cleanup function in the finally block.
|
|
334
|
+
|
|
335
|
+
async def await_with_cleanup(awaitable):
|
|
336
|
+
try:
|
|
337
|
+
return await awaitable
|
|
338
|
+
finally:
|
|
339
|
+
cleanup()
|
|
340
|
+
|
|
341
|
+
try:
|
|
342
|
+
ret = func(*args, **kwargs)
|
|
343
|
+
if inspect.isawaitable(ret):
|
|
344
|
+
cleanup_after_await = True
|
|
345
|
+
return await_with_cleanup(ret)
|
|
346
|
+
else:
|
|
347
|
+
return ret
|
|
348
|
+
finally:
|
|
349
|
+
if not cleanup_after_await:
|
|
350
|
+
cleanup()
|
|
277
351
|
|
|
278
|
-
return
|
|
352
|
+
return wrapper
|
|
279
353
|
|
|
280
354
|
|
|
281
|
-
def initialize():
|
|
355
|
+
def initialize(base_context: Optional[Context] = None) -> None:
|
|
282
356
|
"""Initialize the current SkyPilot context."""
|
|
283
|
-
|
|
357
|
+
new_context = base_context.copy() if base_context is not None else Context()
|
|
358
|
+
_CONTEXT.set(new_context)
|
|
284
359
|
|
|
285
360
|
|
|
286
361
|
class _ContextualStream:
|
sky/utils/controller_utils.py
CHANGED
|
@@ -620,15 +620,16 @@ def get_controller_resources(
|
|
|
620
620
|
controller_resources_to_use: resources.Resources = list(
|
|
621
621
|
controller_resources)[0]
|
|
622
622
|
|
|
623
|
-
|
|
623
|
+
controller_handle = global_user_state.get_handle_from_cluster_name(
|
|
624
624
|
controller.value.cluster_name)
|
|
625
|
-
if
|
|
626
|
-
|
|
627
|
-
if handle is not None:
|
|
625
|
+
if controller_handle is not None:
|
|
626
|
+
if controller_handle is not None:
|
|
628
627
|
# Use the existing resources, but override the autostop config with
|
|
629
628
|
# the one currently specified in the config.
|
|
630
|
-
controller_resources_to_use =
|
|
631
|
-
|
|
629
|
+
controller_resources_to_use = (
|
|
630
|
+
controller_handle.launched_resources.copy(
|
|
631
|
+
autostop=controller_resources_config_copied.get('autostop'))
|
|
632
|
+
)
|
|
632
633
|
|
|
633
634
|
# If the controller and replicas are from the same cloud (and region/zone),
|
|
634
635
|
# it should provide better connectivity. We will let the controller choose
|
|
@@ -1,22 +1,19 @@
|
|
|
1
1
|
#!/bin/bash
|
|
2
2
|
# Creates a local Kubernetes cluster using kind with optional GPU support
|
|
3
|
-
# Usage: ./create_cluster.sh [--gpus]
|
|
4
|
-
# Invokes generate_kind_config.py to generate a kind-cluster.yaml with NodePort mappings
|
|
3
|
+
# Usage: ./create_cluster.sh [name] [yaml_path] [--gpus]
|
|
5
4
|
set -e
|
|
6
5
|
|
|
7
6
|
# Images
|
|
8
7
|
IMAGE="us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest"
|
|
9
8
|
IMAGE_GPU="us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot-gpu:latest"
|
|
10
9
|
|
|
11
|
-
#
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
USER_HASH=$1
|
|
10
|
+
# Arguments
|
|
11
|
+
NAME=$1
|
|
12
|
+
YAML_PATH=$2
|
|
16
13
|
|
|
17
14
|
# Check for GPU flag
|
|
18
15
|
ENABLE_GPUS=false
|
|
19
|
-
if [[ "$
|
|
16
|
+
if [[ "$3" == "--gpus" ]]; then
|
|
20
17
|
ENABLE_GPUS=true
|
|
21
18
|
fi
|
|
22
19
|
|
|
@@ -82,28 +79,16 @@ fi
|
|
|
82
79
|
# ====== End of dependency checks =======
|
|
83
80
|
|
|
84
81
|
# Check if the local cluster already exists
|
|
85
|
-
if kind get clusters | grep -q
|
|
86
|
-
echo "Local cluster already exists. Exiting."
|
|
82
|
+
if kind get clusters | grep -q $NAME; then
|
|
83
|
+
echo "Local cluster $NAME already exists. Exiting."
|
|
87
84
|
# Switch context to the local cluster
|
|
88
|
-
kind export kubeconfig --name
|
|
89
|
-
kubectl config use-context kind
|
|
85
|
+
kind export kubeconfig --name $NAME
|
|
86
|
+
kubectl config use-context kind-$NAME
|
|
90
87
|
exit 100
|
|
91
88
|
fi
|
|
92
89
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
echo "Generating $YAML_PATH"
|
|
96
|
-
|
|
97
|
-
# Add GPUs flag to the generate_kind_config.py command if GPUs are enabled
|
|
98
|
-
if $ENABLE_GPUS; then
|
|
99
|
-
python -m sky.utils.kubernetes.generate_kind_config --path $YAML_PATH --port-start ${PORT_RANGE_START} --port-end ${PORT_RANGE_END} --gpus
|
|
100
|
-
else
|
|
101
|
-
python -m sky.utils.kubernetes.generate_kind_config --path $YAML_PATH --port-start ${PORT_RANGE_START} --port-end ${PORT_RANGE_END}
|
|
102
|
-
fi
|
|
103
|
-
|
|
104
|
-
kind create cluster --config $YAML_PATH --name skypilot
|
|
105
|
-
|
|
106
|
-
echo "Kind cluster created."
|
|
90
|
+
kind create cluster --config $YAML_PATH --name $NAME
|
|
91
|
+
echo "Kind cluster $NAME created."
|
|
107
92
|
|
|
108
93
|
# Function to wait for GPU operator to be correctly installed
|
|
109
94
|
wait_for_gpu_operator_installation() {
|
|
@@ -157,7 +142,7 @@ if $ENABLE_GPUS; then
|
|
|
157
142
|
echo "Enabling GPU support..."
|
|
158
143
|
# Run patch for missing ldconfig.real
|
|
159
144
|
# https://github.com/NVIDIA/nvidia-docker/issues/614#issuecomment-423991632
|
|
160
|
-
docker exec -ti
|
|
145
|
+
docker exec -ti $NAME-control-plane /bin/bash -c '[ ! -f /sbin/ldconfig.real ] && ln -s /sbin/ldconfig /sbin/ldconfig.real || echo "/sbin/ldconfig.real already exists"'
|
|
161
146
|
|
|
162
147
|
echo "Installing NVIDIA GPU operator..."
|
|
163
148
|
# Install the NVIDIA GPU operator
|
|
@@ -185,4 +170,4 @@ if $ENABLE_GPUS; then
|
|
|
185
170
|
echo "GPU support is enabled. Run 'sky show-gpus --cloud kubernetes' to see the GPUs available on the cluster."
|
|
186
171
|
fi
|
|
187
172
|
fi
|
|
188
|
-
echo "Number of CPUs available on the local cluster: $NUM_CPUS"
|
|
173
|
+
echo "Number of CPUs available on the local cluster $NAME: $NUM_CPUS"
|
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
#!/bin/bash
|
|
2
|
-
# Deletes the local kind cluster
|
|
3
|
-
# Usage: ./delete_cluster.sh
|
|
4
|
-
# Raises error code 100 if the local cluster does not exist
|
|
2
|
+
# Deletes the local kind cluster of [name]
|
|
3
|
+
# Usage: ./delete_cluster.sh [name]
|
|
4
|
+
# Raises error code 100 if the specified local cluster does not exist
|
|
5
5
|
|
|
6
6
|
set -e
|
|
7
|
+
|
|
8
|
+
NAME="${1:-skypilot}"
|
|
9
|
+
|
|
7
10
|
# Check if docker is running
|
|
8
11
|
if ! docker info > /dev/null 2>&1; then
|
|
9
12
|
>&2 echo "Docker is not running. Please start Docker and try again."
|
|
@@ -17,13 +20,13 @@ if ! kind version > /dev/null 2>&1; then
|
|
|
17
20
|
fi
|
|
18
21
|
|
|
19
22
|
# Check if the local cluster exists
|
|
20
|
-
if ! kind get clusters | grep -q
|
|
21
|
-
echo "Local cluster does not exist. Exiting."
|
|
23
|
+
if ! kind get clusters | grep -q $NAME; then
|
|
24
|
+
echo "Local cluster $NAME does not exist. Exiting."
|
|
22
25
|
exit 100
|
|
23
26
|
fi
|
|
24
27
|
|
|
25
|
-
kind delete cluster --name
|
|
26
|
-
echo "Local cluster deleted!"
|
|
28
|
+
kind delete cluster --name $NAME
|
|
29
|
+
echo "Local cluster $NAME deleted!"
|
|
27
30
|
|
|
28
31
|
# Switch to the first available context
|
|
29
32
|
AVAILABLE_CONTEXT=$(kubectl config get-contexts -o name | head -n 1)
|
|
@@ -3,67 +3,8 @@
|
|
|
3
3
|
Maps specified ports from host to cluster container.
|
|
4
4
|
"""
|
|
5
5
|
import argparse
|
|
6
|
-
import textwrap
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def generate_kind_config(path: str,
|
|
10
|
-
port_start: int = 30000,
|
|
11
|
-
port_end: int = 32768,
|
|
12
|
-
num_nodes: int = 1,
|
|
13
|
-
gpus: bool = False) -> None:
|
|
14
|
-
"""Generate a kind cluster config with ports mapped from host to container
|
|
15
|
-
|
|
16
|
-
Args:
|
|
17
|
-
path: Path to generate the config file at
|
|
18
|
-
port_start: Port range start
|
|
19
|
-
port_end: Port range end
|
|
20
|
-
num_nodes: Number of nodes in the cluster
|
|
21
|
-
gpus: If true, initialize kind cluster with GPU support
|
|
22
|
-
"""
|
|
23
|
-
|
|
24
|
-
preamble = textwrap.dedent(f"""
|
|
25
|
-
apiVersion: kind.x-k8s.io/v1alpha4
|
|
26
|
-
kind: Cluster
|
|
27
|
-
kubeadmConfigPatches:
|
|
28
|
-
- |
|
|
29
|
-
kind: ClusterConfiguration
|
|
30
|
-
apiServer:
|
|
31
|
-
extraArgs:
|
|
32
|
-
"service-node-port-range": {port_start}-{port_end}
|
|
33
|
-
nodes:
|
|
34
|
-
- role: control-plane
|
|
35
|
-
kubeadmConfigPatches:
|
|
36
|
-
- |
|
|
37
|
-
kind: InitConfiguration
|
|
38
|
-
nodeRegistration:
|
|
39
|
-
kubeletExtraArgs:
|
|
40
|
-
node-labels: "ingress-ready=true"
|
|
41
|
-
""")
|
|
42
|
-
if gpus:
|
|
43
|
-
preamble += textwrap.indent(
|
|
44
|
-
textwrap.dedent("""
|
|
45
|
-
extraMounts:
|
|
46
|
-
- hostPath: /dev/null
|
|
47
|
-
containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
|
|
48
|
-
preamble += textwrap.indent(
|
|
49
|
-
textwrap.dedent("""
|
|
50
|
-
extraPortMappings:"""), ' ' * 2)
|
|
51
|
-
suffix = ''
|
|
52
|
-
if num_nodes > 1:
|
|
53
|
-
for _ in range(1, num_nodes):
|
|
54
|
-
suffix += """- role: worker\n"""
|
|
55
|
-
with open(path, 'w', encoding='utf-8') as f:
|
|
56
|
-
f.write(preamble)
|
|
57
|
-
for port in range(port_start, port_end + 1):
|
|
58
|
-
f.write(f"""
|
|
59
|
-
- containerPort: {port}
|
|
60
|
-
hostPort: {port}
|
|
61
|
-
listenAddress: "0.0.0.0"
|
|
62
|
-
protocol: tcp""")
|
|
63
|
-
f.write('\n')
|
|
64
|
-
if suffix:
|
|
65
|
-
f.write(suffix)
|
|
66
6
|
|
|
7
|
+
from sky.utils.kubernetes import kubernetes_deploy_utils
|
|
67
8
|
|
|
68
9
|
if __name__ == '__main__':
|
|
69
10
|
parser = argparse.ArgumentParser(description='Generate a kind cluster '
|
|
@@ -77,10 +18,6 @@ if __name__ == '__main__':
|
|
|
77
18
|
type=int,
|
|
78
19
|
default=30000,
|
|
79
20
|
help='Port range start')
|
|
80
|
-
parser.add_argument('--port-end',
|
|
81
|
-
type=int,
|
|
82
|
-
default=32768,
|
|
83
|
-
help='Port range end')
|
|
84
21
|
parser.add_argument('--num-nodes',
|
|
85
22
|
type=int,
|
|
86
23
|
default=1,
|
|
@@ -90,5 +27,8 @@ if __name__ == '__main__':
|
|
|
90
27
|
action='store_true',
|
|
91
28
|
help='Initialize kind cluster with GPU support')
|
|
92
29
|
args = parser.parse_args()
|
|
93
|
-
|
|
94
|
-
|
|
30
|
+
|
|
31
|
+
with open(args.path, 'w', encoding='utf-8') as f:
|
|
32
|
+
f.write(
|
|
33
|
+
kubernetes_deploy_utils.generate_kind_config(
|
|
34
|
+
args.port_start, args.num_nodes, args.gpus))
|