skypilot-nightly 1.0.0.dev20250922__py3-none-any.whl → 1.0.0.dev20250925__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend.py +10 -0
- sky/backends/backend_utils.py +194 -69
- sky/backends/cloud_vm_ray_backend.py +37 -13
- sky/backends/local_docker_backend.py +9 -0
- sky/client/cli/command.py +104 -53
- sky/client/sdk.py +13 -5
- sky/client/sdk_async.py +4 -2
- sky/clouds/kubernetes.py +2 -1
- sky/clouds/runpod.py +20 -7
- sky/core.py +7 -53
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → bn-NHt5qTzeTN2PefXuDA}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/1121-b911fc0a0b4742f0.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-2b3600ff2854d066.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-d8bc3a2b9cf839a9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-2cb9b15e09cda628.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-9525660179df3605.js → [cluster]-e052384df65ef200.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-26167a9e6d91fa51.js → webpack-16ba1d7187d2e3b1.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/mounting_utils.py +19 -10
- sky/execution.py +4 -2
- sky/global_user_state.py +217 -36
- sky/jobs/client/sdk.py +10 -1
- sky/jobs/controller.py +7 -7
- sky/jobs/server/core.py +3 -3
- sky/jobs/server/server.py +15 -11
- sky/jobs/utils.py +1 -1
- sky/logs/agent.py +30 -3
- sky/logs/aws.py +9 -19
- sky/provision/__init__.py +2 -1
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/cudo/instance.py +2 -2
- sky/provision/do/instance.py +2 -2
- sky/provision/docker_utils.py +41 -19
- sky/provision/fluidstack/instance.py +2 -2
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +134 -8
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -2
- sky/provision/primeintellect/instance.py +2 -2
- sky/provision/provisioner.py +1 -0
- sky/provision/runpod/instance.py +2 -2
- sky/provision/scp/instance.py +2 -2
- sky/provision/seeweb/instance.py +2 -1
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +6 -5
- sky/schemas/api/responses.py +2 -1
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +45 -19
- sky/serve/replica_managers.py +12 -5
- sky/serve/serve_utils.py +5 -7
- sky/serve/server/core.py +9 -6
- sky/serve/server/impl.py +78 -25
- sky/serve/server/server.py +4 -5
- sky/serve/service_spec.py +33 -0
- sky/server/constants.py +1 -1
- sky/server/daemons.py +2 -3
- sky/server/requests/executor.py +56 -6
- sky/server/requests/payloads.py +31 -8
- sky/server/requests/preconditions.py +2 -3
- sky/server/rest.py +2 -0
- sky/server/server.py +28 -19
- sky/server/stream_utils.py +34 -12
- sky/setup_files/dependencies.py +4 -1
- sky/setup_files/setup.py +44 -44
- sky/templates/kubernetes-ray.yml.j2 +16 -15
- sky/usage/usage_lib.py +3 -0
- sky/utils/cli_utils/status_utils.py +4 -5
- sky/utils/context.py +104 -29
- sky/utils/controller_utils.py +7 -6
- sky/utils/kubernetes/create_cluster.sh +13 -28
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/kubernetes_deploy_utils.py +170 -37
- sky/utils/kubernetes_enums.py +5 -0
- sky/utils/ux_utils.py +35 -1
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +44 -8
- sky/volumes/server/server.py +33 -7
- sky/volumes/volume.py +22 -14
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/METADATA +40 -35
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/RECORD +107 -107
- sky/dashboard/out/_next/static/chunks/1121-4ff1ec0dbc5792ab.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-9a2538f38c004652.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-a39efbadcd9fde80.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1e9248ddbddcd122.js +0 -16
- /sky/dashboard/out/_next/static/{KP6HCNMqb_bnJB17oplgW → bn-NHt5qTzeTN2PefXuDA}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250922.dist-info → skypilot_nightly-1.0.0.dev20250925.dist-info}/top_level.txt +0 -0
sky/utils/controller_utils.py
CHANGED
|
@@ -620,15 +620,16 @@ def get_controller_resources(
|
|
|
620
620
|
controller_resources_to_use: resources.Resources = list(
|
|
621
621
|
controller_resources)[0]
|
|
622
622
|
|
|
623
|
-
|
|
623
|
+
controller_handle = global_user_state.get_handle_from_cluster_name(
|
|
624
624
|
controller.value.cluster_name)
|
|
625
|
-
if
|
|
626
|
-
|
|
627
|
-
if handle is not None:
|
|
625
|
+
if controller_handle is not None:
|
|
626
|
+
if controller_handle is not None:
|
|
628
627
|
# Use the existing resources, but override the autostop config with
|
|
629
628
|
# the one currently specified in the config.
|
|
630
|
-
controller_resources_to_use =
|
|
631
|
-
|
|
629
|
+
controller_resources_to_use = (
|
|
630
|
+
controller_handle.launched_resources.copy(
|
|
631
|
+
autostop=controller_resources_config_copied.get('autostop'))
|
|
632
|
+
)
|
|
632
633
|
|
|
633
634
|
# If the controller and replicas are from the same cloud (and region/zone),
|
|
634
635
|
# it should provide better connectivity. We will let the controller choose
|
|
@@ -1,22 +1,19 @@
|
|
|
1
1
|
#!/bin/bash
|
|
2
2
|
# Creates a local Kubernetes cluster using kind with optional GPU support
|
|
3
|
-
# Usage: ./create_cluster.sh [--gpus]
|
|
4
|
-
# Invokes generate_kind_config.py to generate a kind-cluster.yaml with NodePort mappings
|
|
3
|
+
# Usage: ./create_cluster.sh [name] [yaml_path] [--gpus]
|
|
5
4
|
set -e
|
|
6
5
|
|
|
7
6
|
# Images
|
|
8
7
|
IMAGE="us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot:latest"
|
|
9
8
|
IMAGE_GPU="us-central1-docker.pkg.dev/skypilot-375900/skypilotk8s/skypilot-gpu:latest"
|
|
10
9
|
|
|
11
|
-
#
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
USER_HASH=$1
|
|
10
|
+
# Arguments
|
|
11
|
+
NAME=$1
|
|
12
|
+
YAML_PATH=$2
|
|
16
13
|
|
|
17
14
|
# Check for GPU flag
|
|
18
15
|
ENABLE_GPUS=false
|
|
19
|
-
if [[ "$
|
|
16
|
+
if [[ "$3" == "--gpus" ]]; then
|
|
20
17
|
ENABLE_GPUS=true
|
|
21
18
|
fi
|
|
22
19
|
|
|
@@ -82,28 +79,16 @@ fi
|
|
|
82
79
|
# ====== End of dependency checks =======
|
|
83
80
|
|
|
84
81
|
# Check if the local cluster already exists
|
|
85
|
-
if kind get clusters | grep -q
|
|
86
|
-
echo "Local cluster already exists. Exiting."
|
|
82
|
+
if kind get clusters | grep -q $NAME; then
|
|
83
|
+
echo "Local cluster $NAME already exists. Exiting."
|
|
87
84
|
# Switch context to the local cluster
|
|
88
|
-
kind export kubeconfig --name
|
|
89
|
-
kubectl config use-context kind
|
|
85
|
+
kind export kubeconfig --name $NAME
|
|
86
|
+
kubectl config use-context kind-$NAME
|
|
90
87
|
exit 100
|
|
91
88
|
fi
|
|
92
89
|
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
echo "Generating $YAML_PATH"
|
|
96
|
-
|
|
97
|
-
# Add GPUs flag to the generate_kind_config.py command if GPUs are enabled
|
|
98
|
-
if $ENABLE_GPUS; then
|
|
99
|
-
python -m sky.utils.kubernetes.generate_kind_config --path $YAML_PATH --port-start ${PORT_RANGE_START} --port-end ${PORT_RANGE_END} --gpus
|
|
100
|
-
else
|
|
101
|
-
python -m sky.utils.kubernetes.generate_kind_config --path $YAML_PATH --port-start ${PORT_RANGE_START} --port-end ${PORT_RANGE_END}
|
|
102
|
-
fi
|
|
103
|
-
|
|
104
|
-
kind create cluster --config $YAML_PATH --name skypilot
|
|
105
|
-
|
|
106
|
-
echo "Kind cluster created."
|
|
90
|
+
kind create cluster --config $YAML_PATH --name $NAME
|
|
91
|
+
echo "Kind cluster $NAME created."
|
|
107
92
|
|
|
108
93
|
# Function to wait for GPU operator to be correctly installed
|
|
109
94
|
wait_for_gpu_operator_installation() {
|
|
@@ -157,7 +142,7 @@ if $ENABLE_GPUS; then
|
|
|
157
142
|
echo "Enabling GPU support..."
|
|
158
143
|
# Run patch for missing ldconfig.real
|
|
159
144
|
# https://github.com/NVIDIA/nvidia-docker/issues/614#issuecomment-423991632
|
|
160
|
-
docker exec -ti
|
|
145
|
+
docker exec -ti $NAME-control-plane /bin/bash -c '[ ! -f /sbin/ldconfig.real ] && ln -s /sbin/ldconfig /sbin/ldconfig.real || echo "/sbin/ldconfig.real already exists"'
|
|
161
146
|
|
|
162
147
|
echo "Installing NVIDIA GPU operator..."
|
|
163
148
|
# Install the NVIDIA GPU operator
|
|
@@ -185,4 +170,4 @@ if $ENABLE_GPUS; then
|
|
|
185
170
|
echo "GPU support is enabled. Run 'sky show-gpus --cloud kubernetes' to see the GPUs available on the cluster."
|
|
186
171
|
fi
|
|
187
172
|
fi
|
|
188
|
-
echo "Number of CPUs available on the local cluster: $NUM_CPUS"
|
|
173
|
+
echo "Number of CPUs available on the local cluster $NAME: $NUM_CPUS"
|
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
#!/bin/bash
|
|
2
|
-
# Deletes the local kind cluster
|
|
3
|
-
# Usage: ./delete_cluster.sh
|
|
4
|
-
# Raises error code 100 if the local cluster does not exist
|
|
2
|
+
# Deletes the local kind cluster of [name]
|
|
3
|
+
# Usage: ./delete_cluster.sh [name]
|
|
4
|
+
# Raises error code 100 if the specified local cluster does not exist
|
|
5
5
|
|
|
6
6
|
set -e
|
|
7
|
+
|
|
8
|
+
NAME="${1:-skypilot}"
|
|
9
|
+
|
|
7
10
|
# Check if docker is running
|
|
8
11
|
if ! docker info > /dev/null 2>&1; then
|
|
9
12
|
>&2 echo "Docker is not running. Please start Docker and try again."
|
|
@@ -17,13 +20,13 @@ if ! kind version > /dev/null 2>&1; then
|
|
|
17
20
|
fi
|
|
18
21
|
|
|
19
22
|
# Check if the local cluster exists
|
|
20
|
-
if ! kind get clusters | grep -q
|
|
21
|
-
echo "Local cluster does not exist. Exiting."
|
|
23
|
+
if ! kind get clusters | grep -q $NAME; then
|
|
24
|
+
echo "Local cluster $NAME does not exist. Exiting."
|
|
22
25
|
exit 100
|
|
23
26
|
fi
|
|
24
27
|
|
|
25
|
-
kind delete cluster --name
|
|
26
|
-
echo "Local cluster deleted!"
|
|
28
|
+
kind delete cluster --name $NAME
|
|
29
|
+
echo "Local cluster $NAME deleted!"
|
|
27
30
|
|
|
28
31
|
# Switch to the first available context
|
|
29
32
|
AVAILABLE_CONTEXT=$(kubectl config get-contexts -o name | head -n 1)
|
|
@@ -3,67 +3,8 @@
|
|
|
3
3
|
Maps specified ports from host to cluster container.
|
|
4
4
|
"""
|
|
5
5
|
import argparse
|
|
6
|
-
import textwrap
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
def generate_kind_config(path: str,
|
|
10
|
-
port_start: int = 30000,
|
|
11
|
-
port_end: int = 32768,
|
|
12
|
-
num_nodes: int = 1,
|
|
13
|
-
gpus: bool = False) -> None:
|
|
14
|
-
"""Generate a kind cluster config with ports mapped from host to container
|
|
15
|
-
|
|
16
|
-
Args:
|
|
17
|
-
path: Path to generate the config file at
|
|
18
|
-
port_start: Port range start
|
|
19
|
-
port_end: Port range end
|
|
20
|
-
num_nodes: Number of nodes in the cluster
|
|
21
|
-
gpus: If true, initialize kind cluster with GPU support
|
|
22
|
-
"""
|
|
23
|
-
|
|
24
|
-
preamble = textwrap.dedent(f"""
|
|
25
|
-
apiVersion: kind.x-k8s.io/v1alpha4
|
|
26
|
-
kind: Cluster
|
|
27
|
-
kubeadmConfigPatches:
|
|
28
|
-
- |
|
|
29
|
-
kind: ClusterConfiguration
|
|
30
|
-
apiServer:
|
|
31
|
-
extraArgs:
|
|
32
|
-
"service-node-port-range": {port_start}-{port_end}
|
|
33
|
-
nodes:
|
|
34
|
-
- role: control-plane
|
|
35
|
-
kubeadmConfigPatches:
|
|
36
|
-
- |
|
|
37
|
-
kind: InitConfiguration
|
|
38
|
-
nodeRegistration:
|
|
39
|
-
kubeletExtraArgs:
|
|
40
|
-
node-labels: "ingress-ready=true"
|
|
41
|
-
""")
|
|
42
|
-
if gpus:
|
|
43
|
-
preamble += textwrap.indent(
|
|
44
|
-
textwrap.dedent("""
|
|
45
|
-
extraMounts:
|
|
46
|
-
- hostPath: /dev/null
|
|
47
|
-
containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
|
|
48
|
-
preamble += textwrap.indent(
|
|
49
|
-
textwrap.dedent("""
|
|
50
|
-
extraPortMappings:"""), ' ' * 2)
|
|
51
|
-
suffix = ''
|
|
52
|
-
if num_nodes > 1:
|
|
53
|
-
for _ in range(1, num_nodes):
|
|
54
|
-
suffix += """- role: worker\n"""
|
|
55
|
-
with open(path, 'w', encoding='utf-8') as f:
|
|
56
|
-
f.write(preamble)
|
|
57
|
-
for port in range(port_start, port_end + 1):
|
|
58
|
-
f.write(f"""
|
|
59
|
-
- containerPort: {port}
|
|
60
|
-
hostPort: {port}
|
|
61
|
-
listenAddress: "0.0.0.0"
|
|
62
|
-
protocol: tcp""")
|
|
63
|
-
f.write('\n')
|
|
64
|
-
if suffix:
|
|
65
|
-
f.write(suffix)
|
|
66
6
|
|
|
7
|
+
from sky.utils.kubernetes import kubernetes_deploy_utils
|
|
67
8
|
|
|
68
9
|
if __name__ == '__main__':
|
|
69
10
|
parser = argparse.ArgumentParser(description='Generate a kind cluster '
|
|
@@ -77,10 +18,6 @@ if __name__ == '__main__':
|
|
|
77
18
|
type=int,
|
|
78
19
|
default=30000,
|
|
79
20
|
help='Port range start')
|
|
80
|
-
parser.add_argument('--port-end',
|
|
81
|
-
type=int,
|
|
82
|
-
default=32768,
|
|
83
|
-
help='Port range end')
|
|
84
21
|
parser.add_argument('--num-nodes',
|
|
85
22
|
type=int,
|
|
86
23
|
default=1,
|
|
@@ -90,5 +27,8 @@ if __name__ == '__main__':
|
|
|
90
27
|
action='store_true',
|
|
91
28
|
help='Initialize kind cluster with GPU support')
|
|
92
29
|
args = parser.parse_args()
|
|
93
|
-
|
|
94
|
-
|
|
30
|
+
|
|
31
|
+
with open(args.path, 'w', encoding='utf-8') as f:
|
|
32
|
+
f.write(
|
|
33
|
+
kubernetes_deploy_utils.generate_kind_config(
|
|
34
|
+
args.port_start, args.num_nodes, args.gpus))
|
|
@@ -4,6 +4,7 @@ import shlex
|
|
|
4
4
|
import subprocess
|
|
5
5
|
import sys
|
|
6
6
|
import tempfile
|
|
7
|
+
import textwrap
|
|
7
8
|
from typing import List, Optional
|
|
8
9
|
|
|
9
10
|
import colorama
|
|
@@ -24,6 +25,9 @@ logger = sky_logging.init_logger(__name__)
|
|
|
24
25
|
|
|
25
26
|
# Default path for Kubernetes configuration file
|
|
26
27
|
DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
|
|
28
|
+
DEFAULT_LOCAL_CLUSTER_NAME = 'skypilot'
|
|
29
|
+
LOCAL_CLUSTER_PORT_RANGE = 101
|
|
30
|
+
LOCAL_CLUSTER_INTERNAL_PORT_START = 30000
|
|
27
31
|
|
|
28
32
|
|
|
29
33
|
def check_ssh_cluster_dependencies(
|
|
@@ -252,7 +256,68 @@ def deploy_remote_cluster(ip_list: List[str],
|
|
|
252
256
|
is_local=True))
|
|
253
257
|
|
|
254
258
|
|
|
255
|
-
def
|
|
259
|
+
def generate_kind_config(port_start: int,
|
|
260
|
+
num_nodes: int = 1,
|
|
261
|
+
gpus: bool = False) -> str:
|
|
262
|
+
"""Generate a kind cluster config with ports mapped from host to container
|
|
263
|
+
|
|
264
|
+
Port range will be [port_start, port_start + LOCAL_CLUSTER_PORT_RANGE)
|
|
265
|
+
Internally, this will map to ports 30000 - 30100
|
|
266
|
+
|
|
267
|
+
Args:
|
|
268
|
+
path: Path to generate the config file at
|
|
269
|
+
port_start: Port range start for mappings
|
|
270
|
+
num_nodes: Number of nodes in the cluster
|
|
271
|
+
gpus: If true, initialize kind cluster with GPU support
|
|
272
|
+
|
|
273
|
+
Returns:
|
|
274
|
+
The kind cluster config
|
|
275
|
+
"""
|
|
276
|
+
internal_start = LOCAL_CLUSTER_INTERNAL_PORT_START
|
|
277
|
+
internal_end = internal_start + LOCAL_CLUSTER_PORT_RANGE - 1
|
|
278
|
+
|
|
279
|
+
config = textwrap.dedent(f"""
|
|
280
|
+
apiVersion: kind.x-k8s.io/v1alpha4
|
|
281
|
+
kind: Cluster
|
|
282
|
+
kubeadmConfigPatches:
|
|
283
|
+
- |
|
|
284
|
+
kind: ClusterConfiguration
|
|
285
|
+
apiServer:
|
|
286
|
+
extraArgs:
|
|
287
|
+
"service-node-port-range": {internal_start}-{internal_end}
|
|
288
|
+
nodes:
|
|
289
|
+
- role: control-plane
|
|
290
|
+
kubeadmConfigPatches:
|
|
291
|
+
- |
|
|
292
|
+
kind: InitConfiguration
|
|
293
|
+
nodeRegistration:
|
|
294
|
+
kubeletExtraArgs:
|
|
295
|
+
node-labels: "ingress-ready=true"
|
|
296
|
+
""")
|
|
297
|
+
if gpus:
|
|
298
|
+
config += textwrap.indent(
|
|
299
|
+
textwrap.dedent("""
|
|
300
|
+
extraMounts:
|
|
301
|
+
- hostPath: /dev/null
|
|
302
|
+
containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
|
|
303
|
+
config += textwrap.indent(textwrap.dedent("""
|
|
304
|
+
extraPortMappings:"""), ' ' * 2)
|
|
305
|
+
for offset in range(LOCAL_CLUSTER_PORT_RANGE):
|
|
306
|
+
config += textwrap.indent(
|
|
307
|
+
textwrap.dedent(f"""
|
|
308
|
+
- containerPort: {internal_start + offset}
|
|
309
|
+
hostPort: {port_start + offset}
|
|
310
|
+
listenAddress: "0.0.0.0"
|
|
311
|
+
protocol: tcp
|
|
312
|
+
"""), ' ' * 2)
|
|
313
|
+
if num_nodes > 1:
|
|
314
|
+
config += '- role: worker\n' * (num_nodes - 1)
|
|
315
|
+
return config
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
def deploy_local_cluster(name: Optional[str], gpus: bool):
|
|
319
|
+
name = name or DEFAULT_LOCAL_CLUSTER_NAME
|
|
320
|
+
context_name = f'kind-{name}'
|
|
256
321
|
cluster_created = False
|
|
257
322
|
|
|
258
323
|
# Check if GPUs are available on the host
|
|
@@ -262,41 +327,57 @@ def deploy_local_cluster(gpus: bool):
|
|
|
262
327
|
# Check if ~/.kube/config exists:
|
|
263
328
|
if os.path.exists(os.path.expanduser('~/.kube/config')):
|
|
264
329
|
curr_context = kubernetes_utils.get_current_kube_config_context_name()
|
|
265
|
-
|
|
266
|
-
if curr_context is not None and curr_context != skypilot_context:
|
|
330
|
+
if curr_context is not None and curr_context != context_name:
|
|
267
331
|
logger.info(
|
|
268
332
|
f'Current context in kube config: {curr_context}'
|
|
269
|
-
'\nWill automatically switch to
|
|
270
|
-
'cluster is created.')
|
|
271
|
-
message_str = 'Creating local cluster{}...'
|
|
272
|
-
message_str = message_str.format(
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
333
|
+
f'\nWill automatically switch to {context_name} after the '
|
|
334
|
+
'local cluster is created.')
|
|
335
|
+
message_str = 'Creating local cluster {}{}...'
|
|
336
|
+
message_str = message_str.format(
|
|
337
|
+
name,
|
|
338
|
+
' with GPU support (this may take up to 15 minutes)' if gpus else '')
|
|
339
|
+
|
|
340
|
+
with tempfile.NamedTemporaryFile(mode='w+', suffix='.yaml',
|
|
341
|
+
delete=True) as f:
|
|
342
|
+
# Choose random port range to use on the host machine.
|
|
343
|
+
# Port range is port_start - port_start + 99 (exactly 100 ports).
|
|
344
|
+
# port_start = random.randint(300, 399) * 100
|
|
345
|
+
# TODO (kyuds): hard coding to pass smoketests. Need to figure out
|
|
346
|
+
# how to deal with this later.
|
|
347
|
+
port_start = LOCAL_CLUSTER_INTERNAL_PORT_START
|
|
348
|
+
port_end = port_start + LOCAL_CLUSTER_PORT_RANGE - 1
|
|
349
|
+
logger.debug(f'Using port range {port_start}-{port_end}')
|
|
350
|
+
f.write(generate_kind_config(port_start, gpus=gpus))
|
|
351
|
+
f.flush()
|
|
352
|
+
|
|
353
|
+
path_to_package = os.path.dirname(__file__)
|
|
354
|
+
up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
|
|
355
|
+
|
|
356
|
+
# Get directory of script and run it from there
|
|
357
|
+
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
|
358
|
+
run_command = f'{up_script_path} {name} {f.name}'
|
|
359
|
+
if gpus:
|
|
360
|
+
run_command += ' --gpus'
|
|
361
|
+
run_command = shlex.split(run_command)
|
|
281
362
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
363
|
+
# Setup logging paths
|
|
364
|
+
run_timestamp = sky_logging.get_run_timestamp()
|
|
365
|
+
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
366
|
+
'local_up.log')
|
|
367
|
+
logger.info(message_str)
|
|
287
368
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
369
|
+
with rich_utils.safe_status(
|
|
370
|
+
ux_utils.spinner_message(message_str,
|
|
371
|
+
log_path=log_path,
|
|
372
|
+
is_local=True)):
|
|
373
|
+
returncode, _, stderr = log_lib.run_with_log(
|
|
374
|
+
cmd=run_command,
|
|
375
|
+
log_path=log_path,
|
|
376
|
+
require_outputs=True,
|
|
377
|
+
stream_logs=False,
|
|
378
|
+
line_processor=log_utils.SkyLocalUpLineProcessor(
|
|
379
|
+
log_path=log_path, is_local=True),
|
|
380
|
+
cwd=cwd)
|
|
300
381
|
|
|
301
382
|
# Kind always writes to stderr even if it succeeds.
|
|
302
383
|
# If the failure happens after the cluster is created, we need
|
|
@@ -309,11 +390,11 @@ def deploy_local_cluster(gpus: bool):
|
|
|
309
390
|
elif returncode == 100:
|
|
310
391
|
logger.info(
|
|
311
392
|
ux_utils.finishing_message(
|
|
312
|
-
'Local cluster already exists.\n',
|
|
393
|
+
f'Local cluster {name} already exists.\n',
|
|
313
394
|
log_path=log_path,
|
|
314
395
|
is_local=True,
|
|
315
396
|
follow_up_message=
|
|
316
|
-
'If you want to delete it instead, run: sky local down'))
|
|
397
|
+
'If you want to delete it instead, run: `sky local down --name {name}`')) # pylint: disable=line-too-long
|
|
317
398
|
else:
|
|
318
399
|
with ux_utils.print_exception_no_traceback():
|
|
319
400
|
log_hint = ux_utils.log_path_hint(log_path, is_local=True)
|
|
@@ -339,7 +420,7 @@ def deploy_local_cluster(gpus: bool):
|
|
|
339
420
|
if gpus:
|
|
340
421
|
# Get GPU model by querying the node labels
|
|
341
422
|
label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
|
|
342
|
-
gpu_type_cmd = f'kubectl get node
|
|
423
|
+
gpu_type_cmd = f'kubectl get node {name}-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
|
|
343
424
|
try:
|
|
344
425
|
# Run the command and capture the output
|
|
345
426
|
gpu_count_output = subprocess.check_output(gpu_type_cmd,
|
|
@@ -375,8 +456,9 @@ def deploy_local_cluster(gpus: bool):
|
|
|
375
456
|
'This may cause issues with running tasks.')
|
|
376
457
|
logger.info(
|
|
377
458
|
ux_utils.finishing_message(
|
|
378
|
-
message=(
|
|
379
|
-
|
|
459
|
+
message=(
|
|
460
|
+
f'Local Kubernetes cluster {name} created successfully '
|
|
461
|
+
f'with {num_cpus} CPUs{gpu_message}.'),
|
|
380
462
|
log_path=log_path,
|
|
381
463
|
is_local=True,
|
|
382
464
|
follow_up_message=(
|
|
@@ -384,3 +466,54 @@ def deploy_local_cluster(gpus: bool):
|
|
|
384
466
|
'Hint: To change the number of CPUs, change your docker '
|
|
385
467
|
'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
|
|
386
468
|
f'{gpu_hint}')))
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def teardown_local_cluster(name: Optional[str] = None):
|
|
472
|
+
name = name or DEFAULT_LOCAL_CLUSTER_NAME
|
|
473
|
+
cluster_removed = False
|
|
474
|
+
|
|
475
|
+
path_to_package = os.path.dirname(__file__)
|
|
476
|
+
down_script_path = os.path.join(path_to_package, 'delete_cluster.sh')
|
|
477
|
+
|
|
478
|
+
cwd = os.path.dirname(os.path.abspath(down_script_path))
|
|
479
|
+
run_command = f'{down_script_path} {name}'
|
|
480
|
+
run_command = shlex.split(run_command)
|
|
481
|
+
|
|
482
|
+
# Setup logging paths
|
|
483
|
+
run_timestamp = sky_logging.get_run_timestamp()
|
|
484
|
+
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
485
|
+
'local_down.log')
|
|
486
|
+
|
|
487
|
+
with rich_utils.safe_status(
|
|
488
|
+
ux_utils.spinner_message(f'Removing local cluster {name}',
|
|
489
|
+
log_path=log_path,
|
|
490
|
+
is_local=True)):
|
|
491
|
+
|
|
492
|
+
returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
|
|
493
|
+
log_path=log_path,
|
|
494
|
+
require_outputs=True,
|
|
495
|
+
stream_logs=False,
|
|
496
|
+
cwd=cwd)
|
|
497
|
+
stderr = stderr.replace('No kind clusters found.\n', '')
|
|
498
|
+
|
|
499
|
+
if returncode == 0:
|
|
500
|
+
cluster_removed = True
|
|
501
|
+
elif returncode == 100:
|
|
502
|
+
logger.info(
|
|
503
|
+
ux_utils.error_message(f'Local cluster {name} does not exist.'))
|
|
504
|
+
else:
|
|
505
|
+
with ux_utils.print_exception_no_traceback():
|
|
506
|
+
raise RuntimeError(f'Failed to down local cluster {name}. '
|
|
507
|
+
f'Stdout: {stdout}'
|
|
508
|
+
f'\nError: {stderr}')
|
|
509
|
+
if cluster_removed:
|
|
510
|
+
# Run sky check
|
|
511
|
+
with rich_utils.safe_status(
|
|
512
|
+
ux_utils.spinner_message('Running sky check...')):
|
|
513
|
+
sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
|
|
514
|
+
clouds=['kubernetes'],
|
|
515
|
+
quiet=True)
|
|
516
|
+
logger.info(
|
|
517
|
+
ux_utils.finishing_message(f'Local cluster {name} removed.',
|
|
518
|
+
log_path=log_path,
|
|
519
|
+
is_local=True))
|
sky/utils/kubernetes_enums.py
CHANGED
|
@@ -44,3 +44,8 @@ class KubernetesAutoscalerType(enum.Enum):
|
|
|
44
44
|
KARPENTER = 'karpenter'
|
|
45
45
|
COREWEAVE = 'coreweave'
|
|
46
46
|
GENERIC = 'generic'
|
|
47
|
+
|
|
48
|
+
def emits_autoscale_event(self) -> bool:
|
|
49
|
+
"""Returns whether specific autoscaler emits the event reason
|
|
50
|
+
TriggeredScaleUp."""
|
|
51
|
+
return self not in {self.KARPENTER}
|
sky/utils/ux_utils.py
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
"""Utility functions for UX."""
|
|
2
2
|
import contextlib
|
|
3
3
|
import enum
|
|
4
|
+
import fnmatch
|
|
4
5
|
import os
|
|
5
6
|
import sys
|
|
6
7
|
import traceback
|
|
7
8
|
import typing
|
|
8
|
-
from typing import Callable, Optional, Union
|
|
9
|
+
from typing import Callable, Iterable, List, Optional, Union
|
|
9
10
|
|
|
10
11
|
import colorama
|
|
11
12
|
|
|
@@ -288,3 +289,36 @@ def command_hint_messages(hint_type: CommandHintType,
|
|
|
288
289
|
f'{BOLD}sky jobs queue{RESET_BOLD}')
|
|
289
290
|
else:
|
|
290
291
|
raise ValueError(f'Invalid hint type: {hint_type}')
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def is_glob_pattern(pattern: str) -> bool:
|
|
295
|
+
"""Checks if a string contains common glob pattern wildcards."""
|
|
296
|
+
glob_chars = {'*', '?', '[', ']'}
|
|
297
|
+
# Also check for '**' as a specific globstar pattern
|
|
298
|
+
if '**' in pattern:
|
|
299
|
+
return True
|
|
300
|
+
for char in pattern:
|
|
301
|
+
if char in glob_chars:
|
|
302
|
+
return True
|
|
303
|
+
return False
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
def get_non_matched_query(query_clusters: Iterable[str],
|
|
307
|
+
cluster_names: Iterable[str]) -> List[str]:
|
|
308
|
+
"""Gets the non-matched query clusters."""
|
|
309
|
+
glob_query_clusters = []
|
|
310
|
+
non_glob_query_clusters = []
|
|
311
|
+
for cluster_name in query_clusters:
|
|
312
|
+
if is_glob_pattern(cluster_name):
|
|
313
|
+
glob_query_clusters.append(cluster_name)
|
|
314
|
+
else:
|
|
315
|
+
non_glob_query_clusters.append(cluster_name)
|
|
316
|
+
not_found_clusters = [
|
|
317
|
+
query_cluster for query_cluster in non_glob_query_clusters
|
|
318
|
+
if query_cluster not in cluster_names
|
|
319
|
+
]
|
|
320
|
+
not_found_clusters.extend([
|
|
321
|
+
query_cluster for query_cluster in glob_query_clusters
|
|
322
|
+
if not fnmatch.filter(cluster_names, query_cluster)
|
|
323
|
+
])
|
|
324
|
+
return not_found_clusters
|
sky/utils/yaml_utils.py
CHANGED
|
@@ -44,6 +44,15 @@ def read_yaml(path: Optional[str]) -> Dict[str, Any]:
|
|
|
44
44
|
return config
|
|
45
45
|
|
|
46
46
|
|
|
47
|
+
def read_yaml_str(yaml_str: str) -> Dict[str, Any]:
|
|
48
|
+
stream = io.StringIO(yaml_str)
|
|
49
|
+
parsed_yaml = safe_load(stream)
|
|
50
|
+
if not parsed_yaml:
|
|
51
|
+
# Empty dict
|
|
52
|
+
return {}
|
|
53
|
+
return parsed_yaml
|
|
54
|
+
|
|
55
|
+
|
|
47
56
|
def read_yaml_all_str(yaml_str: str) -> List[Dict[str, Any]]:
|
|
48
57
|
stream = io.StringIO(yaml_str)
|
|
49
58
|
config = safe_load_all(stream)
|
sky/volumes/client/sdk.py
CHANGED
|
@@ -3,13 +3,16 @@ import json
|
|
|
3
3
|
import typing
|
|
4
4
|
from typing import Any, Dict, List
|
|
5
5
|
|
|
6
|
+
from sky import exceptions
|
|
6
7
|
from sky import sky_logging
|
|
7
8
|
from sky.adaptors import common as adaptors_common
|
|
8
9
|
from sky.server import common as server_common
|
|
10
|
+
from sky.server import versions
|
|
9
11
|
from sky.server.requests import payloads
|
|
10
12
|
from sky.usage import usage_lib
|
|
11
13
|
from sky.utils import annotations
|
|
12
14
|
from sky.utils import context
|
|
15
|
+
from sky.utils import ux_utils
|
|
13
16
|
from sky.volumes import volume as volume_lib
|
|
14
17
|
|
|
15
18
|
if typing.TYPE_CHECKING:
|
|
@@ -71,12 +74,44 @@ def apply(volume: volume_lib.Volume) -> server_common.RequestId[None]:
|
|
|
71
74
|
config=volume.config,
|
|
72
75
|
labels=volume.labels,
|
|
73
76
|
)
|
|
74
|
-
response =
|
|
75
|
-
|
|
76
|
-
cookies=server_common.get_api_cookie_jar())
|
|
77
|
+
response = server_common.make_authenticated_request(
|
|
78
|
+
'POST', '/volumes/apply', json=json.loads(body.model_dump_json()))
|
|
77
79
|
return server_common.get_request_id(response)
|
|
78
80
|
|
|
79
81
|
|
|
82
|
+
@context.contextual
|
|
83
|
+
@usage_lib.entrypoint
|
|
84
|
+
@server_common.check_server_healthy_or_start
|
|
85
|
+
@annotations.client_api
|
|
86
|
+
@versions.minimal_api_version(20)
|
|
87
|
+
def validate(volume: volume_lib.Volume) -> None:
|
|
88
|
+
"""Validates the volume.
|
|
89
|
+
|
|
90
|
+
All validation is done on the server side.
|
|
91
|
+
|
|
92
|
+
Args:
|
|
93
|
+
volume: The volume to validate.
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
ValueError: If the volume is invalid.
|
|
97
|
+
"""
|
|
98
|
+
body = payloads.VolumeValidateBody(
|
|
99
|
+
name=volume.name,
|
|
100
|
+
volume_type=volume.type,
|
|
101
|
+
infra=volume.infra,
|
|
102
|
+
resource_name=volume.resource_name,
|
|
103
|
+
size=volume.size,
|
|
104
|
+
config=volume.config,
|
|
105
|
+
labels=volume.labels,
|
|
106
|
+
)
|
|
107
|
+
response = server_common.make_authenticated_request(
|
|
108
|
+
'POST', '/volumes/validate', json=json.loads(body.model_dump_json()))
|
|
109
|
+
if response.status_code == 400:
|
|
110
|
+
with ux_utils.print_exception_no_traceback():
|
|
111
|
+
raise exceptions.deserialize_exception(
|
|
112
|
+
response.json().get('detail'))
|
|
113
|
+
|
|
114
|
+
|
|
80
115
|
@context.contextual
|
|
81
116
|
@usage_lib.entrypoint
|
|
82
117
|
@server_common.check_server_healthy_or_start
|
|
@@ -87,8 +122,10 @@ def ls() -> server_common.RequestId[List[Dict[str, Any]]]:
|
|
|
87
122
|
Returns:
|
|
88
123
|
The request ID of the list request.
|
|
89
124
|
"""
|
|
90
|
-
response =
|
|
91
|
-
|
|
125
|
+
response = server_common.make_authenticated_request(
|
|
126
|
+
'GET',
|
|
127
|
+
'/volumes',
|
|
128
|
+
)
|
|
92
129
|
return server_common.get_request_id(response)
|
|
93
130
|
|
|
94
131
|
|
|
@@ -106,7 +143,6 @@ def delete(names: List[str]) -> server_common.RequestId[None]:
|
|
|
106
143
|
The request ID of the delete request.
|
|
107
144
|
"""
|
|
108
145
|
body = payloads.VolumeDeleteBody(names=names)
|
|
109
|
-
response =
|
|
110
|
-
|
|
111
|
-
cookies=server_common.get_api_cookie_jar())
|
|
146
|
+
response = server_common.make_authenticated_request(
|
|
147
|
+
'POST', '/volumes/delete', json=json.loads(body.model_dump_json()))
|
|
112
148
|
return server_common.get_request_id(response)
|