skypilot-nightly 1.0.0.dev20250521__py3-none-any.whl → 1.0.0.dev20250523__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/kubernetes.py +46 -16
- sky/backends/cloud_vm_ray_backend.py +16 -4
- sky/check.py +109 -44
- sky/cli.py +261 -90
- sky/client/cli.py +261 -90
- sky/client/sdk.py +122 -3
- sky/clouds/__init__.py +5 -0
- sky/clouds/aws.py +4 -2
- sky/clouds/azure.py +4 -2
- sky/clouds/cloud.py +30 -6
- sky/clouds/cudo.py +2 -1
- sky/clouds/do.py +2 -1
- sky/clouds/fluidstack.py +2 -1
- sky/clouds/gcp.py +160 -23
- sky/clouds/ibm.py +4 -2
- sky/clouds/kubernetes.py +66 -22
- sky/clouds/lambda_cloud.py +2 -1
- sky/clouds/nebius.py +18 -2
- sky/clouds/oci.py +4 -2
- sky/clouds/paperspace.py +2 -1
- sky/clouds/runpod.py +2 -1
- sky/clouds/scp.py +2 -1
- sky/clouds/service_catalog/__init__.py +3 -0
- sky/clouds/service_catalog/common.py +9 -2
- sky/clouds/service_catalog/constants.py +2 -1
- sky/clouds/service_catalog/ssh_catalog.py +167 -0
- sky/clouds/ssh.py +203 -0
- sky/clouds/vast.py +2 -1
- sky/clouds/vsphere.py +2 -1
- sky/core.py +59 -17
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_buildManifest.js +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/data/storage.py +1 -0
- sky/execution.py +56 -7
- sky/jobs/server/core.py +4 -2
- sky/optimizer.py +29 -15
- sky/provision/__init__.py +1 -0
- sky/provision/aws/instance.py +17 -1
- sky/provision/gcp/constants.py +147 -4
- sky/provision/gcp/instance_utils.py +10 -0
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/kubernetes/instance.py +16 -5
- sky/provision/kubernetes/utils.py +37 -19
- sky/provision/nebius/instance.py +3 -1
- sky/provision/nebius/utils.py +14 -2
- sky/provision/ssh/__init__.py +18 -0
- sky/resources.py +177 -4
- sky/serve/server/core.py +2 -4
- sky/server/common.py +46 -9
- sky/server/constants.py +2 -0
- sky/server/html/token_page.html +154 -0
- sky/server/requests/executor.py +3 -6
- sky/server/requests/payloads.py +7 -0
- sky/server/server.py +80 -8
- sky/setup_files/dependencies.py +1 -0
- sky/skypilot_config.py +117 -31
- sky/task.py +24 -1
- sky/templates/gcp-ray.yml.j2 +44 -1
- sky/templates/nebius-ray.yml.j2 +12 -2
- sky/utils/admin_policy_utils.py +26 -22
- sky/utils/context.py +36 -6
- sky/utils/context_utils.py +15 -0
- sky/utils/infra_utils.py +21 -1
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/create_cluster.sh +1 -0
- sky/utils/kubernetes/deploy_remote_cluster.py +1437 -0
- sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
- sky/utils/kubernetes/ssh-tunnel.sh +387 -0
- sky/utils/log_utils.py +214 -1
- sky/utils/resources_utils.py +14 -0
- sky/utils/schemas.py +67 -0
- sky/utils/ux_utils.py +2 -1
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/METADATA +6 -1
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/RECORD +88 -81
- sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- /sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/top_level.txt +0 -0
@@ -1,308 +0,0 @@
|
|
1
|
-
#!/bin/bash
|
2
|
-
# Refer to https://docs.skypilot.co/en/latest/reservations/existing-machines.html for details on how to use this script.
|
3
|
-
set -e
|
4
|
-
|
5
|
-
# Colors for nicer UX
|
6
|
-
RED='\033[0;31m'
|
7
|
-
GREEN='\033[0;32m'
|
8
|
-
YELLOW='\033[1;33m'
|
9
|
-
NC='\033[0m' # No color
|
10
|
-
|
11
|
-
# Variables
|
12
|
-
CLEANUP=false
|
13
|
-
INSTALL_GPU=false
|
14
|
-
POSITIONAL_ARGS=()
|
15
|
-
PASSWORD=""
|
16
|
-
|
17
|
-
# Process all arguments
|
18
|
-
while [[ $# -gt 0 ]]; do
|
19
|
-
case $1 in
|
20
|
-
--cleanup)
|
21
|
-
CLEANUP=true
|
22
|
-
shift
|
23
|
-
;;
|
24
|
-
--password)
|
25
|
-
PASSWORD=$2
|
26
|
-
shift
|
27
|
-
shift
|
28
|
-
;;
|
29
|
-
*)
|
30
|
-
POSITIONAL_ARGS+=("$1")
|
31
|
-
shift
|
32
|
-
;;
|
33
|
-
esac
|
34
|
-
done
|
35
|
-
|
36
|
-
# Restore positional arguments in correct order
|
37
|
-
set -- "${POSITIONAL_ARGS[@]}"
|
38
|
-
|
39
|
-
# Assign positional arguments to variables
|
40
|
-
IPS_FILE=$1
|
41
|
-
USER=$2
|
42
|
-
SSH_KEY=$3
|
43
|
-
CONTEXT_NAME=${4:-default}
|
44
|
-
K3S_TOKEN=mytoken # Any string can be used as the token
|
45
|
-
# Create temporary askpass script for sudo
|
46
|
-
ASKPASS_BLOCK="# Create temporary askpass script
|
47
|
-
ASKPASS_SCRIPT=\$(mktemp)
|
48
|
-
trap 'rm -f \$ASKPASS_SCRIPT' EXIT INT TERM ERR QUIT
|
49
|
-
cat > \$ASKPASS_SCRIPT << EOF
|
50
|
-
#!/bin/bash
|
51
|
-
echo $PASSWORD
|
52
|
-
EOF
|
53
|
-
chmod 700 \$ASKPASS_SCRIPT
|
54
|
-
# Use askpass
|
55
|
-
export SUDO_ASKPASS=\$ASKPASS_SCRIPT
|
56
|
-
"
|
57
|
-
|
58
|
-
# Basic argument checks
|
59
|
-
if [ -z "$IPS_FILE" ] || [ -z "$USER" ] || [ -z "$SSH_KEY" ]; then
|
60
|
-
>&2 echo -e "${RED}Error: Missing required arguments.${NC}"
|
61
|
-
>&2 echo "Usage: ./deploy_remote_cluster.sh ips.txt username path/to/ssh/key [context-name] [--cleanup] [--password password]"
|
62
|
-
exit 1
|
63
|
-
fi
|
64
|
-
|
65
|
-
# Check if SSH key exists
|
66
|
-
if [ ! -f "$SSH_KEY" ]; then
|
67
|
-
>&2 echo -e "${RED}Error: SSH key not found: $SSH_KEY${NC}"
|
68
|
-
exit 1
|
69
|
-
fi
|
70
|
-
|
71
|
-
# Check if IPs file exists
|
72
|
-
if [ ! -f "$IPS_FILE" ]; then
|
73
|
-
>&2 echo -e "${RED}Error: IPs file not found: $IPS_FILE${NC}"
|
74
|
-
exit 1
|
75
|
-
fi
|
76
|
-
|
77
|
-
# Get head node and worker nodes from the IPs file
|
78
|
-
HEAD_NODE=$(head -n 1 "$IPS_FILE")
|
79
|
-
WORKER_NODES=$(tail -n +2 "$IPS_FILE")
|
80
|
-
|
81
|
-
# Check if the IPs file is empty or not formatted correctly
|
82
|
-
if [ -z "$HEAD_NODE" ]; then
|
83
|
-
>&2 echo -e "${RED}Error: IPs file is empty or not formatted correctly.${NC}"
|
84
|
-
exit 1
|
85
|
-
fi
|
86
|
-
|
87
|
-
# Function to show a progress message
|
88
|
-
progress_message() {
|
89
|
-
echo -e "${YELLOW}➜ $1${NC}"
|
90
|
-
}
|
91
|
-
|
92
|
-
# Step to display success
|
93
|
-
success_message() {
|
94
|
-
echo -e "${GREEN}✔ $1${NC}"
|
95
|
-
}
|
96
|
-
|
97
|
-
# Function to run a command on a remote machine via SSH
|
98
|
-
run_remote() {
|
99
|
-
local NODE_IP=$1
|
100
|
-
local CMD=$2
|
101
|
-
# echo -e "${YELLOW}Running command on $NODE_IP...${NC}"
|
102
|
-
ssh -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$NODE_IP" "$CMD"
|
103
|
-
}
|
104
|
-
|
105
|
-
# Function to uninstall k3s and clean up the state on a remote machine
|
106
|
-
cleanup_server_node() {
|
107
|
-
local NODE_IP=$1
|
108
|
-
echo -e "${YELLOW}Cleaning up head node $NODE_IP...${NC}"
|
109
|
-
run_remote "$NODE_IP" "
|
110
|
-
$ASKPASS_BLOCK
|
111
|
-
echo 'Uninstalling k3s...' &&
|
112
|
-
sudo -A /usr/local/bin/k3s-uninstall.sh || true &&
|
113
|
-
sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
|
114
|
-
"
|
115
|
-
echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
|
116
|
-
}
|
117
|
-
|
118
|
-
# Function to uninstall k3s and clean up the state on a remote machine
|
119
|
-
cleanup_agent_node() {
|
120
|
-
local NODE_IP=$1
|
121
|
-
echo -e "${YELLOW}Cleaning up node $NODE_IP...${NC}"
|
122
|
-
run_remote "$NODE_IP" "
|
123
|
-
$ASKPASS_BLOCK
|
124
|
-
echo 'Uninstalling k3s...' &&
|
125
|
-
sudo -A /usr/local/bin/k3s-agent-uninstall.sh || true &&
|
126
|
-
sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
|
127
|
-
"
|
128
|
-
echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
|
129
|
-
}
|
130
|
-
|
131
|
-
check_gpu() {
|
132
|
-
local NODE_IP=$1
|
133
|
-
if run_remote "$NODE_IP" "command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null"; then
|
134
|
-
return 0 # GPU detected
|
135
|
-
else
|
136
|
-
return 1 # No GPU detected
|
137
|
-
fi
|
138
|
-
}
|
139
|
-
|
140
|
-
# Pre-flight checks
|
141
|
-
run_remote "$HEAD_NODE" "echo 'SSH connection successful'"
|
142
|
-
# TODO: Add more pre-flight checks here, including checking if port 6443 is accessible
|
143
|
-
|
144
|
-
# If --cleanup flag is set, uninstall k3s and exit
|
145
|
-
if [ "$CLEANUP" == "true" ]; then
|
146
|
-
echo -e "${YELLOW}Starting cleanup...${NC}"
|
147
|
-
|
148
|
-
# Clean up head node
|
149
|
-
cleanup_server_node "$HEAD_NODE"
|
150
|
-
|
151
|
-
# Clean up worker nodes
|
152
|
-
for NODE in $WORKER_NODES; do
|
153
|
-
cleanup_agent_node "$NODE"
|
154
|
-
done
|
155
|
-
|
156
|
-
# Remove the context from local kubeconfig if it exists
|
157
|
-
if [ -f "$HOME/.kube/config" ]; then
|
158
|
-
progress_message "Removing context '$CONTEXT_NAME' from local kubeconfig..."
|
159
|
-
kubectl config delete-context "$CONTEXT_NAME" 2>/dev/null || true
|
160
|
-
kubectl config delete-cluster "$CONTEXT_NAME" 2>/dev/null || true
|
161
|
-
kubectl config delete-user "$CONTEXT_NAME" 2>/dev/null || true
|
162
|
-
# Update the current context to the first available context
|
163
|
-
kubectl config use-context $(kubectl config view -o jsonpath='{.contexts[0].name}') 2>/dev/null || true
|
164
|
-
success_message "Context '$CONTEXT_NAME' removed from local kubeconfig."
|
165
|
-
fi
|
166
|
-
|
167
|
-
echo -e "${GREEN}Cleanup completed successfully.${NC}"
|
168
|
-
exit 0
|
169
|
-
fi
|
170
|
-
|
171
|
-
# Step 1: Install k3s on the head node
|
172
|
-
progress_message "Deploying Kubernetes on head node ($HEAD_NODE)..."
|
173
|
-
run_remote "$HEAD_NODE" "
|
174
|
-
$ASKPASS_BLOCK
|
175
|
-
curl -sfL https://get.k3s.io | K3S_TOKEN=$K3S_TOKEN sudo -E -A sh - &&
|
176
|
-
mkdir -p ~/.kube &&
|
177
|
-
sudo -A cp /etc/rancher/k3s/k3s.yaml ~/.kube/config &&
|
178
|
-
sudo -A chown \$(id -u):\$(id -g) ~/.kube/config &&
|
179
|
-
for i in {1..3}; do
|
180
|
-
if kubectl wait --for=condition=ready node --all --timeout=2m --kubeconfig ~/.kube/config; then
|
181
|
-
break
|
182
|
-
else
|
183
|
-
echo 'Waiting for nodes to be ready...'
|
184
|
-
sleep 5
|
185
|
-
fi
|
186
|
-
done
|
187
|
-
if [ \$i -eq 3 ]; then
|
188
|
-
echo 'Failed to wait for nodes to be ready after 3 attempts'
|
189
|
-
exit 1
|
190
|
-
fi"
|
191
|
-
success_message "K3s deployed on head node."
|
192
|
-
|
193
|
-
# Check if head node has a GPU
|
194
|
-
if check_gpu "$HEAD_NODE"; then
|
195
|
-
echo -e "${YELLOW}GPU detected on head node ($HEAD_NODE).${NC}"
|
196
|
-
INSTALL_GPU=true
|
197
|
-
fi
|
198
|
-
|
199
|
-
# Fetch the head node's internal IP (this will be passed to worker nodes)
|
200
|
-
MASTER_ADDR=$(run_remote "$HEAD_NODE" "hostname -I | awk '{print \$1}'")
|
201
|
-
|
202
|
-
echo -e "${GREEN}Master node internal IP: $MASTER_ADDR${NC}"
|
203
|
-
|
204
|
-
# Step 2: Install k3s on worker nodes and join them to the master node
|
205
|
-
for NODE in $WORKER_NODES; do
|
206
|
-
progress_message "Deploying Kubernetes on worker node ($NODE)..."
|
207
|
-
run_remote "$NODE" "
|
208
|
-
$ASKPASS_BLOCK
|
209
|
-
curl -sfL https://get.k3s.io | K3S_URL=https://$MASTER_ADDR:6443 K3S_TOKEN=$K3S_TOKEN sudo -E -A sh -"
|
210
|
-
success_message "Kubernetes deployed on worker node ($NODE)."
|
211
|
-
|
212
|
-
# Check if worker node has a GPU
|
213
|
-
if check_gpu "$NODE"; then
|
214
|
-
echo -e "${YELLOW}GPU detected on worker node ($NODE).${NC}"
|
215
|
-
INSTALL_GPU=true
|
216
|
-
fi
|
217
|
-
done
|
218
|
-
# Step 3: Configure local kubectl to connect to the cluster
|
219
|
-
progress_message "Configuring local kubectl to connect to the cluster..."
|
220
|
-
|
221
|
-
# Create temporary directory for kubeconfig operations
|
222
|
-
TEMP_DIR=$(mktemp -d)
|
223
|
-
TEMP_KUBECONFIG="$TEMP_DIR/kubeconfig"
|
224
|
-
|
225
|
-
# Get the kubeconfig from remote server
|
226
|
-
scp -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$HEAD_NODE":~/.kube/config "$TEMP_KUBECONFIG"
|
227
|
-
|
228
|
-
# Create .kube directory if it doesn't exist
|
229
|
-
mkdir -p "$HOME/.kube"
|
230
|
-
|
231
|
-
# Create empty kubeconfig if it doesn't exist
|
232
|
-
KUBECONFIG_FILE="$HOME/.kube/config"
|
233
|
-
if [[ ! -f "$KUBECONFIG_FILE" ]]; then
|
234
|
-
touch "$KUBECONFIG_FILE"
|
235
|
-
fi
|
236
|
-
|
237
|
-
# Modify the temporary kubeconfig to update server address and context name
|
238
|
-
awk -v context="$CONTEXT_NAME" '
|
239
|
-
/^clusters:/ { in_cluster = 1 }
|
240
|
-
/^users:/ { in_cluster = 0 }
|
241
|
-
in_cluster && /^ *certificate-authority-data:/ { next }
|
242
|
-
in_cluster && /^ *server:/ {
|
243
|
-
print " server: https://'${HEAD_NODE}:6443'"
|
244
|
-
print " insecure-skip-tls-verify: true"
|
245
|
-
next
|
246
|
-
}
|
247
|
-
/name: default/ { sub("name: default", "name: " context) }
|
248
|
-
/cluster: default/ { sub("cluster: default", "cluster: " context) }
|
249
|
-
/user: default/ { sub("user: default", "user: " context) }
|
250
|
-
/current-context: default/ { sub("current-context: default", "current-context: " context) }
|
251
|
-
{ print }
|
252
|
-
' "$TEMP_KUBECONFIG" > "$TEMP_DIR/modified_config"
|
253
|
-
|
254
|
-
# Merge the configurations using kubectl
|
255
|
-
KUBECONFIG="$KUBECONFIG_FILE:$TEMP_DIR/modified_config" kubectl config view --flatten > "$TEMP_DIR/merged_config"
|
256
|
-
mv "$TEMP_DIR/merged_config" "$KUBECONFIG_FILE"
|
257
|
-
|
258
|
-
# Set the new context as the current context
|
259
|
-
kubectl config use-context "$CONTEXT_NAME"
|
260
|
-
|
261
|
-
# Clean up temporary files
|
262
|
-
rm -rf "$TEMP_DIR"
|
263
|
-
|
264
|
-
success_message "kubectl configured with new context '$CONTEXT_NAME'."
|
265
|
-
|
266
|
-
echo "Cluster deployment completed. You can now run 'kubectl get nodes' to verify the setup."
|
267
|
-
|
268
|
-
# Install GPU operator if a GPU was detected on any node
|
269
|
-
if [ "$INSTALL_GPU" == "true" ]; then
|
270
|
-
echo -e "${YELLOW}GPU detected in the cluster. Installing Nvidia GPU Operator...${NC}"
|
271
|
-
run_remote "$HEAD_NODE" "
|
272
|
-
$ASKPASS_BLOCK
|
273
|
-
curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
|
274
|
-
chmod 700 get_helm.sh &&
|
275
|
-
./get_helm.sh &&
|
276
|
-
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update &&
|
277
|
-
kubectl create namespace gpu-operator --kubeconfig ~/.kube/config || true &&
|
278
|
-
sudo -A ln -s /sbin/ldconfig /sbin/ldconfig.real || true &&
|
279
|
-
helm install gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator \
|
280
|
-
--set 'toolkit.env[0].name=CONTAINERD_CONFIG' \
|
281
|
-
--set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \
|
282
|
-
--set 'toolkit.env[1].name=CONTAINERD_SOCKET' \
|
283
|
-
--set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \
|
284
|
-
--set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \
|
285
|
-
--set 'toolkit.env[2].value=nvidia' &&
|
286
|
-
echo 'Waiting for GPU operator installation...' &&
|
287
|
-
while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:'; do
|
288
|
-
echo 'Waiting for GPU operator...'
|
289
|
-
sleep 5
|
290
|
-
done
|
291
|
-
echo 'GPU operator installed successfully.'"
|
292
|
-
success_message "GPU Operator installed."
|
293
|
-
else
|
294
|
-
echo -e "${YELLOW}No GPUs detected. Skipping GPU Operator installation.${NC}"
|
295
|
-
fi
|
296
|
-
|
297
|
-
# Configure SkyPilot
|
298
|
-
progress_message "Configuring SkyPilot..."
|
299
|
-
sky check kubernetes
|
300
|
-
success_message "SkyPilot configured successfully."
|
301
|
-
|
302
|
-
# Display final success message
|
303
|
-
echo -e "${GREEN}==== 🎉 Kubernetes cluster deployment completed successfully 🎉 ====${NC}"
|
304
|
-
echo "You can now interact with your Kubernetes cluster through SkyPilot: "
|
305
|
-
echo " • List available GPUs: sky show-gpus --cloud kubernetes"
|
306
|
-
echo " • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes --gpus A100:1"
|
307
|
-
echo " • Connect to pod with SSH: ssh devbox"
|
308
|
-
echo " • Connect to pod with VSCode: code --remote ssh-remote+devbox '/'"
|
/sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_ssgManifest.js
RENAMED
File without changes
|
{skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|