skypilot-nightly 1.0.0.dev20250521__py3-none-any.whl → 1.0.0.dev20250523__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (90) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/kubernetes.py +46 -16
  3. sky/backends/cloud_vm_ray_backend.py +16 -4
  4. sky/check.py +109 -44
  5. sky/cli.py +261 -90
  6. sky/client/cli.py +261 -90
  7. sky/client/sdk.py +122 -3
  8. sky/clouds/__init__.py +5 -0
  9. sky/clouds/aws.py +4 -2
  10. sky/clouds/azure.py +4 -2
  11. sky/clouds/cloud.py +30 -6
  12. sky/clouds/cudo.py +2 -1
  13. sky/clouds/do.py +2 -1
  14. sky/clouds/fluidstack.py +2 -1
  15. sky/clouds/gcp.py +160 -23
  16. sky/clouds/ibm.py +4 -2
  17. sky/clouds/kubernetes.py +66 -22
  18. sky/clouds/lambda_cloud.py +2 -1
  19. sky/clouds/nebius.py +18 -2
  20. sky/clouds/oci.py +4 -2
  21. sky/clouds/paperspace.py +2 -1
  22. sky/clouds/runpod.py +2 -1
  23. sky/clouds/scp.py +2 -1
  24. sky/clouds/service_catalog/__init__.py +3 -0
  25. sky/clouds/service_catalog/common.py +9 -2
  26. sky/clouds/service_catalog/constants.py +2 -1
  27. sky/clouds/service_catalog/ssh_catalog.py +167 -0
  28. sky/clouds/ssh.py +203 -0
  29. sky/clouds/vast.py +2 -1
  30. sky/clouds/vsphere.py +2 -1
  31. sky/core.py +59 -17
  32. sky/dashboard/out/404.html +1 -1
  33. sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_buildManifest.js +1 -1
  34. sky/dashboard/out/_next/static/chunks/pages/infra-abf08c4384190a39.js +1 -0
  35. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  36. sky/dashboard/out/clusters/[cluster].html +1 -1
  37. sky/dashboard/out/clusters.html +1 -1
  38. sky/dashboard/out/index.html +1 -1
  39. sky/dashboard/out/infra.html +1 -1
  40. sky/dashboard/out/jobs/[job].html +1 -1
  41. sky/dashboard/out/jobs.html +1 -1
  42. sky/data/storage.py +1 -0
  43. sky/execution.py +56 -7
  44. sky/jobs/server/core.py +4 -2
  45. sky/optimizer.py +29 -15
  46. sky/provision/__init__.py +1 -0
  47. sky/provision/aws/instance.py +17 -1
  48. sky/provision/gcp/constants.py +147 -4
  49. sky/provision/gcp/instance_utils.py +10 -0
  50. sky/provision/gcp/volume_utils.py +247 -0
  51. sky/provision/kubernetes/instance.py +16 -5
  52. sky/provision/kubernetes/utils.py +37 -19
  53. sky/provision/nebius/instance.py +3 -1
  54. sky/provision/nebius/utils.py +14 -2
  55. sky/provision/ssh/__init__.py +18 -0
  56. sky/resources.py +177 -4
  57. sky/serve/server/core.py +2 -4
  58. sky/server/common.py +46 -9
  59. sky/server/constants.py +2 -0
  60. sky/server/html/token_page.html +154 -0
  61. sky/server/requests/executor.py +3 -6
  62. sky/server/requests/payloads.py +7 -0
  63. sky/server/server.py +80 -8
  64. sky/setup_files/dependencies.py +1 -0
  65. sky/skypilot_config.py +117 -31
  66. sky/task.py +24 -1
  67. sky/templates/gcp-ray.yml.j2 +44 -1
  68. sky/templates/nebius-ray.yml.j2 +12 -2
  69. sky/utils/admin_policy_utils.py +26 -22
  70. sky/utils/context.py +36 -6
  71. sky/utils/context_utils.py +15 -0
  72. sky/utils/infra_utils.py +21 -1
  73. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  74. sky/utils/kubernetes/create_cluster.sh +1 -0
  75. sky/utils/kubernetes/deploy_remote_cluster.py +1437 -0
  76. sky/utils/kubernetes/kubernetes_deploy_utils.py +117 -10
  77. sky/utils/kubernetes/ssh-tunnel.sh +387 -0
  78. sky/utils/log_utils.py +214 -1
  79. sky/utils/resources_utils.py +14 -0
  80. sky/utils/schemas.py +67 -0
  81. sky/utils/ux_utils.py +2 -1
  82. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/METADATA +6 -1
  83. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/RECORD +88 -81
  84. sky/dashboard/out/_next/static/chunks/pages/infra-9180cd91cee64b96.js +0 -1
  85. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  86. /sky/dashboard/out/_next/static/{hvWzC5E6Q4CcKzXcWbgig → ECKwDNS9v9y3_IKFZ2lpp}/_ssgManifest.js +0 -0
  87. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/WHEEL +0 -0
  88. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/entry_points.txt +0 -0
  89. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/licenses/LICENSE +0 -0
  90. {skypilot_nightly-1.0.0.dev20250521.dist-info → skypilot_nightly-1.0.0.dev20250523.dist-info}/top_level.txt +0 -0
@@ -1,308 +0,0 @@
1
- #!/bin/bash
2
- # Refer to https://docs.skypilot.co/en/latest/reservations/existing-machines.html for details on how to use this script.
3
- set -e
4
-
5
- # Colors for nicer UX
6
- RED='\033[0;31m'
7
- GREEN='\033[0;32m'
8
- YELLOW='\033[1;33m'
9
- NC='\033[0m' # No color
10
-
11
- # Variables
12
- CLEANUP=false
13
- INSTALL_GPU=false
14
- POSITIONAL_ARGS=()
15
- PASSWORD=""
16
-
17
- # Process all arguments
18
- while [[ $# -gt 0 ]]; do
19
- case $1 in
20
- --cleanup)
21
- CLEANUP=true
22
- shift
23
- ;;
24
- --password)
25
- PASSWORD=$2
26
- shift
27
- shift
28
- ;;
29
- *)
30
- POSITIONAL_ARGS+=("$1")
31
- shift
32
- ;;
33
- esac
34
- done
35
-
36
- # Restore positional arguments in correct order
37
- set -- "${POSITIONAL_ARGS[@]}"
38
-
39
- # Assign positional arguments to variables
40
- IPS_FILE=$1
41
- USER=$2
42
- SSH_KEY=$3
43
- CONTEXT_NAME=${4:-default}
44
- K3S_TOKEN=mytoken # Any string can be used as the token
45
- # Create temporary askpass script for sudo
46
- ASKPASS_BLOCK="# Create temporary askpass script
47
- ASKPASS_SCRIPT=\$(mktemp)
48
- trap 'rm -f \$ASKPASS_SCRIPT' EXIT INT TERM ERR QUIT
49
- cat > \$ASKPASS_SCRIPT << EOF
50
- #!/bin/bash
51
- echo $PASSWORD
52
- EOF
53
- chmod 700 \$ASKPASS_SCRIPT
54
- # Use askpass
55
- export SUDO_ASKPASS=\$ASKPASS_SCRIPT
56
- "
57
-
58
- # Basic argument checks
59
- if [ -z "$IPS_FILE" ] || [ -z "$USER" ] || [ -z "$SSH_KEY" ]; then
60
- >&2 echo -e "${RED}Error: Missing required arguments.${NC}"
61
- >&2 echo "Usage: ./deploy_remote_cluster.sh ips.txt username path/to/ssh/key [context-name] [--cleanup] [--password password]"
62
- exit 1
63
- fi
64
-
65
- # Check if SSH key exists
66
- if [ ! -f "$SSH_KEY" ]; then
67
- >&2 echo -e "${RED}Error: SSH key not found: $SSH_KEY${NC}"
68
- exit 1
69
- fi
70
-
71
- # Check if IPs file exists
72
- if [ ! -f "$IPS_FILE" ]; then
73
- >&2 echo -e "${RED}Error: IPs file not found: $IPS_FILE${NC}"
74
- exit 1
75
- fi
76
-
77
- # Get head node and worker nodes from the IPs file
78
- HEAD_NODE=$(head -n 1 "$IPS_FILE")
79
- WORKER_NODES=$(tail -n +2 "$IPS_FILE")
80
-
81
- # Check if the IPs file is empty or not formatted correctly
82
- if [ -z "$HEAD_NODE" ]; then
83
- >&2 echo -e "${RED}Error: IPs file is empty or not formatted correctly.${NC}"
84
- exit 1
85
- fi
86
-
87
- # Function to show a progress message
88
- progress_message() {
89
- echo -e "${YELLOW}➜ $1${NC}"
90
- }
91
-
92
- # Step to display success
93
- success_message() {
94
- echo -e "${GREEN}✔ $1${NC}"
95
- }
96
-
97
- # Function to run a command on a remote machine via SSH
98
- run_remote() {
99
- local NODE_IP=$1
100
- local CMD=$2
101
- # echo -e "${YELLOW}Running command on $NODE_IP...${NC}"
102
- ssh -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$NODE_IP" "$CMD"
103
- }
104
-
105
- # Function to uninstall k3s and clean up the state on a remote machine
106
- cleanup_server_node() {
107
- local NODE_IP=$1
108
- echo -e "${YELLOW}Cleaning up head node $NODE_IP...${NC}"
109
- run_remote "$NODE_IP" "
110
- $ASKPASS_BLOCK
111
- echo 'Uninstalling k3s...' &&
112
- sudo -A /usr/local/bin/k3s-uninstall.sh || true &&
113
- sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
114
- "
115
- echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
116
- }
117
-
118
- # Function to uninstall k3s and clean up the state on a remote machine
119
- cleanup_agent_node() {
120
- local NODE_IP=$1
121
- echo -e "${YELLOW}Cleaning up node $NODE_IP...${NC}"
122
- run_remote "$NODE_IP" "
123
- $ASKPASS_BLOCK
124
- echo 'Uninstalling k3s...' &&
125
- sudo -A /usr/local/bin/k3s-agent-uninstall.sh || true &&
126
- sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
127
- "
128
- echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
129
- }
130
-
131
- check_gpu() {
132
- local NODE_IP=$1
133
- if run_remote "$NODE_IP" "command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null"; then
134
- return 0 # GPU detected
135
- else
136
- return 1 # No GPU detected
137
- fi
138
- }
139
-
140
- # Pre-flight checks
141
- run_remote "$HEAD_NODE" "echo 'SSH connection successful'"
142
- # TODO: Add more pre-flight checks here, including checking if port 6443 is accessible
143
-
144
- # If --cleanup flag is set, uninstall k3s and exit
145
- if [ "$CLEANUP" == "true" ]; then
146
- echo -e "${YELLOW}Starting cleanup...${NC}"
147
-
148
- # Clean up head node
149
- cleanup_server_node "$HEAD_NODE"
150
-
151
- # Clean up worker nodes
152
- for NODE in $WORKER_NODES; do
153
- cleanup_agent_node "$NODE"
154
- done
155
-
156
- # Remove the context from local kubeconfig if it exists
157
- if [ -f "$HOME/.kube/config" ]; then
158
- progress_message "Removing context '$CONTEXT_NAME' from local kubeconfig..."
159
- kubectl config delete-context "$CONTEXT_NAME" 2>/dev/null || true
160
- kubectl config delete-cluster "$CONTEXT_NAME" 2>/dev/null || true
161
- kubectl config delete-user "$CONTEXT_NAME" 2>/dev/null || true
162
- # Update the current context to the first available context
163
- kubectl config use-context $(kubectl config view -o jsonpath='{.contexts[0].name}') 2>/dev/null || true
164
- success_message "Context '$CONTEXT_NAME' removed from local kubeconfig."
165
- fi
166
-
167
- echo -e "${GREEN}Cleanup completed successfully.${NC}"
168
- exit 0
169
- fi
170
-
171
- # Step 1: Install k3s on the head node
172
- progress_message "Deploying Kubernetes on head node ($HEAD_NODE)..."
173
- run_remote "$HEAD_NODE" "
174
- $ASKPASS_BLOCK
175
- curl -sfL https://get.k3s.io | K3S_TOKEN=$K3S_TOKEN sudo -E -A sh - &&
176
- mkdir -p ~/.kube &&
177
- sudo -A cp /etc/rancher/k3s/k3s.yaml ~/.kube/config &&
178
- sudo -A chown \$(id -u):\$(id -g) ~/.kube/config &&
179
- for i in {1..3}; do
180
- if kubectl wait --for=condition=ready node --all --timeout=2m --kubeconfig ~/.kube/config; then
181
- break
182
- else
183
- echo 'Waiting for nodes to be ready...'
184
- sleep 5
185
- fi
186
- done
187
- if [ \$i -eq 3 ]; then
188
- echo 'Failed to wait for nodes to be ready after 3 attempts'
189
- exit 1
190
- fi"
191
- success_message "K3s deployed on head node."
192
-
193
- # Check if head node has a GPU
194
- if check_gpu "$HEAD_NODE"; then
195
- echo -e "${YELLOW}GPU detected on head node ($HEAD_NODE).${NC}"
196
- INSTALL_GPU=true
197
- fi
198
-
199
- # Fetch the head node's internal IP (this will be passed to worker nodes)
200
- MASTER_ADDR=$(run_remote "$HEAD_NODE" "hostname -I | awk '{print \$1}'")
201
-
202
- echo -e "${GREEN}Master node internal IP: $MASTER_ADDR${NC}"
203
-
204
- # Step 2: Install k3s on worker nodes and join them to the master node
205
- for NODE in $WORKER_NODES; do
206
- progress_message "Deploying Kubernetes on worker node ($NODE)..."
207
- run_remote "$NODE" "
208
- $ASKPASS_BLOCK
209
- curl -sfL https://get.k3s.io | K3S_URL=https://$MASTER_ADDR:6443 K3S_TOKEN=$K3S_TOKEN sudo -E -A sh -"
210
- success_message "Kubernetes deployed on worker node ($NODE)."
211
-
212
- # Check if worker node has a GPU
213
- if check_gpu "$NODE"; then
214
- echo -e "${YELLOW}GPU detected on worker node ($NODE).${NC}"
215
- INSTALL_GPU=true
216
- fi
217
- done
218
- # Step 3: Configure local kubectl to connect to the cluster
219
- progress_message "Configuring local kubectl to connect to the cluster..."
220
-
221
- # Create temporary directory for kubeconfig operations
222
- TEMP_DIR=$(mktemp -d)
223
- TEMP_KUBECONFIG="$TEMP_DIR/kubeconfig"
224
-
225
- # Get the kubeconfig from remote server
226
- scp -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$HEAD_NODE":~/.kube/config "$TEMP_KUBECONFIG"
227
-
228
- # Create .kube directory if it doesn't exist
229
- mkdir -p "$HOME/.kube"
230
-
231
- # Create empty kubeconfig if it doesn't exist
232
- KUBECONFIG_FILE="$HOME/.kube/config"
233
- if [[ ! -f "$KUBECONFIG_FILE" ]]; then
234
- touch "$KUBECONFIG_FILE"
235
- fi
236
-
237
- # Modify the temporary kubeconfig to update server address and context name
238
- awk -v context="$CONTEXT_NAME" '
239
- /^clusters:/ { in_cluster = 1 }
240
- /^users:/ { in_cluster = 0 }
241
- in_cluster && /^ *certificate-authority-data:/ { next }
242
- in_cluster && /^ *server:/ {
243
- print " server: https://'${HEAD_NODE}:6443'"
244
- print " insecure-skip-tls-verify: true"
245
- next
246
- }
247
- /name: default/ { sub("name: default", "name: " context) }
248
- /cluster: default/ { sub("cluster: default", "cluster: " context) }
249
- /user: default/ { sub("user: default", "user: " context) }
250
- /current-context: default/ { sub("current-context: default", "current-context: " context) }
251
- { print }
252
- ' "$TEMP_KUBECONFIG" > "$TEMP_DIR/modified_config"
253
-
254
- # Merge the configurations using kubectl
255
- KUBECONFIG="$KUBECONFIG_FILE:$TEMP_DIR/modified_config" kubectl config view --flatten > "$TEMP_DIR/merged_config"
256
- mv "$TEMP_DIR/merged_config" "$KUBECONFIG_FILE"
257
-
258
- # Set the new context as the current context
259
- kubectl config use-context "$CONTEXT_NAME"
260
-
261
- # Clean up temporary files
262
- rm -rf "$TEMP_DIR"
263
-
264
- success_message "kubectl configured with new context '$CONTEXT_NAME'."
265
-
266
- echo "Cluster deployment completed. You can now run 'kubectl get nodes' to verify the setup."
267
-
268
- # Install GPU operator if a GPU was detected on any node
269
- if [ "$INSTALL_GPU" == "true" ]; then
270
- echo -e "${YELLOW}GPU detected in the cluster. Installing Nvidia GPU Operator...${NC}"
271
- run_remote "$HEAD_NODE" "
272
- $ASKPASS_BLOCK
273
- curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
274
- chmod 700 get_helm.sh &&
275
- ./get_helm.sh &&
276
- helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update &&
277
- kubectl create namespace gpu-operator --kubeconfig ~/.kube/config || true &&
278
- sudo -A ln -s /sbin/ldconfig /sbin/ldconfig.real || true &&
279
- helm install gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator \
280
- --set 'toolkit.env[0].name=CONTAINERD_CONFIG' \
281
- --set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \
282
- --set 'toolkit.env[1].name=CONTAINERD_SOCKET' \
283
- --set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \
284
- --set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \
285
- --set 'toolkit.env[2].value=nvidia' &&
286
- echo 'Waiting for GPU operator installation...' &&
287
- while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:'; do
288
- echo 'Waiting for GPU operator...'
289
- sleep 5
290
- done
291
- echo 'GPU operator installed successfully.'"
292
- success_message "GPU Operator installed."
293
- else
294
- echo -e "${YELLOW}No GPUs detected. Skipping GPU Operator installation.${NC}"
295
- fi
296
-
297
- # Configure SkyPilot
298
- progress_message "Configuring SkyPilot..."
299
- sky check kubernetes
300
- success_message "SkyPilot configured successfully."
301
-
302
- # Display final success message
303
- echo -e "${GREEN}==== 🎉 Kubernetes cluster deployment completed successfully 🎉 ====${NC}"
304
- echo "You can now interact with your Kubernetes cluster through SkyPilot: "
305
- echo " • List available GPUs: sky show-gpus --cloud kubernetes"
306
- echo " • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes --gpus A100:1"
307
- echo " • Connect to pod with SSH: ssh devbox"
308
- echo " • Connect to pod with VSCode: code --remote ssh-remote+devbox '/'"