skypilot-nightly 1.0.0.dev20240927__py3-none-any.whl → 1.0.0.dev20240928__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/cli.py +119 -9
- sky/utils/kubernetes/deploy_remote_cluster.sh +243 -0
- sky/utils/log_utils.py +88 -10
- {skypilot_nightly-1.0.0.dev20240927.dist-info → skypilot_nightly-1.0.0.dev20240928.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20240927.dist-info → skypilot_nightly-1.0.0.dev20240928.dist-info}/RECORD +10 -9
- {skypilot_nightly-1.0.0.dev20240927.dist-info → skypilot_nightly-1.0.0.dev20240928.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20240927.dist-info → skypilot_nightly-1.0.0.dev20240928.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20240927.dist-info → skypilot_nightly-1.0.0.dev20240928.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20240927.dist-info → skypilot_nightly-1.0.0.dev20240928.dist-info}/top_level.txt +0 -0
sky/__init__.py
CHANGED
@@ -5,7 +5,7 @@ from typing import Optional
|
|
5
5
|
import urllib.request
|
6
6
|
|
7
7
|
# Replaced with the current commit when building the wheels.
|
8
|
-
_SKYPILOT_COMMIT_SHA = '
|
8
|
+
_SKYPILOT_COMMIT_SHA = 'dacf27348ae1446c3c93d0ee2fc57702c5366eac'
|
9
9
|
|
10
10
|
|
11
11
|
def _get_git_commit():
|
@@ -35,7 +35,7 @@ def _get_git_commit():
|
|
35
35
|
|
36
36
|
|
37
37
|
__commit__ = _get_git_commit()
|
38
|
-
__version__ = '1.0.0.
|
38
|
+
__version__ = '1.0.0.dev20240928'
|
39
39
|
__root_dir__ = os.path.dirname(os.path.abspath(__file__))
|
40
40
|
|
41
41
|
|
sky/cli.py
CHANGED
@@ -5072,15 +5072,7 @@ def local():
|
|
5072
5072
|
pass
|
5073
5073
|
|
5074
5074
|
|
5075
|
-
|
5076
|
-
default=True,
|
5077
|
-
is_flag=True,
|
5078
|
-
help='Launch cluster without GPU support even '
|
5079
|
-
'if GPUs are detected on the host.')
|
5080
|
-
@local.command('up', cls=_DocumentedCodeCommand)
|
5081
|
-
@usage_lib.entrypoint
|
5082
|
-
def local_up(gpus: bool):
|
5083
|
-
"""Creates a local cluster."""
|
5075
|
+
def _deploy_local_cluster(gpus: bool):
|
5084
5076
|
cluster_created = False
|
5085
5077
|
|
5086
5078
|
# Check if GPUs are available on the host
|
@@ -5206,6 +5198,124 @@ def local_up(gpus: bool):
|
|
5206
5198
|
f'{gpu_hint}')
|
5207
5199
|
|
5208
5200
|
|
5201
|
+
def _deploy_remote_cluster(ip_file: str, ssh_user: str, ssh_key_path: str,
|
5202
|
+
cleanup: bool):
|
5203
|
+
success = False
|
5204
|
+
path_to_package = os.path.dirname(os.path.dirname(__file__))
|
5205
|
+
up_script_path = os.path.join(path_to_package, 'sky/utils/kubernetes',
|
5206
|
+
'deploy_remote_cluster.sh')
|
5207
|
+
# Get directory of script and run it from there
|
5208
|
+
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
5209
|
+
|
5210
|
+
deploy_command = f'{up_script_path} {ip_file} {ssh_user} {ssh_key_path}'
|
5211
|
+
if cleanup:
|
5212
|
+
deploy_command += ' --cleanup'
|
5213
|
+
|
5214
|
+
# Convert the command to a format suitable for subprocess
|
5215
|
+
deploy_command = shlex.split(deploy_command)
|
5216
|
+
|
5217
|
+
# Setup logging paths
|
5218
|
+
run_timestamp = backend_utils.get_run_timestamp()
|
5219
|
+
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
5220
|
+
'local_up.log')
|
5221
|
+
tail_cmd = 'tail -n100 -f ' + log_path
|
5222
|
+
|
5223
|
+
# Check if ~/.kube/config exists:
|
5224
|
+
if os.path.exists(os.path.expanduser('~/.kube/config')):
|
5225
|
+
click.echo('Found existing kube config. '
|
5226
|
+
'It will be backed up to ~/.kube/config.bak.')
|
5227
|
+
style = colorama.Style
|
5228
|
+
click.echo('To view detailed progress: '
|
5229
|
+
f'{style.BRIGHT}{tail_cmd}{style.RESET_ALL}')
|
5230
|
+
if cleanup:
|
5231
|
+
msg_str = 'Cleaning up remote cluster...'
|
5232
|
+
else:
|
5233
|
+
msg_str = 'Deploying remote cluster...'
|
5234
|
+
with rich_utils.safe_status(f'[bold cyan]{msg_str}'):
|
5235
|
+
returncode, _, stderr = log_lib.run_with_log(
|
5236
|
+
cmd=deploy_command,
|
5237
|
+
log_path=log_path,
|
5238
|
+
require_outputs=True,
|
5239
|
+
stream_logs=False,
|
5240
|
+
line_processor=log_utils.SkyRemoteUpLineProcessor(),
|
5241
|
+
cwd=cwd)
|
5242
|
+
if returncode == 0:
|
5243
|
+
success = True
|
5244
|
+
else:
|
5245
|
+
with ux_utils.print_exception_no_traceback():
|
5246
|
+
raise RuntimeError(
|
5247
|
+
'Failed to deploy remote cluster. '
|
5248
|
+
f'Full log: {log_path}'
|
5249
|
+
f'\nError: {style.BRIGHT}{stderr}{style.RESET_ALL}')
|
5250
|
+
|
5251
|
+
if success:
|
5252
|
+
if cleanup:
|
5253
|
+
click.echo(f'{colorama.Fore.GREEN}'
|
5254
|
+
'🎉 Remote cluster cleaned up successfully.'
|
5255
|
+
f'{style.RESET_ALL}')
|
5256
|
+
else:
|
5257
|
+
click.echo('Cluster deployment done. You can now run tasks on '
|
5258
|
+
'this cluster.\nE.g., run a task with: '
|
5259
|
+
'sky launch --cloud kubernetes -- echo hello world.'
|
5260
|
+
f'\n{colorama.Fore.GREEN}🎉 Remote cluster deployed '
|
5261
|
+
f'successfully. {style.RESET_ALL}')
|
5262
|
+
|
5263
|
+
|
5264
|
+
@click.option('--gpus/--no-gpus',
|
5265
|
+
default=True,
|
5266
|
+
is_flag=True,
|
5267
|
+
help='Launch cluster without GPU support even '
|
5268
|
+
'if GPUs are detected on the host.')
|
5269
|
+
@click.option(
|
5270
|
+
'--ips',
|
5271
|
+
type=str,
|
5272
|
+
required=False,
|
5273
|
+
help='Path to the file containing IP addresses of remote machines.')
|
5274
|
+
@click.option('--ssh-user',
|
5275
|
+
type=str,
|
5276
|
+
required=False,
|
5277
|
+
help='SSH username for accessing remote machines.')
|
5278
|
+
@click.option('--ssh-key-path',
|
5279
|
+
type=str,
|
5280
|
+
required=False,
|
5281
|
+
help='Path to the SSH private key.')
|
5282
|
+
@click.option('--cleanup',
|
5283
|
+
is_flag=True,
|
5284
|
+
help='Clean up the remote cluster instead of deploying it.')
|
5285
|
+
@local.command('up', cls=_DocumentedCodeCommand)
|
5286
|
+
@usage_lib.entrypoint
|
5287
|
+
def local_up(gpus: bool, ips: str, ssh_user: str, ssh_key_path: str,
|
5288
|
+
cleanup: bool):
|
5289
|
+
"""Creates a local or remote cluster."""
|
5290
|
+
|
5291
|
+
def _validate_args(ips, ssh_user, ssh_key_path, cleanup):
|
5292
|
+
# If any of --ips, --ssh-user, or --ssh-key-path is specified,
|
5293
|
+
# all must be specified
|
5294
|
+
if bool(ips) or bool(ssh_user) or bool(ssh_key_path):
|
5295
|
+
if not (ips and ssh_user and ssh_key_path):
|
5296
|
+
raise click.BadParameter(
|
5297
|
+
'All --ips, --ssh-user, and --ssh-key-path '
|
5298
|
+
'must be specified together.')
|
5299
|
+
|
5300
|
+
# --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
|
5301
|
+
# are all provided
|
5302
|
+
if cleanup and not (ips and ssh_user and ssh_key_path):
|
5303
|
+
raise click.BadParameter('--cleanup can only be used with '
|
5304
|
+
'--ips, --ssh-user and --ssh-key-path.')
|
5305
|
+
|
5306
|
+
_validate_args(ips, ssh_user, ssh_key_path, cleanup)
|
5307
|
+
|
5308
|
+
# If remote deployment arguments are specified, run remote up script
|
5309
|
+
if ips and ssh_user and ssh_key_path:
|
5310
|
+
# Convert ips and ssh_key_path to absolute paths
|
5311
|
+
ips = os.path.abspath(ips)
|
5312
|
+
ssh_key_path = os.path.abspath(ssh_key_path)
|
5313
|
+
_deploy_remote_cluster(ips, ssh_user, ssh_key_path, cleanup)
|
5314
|
+
else:
|
5315
|
+
# Run local deployment (kind) if no remote args are specified
|
5316
|
+
_deploy_local_cluster(gpus)
|
5317
|
+
|
5318
|
+
|
5209
5319
|
@local.command('down', cls=_DocumentedCodeCommand)
|
5210
5320
|
@usage_lib.entrypoint
|
5211
5321
|
def local_down():
|
@@ -0,0 +1,243 @@
|
|
1
|
+
#!/bin/bash
|
2
|
+
# Refer to https://skypilot.readthedocs.io/en/latest/reservations/existing-machines.html for details on how to use this script.
|
3
|
+
set -e
|
4
|
+
|
5
|
+
# Colors for nicer UX
|
6
|
+
RED='\033[0;31m'
|
7
|
+
GREEN='\033[0;32m'
|
8
|
+
YELLOW='\033[1;33m'
|
9
|
+
NC='\033[0m' # No color
|
10
|
+
|
11
|
+
# Variables
|
12
|
+
IPS_FILE=$1
|
13
|
+
USER=$2
|
14
|
+
SSH_KEY=$3
|
15
|
+
K3S_TOKEN=mytoken # Any string can be used as the token
|
16
|
+
CLEANUP=false
|
17
|
+
INSTALL_GPU=false
|
18
|
+
|
19
|
+
if [[ "$4" == "--cleanup" ]]; then
|
20
|
+
CLEANUP=true
|
21
|
+
fi
|
22
|
+
|
23
|
+
# Basic argument checks
|
24
|
+
if [ -z "$IPS_FILE" ] || [ -z "$USER" ] || [ -z "$SSH_KEY" ]; then
|
25
|
+
>&2 echo -e "${RED}Error: Missing required arguments.${NC}"
|
26
|
+
>&2 echo "Usage: ./deploy_remote_cluster.sh ips.txt username path/to/ssh/key [--cleanup]"
|
27
|
+
exit 1
|
28
|
+
fi
|
29
|
+
|
30
|
+
# Check if SSH key exists
|
31
|
+
if [ ! -f "$SSH_KEY" ]; then
|
32
|
+
>&2 echo -e "${RED}Error: SSH key not found: $SSH_KEY${NC}"
|
33
|
+
exit 1
|
34
|
+
fi
|
35
|
+
|
36
|
+
# Check if IPs file exists
|
37
|
+
if [ ! -f "$IPS_FILE" ]; then
|
38
|
+
>&2 echo -e "${RED}Error: IPs file not found: $IPS_FILE${NC}"
|
39
|
+
exit 1
|
40
|
+
fi
|
41
|
+
|
42
|
+
# Get head node and worker nodes from the IPs file
|
43
|
+
HEAD_NODE=$(head -n 1 "$IPS_FILE")
|
44
|
+
WORKER_NODES=$(tail -n +2 "$IPS_FILE")
|
45
|
+
|
46
|
+
# Check if the IPs file is empty or not formatted correctly
|
47
|
+
if [ -z "$HEAD_NODE" ]; then
|
48
|
+
>&2 echo -e "${RED}Error: IPs file is empty or not formatted correctly.${NC}"
|
49
|
+
exit 1
|
50
|
+
fi
|
51
|
+
|
52
|
+
# Function to show a progress message
|
53
|
+
progress_message() {
|
54
|
+
echo -e "${YELLOW}➜ $1${NC}"
|
55
|
+
}
|
56
|
+
|
57
|
+
# Step to display success
|
58
|
+
success_message() {
|
59
|
+
echo -e "${GREEN}✔ $1${NC}"
|
60
|
+
}
|
61
|
+
|
62
|
+
# Function to run a command on a remote machine via SSH
|
63
|
+
run_remote() {
|
64
|
+
local NODE_IP=$1
|
65
|
+
local CMD=$2
|
66
|
+
# echo -e "${YELLOW}Running command on $NODE_IP...${NC}"
|
67
|
+
ssh -o StrictHostKeyChecking=no -i "$SSH_KEY" "$USER@$NODE_IP" "$CMD"
|
68
|
+
}
|
69
|
+
|
70
|
+
# Function to uninstall k3s and clean up the state on a remote machine
|
71
|
+
cleanup_server_node() {
|
72
|
+
local NODE_IP=$1
|
73
|
+
echo -e "${YELLOW}Cleaning up head node $NODE_IP...${NC}"
|
74
|
+
run_remote "$NODE_IP" "
|
75
|
+
echo 'Uninstalling k3s...' &&
|
76
|
+
/usr/local/bin/k3s-uninstall.sh || true &&
|
77
|
+
sudo rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
|
78
|
+
"
|
79
|
+
echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
|
80
|
+
}
|
81
|
+
|
82
|
+
# Function to uninstall k3s and clean up the state on a remote machine
|
83
|
+
cleanup_agent_node() {
|
84
|
+
local NODE_IP=$1
|
85
|
+
echo -e "${YELLOW}Cleaning up node $NODE_IP...${NC}"
|
86
|
+
run_remote "$NODE_IP" "
|
87
|
+
echo 'Uninstalling k3s...' &&
|
88
|
+
/usr/local/bin/k3s-agent-uninstall.sh || true &&
|
89
|
+
sudo rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
|
90
|
+
"
|
91
|
+
echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
|
92
|
+
}
|
93
|
+
|
94
|
+
check_gpu() {
|
95
|
+
local NODE_IP=$1
|
96
|
+
run_remote "$NODE_IP" "
|
97
|
+
if command -v nvidia-smi &> /dev/null; then
|
98
|
+
nvidia-smi --list-gpus | grep 'GPU 0'
|
99
|
+
fi
|
100
|
+
"
|
101
|
+
}
|
102
|
+
|
103
|
+
# Pre-flight checks
|
104
|
+
run_remote "$HEAD_NODE" "echo 'SSH connection successful'"
|
105
|
+
# TODO: Add more pre-flight checks here, including checking if port 6443 is accessible
|
106
|
+
|
107
|
+
# If --cleanup flag is set, uninstall k3s and exit
|
108
|
+
if [ "$CLEANUP" == "true" ]; then
|
109
|
+
echo -e "${YELLOW}Starting cleanup...${NC}"
|
110
|
+
|
111
|
+
# Clean up head node
|
112
|
+
cleanup_server_node "$HEAD_NODE"
|
113
|
+
|
114
|
+
# Clean up worker nodes
|
115
|
+
for NODE in $WORKER_NODES; do
|
116
|
+
cleanup_agent_node "$NODE"
|
117
|
+
done
|
118
|
+
|
119
|
+
echo -e "${GREEN}Cleanup completed successfully.${NC}"
|
120
|
+
exit 0
|
121
|
+
fi
|
122
|
+
|
123
|
+
# Step 1: Install k3s on the head node
|
124
|
+
progress_message "Deploying Kubernetes on head node ($HEAD_NODE)..."
|
125
|
+
run_remote "$HEAD_NODE" "
|
126
|
+
curl -sfL https://get.k3s.io | K3S_TOKEN=$K3S_TOKEN sh - &&
|
127
|
+
mkdir -p ~/.kube &&
|
128
|
+
sudo cp /etc/rancher/k3s/k3s.yaml ~/.kube/config &&
|
129
|
+
sudo chown \$(id -u):\$(id -g) ~/.kube/config &&
|
130
|
+
for i in {1..3}; do
|
131
|
+
if kubectl wait --for=condition=ready node --all --timeout=2m --kubeconfig ~/.kube/config; then
|
132
|
+
break
|
133
|
+
else
|
134
|
+
echo 'Waiting for nodes to be ready...'
|
135
|
+
sleep 5
|
136
|
+
fi
|
137
|
+
done
|
138
|
+
if [ $i -eq 3 ]; then
|
139
|
+
echo 'Failed to wait for nodes to be ready after 3 attempts'
|
140
|
+
exit 1
|
141
|
+
fi"
|
142
|
+
success_message "K3s deployed on head node."
|
143
|
+
|
144
|
+
# Check if head node has a GPU
|
145
|
+
if check_gpu "$HEAD_NODE"; then
|
146
|
+
echo -e "${YELLOW}GPU detected on head node ($HEAD_NODE).${NC}"
|
147
|
+
INSTALL_GPU=true
|
148
|
+
fi
|
149
|
+
|
150
|
+
# Fetch the head node's internal IP (this will be passed to worker nodes)
|
151
|
+
MASTER_ADDR=$(run_remote "$HEAD_NODE" "hostname -I | awk '{print \$1}'")
|
152
|
+
|
153
|
+
echo -e "${GREEN}Master node internal IP: $MASTER_ADDR${NC}"
|
154
|
+
|
155
|
+
# Step 2: Install k3s on worker nodes and join them to the master node
|
156
|
+
for NODE in $WORKER_NODES; do
|
157
|
+
progress_message "Deploying Kubernetes on worker node ($NODE)..."
|
158
|
+
run_remote "$NODE" "
|
159
|
+
curl -sfL https://get.k3s.io | K3S_URL=https://$MASTER_ADDR:6443 K3S_TOKEN=$K3S_TOKEN sh -"
|
160
|
+
success_message "Kubernetes deployed on worker node ($NODE)."
|
161
|
+
|
162
|
+
# Check if worker node has a GPU
|
163
|
+
if check_gpu "$NODE"; then
|
164
|
+
echo -e "${YELLOW}GPU detected on worker node ($NODE).${NC}"
|
165
|
+
INSTALL_GPU=true
|
166
|
+
fi
|
167
|
+
done
|
168
|
+
# Step 3: Configure local kubectl to connect to the cluster
|
169
|
+
progress_message "Configuring local kubectl to connect to the cluster..."
|
170
|
+
scp -o StrictHostKeyChecking=no -i "$SSH_KEY" "$USER@$HEAD_NODE":~/.kube/config ~/.kube/config
|
171
|
+
|
172
|
+
# Back up the original kubeconfig file if it exists
|
173
|
+
KUBECONFIG_FILE="$HOME/.kube/config"
|
174
|
+
if [[ -f "$KUBECONFIG_FILE" ]]; then
|
175
|
+
echo "Backing up existing kubeconfig to $KUBECONFIG_FILE.bak"
|
176
|
+
cp "$KUBECONFIG_FILE" "$KUBECONFIG_FILE.bak"
|
177
|
+
fi
|
178
|
+
|
179
|
+
# Update kubeconfig for the local machine to use the master node's IP
|
180
|
+
# Temporary file to hold the modified kubeconfig
|
181
|
+
TEMP_FILE=$(mktemp)
|
182
|
+
|
183
|
+
# Remove the certificate-authority-data, and replace the server with the master address
|
184
|
+
awk '
|
185
|
+
BEGIN { in_cluster = 0 }
|
186
|
+
/^clusters:/ { in_cluster = 1 }
|
187
|
+
/^users:/ { in_cluster = 0 }
|
188
|
+
in_cluster && /^ *certificate-authority-data:/ { next }
|
189
|
+
in_cluster && /^ *server:/ {
|
190
|
+
print " server: https://'${HEAD_NODE}:6443'"
|
191
|
+
print " insecure-skip-tls-verify: true"
|
192
|
+
next
|
193
|
+
}
|
194
|
+
{ print }
|
195
|
+
' "$KUBECONFIG_FILE" > "$TEMP_FILE"
|
196
|
+
|
197
|
+
# Replace the original kubeconfig with the modified one
|
198
|
+
mv "$TEMP_FILE" "$KUBECONFIG_FILE"
|
199
|
+
|
200
|
+
success_message "kubectl configured to connect to the cluster."
|
201
|
+
|
202
|
+
echo "Cluster deployment completed. You can now run 'kubectl get nodes' to verify the setup."
|
203
|
+
|
204
|
+
# Install GPU operator if a GPU was detected on any node
|
205
|
+
if [ "$INSTALL_GPU" == "true" ]; then
|
206
|
+
echo -e "${YELLOW}GPU detected in the cluster. Installing Nvidia GPU Operator...${NC}"
|
207
|
+
run_remote "$HEAD_NODE" "
|
208
|
+
curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
|
209
|
+
chmod 700 get_helm.sh &&
|
210
|
+
./get_helm.sh &&
|
211
|
+
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update &&
|
212
|
+
kubectl create namespace gpu-operator --kubeconfig ~/.kube/config || true &&
|
213
|
+
sudo ln -s /sbin/ldconfig /sbin/ldconfig.real || true &&
|
214
|
+
helm install gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator \
|
215
|
+
--set 'toolkit.env[0].name=CONTAINERD_CONFIG' \
|
216
|
+
--set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \
|
217
|
+
--set 'toolkit.env[1].name=CONTAINERD_SOCKET' \
|
218
|
+
--set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \
|
219
|
+
--set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \
|
220
|
+
--set 'toolkit.env[2].value=nvidia' &&
|
221
|
+
echo 'Waiting for GPU operator installation...' &&
|
222
|
+
while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:'; do
|
223
|
+
echo 'Waiting for GPU operator...'
|
224
|
+
sleep 5
|
225
|
+
done
|
226
|
+
echo 'GPU operator installed successfully.'"
|
227
|
+
success_message "GPU Operator installed."
|
228
|
+
else
|
229
|
+
echo -e "${YELLOW}No GPUs detected. Skipping GPU Operator installation.${NC}"
|
230
|
+
fi
|
231
|
+
|
232
|
+
# Configure SkyPilot
|
233
|
+
progress_message "Configuring SkyPilot..."
|
234
|
+
sky check kubernetes
|
235
|
+
success_message "SkyPilot configured successfully."
|
236
|
+
|
237
|
+
# Display final success message
|
238
|
+
echo -e "${GREEN}==== 🎉 Kubernetes cluster deployment completed successfully 🎉 ====${NC}"
|
239
|
+
echo "You can now interact with your Kubernetes cluster through SkyPilot: "
|
240
|
+
echo " • List available GPUs: sky show-gpus --cloud kubernetes"
|
241
|
+
echo " • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes --gpus A100:1"
|
242
|
+
echo " • Connect to pod with SSH: ssh devbox"
|
243
|
+
echo " • Connect to pod with VSCode: code --remote ssh-remote+devbox '/'"
|
sky/utils/log_utils.py
CHANGED
@@ -1,6 +1,7 @@
|
|
1
1
|
"""Logging utils."""
|
2
2
|
import enum
|
3
|
-
|
3
|
+
import types
|
4
|
+
from typing import List, Optional, Type
|
4
5
|
|
5
6
|
import colorama
|
6
7
|
import pendulum
|
@@ -15,13 +16,15 @@ logger = sky_logging.init_logger(__name__)
|
|
15
16
|
class LineProcessor(object):
|
16
17
|
"""A processor for log lines."""
|
17
18
|
|
18
|
-
def __enter__(self):
|
19
|
+
def __enter__(self) -> None:
|
19
20
|
pass
|
20
21
|
|
21
|
-
def process_line(self, log_line):
|
22
|
+
def process_line(self, log_line: str) -> None:
|
22
23
|
pass
|
23
24
|
|
24
|
-
def __exit__(self, except_type
|
25
|
+
def __exit__(self, except_type: Optional[Type[BaseException]],
|
26
|
+
except_value: Optional[BaseException],
|
27
|
+
traceback: Optional[types.TracebackType]) -> None:
|
25
28
|
del except_type, except_value, traceback # unused
|
26
29
|
pass
|
27
30
|
|
@@ -34,12 +37,12 @@ class RayUpLineProcessor(LineProcessor):
|
|
34
37
|
RUNTIME_SETUP = 1
|
35
38
|
PULLING_DOCKER_IMAGES = 2
|
36
39
|
|
37
|
-
def __enter__(self):
|
40
|
+
def __enter__(self) -> None:
|
38
41
|
self.state = self.ProvisionStatus.LAUNCH
|
39
42
|
self.status_display = rich_utils.safe_status('[bold cyan]Launching')
|
40
43
|
self.status_display.start()
|
41
44
|
|
42
|
-
def process_line(self, log_line):
|
45
|
+
def process_line(self, log_line: str) -> None:
|
43
46
|
if ('Success.' in log_line and
|
44
47
|
self.state == self.ProvisionStatus.LAUNCH):
|
45
48
|
logger.info(f'{colorama.Fore.GREEN}Head node is up.'
|
@@ -60,7 +63,9 @@ class RayUpLineProcessor(LineProcessor):
|
|
60
63
|
'[bold cyan]Launching - Preparing SkyPilot runtime')
|
61
64
|
self.state = self.ProvisionStatus.RUNTIME_SETUP
|
62
65
|
|
63
|
-
def __exit__(self, except_type
|
66
|
+
def __exit__(self, except_type: Optional[Type[BaseException]],
|
67
|
+
except_value: Optional[BaseException],
|
68
|
+
traceback: Optional[types.TracebackType]) -> None:
|
64
69
|
del except_type, except_value, traceback # unused
|
65
70
|
self.status_display.stop()
|
66
71
|
|
@@ -68,13 +73,13 @@ class RayUpLineProcessor(LineProcessor):
|
|
68
73
|
class SkyLocalUpLineProcessor(LineProcessor):
|
69
74
|
"""A processor for `sky local up` log lines."""
|
70
75
|
|
71
|
-
def __enter__(self):
|
76
|
+
def __enter__(self) -> None:
|
72
77
|
status = rich_utils.safe_status('[bold cyan]Creating local cluster - '
|
73
78
|
'initializing Kubernetes')
|
74
79
|
self.status_display = status
|
75
80
|
self.status_display.start()
|
76
81
|
|
77
|
-
def process_line(self, log_line):
|
82
|
+
def process_line(self, log_line: str) -> None:
|
78
83
|
if 'Kind cluster created.' in log_line:
|
79
84
|
logger.info(f'{colorama.Fore.GREEN}Kubernetes is running.'
|
80
85
|
f'{colorama.Style.RESET_ALL}')
|
@@ -124,7 +129,80 @@ class SkyLocalUpLineProcessor(LineProcessor):
|
|
124
129
|
f'{colorama.Fore.GREEN}Nginx Ingress Controller installed.'
|
125
130
|
f'{colorama.Style.RESET_ALL}')
|
126
131
|
|
127
|
-
def __exit__(self, except_type
|
132
|
+
def __exit__(self, except_type: Optional[Type[BaseException]],
|
133
|
+
except_value: Optional[BaseException],
|
134
|
+
traceback: Optional[types.TracebackType]) -> None:
|
135
|
+
del except_type, except_value, traceback # unused
|
136
|
+
self.status_display.stop()
|
137
|
+
|
138
|
+
|
139
|
+
class SkyRemoteUpLineProcessor(LineProcessor):
|
140
|
+
"""A processor for deploy_remote_cluster.sh log lines."""
|
141
|
+
|
142
|
+
def __enter__(self) -> None:
|
143
|
+
status = rich_utils.safe_status('[bold cyan]Creating remote cluster')
|
144
|
+
self.status_display = status
|
145
|
+
self.status_display.start()
|
146
|
+
|
147
|
+
def process_line(self, log_line: str) -> None:
|
148
|
+
# Pre-flight checks
|
149
|
+
if 'SSH connection successful' in log_line:
|
150
|
+
logger.info(f'{colorama.Fore.GREEN}SSH connection established.'
|
151
|
+
f'{colorama.Style.RESET_ALL}')
|
152
|
+
|
153
|
+
# Kubernetes installation steps
|
154
|
+
if 'Deploying Kubernetes on head node' in log_line:
|
155
|
+
self.status_display.update('[bold cyan]Creating remote cluster - '
|
156
|
+
'deploying Kubernetes on head node')
|
157
|
+
if 'K3s deployed on head node.' in log_line:
|
158
|
+
logger.info(f'{colorama.Fore.GREEN}'
|
159
|
+
'✔ K3s successfully deployed on head node.'
|
160
|
+
f'{colorama.Style.RESET_ALL}')
|
161
|
+
|
162
|
+
# Worker nodes
|
163
|
+
if 'Deploying Kubernetes on worker node' in log_line:
|
164
|
+
self.status_display.update('[bold cyan]Creating remote cluster - '
|
165
|
+
'deploying Kubernetes on worker nodes')
|
166
|
+
if 'Kubernetes deployed on worker node' in log_line:
|
167
|
+
logger.info(f'{colorama.Fore.GREEN}'
|
168
|
+
'✔ K3s successfully deployed on worker node.'
|
169
|
+
f'{colorama.Style.RESET_ALL}')
|
170
|
+
|
171
|
+
# Cluster configuration
|
172
|
+
if 'Configuring local kubectl to connect to the cluster...' in log_line:
|
173
|
+
self.status_display.update('[bold cyan]Creating remote cluster - '
|
174
|
+
'configuring local kubectl')
|
175
|
+
if 'kubectl configured to connect to the cluster.' in log_line:
|
176
|
+
logger.info(f'{colorama.Fore.GREEN}'
|
177
|
+
'✔ kubectl configured for the remote cluster.'
|
178
|
+
f'{colorama.Style.RESET_ALL}')
|
179
|
+
|
180
|
+
# GPU operator installation
|
181
|
+
if 'Installing Nvidia GPU Operator...' in log_line:
|
182
|
+
self.status_display.update('[bold cyan]Creating remote cluster - '
|
183
|
+
'installing Nvidia GPU Operator')
|
184
|
+
if 'GPU Operator installed.' in log_line:
|
185
|
+
logger.info(f'{colorama.Fore.GREEN}'
|
186
|
+
'✔ Nvidia GPU Operator installed successfully.'
|
187
|
+
f'{colorama.Style.RESET_ALL}')
|
188
|
+
|
189
|
+
# Cleanup steps
|
190
|
+
if 'Cleaning up head node' in log_line:
|
191
|
+
self.status_display.update('[bold cyan]Cleaning up head node')
|
192
|
+
if 'Cleaning up node' in log_line:
|
193
|
+
self.status_display.update('[bold cyan]Cleaning up worker node')
|
194
|
+
if 'cleaned up successfully' in log_line:
|
195
|
+
logger.info(f'{colorama.Fore.GREEN}'
|
196
|
+
f'{log_line.strip()}{colorama.Style.RESET_ALL}')
|
197
|
+
|
198
|
+
# Final status
|
199
|
+
if 'Cluster deployment completed.' in log_line:
|
200
|
+
logger.info(f'{colorama.Fore.GREEN}✔ Remote k3s is running.'
|
201
|
+
f'{colorama.Style.RESET_ALL}')
|
202
|
+
|
203
|
+
def __exit__(self, except_type: Optional[Type[BaseException]],
|
204
|
+
except_value: Optional[BaseException],
|
205
|
+
traceback: Optional[types.TracebackType]) -> None:
|
128
206
|
del except_type, except_value, traceback # unused
|
129
207
|
self.status_display.stop()
|
130
208
|
|
{skypilot_nightly-1.0.0.dev20240927.dist-info → skypilot_nightly-1.0.0.dev20240928.dist-info}/RECORD
RENAMED
@@ -1,8 +1,8 @@
|
|
1
|
-
sky/__init__.py,sha256=
|
1
|
+
sky/__init__.py,sha256=8BEk3x0IPkFli8tjp7axkkM5mwQ1GuCABWwTMppkPcc,5854
|
2
2
|
sky/admin_policy.py,sha256=hPo02f_A32gCqhUueF0QYy1fMSSKqRwYEg_9FxScN_s,3248
|
3
3
|
sky/authentication.py,sha256=o8ZhUf4VSN8WtjWcUUGYg-HVskaqaoMK4ZobHC-HVYU,20697
|
4
4
|
sky/check.py,sha256=jLMIIJrseaZj1_o5WkbaD9XdyXIlCaT6pyAaIFdhdmA,9079
|
5
|
-
sky/cli.py,sha256=
|
5
|
+
sky/cli.py,sha256=9h4yO8p962960qUjvQ-xSusrtdh8TXNNQ1sfV0OqgZc,206262
|
6
6
|
sky/cloud_stores.py,sha256=RjFgmRhUh1Kk__f6g3KxzLp9s7dA0pFK4W1AukEuUaw,21153
|
7
7
|
sky/core.py,sha256=YF_6kwj8Ja171Oycb8L25SZ7V_ylZYovFS_jpnjwGo0,34408
|
8
8
|
sky/dag.py,sha256=WLFWr5hfrwjd31uYlNvI-zWUk7tLaT_gzJn4LzbVtkE,2780
|
@@ -252,7 +252,7 @@ sky/utils/dag_utils.py,sha256=gjGZiJj4_GYsraXX67e6ElvbmOByJcyjSfvVgYZiXvs,5588
|
|
252
252
|
sky/utils/db_utils.py,sha256=AOvMmBEN9cF4I7CoXihPCtus4mU2VDGjBQSVMMgzKlA,2786
|
253
253
|
sky/utils/env_options.py,sha256=1VXyd3bhiUgGfCpmmTqM9PagRo1ILBH4-pzIxmIeE6E,861
|
254
254
|
sky/utils/kubernetes_enums.py,sha256=imGqHSa8O07zD_6xH1SDMM7dBU5lF5fzFFlQuQy00QM,1384
|
255
|
-
sky/utils/log_utils.py,sha256=
|
255
|
+
sky/utils/log_utils.py,sha256=yVu3etgKhiVYX8UG-JFPWZujxWBT4kwxZ5oAPIdjtGs,12054
|
256
256
|
sky/utils/resources_utils.py,sha256=snByBxgx3Hnjfch2uysdAA3D-OAwrnuzTDHug36s5H4,6515
|
257
257
|
sky/utils/rich_utils.py,sha256=5ZVhzlFx-nhqMXwv00eO9xC4rz7ibDlfD2lmGhZrJEY,1581
|
258
258
|
sky/utils/schemas.py,sha256=QT0Fxri2o0SiWkky1DlZhA1dzQRQoB5OdVaej0wJvhc,28787
|
@@ -265,6 +265,7 @@ sky/utils/cli_utils/status_utils.py,sha256=9odkfXiXLMD14XJsqve6sGvHpe7ThHXpC6ic9
|
|
265
265
|
sky/utils/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
266
266
|
sky/utils/kubernetes/create_cluster.sh,sha256=rv5Lz6AR00yBJMRyScfMSQiGKptMhtHWRsvyG20-u9c,7764
|
267
267
|
sky/utils/kubernetes/delete_cluster.sh,sha256=BSccHF43GyepDNf-FZcenzHzpXXATkVD92vgn1lWPgk,927
|
268
|
+
sky/utils/kubernetes/deploy_remote_cluster.sh,sha256=vGj0mD0tejHDRy8ulwKOvOF2mfLyT5J8fp7GVqEe_EY,8478
|
268
269
|
sky/utils/kubernetes/generate_kind_config.py,sha256=_TNLnifA_r7-CRq083IP1xjelYqiLjzQX9ohuqYpDH8,3187
|
269
270
|
sky/utils/kubernetes/generate_kubeconfig.sh,sha256=AcYhuuG5jXWGHUmyRuH-oKy5qcn92gXhu6bXOt6eD6g,9274
|
270
271
|
sky/utils/kubernetes/gpu_labeler.py,sha256=MEUv0U4ACDcNwtFVltlv017XJMjxx1Bndf6fL0i6eqg,6960
|
@@ -272,9 +273,9 @@ sky/utils/kubernetes/k8s_gpu_labeler_job.yaml,sha256=KPqp23B-zQ2SZK03jdHeF9fLTog
|
|
272
273
|
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml,sha256=VLKT2KKimZu1GDg_4AIlIt488oMQvhRZWwsj9vBbPUg,3812
|
273
274
|
sky/utils/kubernetes/rsync_helper.sh,sha256=Ma-N9a271fTfdgP5-8XIQL7KPf8IPUo-uY004PCdUFo,747
|
274
275
|
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py,sha256=RFLJ3k7MR5UN4SKHykQ0lV9SgXumoULpKYIAt1vh-HU,6560
|
275
|
-
skypilot_nightly-1.0.0.
|
276
|
-
skypilot_nightly-1.0.0.
|
277
|
-
skypilot_nightly-1.0.0.
|
278
|
-
skypilot_nightly-1.0.0.
|
279
|
-
skypilot_nightly-1.0.0.
|
280
|
-
skypilot_nightly-1.0.0.
|
276
|
+
skypilot_nightly-1.0.0.dev20240928.dist-info/LICENSE,sha256=emRJAvE7ngL6x0RhQvlns5wJzGI3NEQ_WMjNmd9TZc4,12170
|
277
|
+
skypilot_nightly-1.0.0.dev20240928.dist-info/METADATA,sha256=AT9cnsY7Uj7BK0COu8mOXiCtfyCFrjtk7OBQvqx-_Nk,18948
|
278
|
+
skypilot_nightly-1.0.0.dev20240928.dist-info/WHEEL,sha256=GV9aMThwP_4oNCtvEC2ec3qUYutgWeAzklro_0m4WJQ,91
|
279
|
+
skypilot_nightly-1.0.0.dev20240928.dist-info/entry_points.txt,sha256=StA6HYpuHj-Y61L2Ze-hK2IcLWgLZcML5gJu8cs6nU4,36
|
280
|
+
skypilot_nightly-1.0.0.dev20240928.dist-info/top_level.txt,sha256=qA8QuiNNb6Y1OF-pCUtPEr6sLEwy2xJX06Bd_CrtrHY,4
|
281
|
+
skypilot_nightly-1.0.0.dev20240928.dist-info/RECORD,,
|
File without changes
|
{skypilot_nightly-1.0.0.dev20240927.dist-info → skypilot_nightly-1.0.0.dev20240928.dist-info}/WHEEL
RENAMED
File without changes
|
File without changes
|
File without changes
|