skypilot-nightly 1.0.0.dev20241120__py3-none-any.whl → 1.0.0.dev20241122__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +20 -15
- sky/backends/cloud_vm_ray_backend.py +21 -3
- sky/clouds/aws.py +1 -0
- sky/clouds/azure.py +1 -0
- sky/clouds/cloud.py +1 -0
- sky/clouds/cudo.py +1 -0
- sky/clouds/fluidstack.py +1 -0
- sky/clouds/gcp.py +1 -0
- sky/clouds/ibm.py +1 -0
- sky/clouds/kubernetes.py +45 -3
- sky/clouds/lambda_cloud.py +1 -0
- sky/clouds/oci.py +1 -0
- sky/clouds/paperspace.py +1 -0
- sky/clouds/runpod.py +1 -0
- sky/clouds/scp.py +1 -0
- sky/clouds/vsphere.py +1 -0
- sky/provision/instance_setup.py +80 -83
- sky/provision/kubernetes/instance.py +108 -76
- sky/provision/kubernetes/utils.py +2 -0
- sky/provision/oci/instance.py +4 -2
- sky/provision/provisioner.py +95 -19
- sky/resources.py +2 -1
- sky/skylet/constants.py +31 -21
- sky/templates/kubernetes-ray.yml.j2 +169 -39
- sky/utils/subprocess_utils.py +49 -4
- {skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/METADATA +65 -55
- {skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/RECORD +32 -32
- {skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/WHEEL +1 -1
- {skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20241120.dist-info → skypilot_nightly-1.0.0.dev20241122.dist-info}/top_level.txt +0 -0
sky/skylet/constants.py
CHANGED
@@ -39,6 +39,7 @@ SKY_GET_PYTHON_PATH_CMD = (f'[ -s {SKY_PYTHON_PATH_FILE} ] && '
|
|
39
39
|
'which python3')
|
40
40
|
# Python executable, e.g., /opt/conda/bin/python3
|
41
41
|
SKY_PYTHON_CMD = f'$({SKY_GET_PYTHON_PATH_CMD})'
|
42
|
+
# Prefer SKY_UV_PIP_CMD, which is faster. TODO(cooper): remove all usages.
|
42
43
|
SKY_PIP_CMD = f'{SKY_PYTHON_CMD} -m pip'
|
43
44
|
# Ray executable, e.g., /opt/conda/bin/ray
|
44
45
|
# We need to add SKY_PYTHON_CMD before ray executable because:
|
@@ -50,6 +51,10 @@ SKY_RAY_CMD = (f'{SKY_PYTHON_CMD} $([ -s {SKY_RAY_PATH_FILE} ] && '
|
|
50
51
|
SKY_REMOTE_PYTHON_ENV_NAME = 'skypilot-runtime'
|
51
52
|
SKY_REMOTE_PYTHON_ENV = f'~/{SKY_REMOTE_PYTHON_ENV_NAME}'
|
52
53
|
ACTIVATE_SKY_REMOTE_PYTHON_ENV = f'source {SKY_REMOTE_PYTHON_ENV}/bin/activate'
|
54
|
+
# uv is used for venv and pip, much faster than python implementations.
|
55
|
+
SKY_UV_INSTALL_DIR = '"$HOME/.local/bin"'
|
56
|
+
SKY_UV_CMD = f'{SKY_UV_INSTALL_DIR}/uv'
|
57
|
+
SKY_UV_PIP_CMD = f'VIRTUAL_ENV={SKY_REMOTE_PYTHON_ENV} {SKY_UV_CMD} pip'
|
53
58
|
# Deleting the SKY_REMOTE_PYTHON_ENV_NAME from the PATH to deactivate the
|
54
59
|
# environment. `deactivate` command does not work when conda is used.
|
55
60
|
DEACTIVATE_SKY_REMOTE_PYTHON_ENV = (
|
@@ -148,31 +153,29 @@ CONDA_INSTALLATION_COMMANDS = (
|
|
148
153
|
'echo "Creating conda env with Python 3.10" && '
|
149
154
|
f'conda create -y -n {SKY_REMOTE_PYTHON_ENV_NAME} python=3.10 && '
|
150
155
|
f'conda activate {SKY_REMOTE_PYTHON_ENV_NAME};'
|
156
|
+
# Install uv for venv management and pip installation.
|
157
|
+
'which uv >/dev/null 2>&1 || '
|
158
|
+
'curl -LsSf https://astral.sh/uv/install.sh '
|
159
|
+
f'| UV_INSTALL_DIR={SKY_UV_INSTALL_DIR} sh;'
|
151
160
|
# Create a separate conda environment for SkyPilot dependencies.
|
152
161
|
f'[ -d {SKY_REMOTE_PYTHON_ENV} ] || '
|
153
162
|
# Do NOT use --system-site-packages here, because if users upgrade any
|
154
163
|
# packages in the base env, they interfere with skypilot dependencies.
|
155
164
|
# Reference: https://github.com/skypilot-org/skypilot/issues/4097
|
156
|
-
f'{
|
165
|
+
f'{SKY_UV_CMD} venv {SKY_REMOTE_PYTHON_ENV};'
|
157
166
|
f'echo "$(echo {SKY_REMOTE_PYTHON_ENV})/bin/python" > {SKY_PYTHON_PATH_FILE};'
|
158
167
|
)
|
159
168
|
|
160
169
|
_sky_version = str(version.parse(sky.__version__))
|
161
170
|
RAY_STATUS = f'RAY_ADDRESS=127.0.0.1:{SKY_REMOTE_RAY_PORT} {SKY_RAY_CMD} status'
|
162
|
-
|
163
|
-
# installed. {var} will be replaced with the actual value in
|
164
|
-
# backend_utils.write_cluster_config.
|
165
|
-
RAY_SKYPILOT_INSTALLATION_COMMANDS = (
|
171
|
+
RAY_INSTALLATION_COMMANDS = (
|
166
172
|
'mkdir -p ~/sky_workdir && mkdir -p ~/.sky/sky_app;'
|
167
|
-
# Disable the pip version check to avoid the warning message, which makes
|
168
|
-
# the output hard to read.
|
169
|
-
'export PIP_DISABLE_PIP_VERSION_CHECK=1;'
|
170
173
|
# Print the PATH in provision.log to help debug PATH issues.
|
171
174
|
'echo PATH=$PATH; '
|
172
175
|
# Install setuptools<=69.5.1 to avoid the issue with the latest setuptools
|
173
176
|
# causing the error:
|
174
177
|
# ImportError: cannot import name 'packaging' from 'pkg_resources'"
|
175
|
-
f'{
|
178
|
+
f'{SKY_UV_PIP_CMD} install "setuptools<70"; '
|
176
179
|
# Backward compatibility for ray upgrade (#3248): do not upgrade ray if the
|
177
180
|
# ray cluster is already running, to avoid the ray cluster being restarted.
|
178
181
|
#
|
@@ -186,10 +189,10 @@ RAY_SKYPILOT_INSTALLATION_COMMANDS = (
|
|
186
189
|
# latest ray port 6380, but those existing cluster launched before #1790
|
187
190
|
# that has ray cluster on the default port 6379 will be upgraded and
|
188
191
|
# restarted.
|
189
|
-
f'{
|
192
|
+
f'{SKY_UV_PIP_CMD} list | grep "ray " | '
|
190
193
|
f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null '
|
191
194
|
f'|| {RAY_STATUS} || '
|
192
|
-
f'{
|
195
|
+
f'{SKY_UV_PIP_CMD} install -U ray[default]=={SKY_REMOTE_RAY_VERSION}; ' # pylint: disable=line-too-long
|
193
196
|
# In some envs, e.g. pip does not have permission to write under /opt/conda
|
194
197
|
# ray package will be installed under ~/.local/bin. If the user's PATH does
|
195
198
|
# not include ~/.local/bin (the pip install will have the output: `WARNING:
|
@@ -202,24 +205,31 @@ RAY_SKYPILOT_INSTALLATION_COMMANDS = (
|
|
202
205
|
# Writes ray path to file if it does not exist or the file is empty.
|
203
206
|
f'[ -s {SKY_RAY_PATH_FILE} ] || '
|
204
207
|
f'{{ {ACTIVATE_SKY_REMOTE_PYTHON_ENV} && '
|
205
|
-
f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; '
|
206
|
-
|
207
|
-
|
208
|
+
f'which ray > {SKY_RAY_PATH_FILE} || exit 1; }}; ')
|
209
|
+
|
210
|
+
SKYPILOT_WHEEL_INSTALLATION_COMMANDS = (
|
211
|
+
f'{{ {SKY_UV_PIP_CMD} list | grep "skypilot " && '
|
208
212
|
'[ "$(cat ~/.sky/wheels/current_sky_wheel_hash)" == "{sky_wheel_hash}" ]; } || ' # pylint: disable=line-too-long
|
209
|
-
f'{{ {
|
210
|
-
f'{
|
213
|
+
f'{{ {SKY_UV_PIP_CMD} uninstall skypilot; '
|
214
|
+
f'{SKY_UV_PIP_CMD} install "$(echo ~/.sky/wheels/{{sky_wheel_hash}}/'
|
211
215
|
f'skypilot-{_sky_version}*.whl)[{{cloud}}, remote]" && '
|
212
216
|
'echo "{sky_wheel_hash}" > ~/.sky/wheels/current_sky_wheel_hash || '
|
213
|
-
'exit 1; }; '
|
214
|
-
# END SkyPilot package check and installation
|
217
|
+
'exit 1; }; ')
|
215
218
|
|
219
|
+
# Install ray and skypilot on the remote cluster if they are not already
|
220
|
+
# installed. {var} will be replaced with the actual value in
|
221
|
+
# backend_utils.write_cluster_config.
|
222
|
+
RAY_SKYPILOT_INSTALLATION_COMMANDS = (
|
223
|
+
f'{RAY_INSTALLATION_COMMANDS} '
|
224
|
+
f'{SKYPILOT_WHEEL_INSTALLATION_COMMANDS} '
|
216
225
|
# Only patch ray when the ray version is the same as the expected version.
|
217
226
|
# The ray installation above can be skipped due to the existing ray cluster
|
218
227
|
# for backward compatibility. In this case, we should not patch the ray
|
219
228
|
# files.
|
220
|
-
f'{
|
221
|
-
f'
|
222
|
-
'
|
229
|
+
f'{SKY_UV_PIP_CMD} list | grep "ray " | '
|
230
|
+
f'grep {SKY_REMOTE_RAY_VERSION} 2>&1 > /dev/null && '
|
231
|
+
f'{{ {SKY_PYTHON_CMD} -c '
|
232
|
+
'"from sky.skylet.ray_patches import patch; patch()" || exit 1; }; ')
|
223
233
|
|
224
234
|
# The name for the environment variable that stores SkyPilot user hash, which
|
225
235
|
# is mainly used to make sure sky commands runs on a VM launched by SkyPilot
|
@@ -222,7 +222,9 @@ provider:
|
|
222
222
|
- protocol: TCP
|
223
223
|
port: 22
|
224
224
|
targetPort: 22
|
225
|
-
# Service that maps to the head node of the Ray cluster
|
225
|
+
# Service that maps to the head node of the Ray cluster, so that the
|
226
|
+
# worker nodes can find the head node using
|
227
|
+
# {{cluster_name_on_cloud}}-head.{{k8s_namespace}}.svc.cluster.local
|
226
228
|
- apiVersion: v1
|
227
229
|
kind: Service
|
228
230
|
metadata:
|
@@ -235,18 +237,12 @@ provider:
|
|
235
237
|
# names.
|
236
238
|
name: {{cluster_name_on_cloud}}-head
|
237
239
|
spec:
|
240
|
+
# Create a headless service so that the head node can be reached by
|
241
|
+
# the worker nodes with any port number.
|
242
|
+
clusterIP: None
|
238
243
|
# This selector must match the head node pod's selector below.
|
239
244
|
selector:
|
240
245
|
component: {{cluster_name_on_cloud}}-head
|
241
|
-
ports:
|
242
|
-
- name: client
|
243
|
-
protocol: TCP
|
244
|
-
port: 10001
|
245
|
-
targetPort: 10001
|
246
|
-
- name: dashboard
|
247
|
-
protocol: TCP
|
248
|
-
port: 8265
|
249
|
-
targetPort: 8265
|
250
246
|
|
251
247
|
# Specify the pod type for the ray head node (as configured below).
|
252
248
|
head_node_type: ray_head_default
|
@@ -280,7 +276,6 @@ available_node_types:
|
|
280
276
|
# serviceAccountName: skypilot-service-account
|
281
277
|
serviceAccountName: {{k8s_service_account_name}}
|
282
278
|
automountServiceAccountToken: {{k8s_automount_sa_token}}
|
283
|
-
|
284
279
|
restartPolicy: Never
|
285
280
|
|
286
281
|
# Add node selector if GPU/TPUs are requested:
|
@@ -322,18 +317,147 @@ available_node_types:
|
|
322
317
|
- name: ray-node
|
323
318
|
imagePullPolicy: IfNotPresent
|
324
319
|
image: {{image_id}}
|
320
|
+
env:
|
321
|
+
- name: SKYPILOT_POD_NODE_TYPE
|
322
|
+
valueFrom:
|
323
|
+
fieldRef:
|
324
|
+
fieldPath: metadata.labels['ray-node-type']
|
325
325
|
# Do not change this command - it keeps the pod alive until it is
|
326
326
|
# explicitly killed.
|
327
327
|
command: ["/bin/bash", "-c", "--"]
|
328
328
|
args:
|
329
329
|
- |
|
330
330
|
# Helper function to conditionally use sudo
|
331
|
+
# TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
|
331
332
|
prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
|
333
|
+
[ $(id -u) -eq 0 ] && function sudo() { "$@"; } || true;
|
334
|
+
|
335
|
+
STEPS=("apt-ssh-setup" "runtime-setup" "env-setup")
|
332
336
|
|
333
|
-
# Run apt update
|
337
|
+
# STEP 1: Run apt update, install missing packages, and set up ssh.
|
334
338
|
(
|
339
|
+
(
|
335
340
|
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > /tmp/apt-update.log 2>&1 || \
|
336
341
|
echo "Warning: apt-get update failed. Continuing anyway..." >> /tmp/apt-update.log
|
342
|
+
PACKAGES="rsync curl netcat gcc patch pciutils fuse openssh-server";
|
343
|
+
|
344
|
+
# Separate packages into two groups: packages that are installed first
|
345
|
+
# so that curl and rsync are available sooner to unblock the following
|
346
|
+
# conda installation and rsync.
|
347
|
+
set -e
|
348
|
+
INSTALL_FIRST="";
|
349
|
+
MISSING_PACKAGES="";
|
350
|
+
for pkg in $PACKAGES; do
|
351
|
+
if [ "$pkg" == "netcat" ]; then
|
352
|
+
if ! dpkg -l | grep -q "^ii \(netcat\|netcat-openbsd\|netcat-traditional\) "; then
|
353
|
+
INSTALL_FIRST="$INSTALL_FIRST netcat-openbsd";
|
354
|
+
fi
|
355
|
+
elif ! dpkg -l | grep -q "^ii $pkg "; then
|
356
|
+
if [ "$pkg" == "curl" ] || [ "$pkg" == "rsync" ]; then
|
357
|
+
INSTALL_FIRST="$INSTALL_FIRST $pkg";
|
358
|
+
else
|
359
|
+
MISSING_PACKAGES="$MISSING_PACKAGES $pkg";
|
360
|
+
fi
|
361
|
+
fi
|
362
|
+
done;
|
363
|
+
if [ ! -z "$INSTALL_FIRST" ]; then
|
364
|
+
echo "Installing core packages: $INSTALL_FIRST";
|
365
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $INSTALL_FIRST;
|
366
|
+
fi;
|
367
|
+
# SSH and other packages are not necessary, so we disable set -e
|
368
|
+
set +e
|
369
|
+
|
370
|
+
if [ ! -z "$MISSING_PACKAGES" ]; then
|
371
|
+
echo "Installing missing packages: $MISSING_PACKAGES";
|
372
|
+
DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y $MISSING_PACKAGES;
|
373
|
+
fi;
|
374
|
+
$(prefix_cmd) mkdir -p /var/run/sshd;
|
375
|
+
$(prefix_cmd) sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config;
|
376
|
+
$(prefix_cmd) sed "s@session\s*required\s*pam_loginuid.so@session optional pam_loginuid.so@g" -i /etc/pam.d/sshd;
|
377
|
+
cd /etc/ssh/ && $(prefix_cmd) ssh-keygen -A;
|
378
|
+
$(prefix_cmd) mkdir -p ~/.ssh;
|
379
|
+
$(prefix_cmd) chown -R $(whoami) ~/.ssh;
|
380
|
+
$(prefix_cmd) chmod 700 ~/.ssh;
|
381
|
+
$(prefix_cmd) cat /etc/secret-volume/ssh-publickey* > ~/.ssh/authorized_keys;
|
382
|
+
$(prefix_cmd) chmod 644 ~/.ssh/authorized_keys;
|
383
|
+
$(prefix_cmd) service ssh restart;
|
384
|
+
$(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
|
385
|
+
) > /tmp/${STEPS[0]}.log 2>&1 || {
|
386
|
+
echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed
|
387
|
+
cat /tmp/${STEPS[0]}.log
|
388
|
+
exit 1
|
389
|
+
}
|
390
|
+
) &
|
391
|
+
|
392
|
+
# STEP 2: Install conda, ray and skypilot (for dependencies); start
|
393
|
+
# ray cluster.
|
394
|
+
(
|
395
|
+
(
|
396
|
+
set -e
|
397
|
+
mkdir -p ~/.sky
|
398
|
+
# Wait for `curl` package to be installed before installing conda
|
399
|
+
# and ray.
|
400
|
+
until dpkg -l | grep -q "^ii curl "; do
|
401
|
+
sleep 0.1
|
402
|
+
echo "Waiting for curl package to be installed..."
|
403
|
+
done
|
404
|
+
{{ conda_installation_commands }}
|
405
|
+
{{ ray_installation_commands }}
|
406
|
+
VIRTUAL_ENV=~/skypilot-runtime ~/.local/bin/uv pip install skypilot[kubernetes,remote]
|
407
|
+
touch /tmp/ray_skypilot_installation_complete
|
408
|
+
echo "=== Ray and skypilot installation completed ==="
|
409
|
+
|
410
|
+
# Disable set -e, as we have some commands that are ok to fail
|
411
|
+
# after the ray start.
|
412
|
+
# TODO(zhwu): this is a hack, we should fix the commands that are
|
413
|
+
# ok to fail.
|
414
|
+
if [ "$SKYPILOT_POD_NODE_TYPE" == "head" ]; then
|
415
|
+
set +e
|
416
|
+
{{ ray_head_start_command }}
|
417
|
+
else
|
418
|
+
# Start ray worker on the worker pod.
|
419
|
+
# Wait until the head pod is available with an IP address
|
420
|
+
export SKYPILOT_RAY_HEAD_IP="{{cluster_name_on_cloud}}-head.{{k8s_namespace}}.svc.cluster.local"
|
421
|
+
export SKYPILOT_RAY_PORT={{skypilot_ray_port}}
|
422
|
+
# Wait until the ray cluster is started on the head pod
|
423
|
+
until dpkg -l | grep -q "^ii \(netcat\|netcat-openbsd\|netcat-traditional\) "; do
|
424
|
+
sleep 0.1
|
425
|
+
echo "Waiting for netcat package to be installed..."
|
426
|
+
done
|
427
|
+
until nc -z -w 1 ${SKYPILOT_RAY_HEAD_IP} ${SKYPILOT_RAY_PORT}; do
|
428
|
+
sleep 0.1
|
429
|
+
done
|
430
|
+
|
431
|
+
set +e
|
432
|
+
{{ ray_worker_start_command }}
|
433
|
+
fi
|
434
|
+
) > /tmp/${STEPS[1]}.log 2>&1 || {
|
435
|
+
echo "Error: ${STEPS[1]} failed. Continuing anyway..." > /tmp/${STEPS[1]}.failed
|
436
|
+
cat /tmp/${STEPS[1]}.log
|
437
|
+
exit 1
|
438
|
+
}
|
439
|
+
) &
|
440
|
+
|
441
|
+
|
442
|
+
# STEP 3: Set up environment variables; this should be relatively fast.
|
443
|
+
(
|
444
|
+
(
|
445
|
+
set -e
|
446
|
+
if [ $(id -u) -eq 0 ]; then
|
447
|
+
echo 'alias sudo=""' >> ~/.bashrc; echo succeed;
|
448
|
+
else
|
449
|
+
if command -v sudo >/dev/null 2>&1; then
|
450
|
+
timeout 2 sudo -l >/dev/null 2>&1 && echo succeed || { echo 52; exit 52; };
|
451
|
+
else
|
452
|
+
{ echo 52; exit 52; };
|
453
|
+
fi;
|
454
|
+
fi;
|
455
|
+
printenv | while IFS='=' read -r key value; do echo "export $key=\"$value\""; done > ~/container_env_var.sh && $(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh
|
456
|
+
) > /tmp/${STEPS[2]}.log 2>&1 || {
|
457
|
+
echo "Error: ${STEPS[2]} failed. Continuing anyway..." > /tmp/${STEPS[2]}.failed
|
458
|
+
cat /tmp/${STEPS[2]}.log
|
459
|
+
exit 1
|
460
|
+
}
|
337
461
|
) &
|
338
462
|
|
339
463
|
function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
|
@@ -441,42 +565,48 @@ setup_commands:
|
|
441
565
|
# Line 'sudo grep ..': set the number of threads per process to unlimited to avoid ray job submit stucking issue when the number of running ray jobs increase.
|
442
566
|
# Line 'mkdir -p ..': disable host key check
|
443
567
|
# Line 'python3 -c ..': patch the buggy ray files and enable `-o allow_other` option for `goofys`
|
568
|
+
# Line 'for step in ..': check if any failure indicator exists for the setup done in pod args and print the error message. This is only a best effort, as the
|
569
|
+
# commands in pod args are asynchronous and we cannot guarantee the failure indicators are created before the setup commands finish.
|
444
570
|
- |
|
445
|
-
PACKAGES="gcc patch pciutils rsync fuse curl";
|
446
|
-
MISSING_PACKAGES="";
|
447
|
-
for pkg in $PACKAGES; do
|
448
|
-
if ! dpkg -l | grep -q "^ii $pkg "; then
|
449
|
-
MISSING_PACKAGES="$MISSING_PACKAGES $pkg";
|
450
|
-
fi
|
451
|
-
done;
|
452
|
-
if [ ! -z "$MISSING_PACKAGES" ]; then
|
453
|
-
echo "Installing missing packages: $MISSING_PACKAGES";
|
454
|
-
sudo DEBIAN_FRONTEND=noninteractive apt-get install -y $MISSING_PACKAGES;
|
455
|
-
fi;
|
456
571
|
mkdir -p ~/.ssh; touch ~/.ssh/config;
|
457
572
|
{%- for initial_setup_command in initial_setup_commands %}
|
458
573
|
{{ initial_setup_command }}
|
459
574
|
{%- endfor %}
|
460
|
-
|
461
|
-
|
575
|
+
STEPS=("apt-ssh-setup" "runtime-setup" "env-setup")
|
576
|
+
start_epoch=$(date +%s);
|
577
|
+
echo "=== Logs for asynchronous ray and skypilot installation ===";
|
578
|
+
[ -f /tmp/ray_skypilot_installation_complete ] && cat /tmp/${STEPS[1]}.log ||
|
579
|
+
{ tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/ray_skypilot_installation_complete ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
580
|
+
[ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
|
581
|
+
end_epoch=$(date +%s);
|
582
|
+
echo "=== Ray and skypilot dependencies installation completed in $(($end_epoch - $start_epoch)) secs ===";
|
583
|
+
start_epoch=$(date +%s);
|
584
|
+
{{ skypilot_wheel_installation_commands }}
|
585
|
+
end_epoch=$(date +%s);
|
586
|
+
echo "=== Skypilot wheel installation completed in $(($end_epoch - $start_epoch)) secs ===";
|
587
|
+
start_epoch=$(date +%s);
|
462
588
|
sudo touch ~/.sudo_as_admin_successful;
|
463
589
|
sudo bash -c 'rm -rf /etc/security/limits.d; echo "* soft nofile 1048576" >> /etc/security/limits.conf; echo "* hard nofile 1048576" >> /etc/security/limits.conf';
|
464
|
-
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf');
|
590
|
+
sudo grep -e '^DefaultTasksMax' /etc/systemd/system.conf || (sudo bash -c 'echo "DefaultTasksMax=infinity" >> /etc/systemd/system.conf');
|
591
|
+
ulimit -n 1048576;
|
465
592
|
mkdir -p ~/.ssh; (grep -Pzo -q "Host \*\n StrictHostKeyChecking no" ~/.ssh/config) || printf "Host *\n StrictHostKeyChecking no\n" >> ~/.ssh/config;
|
466
593
|
[ -f /etc/fuse.conf ] && sudo sed -i 's/#user_allow_other/user_allow_other/g' /etc/fuse.conf || (sudo sh -c 'echo "user_allow_other" > /etc/fuse.conf'); # This is needed for `-o allow_other` option for `goofys`;
|
467
|
-
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
594
|
+
end_epoch=$(date +%s);
|
595
|
+
echo "=== Setup system configs and fuse completed in $(($end_epoch - $start_epoch)) secs ===";
|
596
|
+
for step in $STEPS; do [ -f "/tmp/${step}.failed" ] && { echo "Error: /tmp/${step}.failed found:"; cat /tmp/${step}.log; exit 1; } || true; done;
|
597
|
+
{% if tpu_requested %}
|
598
|
+
# The /tmp/tpu_logs directory is where TPU-related logs, such as logs from
|
599
|
+
# the TPU runtime, are written. These capture runtime information about the
|
600
|
+
# TPU execution, including any warnings, errors, or general activity of
|
601
|
+
# the TPU driver. By default, the /tmp/tpu_logs directory is created with
|
602
|
+
# 755 permissions, and the user of the provisioned pod is not necessarily
|
603
|
+
# a root. Hence, we need to update the write permission so the logs can be
|
604
|
+
# properly written.
|
605
|
+
# TODO(Doyoung): Investigate to see why TPU workload fails to run without
|
606
|
+
# execution permission, such as granting 766 to log file. Check if it's a
|
607
|
+
# must and see if there's a workaround to grant minimum permission.
|
608
|
+
sudo chmod 777 /tmp/tpu_logs;
|
609
|
+
{% endif %}
|
480
610
|
|
481
611
|
# Format: `REMOTE_PATH : LOCAL_PATH`
|
482
612
|
file_mounts: {
|
sky/utils/subprocess_utils.py
CHANGED
@@ -2,9 +2,10 @@
|
|
2
2
|
from multiprocessing import pool
|
3
3
|
import os
|
4
4
|
import random
|
5
|
+
import resource
|
5
6
|
import subprocess
|
6
7
|
import time
|
7
|
-
from typing import Any, Callable, Iterable, List, Optional, Tuple, Union
|
8
|
+
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union
|
8
9
|
|
9
10
|
import colorama
|
10
11
|
import psutil
|
@@ -18,6 +19,8 @@ from sky.utils import ux_utils
|
|
18
19
|
|
19
20
|
logger = sky_logging.init_logger(__name__)
|
20
21
|
|
22
|
+
_fd_limit_warning_shown = False
|
23
|
+
|
21
24
|
|
22
25
|
@timeline.event
|
23
26
|
def run(cmd, **kwargs):
|
@@ -43,12 +46,54 @@ def run_no_outputs(cmd, **kwargs):
|
|
43
46
|
**kwargs)
|
44
47
|
|
45
48
|
|
46
|
-
def
|
47
|
-
|
49
|
+
def _get_thread_multiplier(cloud_str: Optional[str] = None) -> int:
|
50
|
+
# If using Kubernetes, we use 4x the number of cores.
|
51
|
+
if cloud_str and cloud_str.lower() == 'kubernetes':
|
52
|
+
return 4
|
53
|
+
return 1
|
54
|
+
|
55
|
+
|
56
|
+
def get_max_workers_for_file_mounts(common_file_mounts: Dict[str, str],
|
57
|
+
cloud_str: Optional[str] = None) -> int:
|
58
|
+
global _fd_limit_warning_shown
|
59
|
+
fd_limit, _ = resource.getrlimit(resource.RLIMIT_NOFILE)
|
60
|
+
|
61
|
+
# Raise warning for low fd_limit (only once)
|
62
|
+
if fd_limit < 1024 and not _fd_limit_warning_shown:
|
63
|
+
logger.warning(
|
64
|
+
f'Open file descriptor limit ({fd_limit}) is low. File sync to '
|
65
|
+
'remote clusters may be slow. Consider increasing the limit using '
|
66
|
+
'`ulimit -n <number>` or modifying system limits.')
|
67
|
+
_fd_limit_warning_shown = True
|
68
|
+
|
69
|
+
fd_per_rsync = 5
|
70
|
+
for src in common_file_mounts.values():
|
71
|
+
if os.path.isdir(src):
|
72
|
+
# Assume that each file/folder under src takes 5 file descriptors
|
73
|
+
# on average.
|
74
|
+
fd_per_rsync = max(fd_per_rsync, len(os.listdir(src)) * 5)
|
75
|
+
|
76
|
+
# Reserve some file descriptors for the system and other processes
|
77
|
+
fd_reserve = 100
|
78
|
+
|
79
|
+
max_workers = (fd_limit - fd_reserve) // fd_per_rsync
|
80
|
+
# At least 1 worker, and avoid too many workers overloading the system.
|
81
|
+
num_threads = get_parallel_threads(cloud_str)
|
82
|
+
max_workers = min(max(max_workers, 1), num_threads)
|
83
|
+
logger.debug(f'Using {max_workers} workers for file mounts.')
|
84
|
+
return max_workers
|
85
|
+
|
86
|
+
|
87
|
+
def get_parallel_threads(cloud_str: Optional[str] = None) -> int:
|
88
|
+
"""Returns the number of threads to use for parallel execution.
|
89
|
+
|
90
|
+
Args:
|
91
|
+
cloud_str: The cloud
|
92
|
+
"""
|
48
93
|
cpu_count = os.cpu_count()
|
49
94
|
if cpu_count is None:
|
50
95
|
cpu_count = 1
|
51
|
-
return max(4, cpu_count - 1)
|
96
|
+
return max(4, cpu_count - 1) * _get_thread_multiplier(cloud_str)
|
52
97
|
|
53
98
|
|
54
99
|
def run_in_parallel(func: Callable,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.1
|
2
2
|
Name: skypilot-nightly
|
3
|
-
Version: 1.0.0.
|
3
|
+
Version: 1.0.0.dev20241122
|
4
4
|
Summary: SkyPilot: An intercloud broker for the clouds
|
5
5
|
Author: SkyPilot Team
|
6
6
|
License: Apache 2.0
|
@@ -33,45 +33,13 @@ Requires-Dist: PrettyTable>=2.0.0
|
|
33
33
|
Requires-Dist: python-dotenv
|
34
34
|
Requires-Dist: rich
|
35
35
|
Requires-Dist: tabulate
|
36
|
-
Requires-Dist:
|
36
|
+
Requires-Dist: typing_extensions
|
37
37
|
Requires-Dist: filelock>=3.6.0
|
38
38
|
Requires-Dist: packaging
|
39
39
|
Requires-Dist: psutil
|
40
40
|
Requires-Dist: pulp
|
41
41
|
Requires-Dist: pyyaml!=5.4.*,>3.13
|
42
42
|
Requires-Dist: requests
|
43
|
-
Provides-Extra: all
|
44
|
-
Requires-Dist: urllib3<2; extra == "all"
|
45
|
-
Requires-Dist: awscli>=1.27.10; extra == "all"
|
46
|
-
Requires-Dist: botocore>=1.29.10; extra == "all"
|
47
|
-
Requires-Dist: boto3>=1.26.1; extra == "all"
|
48
|
-
Requires-Dist: colorama<0.4.5; extra == "all"
|
49
|
-
Requires-Dist: azure-cli>=2.65.0; extra == "all"
|
50
|
-
Requires-Dist: azure-core>=1.31.0; extra == "all"
|
51
|
-
Requires-Dist: azure-identity>=1.19.0; extra == "all"
|
52
|
-
Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
|
53
|
-
Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
|
54
|
-
Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
|
55
|
-
Requires-Dist: msgraph-sdk; extra == "all"
|
56
|
-
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
|
57
|
-
Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
|
58
|
-
Requires-Dist: google-cloud-storage; extra == "all"
|
59
|
-
Requires-Dist: ibm-cloud-sdk-core; extra == "all"
|
60
|
-
Requires-Dist: ibm-vpc; extra == "all"
|
61
|
-
Requires-Dist: ibm-platform-services; extra == "all"
|
62
|
-
Requires-Dist: ibm-cos-sdk; extra == "all"
|
63
|
-
Requires-Dist: docker; extra == "all"
|
64
|
-
Requires-Dist: oci; extra == "all"
|
65
|
-
Requires-Dist: kubernetes>=20.0.0; extra == "all"
|
66
|
-
Requires-Dist: protobuf!=3.19.5,>=3.15.3; extra == "all"
|
67
|
-
Requires-Dist: pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3; extra == "all"
|
68
|
-
Requires-Dist: runpod>=1.5.1; extra == "all"
|
69
|
-
Requires-Dist: cudo-compute>=0.1.10; extra == "all"
|
70
|
-
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
|
71
|
-
Requires-Dist: grpcio!=1.48.0,<=1.51.3,>=1.32.0; (python_version < "3.10" and sys_platform != "darwin") and extra == "all"
|
72
|
-
Requires-Dist: grpcio!=1.48.0,<=1.49.1,>=1.32.0; (python_version < "3.10" and sys_platform == "darwin") and extra == "all"
|
73
|
-
Requires-Dist: grpcio!=1.48.0,<=1.51.3,>=1.42.0; (python_version >= "3.10" and sys_platform != "darwin") and extra == "all"
|
74
|
-
Requires-Dist: grpcio!=1.48.0,<=1.49.1,>=1.42.0; (python_version >= "3.10" and sys_platform == "darwin") and extra == "all"
|
75
43
|
Provides-Extra: aws
|
76
44
|
Requires-Dist: urllib3<2; extra == "aws"
|
77
45
|
Requires-Dist: awscli>=1.27.10; extra == "aws"
|
@@ -87,18 +55,6 @@ Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "azure"
|
|
87
55
|
Requires-Dist: azure-storage-blob>=12.23.1; extra == "azure"
|
88
56
|
Requires-Dist: msgraph-sdk; extra == "azure"
|
89
57
|
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "azure"
|
90
|
-
Provides-Extra: cloudflare
|
91
|
-
Requires-Dist: urllib3<2; extra == "cloudflare"
|
92
|
-
Requires-Dist: awscli>=1.27.10; extra == "cloudflare"
|
93
|
-
Requires-Dist: botocore>=1.29.10; extra == "cloudflare"
|
94
|
-
Requires-Dist: boto3>=1.26.1; extra == "cloudflare"
|
95
|
-
Requires-Dist: colorama<0.4.5; extra == "cloudflare"
|
96
|
-
Provides-Extra: cudo
|
97
|
-
Requires-Dist: cudo-compute>=0.1.10; extra == "cudo"
|
98
|
-
Provides-Extra: docker
|
99
|
-
Requires-Dist: docker; extra == "docker"
|
100
|
-
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "docker"
|
101
|
-
Provides-Extra: fluidstack
|
102
58
|
Provides-Extra: gcp
|
103
59
|
Requires-Dist: google-api-python-client>=2.69.0; extra == "gcp"
|
104
60
|
Requires-Dist: google-cloud-storage; extra == "gcp"
|
@@ -108,27 +64,81 @@ Requires-Dist: ibm-vpc; extra == "ibm"
|
|
108
64
|
Requires-Dist: ibm-platform-services; extra == "ibm"
|
109
65
|
Requires-Dist: ibm-cos-sdk; extra == "ibm"
|
110
66
|
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "ibm"
|
111
|
-
Provides-Extra:
|
112
|
-
Requires-Dist:
|
67
|
+
Provides-Extra: docker
|
68
|
+
Requires-Dist: docker; extra == "docker"
|
69
|
+
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "docker"
|
113
70
|
Provides-Extra: lambda
|
114
71
|
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "lambda"
|
72
|
+
Provides-Extra: cloudflare
|
73
|
+
Requires-Dist: urllib3<2; extra == "cloudflare"
|
74
|
+
Requires-Dist: awscli>=1.27.10; extra == "cloudflare"
|
75
|
+
Requires-Dist: botocore>=1.29.10; extra == "cloudflare"
|
76
|
+
Requires-Dist: boto3>=1.26.1; extra == "cloudflare"
|
77
|
+
Requires-Dist: colorama<0.4.5; extra == "cloudflare"
|
78
|
+
Provides-Extra: scp
|
79
|
+
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "scp"
|
115
80
|
Provides-Extra: oci
|
116
81
|
Requires-Dist: oci; extra == "oci"
|
117
82
|
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "oci"
|
118
|
-
Provides-Extra:
|
83
|
+
Provides-Extra: kubernetes
|
84
|
+
Requires-Dist: kubernetes>=20.0.0; extra == "kubernetes"
|
119
85
|
Provides-Extra: remote
|
120
|
-
Requires-Dist: protobuf!=3.19.5,>=3.15.3; extra == "remote"
|
121
|
-
Requires-Dist: pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3; extra == "remote"
|
122
|
-
Requires-Dist: grpcio!=1.48.0,<=1.51.3,>=1.32.0; (python_version < "3.10" and sys_platform != "darwin") and extra == "remote"
|
123
86
|
Requires-Dist: grpcio!=1.48.0,<=1.49.1,>=1.32.0; (python_version < "3.10" and sys_platform == "darwin") and extra == "remote"
|
124
|
-
Requires-Dist: grpcio!=1.48.0,<=1.51.3,>=1.42.0; (python_version >= "3.10" and sys_platform != "darwin") and extra == "remote"
|
125
87
|
Requires-Dist: grpcio!=1.48.0,<=1.49.1,>=1.42.0; (python_version >= "3.10" and sys_platform == "darwin") and extra == "remote"
|
88
|
+
Requires-Dist: grpcio!=1.48.0,<=1.51.3,>=1.32.0; (python_version < "3.10" and sys_platform != "darwin") and extra == "remote"
|
89
|
+
Requires-Dist: grpcio!=1.48.0,<=1.51.3,>=1.42.0; (python_version >= "3.10" and sys_platform != "darwin") and extra == "remote"
|
90
|
+
Requires-Dist: protobuf!=3.19.5,>=3.15.3; extra == "remote"
|
91
|
+
Requires-Dist: pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3; extra == "remote"
|
126
92
|
Provides-Extra: runpod
|
127
93
|
Requires-Dist: runpod>=1.5.1; extra == "runpod"
|
128
|
-
Provides-Extra:
|
129
|
-
|
94
|
+
Provides-Extra: fluidstack
|
95
|
+
Provides-Extra: cudo
|
96
|
+
Requires-Dist: cudo-compute>=0.1.10; extra == "cudo"
|
97
|
+
Provides-Extra: paperspace
|
130
98
|
Provides-Extra: vsphere
|
131
99
|
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "vsphere"
|
100
|
+
Provides-Extra: all
|
101
|
+
Requires-Dist: urllib3<2; extra == "all"
|
102
|
+
Requires-Dist: awscli>=1.27.10; extra == "all"
|
103
|
+
Requires-Dist: botocore>=1.29.10; extra == "all"
|
104
|
+
Requires-Dist: boto3>=1.26.1; extra == "all"
|
105
|
+
Requires-Dist: colorama<0.4.5; extra == "all"
|
106
|
+
Requires-Dist: azure-cli>=2.65.0; extra == "all"
|
107
|
+
Requires-Dist: azure-core>=1.31.0; extra == "all"
|
108
|
+
Requires-Dist: azure-identity>=1.19.0; extra == "all"
|
109
|
+
Requires-Dist: azure-mgmt-network>=27.0.0; extra == "all"
|
110
|
+
Requires-Dist: azure-mgmt-compute>=33.0.0; extra == "all"
|
111
|
+
Requires-Dist: azure-storage-blob>=12.23.1; extra == "all"
|
112
|
+
Requires-Dist: msgraph-sdk; extra == "all"
|
113
|
+
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
|
114
|
+
Requires-Dist: google-api-python-client>=2.69.0; extra == "all"
|
115
|
+
Requires-Dist: google-cloud-storage; extra == "all"
|
116
|
+
Requires-Dist: ibm-cloud-sdk-core; extra == "all"
|
117
|
+
Requires-Dist: ibm-vpc; extra == "all"
|
118
|
+
Requires-Dist: ibm-platform-services; extra == "all"
|
119
|
+
Requires-Dist: ibm-cos-sdk; extra == "all"
|
120
|
+
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
|
121
|
+
Requires-Dist: docker; extra == "all"
|
122
|
+
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
|
123
|
+
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
|
124
|
+
Requires-Dist: urllib3<2; extra == "all"
|
125
|
+
Requires-Dist: awscli>=1.27.10; extra == "all"
|
126
|
+
Requires-Dist: botocore>=1.29.10; extra == "all"
|
127
|
+
Requires-Dist: boto3>=1.26.1; extra == "all"
|
128
|
+
Requires-Dist: colorama<0.4.5; extra == "all"
|
129
|
+
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
|
130
|
+
Requires-Dist: oci; extra == "all"
|
131
|
+
Requires-Dist: ray[default]!=2.6.0,>=2.2.0; extra == "all"
|
132
|
+
Requires-Dist: kubernetes>=20.0.0; extra == "all"
|
133
|
+
Requires-Dist: grpcio!=1.48.0,<=1.49.1,>=1.32.0; (python_version < "3.10" and sys_platform == "darwin") and extra == "all"
|
134
|
+
Requires-Dist: grpcio!=1.48.0,<=1.49.1,>=1.42.0; (python_version >= "3.10" and sys_platform == "darwin") and extra == "all"
|
135
|
+
Requires-Dist: grpcio!=1.48.0,<=1.51.3,>=1.32.0; (python_version < "3.10" and sys_platform != "darwin") and extra == "all"
|
136
|
+
Requires-Dist: grpcio!=1.48.0,<=1.51.3,>=1.42.0; (python_version >= "3.10" and sys_platform != "darwin") and extra == "all"
|
137
|
+
Requires-Dist: protobuf!=3.19.5,>=3.15.3; extra == "all"
|
138
|
+
Requires-Dist: pydantic!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,<3; extra == "all"
|
139
|
+
Requires-Dist: runpod>=1.5.1; extra == "all"
|
140
|
+
Requires-Dist: cudo-compute>=0.1.10; extra == "all"
|
141
|
+
Requires-Dist: pyvmomi==8.0.1.0.2; extra == "all"
|
132
142
|
|
133
143
|
<p align="center">
|
134
144
|
<img alt="SkyPilot" src="https://raw.githubusercontent.com/skypilot-org/skypilot/master/docs/source/images/skypilot-wide-light-1k.png" width=55%>
|