skypilot-nightly 1.0.0.dev20250427__py3-none-any.whl → 1.0.0.dev20250429__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/adaptors/nebius.py +28 -40
- sky/backends/backend_utils.py +19 -2
- sky/backends/cloud_vm_ray_backend.py +33 -8
- sky/backends/local_docker_backend.py +1 -2
- sky/cli.py +91 -38
- sky/client/cli.py +91 -38
- sky/client/sdk.py +3 -2
- sky/clouds/aws.py +12 -6
- sky/clouds/azure.py +3 -0
- sky/clouds/cloud.py +8 -2
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +3 -0
- sky/clouds/fluidstack.py +3 -0
- sky/clouds/gcp.py +7 -0
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +42 -19
- sky/clouds/lambda_cloud.py +1 -0
- sky/clouds/nebius.py +18 -10
- sky/clouds/oci.py +6 -3
- sky/clouds/paperspace.py +2 -0
- sky/clouds/runpod.py +2 -0
- sky/clouds/scp.py +2 -0
- sky/clouds/service_catalog/constants.py +1 -1
- sky/clouds/service_catalog/kubernetes_catalog.py +7 -7
- sky/clouds/vast.py +2 -0
- sky/clouds/vsphere.py +2 -0
- sky/core.py +58 -29
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/favicon.ico +0 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/exceptions.py +6 -0
- sky/execution.py +19 -4
- sky/global_user_state.py +1 -0
- sky/optimizer.py +35 -11
- sky/provision/common.py +2 -5
- sky/provision/docker_utils.py +22 -16
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +276 -93
- sky/provision/kubernetes/network.py +1 -1
- sky/provision/kubernetes/utils.py +36 -24
- sky/provision/provisioner.py +6 -0
- sky/serve/replica_managers.py +51 -5
- sky/serve/serve_state.py +41 -0
- sky/serve/service.py +108 -63
- sky/server/common.py +6 -3
- sky/server/config.py +184 -0
- sky/server/requests/executor.py +17 -156
- sky/server/server.py +4 -4
- sky/setup_files/dependencies.py +0 -1
- sky/skylet/constants.py +7 -0
- sky/skypilot_config.py +27 -6
- sky/task.py +1 -1
- sky/templates/kubernetes-ray.yml.j2 +145 -15
- sky/templates/nebius-ray.yml.j2 +63 -0
- sky/utils/command_runner.py +17 -3
- sky/utils/command_runner.pyi +2 -0
- sky/utils/controller_utils.py +24 -0
- sky/utils/kubernetes/rsync_helper.sh +20 -4
- sky/utils/schemas.py +13 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/METADATA +2 -2
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/RECORD +73 -72
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/WHEEL +1 -1
- /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/top_level.txt +0 -0
@@ -258,7 +258,7 @@ available_node_types:
|
|
258
258
|
# service is required.
|
259
259
|
labels:
|
260
260
|
parent: skypilot
|
261
|
-
# component will be set for the head node pod to be the same as the head node service selector above if a
|
261
|
+
# component will be set for the head node pod to be the same as the head node service selector above if a
|
262
262
|
skypilot-cluster: {{cluster_name_on_cloud}}
|
263
263
|
# Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
|
264
264
|
skypilot-ssh-jump: {{k8s_ssh_jump_name}}
|
@@ -267,18 +267,18 @@ available_node_types:
|
|
267
267
|
{%- for label_key, label_value in labels.items() %}
|
268
268
|
{{ label_key }}: {{ label_value|tojson }}
|
269
269
|
{%- endfor %}
|
270
|
+
{% if high_availability %}
|
271
|
+
app: {{cluster_name_on_cloud}}
|
272
|
+
{% endif %}
|
270
273
|
spec:
|
271
274
|
# serviceAccountName: skypilot-service-account
|
272
275
|
serviceAccountName: {{k8s_service_account_name}}
|
273
276
|
automountServiceAccountToken: {{k8s_automount_sa_token}}
|
274
|
-
restartPolicy: Never
|
277
|
+
restartPolicy: {{ "Always" if high_availability else "Never" }}
|
275
278
|
|
276
279
|
# Add node selector if GPU/TPUs are requested:
|
277
|
-
{% if (
|
280
|
+
{% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) %}
|
278
281
|
nodeSelector:
|
279
|
-
{% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %}
|
280
|
-
{{k8s_acc_label_key}}: {{k8s_acc_label_value}}
|
281
|
-
{% endif %}
|
282
282
|
{% if k8s_topology_label_key is not none and k8s_topology_label_value is not none %}
|
283
283
|
{{k8s_topology_label_key}}: {{k8s_topology_label_value}}
|
284
284
|
{% endif %}
|
@@ -286,6 +286,19 @@ available_node_types:
|
|
286
286
|
{{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}}
|
287
287
|
{% endif %}
|
288
288
|
{% endif %}
|
289
|
+
{% if (k8s_acc_label_key is not none and k8s_acc_label_values is not none) %}
|
290
|
+
affinity:
|
291
|
+
nodeAffinity:
|
292
|
+
requiredDuringSchedulingIgnoredDuringExecution:
|
293
|
+
nodeSelectorTerms:
|
294
|
+
- matchExpressions:
|
295
|
+
- key: {{k8s_acc_label_key}}
|
296
|
+
operator: In
|
297
|
+
values:
|
298
|
+
{% for label_value in k8s_acc_label_values %}
|
299
|
+
- {{label_value}}
|
300
|
+
{% endfor %}
|
301
|
+
{% endif %}
|
289
302
|
|
290
303
|
{% if k8s_spot_label_key is not none %}
|
291
304
|
tolerations:
|
@@ -311,6 +324,11 @@ available_node_types:
|
|
311
324
|
path: {{k8s_fusermount_shared_dir}}
|
312
325
|
type: DirectoryOrCreate
|
313
326
|
{% endif %}
|
327
|
+
{% if high_availability %}
|
328
|
+
- name: {{k8s_high_availability_deployment_volume_mount_name}}
|
329
|
+
persistentVolumeClaim:
|
330
|
+
claimName: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
|
331
|
+
{% endif %}
|
314
332
|
containers:
|
315
333
|
- name: ray-node
|
316
334
|
imagePullPolicy: IfNotPresent
|
@@ -331,15 +349,15 @@ available_node_types:
|
|
331
349
|
# Do not change this command - it keeps the pod alive until it is
|
332
350
|
# explicitly killed.
|
333
351
|
command: ["/bin/bash", "-c", "--"]
|
334
|
-
args:
|
352
|
+
args:
|
335
353
|
- |
|
336
354
|
# For backwards compatibility, we put a marker file in the pod
|
337
|
-
# to indicate that the pod is running with the changes introduced
|
355
|
+
# to indicate that the pod is running with the changes introduced
|
338
356
|
# in project nimbus: https://github.com/skypilot-org/skypilot/pull/4393
|
339
357
|
# TODO: Remove this marker file and it's usage in setup_commands
|
340
358
|
# after v0.10.0 release.
|
341
359
|
touch /tmp/skypilot_is_nimbus
|
342
|
-
|
360
|
+
|
343
361
|
# Helper function to conditionally use sudo
|
344
362
|
# TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
|
345
363
|
prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
|
@@ -382,7 +400,7 @@ available_node_types:
|
|
382
400
|
fi;
|
383
401
|
# SSH and other packages are not necessary, so we disable set -e
|
384
402
|
set +e
|
385
|
-
|
403
|
+
|
386
404
|
if [ ! -z "$MISSING_PACKAGES" ]; then
|
387
405
|
# Install missing packages individually to avoid failure installation breaks the whole install process,
|
388
406
|
# e.g. fuse3 is not available on some distributions.
|
@@ -435,7 +453,7 @@ available_node_types:
|
|
435
453
|
$(prefix_cmd) rm -f /bin/fusermount-wrapper
|
436
454
|
$(prefix_cmd) cp -p {{k8s_fusermount_shared_dir}}/fusermount-wrapper /bin/fusermount-wrapper
|
437
455
|
fi
|
438
|
-
{% endif %}
|
456
|
+
{% endif %}
|
439
457
|
|
440
458
|
$(prefix_cmd) mkdir -p /var/run/sshd;
|
441
459
|
$(prefix_cmd) sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config;
|
@@ -470,6 +488,7 @@ available_node_types:
|
|
470
488
|
done
|
471
489
|
{{ conda_installation_commands }}
|
472
490
|
{{ ray_installation_commands }}
|
491
|
+
|
473
492
|
VIRTUAL_ENV=~/skypilot-runtime ~/.local/bin/uv pip install skypilot[kubernetes,remote]
|
474
493
|
touch /tmp/ray_skypilot_installation_complete
|
475
494
|
echo "=== Ray and skypilot installation completed ==="
|
@@ -565,7 +584,7 @@ available_node_types:
|
|
565
584
|
# File is already being monitored
|
566
585
|
continue
|
567
586
|
fi
|
568
|
-
|
587
|
+
|
569
588
|
# Monitor the new file
|
570
589
|
monitor_file $file &
|
571
590
|
already_monitored="${already_monitored} ${file}"
|
@@ -573,6 +592,28 @@ available_node_types:
|
|
573
592
|
sleep 0.1
|
574
593
|
done
|
575
594
|
}
|
595
|
+
|
596
|
+
{% if high_availability %}
|
597
|
+
mkdir -p {{k8s_high_availability_deployment_run_script_dir}}
|
598
|
+
if [ -f {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready ]; then
|
599
|
+
# ! Keep this aligned with `CloudVmRayBackend._setup()`
|
600
|
+
# Suppose all `task.setup` are the same for skyserve controller task.
|
601
|
+
# So be careful for compatibility issue once you change it.
|
602
|
+
chmod +x {{k8s_high_availability_deployment_setup_script_path}}
|
603
|
+
/bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && {{k8s_high_availability_deployment_setup_script_path}} > /tmp/controller_recovery_setup_commands.log 2>&1"
|
604
|
+
echo "=== Controller setup commands completed for recovery ==="
|
605
|
+
|
606
|
+
for file in {{k8s_high_availability_deployment_run_script_dir}}/*; do
|
607
|
+
# ! Keep this aligned with `CloudVmRayBackend._execute()`
|
608
|
+
chmod +x $file
|
609
|
+
/bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && $file > /tmp/task_run_$(basename $file).log 2>&1"
|
610
|
+
echo "=== Controller task run for service (file: $file) completed for recovery ==="
|
611
|
+
done
|
612
|
+
fi
|
613
|
+
|
614
|
+
touch {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready
|
615
|
+
{% endif %}
|
616
|
+
|
576
617
|
trap : TERM INT; log_tail || sleep infinity & wait
|
577
618
|
|
578
619
|
ports:
|
@@ -593,6 +634,10 @@ available_node_types:
|
|
593
634
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
594
635
|
- mountPath: /dev/shm
|
595
636
|
name: dshm
|
637
|
+
{% if high_availability %}
|
638
|
+
- name: {{k8s_high_availability_deployment_volume_mount_name}}
|
639
|
+
mountPath: {{k8s_high_availability_deployment_volume_mount_path}}
|
640
|
+
{% endif %}
|
596
641
|
{% if k8s_fuse_device_required %}
|
597
642
|
- name: fusermount-shared-dir
|
598
643
|
mountPath: {{k8s_fusermount_shared_dir}}
|
@@ -616,7 +661,92 @@ available_node_types:
|
|
616
661
|
{{k8s_resource_key}}: {{accelerator_count}}
|
617
662
|
{% endif %}
|
618
663
|
{% endif %}
|
619
|
-
|
664
|
+
|
665
|
+
{% if high_availability %}
|
666
|
+
pvc_spec:
|
667
|
+
apiVersion: v1
|
668
|
+
kind: PersistentVolumeClaim
|
669
|
+
metadata:
|
670
|
+
name: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
|
671
|
+
namespace: {{k8s_namespace}}
|
672
|
+
spec:
|
673
|
+
accessModes:
|
674
|
+
- ReadWriteOnce # Our controller pod is singleton
|
675
|
+
{% if k8s_high_availability_storage_class_name is not none %}
|
676
|
+
storageClassName: {{k8s_high_availability_storage_class_name}}
|
677
|
+
{% endif %}
|
678
|
+
resources:
|
679
|
+
requests:
|
680
|
+
storage: {{disk_size}}Gi
|
681
|
+
|
682
|
+
deployment_spec:
|
683
|
+
apiVersion: apps/v1
|
684
|
+
kind: Deployment
|
685
|
+
metadata:
|
686
|
+
name: {{cluster_name_on_cloud}}-deployment
|
687
|
+
namespace: {{k8s_namespace}}
|
688
|
+
spec:
|
689
|
+
replicas: 1
|
690
|
+
selector:
|
691
|
+
matchLabels:
|
692
|
+
app: {{cluster_name_on_cloud}}
|
693
|
+
template:
|
694
|
+
# The only difference between the pod spec and this section is the initContainers
|
695
|
+
metadata:
|
696
|
+
# should be replaced by pod metadata
|
697
|
+
spec:
|
698
|
+
securityContext:
|
699
|
+
fsGroup: 1000
|
700
|
+
# To prevent the home dir provided by the docker image from being overriden by pvc mounting,
|
701
|
+
# we use initContainers to copy it first to /mnt/home, which will later be mounted to home dir.
|
702
|
+
initContainers:
|
703
|
+
- name: init-copy-home
|
704
|
+
image: {{image_id}}
|
705
|
+
command: ["/bin/sh", "-c"]
|
706
|
+
args:
|
707
|
+
- |
|
708
|
+
# Define path for the marker file created by the main container upon successful startup.
|
709
|
+
# This file persists in the PVC across Pod restarts.
|
710
|
+
MARKER_FILE="/mnt/home/k8s_container_ready"
|
711
|
+
SOURCE_PATH="{{k8s_high_availability_deployment_volume_mount_path}}"
|
712
|
+
DEST_PATH="/mnt/home"
|
713
|
+
|
714
|
+
# We only need to copy the initial home directory contents from the image
|
715
|
+
# the *first* time a Pod uses a *new* PVC.
|
716
|
+
# On subsequent Pod starts (e.g., after a crash or update), the PVC
|
717
|
+
# already contains the necessary data (and potentially user modifications).
|
718
|
+
# The presence of MARKER_FILE (created by the main container in a previous
|
719
|
+
# successful run) indicates the PVC is already initialized. Checking for
|
720
|
+
# it prevents unnecessary and time-consuming rsync operations on every restart.
|
721
|
+
if [ ! -f "$MARKER_FILE" ]; then
|
722
|
+
echo "Marker '$MARKER_FILE' not found. PVC likely needs initialization."
|
723
|
+
echo "Copying initial home directory from image ($SOURCE_PATH/) to PVC ($DEST_PATH)..."
|
724
|
+
|
725
|
+
# Use rsync with -rl (recursive, links) instead of -a (archive).
|
726
|
+
# This avoids preserving times (-t) and permissions (-p) implied by -a,
|
727
|
+
# which caused 'Operation not permitted' errors on the PVC root directory (/mnt/home).
|
728
|
+
# Owner/group preservation (-o, -g) is also skipped (default for -rl), ensuring
|
729
|
+
# files are owned by the container's user/group.
|
730
|
+
rsync -rl "$SOURCE_PATH/" "$DEST_PATH"
|
731
|
+
|
732
|
+
# Check if rsync failed
|
733
|
+
if [ $? -ne 0 ]; then
|
734
|
+
echo "ERROR: rsync failed during home directory initialization." >&2
|
735
|
+
exit 1 # Exit initContainer with error if copy fails
|
736
|
+
fi
|
737
|
+
echo "Home directory initialization copy complete."
|
738
|
+
else
|
739
|
+
# If marker exists, skip the copy
|
740
|
+
echo "Marker '$MARKER_FILE' found. Skipping initial home directory copy."
|
741
|
+
fi
|
742
|
+
echo "Current contents of $DEST_PATH:"
|
743
|
+
ls -la "$DEST_PATH"
|
744
|
+
volumeMounts:
|
745
|
+
# Mount the persistent volume claim into the initContainer
|
746
|
+
- name: {{k8s_high_availability_deployment_volume_mount_name}}
|
747
|
+
mountPath: /mnt/home # Temporary mount point for initialization
|
748
|
+
# should be replaced by pod spec
|
749
|
+
{% endif %}
|
620
750
|
setup_commands:
|
621
751
|
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
622
752
|
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
@@ -636,7 +766,7 @@ setup_commands:
|
|
636
766
|
echo "=== Logs for asynchronous ray and skypilot installation ===";
|
637
767
|
if [ -f /tmp/skypilot_is_nimbus ]; then
|
638
768
|
echo "=== Logs for asynchronous ray and skypilot installation ===";
|
639
|
-
[ -f /tmp/ray_skypilot_installation_complete ] && cat /tmp/${STEPS[1]}.log ||
|
769
|
+
[ -f /tmp/ray_skypilot_installation_complete ] && cat /tmp/${STEPS[1]}.log ||
|
640
770
|
{ tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/ray_skypilot_installation_complete ]; do sleep 0.5; done; kill $TAIL_PID || true; };
|
641
771
|
[ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
|
642
772
|
fi
|
@@ -666,7 +796,7 @@ setup_commands:
|
|
666
796
|
# properly written.
|
667
797
|
# TODO(Doyoung): Investigate to see why TPU workload fails to run without
|
668
798
|
# execution permission, such as granting 766 to log file. Check if it's a
|
669
|
-
# must and see if there's a workaround to grant minimum permission.
|
799
|
+
# must and see if there's a workaround to grant minimum permission.
|
670
800
|
sudo chmod 777 /tmp/tpu_logs;
|
671
801
|
{% endif %}
|
672
802
|
|
sky/templates/nebius-ray.yml.j2
CHANGED
@@ -10,6 +10,27 @@ provider:
|
|
10
10
|
module: sky.provision.nebius
|
11
11
|
region: "{{region}}"
|
12
12
|
|
13
|
+
{%- if docker_image is not none %}
|
14
|
+
docker:
|
15
|
+
image: {{docker_image}}
|
16
|
+
container_name: {{docker_container_name}}
|
17
|
+
run_options:
|
18
|
+
- --ulimit nofile=1048576:1048576
|
19
|
+
{%- for run_option in docker_run_options %}
|
20
|
+
- {{run_option}}
|
21
|
+
{%- endfor %}
|
22
|
+
{%- if docker_login_config is not none %}
|
23
|
+
docker_login_config:
|
24
|
+
username: |-
|
25
|
+
{{docker_login_config.username}}
|
26
|
+
password: |-
|
27
|
+
{{docker_login_config.password}}
|
28
|
+
server: |-
|
29
|
+
{{docker_login_config.server}}
|
30
|
+
{%- endif %}
|
31
|
+
{%- endif %}
|
32
|
+
|
33
|
+
|
13
34
|
auth:
|
14
35
|
ssh_user: ubuntu
|
15
36
|
ssh_private_key: {{ssh_private_key}}
|
@@ -22,6 +43,48 @@ available_node_types:
|
|
22
43
|
ImageId: {{image_id}}
|
23
44
|
DiskSize: {{disk_size}}
|
24
45
|
UserData: |
|
46
|
+
{%- if docker_image is not none %}
|
47
|
+
runcmd:
|
48
|
+
- sudo sed -i 's/^#\?AllowTcpForwarding.*/AllowTcpForwarding yes/' /etc/ssh/sshd_config
|
49
|
+
- systemctl restart sshd
|
50
|
+
{%- endif %}
|
51
|
+
|
52
|
+
{# Two available OS images:
|
53
|
+
1. ubuntu22.04-driverless - requires Docker installation
|
54
|
+
2. ubuntu22.04-cuda12 - comes with Docker pre-installed
|
55
|
+
To optimize deployment speed, Docker is only installed when using ubuntu22.04-driverless #}
|
56
|
+
{%- if docker_image is not none and image_id == 'ubuntu22.04-driverless' %}
|
57
|
+
apt:
|
58
|
+
sources:
|
59
|
+
docker.list:
|
60
|
+
source: deb [arch=amd64] https://download.docker.com/linux/ubuntu $RELEASE stable
|
61
|
+
keyid: 9DC858229FC7DD38854AE2D88D81803C0EBFCD88
|
62
|
+
|
63
|
+
packages:
|
64
|
+
- apt-transport-https
|
65
|
+
- ca-certificates
|
66
|
+
- curl
|
67
|
+
- gnupg-agent
|
68
|
+
- software-properties-common
|
69
|
+
- docker-ce
|
70
|
+
- docker-ce-cli
|
71
|
+
- containerd.io
|
72
|
+
|
73
|
+
# Enable ipv4 forwarding, required on CIS hardened machines
|
74
|
+
write_files:
|
75
|
+
- path: /etc/sysctl.d/enabled_ipv4_forwarding.conf
|
76
|
+
content: |
|
77
|
+
net.ipv4.conf.all.forwarding=1
|
78
|
+
|
79
|
+
# create the docker group
|
80
|
+
groups:
|
81
|
+
- docker
|
82
|
+
|
83
|
+
# Add default auto created user to docker group
|
84
|
+
system_info:
|
85
|
+
default_user:
|
86
|
+
groups: [docker]
|
87
|
+
{%- endif %}
|
25
88
|
users:
|
26
89
|
- name: skypilot:ssh_user
|
27
90
|
shell: /bin/bash
|
sky/utils/command_runner.py
CHANGED
@@ -325,6 +325,7 @@ class CommandRunner:
|
|
325
325
|
direction = 'up' if up else 'down'
|
326
326
|
error_msg = (f'Failed to rsync {direction}: {source} -> {target}. '
|
327
327
|
'Ensure that the network is stable, then retry.')
|
328
|
+
|
328
329
|
subprocess_utils.handle_returncode(returncode,
|
329
330
|
command,
|
330
331
|
error_msg,
|
@@ -718,6 +719,7 @@ class KubernetesCommandRunner(CommandRunner):
|
|
718
719
|
def __init__(
|
719
720
|
self,
|
720
721
|
node: Tuple[Tuple[str, Optional[str]], str],
|
722
|
+
deployment: Optional[str] = None,
|
721
723
|
**kwargs,
|
722
724
|
):
|
723
725
|
"""Initialize KubernetesCommandRunner.
|
@@ -733,11 +735,19 @@ class KubernetesCommandRunner(CommandRunner):
|
|
733
735
|
del kwargs
|
734
736
|
super().__init__(node)
|
735
737
|
(self.namespace, self.context), self.pod_name = node
|
738
|
+
self.deployment = deployment
|
736
739
|
|
737
740
|
@property
|
738
741
|
def node_id(self) -> str:
|
739
742
|
return f'{self.context}-{self.namespace}-{self.pod_name}'
|
740
743
|
|
744
|
+
@property
|
745
|
+
def kube_identifier(self) -> str:
|
746
|
+
if self.deployment is not None:
|
747
|
+
return f'deployment/{self.deployment}'
|
748
|
+
else:
|
749
|
+
return f'pod/{self.pod_name}'
|
750
|
+
|
741
751
|
def port_forward_command(self,
|
742
752
|
port_forward: List[Tuple[int, int]],
|
743
753
|
connect_timeout: int = 1) -> List[str]:
|
@@ -758,11 +768,12 @@ class KubernetesCommandRunner(CommandRunner):
|
|
758
768
|
kubectl_args += ['--context', self.context]
|
759
769
|
local_port, remote_port = port_forward[0]
|
760
770
|
local_port_str = f'{local_port}' if local_port is not None else ''
|
771
|
+
|
761
772
|
kubectl_cmd = [
|
762
773
|
'kubectl',
|
763
774
|
*kubectl_args,
|
764
775
|
'port-forward',
|
765
|
-
|
776
|
+
self.kube_identifier,
|
766
777
|
f'{local_port_str}:{remote_port}',
|
767
778
|
]
|
768
779
|
return kubectl_cmd
|
@@ -785,7 +796,8 @@ class KubernetesCommandRunner(CommandRunner):
|
|
785
796
|
source_bashrc: bool = False,
|
786
797
|
skip_num_lines: int = 0,
|
787
798
|
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
788
|
-
"""Uses 'kubectl exec' to run 'cmd' on a pod by its
|
799
|
+
"""Uses 'kubectl exec' to run 'cmd' on a pod or deployment by its
|
800
|
+
name and namespace.
|
789
801
|
|
790
802
|
Args:
|
791
803
|
cmd: The command to run.
|
@@ -828,7 +840,9 @@ class KubernetesCommandRunner(CommandRunner):
|
|
828
840
|
# case, need to set KUBECONFIG to /dev/null to avoid using kubeconfig.
|
829
841
|
if self.context is None:
|
830
842
|
kubectl_args += ['--kubeconfig', '/dev/null']
|
831
|
-
|
843
|
+
|
844
|
+
kubectl_args += [self.kube_identifier]
|
845
|
+
|
832
846
|
if ssh_mode == SshMode.LOGIN:
|
833
847
|
assert isinstance(cmd, list), 'cmd must be a list for login mode.'
|
834
848
|
base_cmd = ['kubectl', 'exec', '-it', *kubectl_args, '--']
|
sky/utils/command_runner.pyi
CHANGED
sky/utils/controller_utils.py
CHANGED
@@ -193,6 +193,30 @@ class Controllers(enum.Enum):
|
|
193
193
|
return None
|
194
194
|
|
195
195
|
|
196
|
+
def high_availability_specified(cluster_name: Optional[str],
|
197
|
+
skip_warning: bool = True) -> bool:
|
198
|
+
"""Check if the controller high availability is specified in user config.
|
199
|
+
"""
|
200
|
+
controller = Controllers.from_name(cluster_name)
|
201
|
+
if controller is None:
|
202
|
+
return False
|
203
|
+
|
204
|
+
if skypilot_config.loaded():
|
205
|
+
high_availability = skypilot_config.get_nested(
|
206
|
+
(controller.value.controller_type, 'controller',
|
207
|
+
'high_availability'), False)
|
208
|
+
if high_availability:
|
209
|
+
if controller.value.controller_type != 'serve':
|
210
|
+
if not skip_warning:
|
211
|
+
print(f'{colorama.Fore.RED}High availability controller is'
|
212
|
+
'only supported for SkyServe controller. It cannot'
|
213
|
+
f'be enabled for {controller.value.name}.'
|
214
|
+
f'Skipping this flag.{colorama.Style.RESET_ALL}')
|
215
|
+
else:
|
216
|
+
return True
|
217
|
+
return False
|
218
|
+
|
219
|
+
|
196
220
|
# Install cli dependencies. Not using SkyPilot wheels because the wheel
|
197
221
|
# can be cleaned up by another process.
|
198
222
|
def _get_cloud_dependencies_installation_commands(
|
@@ -1,3 +1,4 @@
|
|
1
|
+
#!/bin/bash
|
1
2
|
# We need to determine the pod, namespace and context from the args
|
2
3
|
# For backward compatibility, we use + as the separator between namespace and context and add handling when context is not provided
|
3
4
|
if [ "$1" = "-l" ]; then
|
@@ -7,7 +8,7 @@ if [ "$1" = "-l" ]; then
|
|
7
8
|
pod=$1
|
8
9
|
shift
|
9
10
|
encoded_namespace_context=$1
|
10
|
-
shift
|
11
|
+
shift # Shift past the encoded namespace+context
|
11
12
|
echo "pod: $pod" >&2
|
12
13
|
# Revert the encoded namespace+context to the original string.
|
13
14
|
namespace_context=$(echo "$encoded_namespace_context" | sed 's|%40|@|g' | sed 's|%3A|:|g' | sed 's|%2B|+|g' | sed 's|%2F|/|g')
|
@@ -16,7 +17,7 @@ else
|
|
16
17
|
# pod@namespace+context ...
|
17
18
|
# used by openrsync
|
18
19
|
encoded_pod_namespace_context=$1
|
19
|
-
shift
|
20
|
+
shift # Shift past the pod@namespace+context
|
20
21
|
pod_namespace_context=$(echo "$encoded_pod_namespace_context" | sed 's|%40|@|g' | sed 's|%3A|:|g' | sed 's|%2B|+|g' | sed 's|%2F|/|g')
|
21
22
|
echo "pod_namespace_context: $pod_namespace_context" >&2
|
22
23
|
pod=$(echo $pod_namespace_context | cut -d@ -f1)
|
@@ -24,16 +25,31 @@ else
|
|
24
25
|
namespace_context=$(echo $pod_namespace_context | cut -d@ -f2-)
|
25
26
|
echo "namespace_context: $namespace_context" >&2
|
26
27
|
fi
|
28
|
+
|
27
29
|
namespace=$(echo $namespace_context | cut -d+ -f1)
|
28
30
|
echo "namespace: $namespace" >&2
|
29
31
|
context=$(echo $namespace_context | grep '+' >/dev/null && echo $namespace_context | cut -d+ -f2- || echo "")
|
30
32
|
echo "context: $context" >&2
|
31
33
|
context_lower=$(echo "$context" | tr '[:upper:]' '[:lower:]')
|
32
34
|
|
35
|
+
# Check if the resource is a pod or a deployment (or other type)
|
36
|
+
if [[ "$pod" == *"/"* ]]; then
|
37
|
+
# Format is resource_type/resource_name
|
38
|
+
echo "Resource contains type: $pod" >&2
|
39
|
+
resource_type=$(echo $pod | cut -d/ -f1)
|
40
|
+
resource_name=$(echo $pod | cut -d/ -f2)
|
41
|
+
echo "Resource type: $resource_type, Resource name: $resource_name" >&2
|
42
|
+
else
|
43
|
+
# For backward compatibility or simple pod name, assume it's a pod
|
44
|
+
resource_type="pod"
|
45
|
+
resource_name=$pod
|
46
|
+
echo "Assuming resource is a pod: $resource_name" >&2
|
47
|
+
fi
|
48
|
+
|
33
49
|
if [ -z "$context" ] || [ "$context_lower" = "none" ]; then
|
34
50
|
# If context is none, it means we are using incluster auth. In this case,
|
35
51
|
# use need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
|
36
|
-
kubectl exec -i $
|
52
|
+
kubectl exec -i "$resource_type/$resource_name" -n "$namespace" --kubeconfig=/dev/null -- "$@"
|
37
53
|
else
|
38
|
-
kubectl exec -i $
|
54
|
+
kubectl exec -i "$resource_type/$resource_name" -n "$namespace" --context="$context" -- "$@"
|
39
55
|
fi
|
sky/utils/schemas.py
CHANGED
@@ -759,6 +759,9 @@ def get_config_schema():
|
|
759
759
|
'additionalProperties': False,
|
760
760
|
'properties': {
|
761
761
|
'resources': resources_schema,
|
762
|
+
'high_availability': {
|
763
|
+
'type': 'boolean',
|
764
|
+
},
|
762
765
|
'autostop': autostop_schema,
|
763
766
|
}
|
764
767
|
},
|
@@ -920,6 +923,16 @@ def get_config_schema():
|
|
920
923
|
for type in kubernetes_enums.KubernetesAutoscalerType
|
921
924
|
]
|
922
925
|
},
|
926
|
+
'high_availability': {
|
927
|
+
'type': 'object',
|
928
|
+
'required': [],
|
929
|
+
'additionalProperties': False,
|
930
|
+
'properties': {
|
931
|
+
'storage_class_name': {
|
932
|
+
'type': 'string',
|
933
|
+
}
|
934
|
+
}
|
935
|
+
},
|
923
936
|
}
|
924
937
|
},
|
925
938
|
'oci': {
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: skypilot-nightly
|
3
|
-
Version: 1.0.0.
|
3
|
+
Version: 1.0.0.dev20250429
|
4
4
|
Summary: SkyPilot: An intercloud broker for the clouds
|
5
5
|
Author: SkyPilot Team
|
6
6
|
License: Apache 2.0
|
@@ -47,7 +47,6 @@ Requires-Dist: python-multipart
|
|
47
47
|
Requires-Dist: aiofiles
|
48
48
|
Requires-Dist: httpx
|
49
49
|
Requires-Dist: setproctitle
|
50
|
-
Requires-Dist: omegaconf<2.5,>=2.4.0dev3
|
51
50
|
Provides-Extra: aws
|
52
51
|
Requires-Dist: urllib3<2; extra == "aws"
|
53
52
|
Requires-Dist: awscli>=1.27.10; extra == "aws"
|
@@ -204,6 +203,7 @@ Dynamic: summary
|
|
204
203
|
|
205
204
|
----
|
206
205
|
:fire: *News* :fire:
|
206
|
+
- [Apr 2025] Spin up **Qwen3** on your cluster/cloud: [**example**](./llm/qwen/)
|
207
207
|
- [Mar 2025] Run and serve **Google Gemma 3** using SkyPilot [**example**](./llm/gemma3/)
|
208
208
|
- [Feb 2025] Prepare and serve **Retrieval Augmented Generation (RAG) with DeepSeek-R1**: [**blog post**](https://blog.skypilot.co/deepseek-rag), [**example**](./llm/rag/)
|
209
209
|
- [Feb 2025] Run and serve **DeepSeek-R1 671B** using SkyPilot and SGLang with high throughput: [**example**](./llm/deepseek-r1/)
|