skypilot-nightly 1.0.0.dev20250427__py3-none-any.whl → 1.0.0.dev20250429__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. sky/__init__.py +2 -2
  2. sky/adaptors/nebius.py +28 -40
  3. sky/backends/backend_utils.py +19 -2
  4. sky/backends/cloud_vm_ray_backend.py +33 -8
  5. sky/backends/local_docker_backend.py +1 -2
  6. sky/cli.py +91 -38
  7. sky/client/cli.py +91 -38
  8. sky/client/sdk.py +3 -2
  9. sky/clouds/aws.py +12 -6
  10. sky/clouds/azure.py +3 -0
  11. sky/clouds/cloud.py +8 -2
  12. sky/clouds/cudo.py +2 -0
  13. sky/clouds/do.py +3 -0
  14. sky/clouds/fluidstack.py +3 -0
  15. sky/clouds/gcp.py +7 -0
  16. sky/clouds/ibm.py +2 -0
  17. sky/clouds/kubernetes.py +42 -19
  18. sky/clouds/lambda_cloud.py +1 -0
  19. sky/clouds/nebius.py +18 -10
  20. sky/clouds/oci.py +6 -3
  21. sky/clouds/paperspace.py +2 -0
  22. sky/clouds/runpod.py +2 -0
  23. sky/clouds/scp.py +2 -0
  24. sky/clouds/service_catalog/constants.py +1 -1
  25. sky/clouds/service_catalog/kubernetes_catalog.py +7 -7
  26. sky/clouds/vast.py +2 -0
  27. sky/clouds/vsphere.py +2 -0
  28. sky/core.py +58 -29
  29. sky/dashboard/out/404.html +1 -1
  30. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  31. sky/dashboard/out/clusters/[cluster].html +1 -1
  32. sky/dashboard/out/clusters.html +1 -1
  33. sky/dashboard/out/favicon.ico +0 -0
  34. sky/dashboard/out/index.html +1 -1
  35. sky/dashboard/out/jobs/[job].html +1 -1
  36. sky/dashboard/out/jobs.html +1 -1
  37. sky/exceptions.py +6 -0
  38. sky/execution.py +19 -4
  39. sky/global_user_state.py +1 -0
  40. sky/optimizer.py +35 -11
  41. sky/provision/common.py +2 -5
  42. sky/provision/docker_utils.py +22 -16
  43. sky/provision/instance_setup.py +1 -1
  44. sky/provision/kubernetes/instance.py +276 -93
  45. sky/provision/kubernetes/network.py +1 -1
  46. sky/provision/kubernetes/utils.py +36 -24
  47. sky/provision/provisioner.py +6 -0
  48. sky/serve/replica_managers.py +51 -5
  49. sky/serve/serve_state.py +41 -0
  50. sky/serve/service.py +108 -63
  51. sky/server/common.py +6 -3
  52. sky/server/config.py +184 -0
  53. sky/server/requests/executor.py +17 -156
  54. sky/server/server.py +4 -4
  55. sky/setup_files/dependencies.py +0 -1
  56. sky/skylet/constants.py +7 -0
  57. sky/skypilot_config.py +27 -6
  58. sky/task.py +1 -1
  59. sky/templates/kubernetes-ray.yml.j2 +145 -15
  60. sky/templates/nebius-ray.yml.j2 +63 -0
  61. sky/utils/command_runner.py +17 -3
  62. sky/utils/command_runner.pyi +2 -0
  63. sky/utils/controller_utils.py +24 -0
  64. sky/utils/kubernetes/rsync_helper.sh +20 -4
  65. sky/utils/schemas.py +13 -0
  66. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/METADATA +2 -2
  67. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/RECORD +73 -72
  68. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/WHEEL +1 -1
  69. /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_buildManifest.js +0 -0
  70. /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → BMtJJ079_cyYmtW2-7nVS}/_ssgManifest.js +0 -0
  71. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/entry_points.txt +0 -0
  72. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/licenses/LICENSE +0 -0
  73. {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250429.dist-info}/top_level.txt +0 -0
@@ -258,7 +258,7 @@ available_node_types:
258
258
  # service is required.
259
259
  labels:
260
260
  parent: skypilot
261
- # component will be set for the head node pod to be the same as the head node service selector above if a
261
+ # component will be set for the head node pod to be the same as the head node service selector above if a
262
262
  skypilot-cluster: {{cluster_name_on_cloud}}
263
263
  # Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
264
264
  skypilot-ssh-jump: {{k8s_ssh_jump_name}}
@@ -267,18 +267,18 @@ available_node_types:
267
267
  {%- for label_key, label_value in labels.items() %}
268
268
  {{ label_key }}: {{ label_value|tojson }}
269
269
  {%- endfor %}
270
+ {% if high_availability %}
271
+ app: {{cluster_name_on_cloud}}
272
+ {% endif %}
270
273
  spec:
271
274
  # serviceAccountName: skypilot-service-account
272
275
  serviceAccountName: {{k8s_service_account_name}}
273
276
  automountServiceAccountToken: {{k8s_automount_sa_token}}
274
- restartPolicy: Never
277
+ restartPolicy: {{ "Always" if high_availability else "Never" }}
275
278
 
276
279
  # Add node selector if GPU/TPUs are requested:
277
- {% if (k8s_acc_label_key is not none and k8s_acc_label_value is not none) or (k8s_spot_label_key is not none) %}
280
+ {% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) %}
278
281
  nodeSelector:
279
- {% if k8s_acc_label_key is not none and k8s_acc_label_value is not none %}
280
- {{k8s_acc_label_key}}: {{k8s_acc_label_value}}
281
- {% endif %}
282
282
  {% if k8s_topology_label_key is not none and k8s_topology_label_value is not none %}
283
283
  {{k8s_topology_label_key}}: {{k8s_topology_label_value}}
284
284
  {% endif %}
@@ -286,6 +286,19 @@ available_node_types:
286
286
  {{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}}
287
287
  {% endif %}
288
288
  {% endif %}
289
+ {% if (k8s_acc_label_key is not none and k8s_acc_label_values is not none) %}
290
+ affinity:
291
+ nodeAffinity:
292
+ requiredDuringSchedulingIgnoredDuringExecution:
293
+ nodeSelectorTerms:
294
+ - matchExpressions:
295
+ - key: {{k8s_acc_label_key}}
296
+ operator: In
297
+ values:
298
+ {% for label_value in k8s_acc_label_values %}
299
+ - {{label_value}}
300
+ {% endfor %}
301
+ {% endif %}
289
302
 
290
303
  {% if k8s_spot_label_key is not none %}
291
304
  tolerations:
@@ -311,6 +324,11 @@ available_node_types:
311
324
  path: {{k8s_fusermount_shared_dir}}
312
325
  type: DirectoryOrCreate
313
326
  {% endif %}
327
+ {% if high_availability %}
328
+ - name: {{k8s_high_availability_deployment_volume_mount_name}}
329
+ persistentVolumeClaim:
330
+ claimName: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
331
+ {% endif %}
314
332
  containers:
315
333
  - name: ray-node
316
334
  imagePullPolicy: IfNotPresent
@@ -331,15 +349,15 @@ available_node_types:
331
349
  # Do not change this command - it keeps the pod alive until it is
332
350
  # explicitly killed.
333
351
  command: ["/bin/bash", "-c", "--"]
334
- args:
352
+ args:
335
353
  - |
336
354
  # For backwards compatibility, we put a marker file in the pod
337
- # to indicate that the pod is running with the changes introduced
355
+ # to indicate that the pod is running with the changes introduced
338
356
  # in project nimbus: https://github.com/skypilot-org/skypilot/pull/4393
339
357
  # TODO: Remove this marker file and it's usage in setup_commands
340
358
  # after v0.10.0 release.
341
359
  touch /tmp/skypilot_is_nimbus
342
-
360
+
343
361
  # Helper function to conditionally use sudo
344
362
  # TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
345
363
  prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
@@ -382,7 +400,7 @@ available_node_types:
382
400
  fi;
383
401
  # SSH and other packages are not necessary, so we disable set -e
384
402
  set +e
385
-
403
+
386
404
  if [ ! -z "$MISSING_PACKAGES" ]; then
387
405
  # Install missing packages individually to avoid failure installation breaks the whole install process,
388
406
  # e.g. fuse3 is not available on some distributions.
@@ -435,7 +453,7 @@ available_node_types:
435
453
  $(prefix_cmd) rm -f /bin/fusermount-wrapper
436
454
  $(prefix_cmd) cp -p {{k8s_fusermount_shared_dir}}/fusermount-wrapper /bin/fusermount-wrapper
437
455
  fi
438
- {% endif %}
456
+ {% endif %}
439
457
 
440
458
  $(prefix_cmd) mkdir -p /var/run/sshd;
441
459
  $(prefix_cmd) sed -i "s/PermitRootLogin prohibit-password/PermitRootLogin yes/" /etc/ssh/sshd_config;
@@ -470,6 +488,7 @@ available_node_types:
470
488
  done
471
489
  {{ conda_installation_commands }}
472
490
  {{ ray_installation_commands }}
491
+
473
492
  VIRTUAL_ENV=~/skypilot-runtime ~/.local/bin/uv pip install skypilot[kubernetes,remote]
474
493
  touch /tmp/ray_skypilot_installation_complete
475
494
  echo "=== Ray and skypilot installation completed ==="
@@ -565,7 +584,7 @@ available_node_types:
565
584
  # File is already being monitored
566
585
  continue
567
586
  fi
568
-
587
+
569
588
  # Monitor the new file
570
589
  monitor_file $file &
571
590
  already_monitored="${already_monitored} ${file}"
@@ -573,6 +592,28 @@ available_node_types:
573
592
  sleep 0.1
574
593
  done
575
594
  }
595
+
596
+ {% if high_availability %}
597
+ mkdir -p {{k8s_high_availability_deployment_run_script_dir}}
598
+ if [ -f {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready ]; then
599
+ # ! Keep this aligned with `CloudVmRayBackend._setup()`
600
+ # Suppose all `task.setup` are the same for skyserve controller task.
601
+ # So be careful for compatibility issue once you change it.
602
+ chmod +x {{k8s_high_availability_deployment_setup_script_path}}
603
+ /bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && {{k8s_high_availability_deployment_setup_script_path}} > /tmp/controller_recovery_setup_commands.log 2>&1"
604
+ echo "=== Controller setup commands completed for recovery ==="
605
+
606
+ for file in {{k8s_high_availability_deployment_run_script_dir}}/*; do
607
+ # ! Keep this aligned with `CloudVmRayBackend._execute()`
608
+ chmod +x $file
609
+ /bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && $file > /tmp/task_run_$(basename $file).log 2>&1"
610
+ echo "=== Controller task run for service (file: $file) completed for recovery ==="
611
+ done
612
+ fi
613
+
614
+ touch {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready
615
+ {% endif %}
616
+
576
617
  trap : TERM INT; log_tail || sleep infinity & wait
577
618
 
578
619
  ports:
@@ -593,6 +634,10 @@ available_node_types:
593
634
  # /tmp which cause slowdowns if is not a shared memory volume.
594
635
  - mountPath: /dev/shm
595
636
  name: dshm
637
+ {% if high_availability %}
638
+ - name: {{k8s_high_availability_deployment_volume_mount_name}}
639
+ mountPath: {{k8s_high_availability_deployment_volume_mount_path}}
640
+ {% endif %}
596
641
  {% if k8s_fuse_device_required %}
597
642
  - name: fusermount-shared-dir
598
643
  mountPath: {{k8s_fusermount_shared_dir}}
@@ -616,7 +661,92 @@ available_node_types:
616
661
  {{k8s_resource_key}}: {{accelerator_count}}
617
662
  {% endif %}
618
663
  {% endif %}
619
-
664
+
665
+ {% if high_availability %}
666
+ pvc_spec:
667
+ apiVersion: v1
668
+ kind: PersistentVolumeClaim
669
+ metadata:
670
+ name: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
671
+ namespace: {{k8s_namespace}}
672
+ spec:
673
+ accessModes:
674
+ - ReadWriteOnce # Our controller pod is singleton
675
+ {% if k8s_high_availability_storage_class_name is not none %}
676
+ storageClassName: {{k8s_high_availability_storage_class_name}}
677
+ {% endif %}
678
+ resources:
679
+ requests:
680
+ storage: {{disk_size}}Gi
681
+
682
+ deployment_spec:
683
+ apiVersion: apps/v1
684
+ kind: Deployment
685
+ metadata:
686
+ name: {{cluster_name_on_cloud}}-deployment
687
+ namespace: {{k8s_namespace}}
688
+ spec:
689
+ replicas: 1
690
+ selector:
691
+ matchLabels:
692
+ app: {{cluster_name_on_cloud}}
693
+ template:
694
+ # The only difference between the pod spec and this section is the initContainers
695
+ metadata:
696
+ # should be replaced by pod metadata
697
+ spec:
698
+ securityContext:
699
+ fsGroup: 1000
700
+ # To prevent the home dir provided by the docker image from being overriden by pvc mounting,
701
+ # we use initContainers to copy it first to /mnt/home, which will later be mounted to home dir.
702
+ initContainers:
703
+ - name: init-copy-home
704
+ image: {{image_id}}
705
+ command: ["/bin/sh", "-c"]
706
+ args:
707
+ - |
708
+ # Define path for the marker file created by the main container upon successful startup.
709
+ # This file persists in the PVC across Pod restarts.
710
+ MARKER_FILE="/mnt/home/k8s_container_ready"
711
+ SOURCE_PATH="{{k8s_high_availability_deployment_volume_mount_path}}"
712
+ DEST_PATH="/mnt/home"
713
+
714
+ # We only need to copy the initial home directory contents from the image
715
+ # the *first* time a Pod uses a *new* PVC.
716
+ # On subsequent Pod starts (e.g., after a crash or update), the PVC
717
+ # already contains the necessary data (and potentially user modifications).
718
+ # The presence of MARKER_FILE (created by the main container in a previous
719
+ # successful run) indicates the PVC is already initialized. Checking for
720
+ # it prevents unnecessary and time-consuming rsync operations on every restart.
721
+ if [ ! -f "$MARKER_FILE" ]; then
722
+ echo "Marker '$MARKER_FILE' not found. PVC likely needs initialization."
723
+ echo "Copying initial home directory from image ($SOURCE_PATH/) to PVC ($DEST_PATH)..."
724
+
725
+ # Use rsync with -rl (recursive, links) instead of -a (archive).
726
+ # This avoids preserving times (-t) and permissions (-p) implied by -a,
727
+ # which caused 'Operation not permitted' errors on the PVC root directory (/mnt/home).
728
+ # Owner/group preservation (-o, -g) is also skipped (default for -rl), ensuring
729
+ # files are owned by the container's user/group.
730
+ rsync -rl "$SOURCE_PATH/" "$DEST_PATH"
731
+
732
+ # Check if rsync failed
733
+ if [ $? -ne 0 ]; then
734
+ echo "ERROR: rsync failed during home directory initialization." >&2
735
+ exit 1 # Exit initContainer with error if copy fails
736
+ fi
737
+ echo "Home directory initialization copy complete."
738
+ else
739
+ # If marker exists, skip the copy
740
+ echo "Marker '$MARKER_FILE' found. Skipping initial home directory copy."
741
+ fi
742
+ echo "Current contents of $DEST_PATH:"
743
+ ls -la "$DEST_PATH"
744
+ volumeMounts:
745
+ # Mount the persistent volume claim into the initContainer
746
+ - name: {{k8s_high_availability_deployment_volume_mount_name}}
747
+ mountPath: /mnt/home # Temporary mount point for initialization
748
+ # should be replaced by pod spec
749
+ {% endif %}
620
750
  setup_commands:
621
751
  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
622
752
  # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
@@ -636,7 +766,7 @@ setup_commands:
636
766
  echo "=== Logs for asynchronous ray and skypilot installation ===";
637
767
  if [ -f /tmp/skypilot_is_nimbus ]; then
638
768
  echo "=== Logs for asynchronous ray and skypilot installation ===";
639
- [ -f /tmp/ray_skypilot_installation_complete ] && cat /tmp/${STEPS[1]}.log ||
769
+ [ -f /tmp/ray_skypilot_installation_complete ] && cat /tmp/${STEPS[1]}.log ||
640
770
  { tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/ray_skypilot_installation_complete ]; do sleep 0.5; done; kill $TAIL_PID || true; };
641
771
  [ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
642
772
  fi
@@ -666,7 +796,7 @@ setup_commands:
666
796
  # properly written.
667
797
  # TODO(Doyoung): Investigate to see why TPU workload fails to run without
668
798
  # execution permission, such as granting 766 to log file. Check if it's a
669
- # must and see if there's a workaround to grant minimum permission.
799
+ # must and see if there's a workaround to grant minimum permission.
670
800
  sudo chmod 777 /tmp/tpu_logs;
671
801
  {% endif %}
672
802
 
@@ -10,6 +10,27 @@ provider:
10
10
  module: sky.provision.nebius
11
11
  region: "{{region}}"
12
12
 
13
+ {%- if docker_image is not none %}
14
+ docker:
15
+ image: {{docker_image}}
16
+ container_name: {{docker_container_name}}
17
+ run_options:
18
+ - --ulimit nofile=1048576:1048576
19
+ {%- for run_option in docker_run_options %}
20
+ - {{run_option}}
21
+ {%- endfor %}
22
+ {%- if docker_login_config is not none %}
23
+ docker_login_config:
24
+ username: |-
25
+ {{docker_login_config.username}}
26
+ password: |-
27
+ {{docker_login_config.password}}
28
+ server: |-
29
+ {{docker_login_config.server}}
30
+ {%- endif %}
31
+ {%- endif %}
32
+
33
+
13
34
  auth:
14
35
  ssh_user: ubuntu
15
36
  ssh_private_key: {{ssh_private_key}}
@@ -22,6 +43,48 @@ available_node_types:
22
43
  ImageId: {{image_id}}
23
44
  DiskSize: {{disk_size}}
24
45
  UserData: |
46
+ {%- if docker_image is not none %}
47
+ runcmd:
48
+ - sudo sed -i 's/^#\?AllowTcpForwarding.*/AllowTcpForwarding yes/' /etc/ssh/sshd_config
49
+ - systemctl restart sshd
50
+ {%- endif %}
51
+
52
+ {# Two available OS images:
53
+ 1. ubuntu22.04-driverless - requires Docker installation
54
+ 2. ubuntu22.04-cuda12 - comes with Docker pre-installed
55
+ To optimize deployment speed, Docker is only installed when using ubuntu22.04-driverless #}
56
+ {%- if docker_image is not none and image_id == 'ubuntu22.04-driverless' %}
57
+ apt:
58
+ sources:
59
+ docker.list:
60
+ source: deb [arch=amd64] https://download.docker.com/linux/ubuntu $RELEASE stable
61
+ keyid: 9DC858229FC7DD38854AE2D88D81803C0EBFCD88
62
+
63
+ packages:
64
+ - apt-transport-https
65
+ - ca-certificates
66
+ - curl
67
+ - gnupg-agent
68
+ - software-properties-common
69
+ - docker-ce
70
+ - docker-ce-cli
71
+ - containerd.io
72
+
73
+ # Enable ipv4 forwarding, required on CIS hardened machines
74
+ write_files:
75
+ - path: /etc/sysctl.d/enabled_ipv4_forwarding.conf
76
+ content: |
77
+ net.ipv4.conf.all.forwarding=1
78
+
79
+ # create the docker group
80
+ groups:
81
+ - docker
82
+
83
+ # Add default auto created user to docker group
84
+ system_info:
85
+ default_user:
86
+ groups: [docker]
87
+ {%- endif %}
25
88
  users:
26
89
  - name: skypilot:ssh_user
27
90
  shell: /bin/bash
@@ -325,6 +325,7 @@ class CommandRunner:
325
325
  direction = 'up' if up else 'down'
326
326
  error_msg = (f'Failed to rsync {direction}: {source} -> {target}. '
327
327
  'Ensure that the network is stable, then retry.')
328
+
328
329
  subprocess_utils.handle_returncode(returncode,
329
330
  command,
330
331
  error_msg,
@@ -718,6 +719,7 @@ class KubernetesCommandRunner(CommandRunner):
718
719
  def __init__(
719
720
  self,
720
721
  node: Tuple[Tuple[str, Optional[str]], str],
722
+ deployment: Optional[str] = None,
721
723
  **kwargs,
722
724
  ):
723
725
  """Initialize KubernetesCommandRunner.
@@ -733,11 +735,19 @@ class KubernetesCommandRunner(CommandRunner):
733
735
  del kwargs
734
736
  super().__init__(node)
735
737
  (self.namespace, self.context), self.pod_name = node
738
+ self.deployment = deployment
736
739
 
737
740
  @property
738
741
  def node_id(self) -> str:
739
742
  return f'{self.context}-{self.namespace}-{self.pod_name}'
740
743
 
744
+ @property
745
+ def kube_identifier(self) -> str:
746
+ if self.deployment is not None:
747
+ return f'deployment/{self.deployment}'
748
+ else:
749
+ return f'pod/{self.pod_name}'
750
+
741
751
  def port_forward_command(self,
742
752
  port_forward: List[Tuple[int, int]],
743
753
  connect_timeout: int = 1) -> List[str]:
@@ -758,11 +768,12 @@ class KubernetesCommandRunner(CommandRunner):
758
768
  kubectl_args += ['--context', self.context]
759
769
  local_port, remote_port = port_forward[0]
760
770
  local_port_str = f'{local_port}' if local_port is not None else ''
771
+
761
772
  kubectl_cmd = [
762
773
  'kubectl',
763
774
  *kubectl_args,
764
775
  'port-forward',
765
- f'pod/{self.pod_name}',
776
+ self.kube_identifier,
766
777
  f'{local_port_str}:{remote_port}',
767
778
  ]
768
779
  return kubectl_cmd
@@ -785,7 +796,8 @@ class KubernetesCommandRunner(CommandRunner):
785
796
  source_bashrc: bool = False,
786
797
  skip_num_lines: int = 0,
787
798
  **kwargs) -> Union[int, Tuple[int, str, str]]:
788
- """Uses 'kubectl exec' to run 'cmd' on a pod by its name and namespace.
799
+ """Uses 'kubectl exec' to run 'cmd' on a pod or deployment by its
800
+ name and namespace.
789
801
 
790
802
  Args:
791
803
  cmd: The command to run.
@@ -828,7 +840,9 @@ class KubernetesCommandRunner(CommandRunner):
828
840
  # case, need to set KUBECONFIG to /dev/null to avoid using kubeconfig.
829
841
  if self.context is None:
830
842
  kubectl_args += ['--kubeconfig', '/dev/null']
831
- kubectl_args += [self.pod_name]
843
+
844
+ kubectl_args += [self.kube_identifier]
845
+
832
846
  if ssh_mode == SshMode.LOGIN:
833
847
  assert isinstance(cmd, list), 'cmd must be a list for login mode.'
834
848
  base_cmd = ['kubectl', 'exec', '-it', *kubectl_args, '--']
@@ -206,6 +206,8 @@ class KubernetesCommandRunner(CommandRunner):
206
206
  def __init__(
207
207
  self,
208
208
  node: Tuple[Tuple[str, Optional[str]], str],
209
+ deployment: Optional[str] = ...,
210
+ **kwargs,
209
211
  ) -> None:
210
212
  ...
211
213
 
@@ -193,6 +193,30 @@ class Controllers(enum.Enum):
193
193
  return None
194
194
 
195
195
 
196
+ def high_availability_specified(cluster_name: Optional[str],
197
+ skip_warning: bool = True) -> bool:
198
+ """Check if the controller high availability is specified in user config.
199
+ """
200
+ controller = Controllers.from_name(cluster_name)
201
+ if controller is None:
202
+ return False
203
+
204
+ if skypilot_config.loaded():
205
+ high_availability = skypilot_config.get_nested(
206
+ (controller.value.controller_type, 'controller',
207
+ 'high_availability'), False)
208
+ if high_availability:
209
+ if controller.value.controller_type != 'serve':
210
+ if not skip_warning:
211
+ print(f'{colorama.Fore.RED}High availability controller is'
212
+ 'only supported for SkyServe controller. It cannot'
213
+ f'be enabled for {controller.value.name}.'
214
+ f'Skipping this flag.{colorama.Style.RESET_ALL}')
215
+ else:
216
+ return True
217
+ return False
218
+
219
+
196
220
  # Install cli dependencies. Not using SkyPilot wheels because the wheel
197
221
  # can be cleaned up by another process.
198
222
  def _get_cloud_dependencies_installation_commands(
@@ -1,3 +1,4 @@
1
+ #!/bin/bash
1
2
  # We need to determine the pod, namespace and context from the args
2
3
  # For backward compatibility, we use + as the separator between namespace and context and add handling when context is not provided
3
4
  if [ "$1" = "-l" ]; then
@@ -7,7 +8,7 @@ if [ "$1" = "-l" ]; then
7
8
  pod=$1
8
9
  shift
9
10
  encoded_namespace_context=$1
10
- shift
11
+ shift # Shift past the encoded namespace+context
11
12
  echo "pod: $pod" >&2
12
13
  # Revert the encoded namespace+context to the original string.
13
14
  namespace_context=$(echo "$encoded_namespace_context" | sed 's|%40|@|g' | sed 's|%3A|:|g' | sed 's|%2B|+|g' | sed 's|%2F|/|g')
@@ -16,7 +17,7 @@ else
16
17
  # pod@namespace+context ...
17
18
  # used by openrsync
18
19
  encoded_pod_namespace_context=$1
19
- shift
20
+ shift # Shift past the pod@namespace+context
20
21
  pod_namespace_context=$(echo "$encoded_pod_namespace_context" | sed 's|%40|@|g' | sed 's|%3A|:|g' | sed 's|%2B|+|g' | sed 's|%2F|/|g')
21
22
  echo "pod_namespace_context: $pod_namespace_context" >&2
22
23
  pod=$(echo $pod_namespace_context | cut -d@ -f1)
@@ -24,16 +25,31 @@ else
24
25
  namespace_context=$(echo $pod_namespace_context | cut -d@ -f2-)
25
26
  echo "namespace_context: $namespace_context" >&2
26
27
  fi
28
+
27
29
  namespace=$(echo $namespace_context | cut -d+ -f1)
28
30
  echo "namespace: $namespace" >&2
29
31
  context=$(echo $namespace_context | grep '+' >/dev/null && echo $namespace_context | cut -d+ -f2- || echo "")
30
32
  echo "context: $context" >&2
31
33
  context_lower=$(echo "$context" | tr '[:upper:]' '[:lower:]')
32
34
 
35
+ # Check if the resource is a pod or a deployment (or other type)
36
+ if [[ "$pod" == *"/"* ]]; then
37
+ # Format is resource_type/resource_name
38
+ echo "Resource contains type: $pod" >&2
39
+ resource_type=$(echo $pod | cut -d/ -f1)
40
+ resource_name=$(echo $pod | cut -d/ -f2)
41
+ echo "Resource type: $resource_type, Resource name: $resource_name" >&2
42
+ else
43
+ # For backward compatibility or simple pod name, assume it's a pod
44
+ resource_type="pod"
45
+ resource_name=$pod
46
+ echo "Assuming resource is a pod: $resource_name" >&2
47
+ fi
48
+
33
49
  if [ -z "$context" ] || [ "$context_lower" = "none" ]; then
34
50
  # If context is none, it means we are using incluster auth. In this case,
35
51
  # use need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
36
- kubectl exec -i $pod -n $namespace --kubeconfig=/dev/null -- "$@"
52
+ kubectl exec -i "$resource_type/$resource_name" -n "$namespace" --kubeconfig=/dev/null -- "$@"
37
53
  else
38
- kubectl exec -i $pod -n $namespace --context=$context -- "$@"
54
+ kubectl exec -i "$resource_type/$resource_name" -n "$namespace" --context="$context" -- "$@"
39
55
  fi
sky/utils/schemas.py CHANGED
@@ -759,6 +759,9 @@ def get_config_schema():
759
759
  'additionalProperties': False,
760
760
  'properties': {
761
761
  'resources': resources_schema,
762
+ 'high_availability': {
763
+ 'type': 'boolean',
764
+ },
762
765
  'autostop': autostop_schema,
763
766
  }
764
767
  },
@@ -920,6 +923,16 @@ def get_config_schema():
920
923
  for type in kubernetes_enums.KubernetesAutoscalerType
921
924
  ]
922
925
  },
926
+ 'high_availability': {
927
+ 'type': 'object',
928
+ 'required': [],
929
+ 'additionalProperties': False,
930
+ 'properties': {
931
+ 'storage_class_name': {
932
+ 'type': 'string',
933
+ }
934
+ }
935
+ },
923
936
  }
924
937
  },
925
938
  'oci': {
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: skypilot-nightly
3
- Version: 1.0.0.dev20250427
3
+ Version: 1.0.0.dev20250429
4
4
  Summary: SkyPilot: An intercloud broker for the clouds
5
5
  Author: SkyPilot Team
6
6
  License: Apache 2.0
@@ -47,7 +47,6 @@ Requires-Dist: python-multipart
47
47
  Requires-Dist: aiofiles
48
48
  Requires-Dist: httpx
49
49
  Requires-Dist: setproctitle
50
- Requires-Dist: omegaconf<2.5,>=2.4.0dev3
51
50
  Provides-Extra: aws
52
51
  Requires-Dist: urllib3<2; extra == "aws"
53
52
  Requires-Dist: awscli>=1.27.10; extra == "aws"
@@ -204,6 +203,7 @@ Dynamic: summary
204
203
 
205
204
  ----
206
205
  :fire: *News* :fire:
206
+ - [Apr 2025] Spin up **Qwen3** on your cluster/cloud: [**example**](./llm/qwen/)
207
207
  - [Mar 2025] Run and serve **Google Gemma 3** using SkyPilot [**example**](./llm/gemma3/)
208
208
  - [Feb 2025] Prepare and serve **Retrieval Augmented Generation (RAG) with DeepSeek-R1**: [**blog post**](https://blog.skypilot.co/deepseek-rag), [**example**](./llm/rag/)
209
209
  - [Feb 2025] Run and serve **DeepSeek-R1 671B** using SkyPilot and SGLang with high throughput: [**example**](./llm/deepseek-r1/)