skypilot-nightly 1.0.0.dev20250426__py3-none-any.whl → 1.0.0.dev20250428__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +19 -2
  3. sky/backends/cloud_vm_ray_backend.py +33 -8
  4. sky/backends/local_docker_backend.py +1 -2
  5. sky/cli.py +1 -1
  6. sky/client/cli.py +1 -1
  7. sky/clouds/aws.py +12 -6
  8. sky/clouds/azure.py +3 -0
  9. sky/clouds/cloud.py +3 -0
  10. sky/clouds/cudo.py +2 -0
  11. sky/clouds/do.py +3 -0
  12. sky/clouds/fluidstack.py +3 -0
  13. sky/clouds/gcp.py +7 -0
  14. sky/clouds/ibm.py +2 -0
  15. sky/clouds/kubernetes.py +38 -15
  16. sky/clouds/lambda_cloud.py +1 -0
  17. sky/clouds/nebius.py +2 -0
  18. sky/clouds/oci.py +6 -3
  19. sky/clouds/paperspace.py +2 -0
  20. sky/clouds/runpod.py +2 -0
  21. sky/clouds/scp.py +2 -0
  22. sky/clouds/vast.py +2 -0
  23. sky/clouds/vsphere.py +2 -0
  24. sky/dashboard/out/404.html +1 -1
  25. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  26. sky/dashboard/out/clusters/[cluster].html +1 -1
  27. sky/dashboard/out/clusters.html +1 -1
  28. sky/dashboard/out/index.html +1 -1
  29. sky/dashboard/out/jobs/[job].html +1 -1
  30. sky/dashboard/out/jobs.html +1 -1
  31. sky/exceptions.py +6 -0
  32. sky/execution.py +19 -4
  33. sky/global_user_state.py +1 -0
  34. sky/provision/common.py +2 -5
  35. sky/provision/instance_setup.py +1 -1
  36. sky/provision/kubernetes/instance.py +280 -94
  37. sky/provision/kubernetes/network.py +1 -1
  38. sky/provision/kubernetes/utils.py +10 -0
  39. sky/provision/provisioner.py +6 -0
  40. sky/serve/replica_managers.py +51 -5
  41. sky/serve/serve_state.py +41 -0
  42. sky/serve/service.py +108 -63
  43. sky/server/requests/executor.py +4 -4
  44. sky/skylet/constants.py +7 -0
  45. sky/task.py +1 -1
  46. sky/templates/kubernetes-ray.yml.j2 +122 -2
  47. sky/utils/command_runner.py +17 -3
  48. sky/utils/command_runner.pyi +2 -0
  49. sky/utils/controller_utils.py +24 -0
  50. sky/utils/kubernetes/rsync_helper.sh +20 -4
  51. sky/utils/schemas.py +13 -0
  52. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/METADATA +1 -1
  53. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/RECORD +59 -59
  54. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/WHEEL +1 -1
  55. /sky/dashboard/out/_next/static/{WO8lTFPfj-lO3_gDGEiN8 → 2f-jlOWR_G5mOwCF4RcZz}/_buildManifest.js +0 -0
  56. /sky/dashboard/out/_next/static/{WO8lTFPfj-lO3_gDGEiN8 → 2f-jlOWR_G5mOwCF4RcZz}/_ssgManifest.js +0 -0
  57. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/entry_points.txt +0 -0
  58. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/licenses/LICENSE +0 -0
  59. {skypilot_nightly-1.0.0.dev20250426.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/top_level.txt +0 -0
sky/serve/serve_state.py CHANGED
@@ -479,6 +479,14 @@ def total_number_provisioning_replicas() -> int:
479
479
  return provisioning_count
480
480
 
481
481
 
482
+ def get_replicas_at_status(
483
+ service_name: str,
484
+ status: ReplicaStatus,
485
+ ) -> List['replica_managers.ReplicaInfo']:
486
+ replicas = get_replica_infos(service_name)
487
+ return [replica for replica in replicas if replica.status == status]
488
+
489
+
482
490
  # === Version functions ===
483
491
  def add_version(service_name: str) -> int:
484
492
  """Adds a version to the database."""
@@ -549,3 +557,36 @@ def delete_all_versions(service_name: str) -> None:
549
557
  """\
550
558
  DELETE FROM version_specs
551
559
  WHERE service_name=(?)""", (service_name,))
560
+
561
+
562
+ def get_latest_version(service_name: str) -> Optional[int]:
563
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
564
+ rows = cursor.execute(
565
+ """\
566
+ SELECT MAX(version) FROM version_specs
567
+ WHERE service_name=(?)""", (service_name,)).fetchall()
568
+ if not rows or rows[0][0] is None:
569
+ return None
570
+ return rows[0][0]
571
+
572
+
573
+ def get_service_controller_port(service_name: str) -> int:
574
+ """Gets the controller port of a service."""
575
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
576
+ cursor.execute('SELECT controller_port FROM services WHERE name = ?',
577
+ (service_name,))
578
+ row = cursor.fetchone()
579
+ if row is None:
580
+ raise ValueError(f'Service {service_name} does not exist.')
581
+ return row[0]
582
+
583
+
584
+ def get_service_load_balancer_port(service_name: str) -> int:
585
+ """Gets the load balancer port of a service."""
586
+ with db_utils.safe_cursor(_DB_PATH) as cursor:
587
+ cursor.execute('SELECT load_balancer_port FROM services WHERE name = ?',
588
+ (service_name,))
589
+ row = cursor.fetchone()
590
+ if row is None:
591
+ raise ValueError(f'Service {service_name} does not exist.')
592
+ return row[0]
sky/serve/service.py CHANGED
@@ -25,6 +25,7 @@ from sky.serve import load_balancer
25
25
  from sky.serve import replica_managers
26
26
  from sky.serve import serve_state
27
27
  from sky.serve import serve_utils
28
+ from sky.skylet import constants as skylet_constants
28
29
  from sky.utils import common_utils
29
30
  from sky.utils import subprocess_utils
30
31
  from sky.utils import ux_utils
@@ -136,8 +137,25 @@ def _cleanup(service_name: str) -> bool:
136
137
  return failed
137
138
 
138
139
 
140
+ def _cleanup_task_run_script(job_id: int) -> None:
141
+ """Clean up task run script.
142
+ Please see `kubernetes-ray.yml.j2` for more details.
143
+ """
144
+ task_run_dir = pathlib.Path(
145
+ skylet_constants.PERSISTENT_RUN_SCRIPT_DIR).expanduser()
146
+ if task_run_dir.exists():
147
+ this_task_run_script = task_run_dir / f'sky_job_{job_id}'
148
+ if this_task_run_script.exists():
149
+ this_task_run_script.unlink()
150
+ logger.info(f'Task run script {this_task_run_script} removed')
151
+ else:
152
+ logger.warning(f'Task run script {this_task_run_script} not found')
153
+
154
+
139
155
  def _start(service_name: str, tmp_task_yaml: str, job_id: int):
140
- """Starts the service."""
156
+ """Starts the service.
157
+ This including the controller and load balancer.
158
+ """
141
159
  # Generate ssh key pair to avoid race condition when multiple sky.launch
142
160
  # are executed at the same time.
143
161
  authentication.get_or_generate_keys()
@@ -147,62 +165,79 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
147
165
  # Already checked before submit to controller.
148
166
  assert task.service is not None, task
149
167
  service_spec = task.service
150
- if (len(serve_state.get_services()) >=
151
- serve_utils.get_num_service_threshold()):
152
- cleanup_storage(tmp_task_yaml)
153
- with ux_utils.print_exception_no_traceback():
154
- raise RuntimeError('Max number of services reached.')
155
- success = serve_state.add_service(
156
- service_name,
157
- controller_job_id=job_id,
158
- policy=service_spec.autoscaling_policy_str(),
159
- requested_resources_str=backend_utils.get_task_resources_str(task),
160
- load_balancing_policy=service_spec.load_balancing_policy,
161
- status=serve_state.ServiceStatus.CONTROLLER_INIT,
162
- tls_encrypted=service_spec.tls_credential is not None)
163
- # Directly throw an error here. See sky/serve/api.py::up
164
- # for more details.
165
- if not success:
166
- cleanup_storage(tmp_task_yaml)
167
- with ux_utils.print_exception_no_traceback():
168
- raise ValueError(f'Service {service_name} already exists.')
169
-
170
- # Add initial version information to the service state.
171
- serve_state.add_or_update_version(service_name, constants.INITIAL_VERSION,
172
- service_spec)
173
-
174
- # Create the service working directory.
168
+
169
+ def is_recovery_mode(service_name: str) -> bool:
170
+ """Check if service exists in database to determine recovery mode.
171
+ """
172
+ service = serve_state.get_service_from_name(service_name)
173
+ return service is not None
174
+
175
+ is_recovery = is_recovery_mode(service_name)
176
+ logger.info(f'It is a {"first" if not is_recovery else "recovery"} run')
177
+
178
+ if is_recovery:
179
+ version = serve_state.get_latest_version(service_name)
180
+ if version is None:
181
+ raise ValueError(f'No version found for service {service_name}')
182
+ else:
183
+ version = constants.INITIAL_VERSION
184
+ # Add initial version information to the service state.
185
+ serve_state.add_or_update_version(service_name, version, service_spec)
186
+
175
187
  service_dir = os.path.expanduser(
176
188
  serve_utils.generate_remote_service_dir_name(service_name))
177
- os.makedirs(service_dir, exist_ok=True)
178
-
179
- # Copy the tmp task yaml file to the final task yaml file.
180
- # This is for the service name conflict case. The _execute will
181
- # sync file mounts first and then realized a name conflict. We
182
- # don't want the new file mounts to overwrite the old one, so we
183
- # sync to a tmp file first and then copy it to the final name
184
- # if there is no name conflict.
185
- task_yaml = serve_utils.generate_task_yaml_file_name(
186
- service_name, constants.INITIAL_VERSION)
187
- shutil.copy(tmp_task_yaml, task_yaml)
188
-
189
- # Generate load balancer log file name.
190
- load_balancer_log_file = os.path.expanduser(
191
- serve_utils.generate_remote_load_balancer_log_file_name(service_name))
189
+ task_yaml = serve_utils.generate_task_yaml_file_name(service_name, version)
190
+
191
+ if not is_recovery:
192
+ if (len(serve_state.get_services()) >=
193
+ serve_utils.get_num_service_threshold()):
194
+ cleanup_storage(tmp_task_yaml)
195
+ with ux_utils.print_exception_no_traceback():
196
+ raise RuntimeError('Max number of services reached.')
197
+ success = serve_state.add_service(
198
+ service_name,
199
+ controller_job_id=job_id,
200
+ policy=service_spec.autoscaling_policy_str(),
201
+ requested_resources_str=backend_utils.get_task_resources_str(task),
202
+ load_balancing_policy=service_spec.load_balancing_policy,
203
+ status=serve_state.ServiceStatus.CONTROLLER_INIT,
204
+ tls_encrypted=service_spec.tls_credential is not None)
205
+ # Directly throw an error here. See sky/serve/api.py::up
206
+ # for more details.
207
+ if not success:
208
+ cleanup_storage(tmp_task_yaml)
209
+ with ux_utils.print_exception_no_traceback():
210
+ raise ValueError(f'Service {service_name} already exists.')
211
+
212
+ # Create the service working directory.
213
+ os.makedirs(service_dir, exist_ok=True)
214
+
215
+ # Copy the tmp task yaml file to the final task yaml file.
216
+ # This is for the service name conflict case. The _execute will
217
+ # sync file mounts first and then realized a name conflict. We
218
+ # don't want the new file mounts to overwrite the old one, so we
219
+ # sync to a tmp file first and then copy it to the final name
220
+ # if there is no name conflict.
221
+ shutil.copy(tmp_task_yaml, task_yaml)
192
222
 
193
223
  controller_process = None
194
224
  load_balancer_process = None
195
225
  try:
196
226
  with filelock.FileLock(
197
227
  os.path.expanduser(constants.PORT_SELECTION_FILE_LOCK_PATH)):
198
- controller_port = common_utils.find_free_port(
199
- constants.CONTROLLER_PORT_START)
200
-
201
- # We expose the controller to the public network when running
202
- # inside a kubernetes cluster to allow external load balancers
203
- # (example, for high availability load balancers) to communicate
204
- # with the controller.
205
- def _get_host():
228
+ # Start the controller.
229
+ controller_port = (
230
+ common_utils.find_free_port(constants.CONTROLLER_PORT_START)
231
+ if not is_recovery else
232
+ serve_state.get_service_controller_port(service_name))
233
+
234
+ def _get_controller_host():
235
+ """Get the controller host address.
236
+ We expose the controller to the public network when running
237
+ inside a kubernetes cluster to allow external load balancers
238
+ (example, for high availability load balancers) to communicate
239
+ with the controller.
240
+ """
206
241
  if 'KUBERNETES_SERVICE_HOST' in os.environ:
207
242
  return '0.0.0.0'
208
243
  # Not using localhost to avoid using ipv6 address and causing
@@ -211,26 +246,28 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
211
246
  # ('::1', 20001, 0, 0): cannot assign requested address
212
247
  return '127.0.0.1'
213
248
 
214
- controller_host = _get_host()
215
-
216
- # Start the controller.
249
+ controller_host = _get_controller_host()
217
250
  controller_process = multiprocessing.Process(
218
251
  target=controller.run_controller,
219
252
  args=(service_name, service_spec, task_yaml, controller_host,
220
253
  controller_port))
221
254
  controller_process.start()
222
- serve_state.set_service_controller_port(service_name,
223
- controller_port)
224
255
 
225
- controller_addr = f'http://{controller_host}:{controller_port}'
256
+ if not is_recovery:
257
+ serve_state.set_service_controller_port(service_name,
258
+ controller_port)
226
259
 
227
- load_balancer_port = common_utils.find_free_port(
228
- constants.LOAD_BALANCER_PORT_START)
229
-
230
- # Extract the load balancing policy from the service spec
231
- policy_name = service_spec.load_balancing_policy
260
+ controller_addr = f'http://{controller_host}:{controller_port}'
232
261
 
233
262
  # Start the load balancer.
263
+ load_balancer_port = (
264
+ common_utils.find_free_port(constants.LOAD_BALANCER_PORT_START)
265
+ if not is_recovery else
266
+ serve_state.get_service_load_balancer_port(service_name))
267
+ load_balancer_log_file = os.path.expanduser(
268
+ serve_utils.generate_remote_load_balancer_log_file_name(
269
+ service_name))
270
+
234
271
  # TODO(tian): Probably we could enable multiple ports specified in
235
272
  # service spec and we could start multiple load balancers.
236
273
  # After that, we will have a mapping from replica port to endpoint.
@@ -238,11 +275,14 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
238
275
  target=ux_utils.RedirectOutputForProcess(
239
276
  load_balancer.run_load_balancer,
240
277
  load_balancer_log_file).run,
241
- args=(controller_addr, load_balancer_port, policy_name,
278
+ args=(controller_addr, load_balancer_port,
279
+ service_spec.load_balancing_policy,
242
280
  service_spec.tls_credential))
243
281
  load_balancer_process.start()
244
- serve_state.set_service_load_balancer_port(service_name,
245
- load_balancer_port)
282
+
283
+ if not is_recovery:
284
+ serve_state.set_service_load_balancer_port(
285
+ service_name, load_balancer_port)
246
286
 
247
287
  while True:
248
288
  _handle_signal(service_name)
@@ -262,6 +302,7 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
262
302
  force=True)
263
303
  for process in process_to_kill:
264
304
  process.join()
305
+
265
306
  failed = _cleanup(service_name)
266
307
  if failed:
267
308
  serve_state.set_service_status_and_active_versions(
@@ -273,8 +314,12 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
273
314
  serve_state.delete_all_versions(service_name)
274
315
  logger.info(f'Service {service_name} terminated successfully.')
275
316
 
317
+ _cleanup_task_run_script(job_id)
318
+
276
319
 
277
320
  if __name__ == '__main__':
321
+ logger.info('Starting service...')
322
+
278
323
  parser = argparse.ArgumentParser(description='Sky Serve Service')
279
324
  parser.add_argument('--service-name',
280
325
  type=str,
@@ -493,15 +493,15 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
493
493
  # Determine the job capacity of the workers based on the system resources.
494
494
  cpu_count = common_utils.get_cpu_count()
495
495
  mem_size_gb = common_utils.get_mem_size_gb()
496
- mem_size_gb = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
496
+ mem_for_workers = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
497
497
  # Runs in low resource mode if the available memory is less than
498
498
  # server_constants.MIN_AVAIL_MEM_GB.
499
499
  max_parallel_for_long = _max_long_worker_parallism(cpu_count,
500
- mem_size_gb,
500
+ mem_for_workers,
501
501
  local=not deploy)
502
502
  max_parallel_for_short = _max_short_worker_parallism(
503
- mem_size_gb, max_parallel_for_long)
504
- if mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
503
+ mem_for_workers, max_parallel_for_long)
504
+ if not deploy and mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
505
505
  # Permanent worker process may have significant memory consumption
506
506
  # (~350MB per worker) after running commands like `sky check`, so we
507
507
  # don't start any permanent workers in low resource local mode. This
sky/skylet/constants.py CHANGED
@@ -368,6 +368,13 @@ ROLE_ASSIGNMENT_FAILURE_ERROR_MSG = (
368
368
  'Failed to assign Storage Blob Data Owner role to the '
369
369
  'storage account {storage_account_name}.')
370
370
 
371
+ # Constants for path in K8S pod to store persistent setup and run scripts
372
+ # so that we can run them again after the pod restarts.
373
+ # Path within user home. For HA controller, assumes home directory is
374
+ # persistent through PVC. See kubernetes-ray.yml.j2.
375
+ PERSISTENT_SETUP_SCRIPT_PATH = '~/.sky/.controller_recovery_setup_commands.sh'
376
+ PERSISTENT_RUN_SCRIPT_DIR = '~/.sky/.controller_recovery_task_run'
377
+
371
378
  # The placeholder for the local skypilot config path in file mounts for
372
379
  # controllers.
373
380
  LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
sky/task.py CHANGED
@@ -306,7 +306,7 @@ class Task:
306
306
  self.service_name: Optional[str] = None
307
307
 
308
308
  # Filled in by the optimizer. If None, this Task is not planned.
309
- self.best_resources = None
309
+ self.best_resources: Optional[sky.Resources] = None
310
310
 
311
311
  # For internal use only.
312
312
  self.file_mounts_mapping = file_mounts_mapping
@@ -267,11 +267,14 @@ available_node_types:
267
267
  {%- for label_key, label_value in labels.items() %}
268
268
  {{ label_key }}: {{ label_value|tojson }}
269
269
  {%- endfor %}
270
+ {% if high_availability %}
271
+ app: {{cluster_name_on_cloud}}
272
+ {% endif %}
270
273
  spec:
271
274
  # serviceAccountName: skypilot-service-account
272
275
  serviceAccountName: {{k8s_service_account_name}}
273
276
  automountServiceAccountToken: {{k8s_automount_sa_token}}
274
- restartPolicy: Never
277
+ restartPolicy: {{ "Always" if high_availability else "Never" }}
275
278
 
276
279
  # Add node selector if GPU/TPUs are requested:
277
280
  {% if (k8s_acc_label_key is not none and k8s_acc_label_value is not none) or (k8s_spot_label_key is not none) %}
@@ -311,6 +314,11 @@ available_node_types:
311
314
  path: {{k8s_fusermount_shared_dir}}
312
315
  type: DirectoryOrCreate
313
316
  {% endif %}
317
+ {% if high_availability %}
318
+ - name: {{k8s_high_availability_deployment_volume_mount_name}}
319
+ persistentVolumeClaim:
320
+ claimName: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
321
+ {% endif %}
314
322
  containers:
315
323
  - name: ray-node
316
324
  imagePullPolicy: IfNotPresent
@@ -470,6 +478,7 @@ available_node_types:
470
478
  done
471
479
  {{ conda_installation_commands }}
472
480
  {{ ray_installation_commands }}
481
+
473
482
  VIRTUAL_ENV=~/skypilot-runtime ~/.local/bin/uv pip install skypilot[kubernetes,remote]
474
483
  touch /tmp/ray_skypilot_installation_complete
475
484
  echo "=== Ray and skypilot installation completed ==="
@@ -573,6 +582,28 @@ available_node_types:
573
582
  sleep 0.1
574
583
  done
575
584
  }
585
+
586
+ {% if high_availability %}
587
+ mkdir -p {{k8s_high_availability_deployment_run_script_dir}}
588
+ if [ -f {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready ]; then
589
+ # ! Keep this aligned with `CloudVmRayBackend._setup()`
590
+ # Suppose all `task.setup` are the same for skyserve controller task.
591
+ # So be careful for compatibility issue once you change it.
592
+ chmod +x {{k8s_high_availability_deployment_setup_script_path}}
593
+ /bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && {{k8s_high_availability_deployment_setup_script_path}} > /tmp/controller_recovery_setup_commands.log 2>&1"
594
+ echo "=== Controller setup commands completed for recovery ==="
595
+
596
+ for file in {{k8s_high_availability_deployment_run_script_dir}}/*; do
597
+ # ! Keep this aligned with `CloudVmRayBackend._execute()`
598
+ chmod +x $file
599
+ /bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && $file > /tmp/task_run_$(basename $file).log 2>&1"
600
+ echo "=== Controller task run for service (file: $file) completed for recovery ==="
601
+ done
602
+ fi
603
+
604
+ touch {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready
605
+ {% endif %}
606
+
576
607
  trap : TERM INT; log_tail || sleep infinity & wait
577
608
 
578
609
  ports:
@@ -593,6 +624,10 @@ available_node_types:
593
624
  # /tmp which cause slowdowns if is not a shared memory volume.
594
625
  - mountPath: /dev/shm
595
626
  name: dshm
627
+ {% if high_availability %}
628
+ - name: {{k8s_high_availability_deployment_volume_mount_name}}
629
+ mountPath: {{k8s_high_availability_deployment_volume_mount_path}}
630
+ {% endif %}
596
631
  {% if k8s_fuse_device_required %}
597
632
  - name: fusermount-shared-dir
598
633
  mountPath: {{k8s_fusermount_shared_dir}}
@@ -616,7 +651,92 @@ available_node_types:
616
651
  {{k8s_resource_key}}: {{accelerator_count}}
617
652
  {% endif %}
618
653
  {% endif %}
619
-
654
+
655
+ {% if high_availability %}
656
+ pvc_spec:
657
+ apiVersion: v1
658
+ kind: PersistentVolumeClaim
659
+ metadata:
660
+ name: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
661
+ namespace: {{k8s_namespace}}
662
+ spec:
663
+ accessModes:
664
+ - ReadWriteOnce # Our controller pod is singleton
665
+ {% if k8s_high_availability_storage_class_name is not none %}
666
+ storageClassName: {{k8s_high_availability_storage_class_name}}
667
+ {% endif %}
668
+ resources:
669
+ requests:
670
+ storage: {{disk_size}}Gi
671
+
672
+ deployment_spec:
673
+ apiVersion: apps/v1
674
+ kind: Deployment
675
+ metadata:
676
+ name: {{cluster_name_on_cloud}}-deployment
677
+ namespace: {{k8s_namespace}}
678
+ spec:
679
+ replicas: 1
680
+ selector:
681
+ matchLabels:
682
+ app: {{cluster_name_on_cloud}}
683
+ template:
684
+ # The only difference between the pod spec and this section is the initContainers
685
+ metadata:
686
+ # should be replaced by pod metadata
687
+ spec:
688
+ securityContext:
689
+ fsGroup: 1000
690
+ # To prevent the home dir provided by the docker image from being overriden by pvc mounting,
691
+ # we use initContainers to copy it first to /mnt/home, which will later be mounted to home dir.
692
+ initContainers:
693
+ - name: init-copy-home
694
+ image: {{image_id}}
695
+ command: ["/bin/sh", "-c"]
696
+ args:
697
+ - |
698
+ # Define path for the marker file created by the main container upon successful startup.
699
+ # This file persists in the PVC across Pod restarts.
700
+ MARKER_FILE="/mnt/home/k8s_container_ready"
701
+ SOURCE_PATH="{{k8s_high_availability_deployment_volume_mount_path}}"
702
+ DEST_PATH="/mnt/home"
703
+
704
+ # We only need to copy the initial home directory contents from the image
705
+ # the *first* time a Pod uses a *new* PVC.
706
+ # On subsequent Pod starts (e.g., after a crash or update), the PVC
707
+ # already contains the necessary data (and potentially user modifications).
708
+ # The presence of MARKER_FILE (created by the main container in a previous
709
+ # successful run) indicates the PVC is already initialized. Checking for
710
+ # it prevents unnecessary and time-consuming rsync operations on every restart.
711
+ if [ ! -f "$MARKER_FILE" ]; then
712
+ echo "Marker '$MARKER_FILE' not found. PVC likely needs initialization."
713
+ echo "Copying initial home directory from image ($SOURCE_PATH/) to PVC ($DEST_PATH)..."
714
+
715
+ # Use rsync with -rl (recursive, links) instead of -a (archive).
716
+ # This avoids preserving times (-t) and permissions (-p) implied by -a,
717
+ # which caused 'Operation not permitted' errors on the PVC root directory (/mnt/home).
718
+ # Owner/group preservation (-o, -g) is also skipped (default for -rl), ensuring
719
+ # files are owned by the container's user/group.
720
+ rsync -rl "$SOURCE_PATH/" "$DEST_PATH"
721
+
722
+ # Check if rsync failed
723
+ if [ $? -ne 0 ]; then
724
+ echo "ERROR: rsync failed during home directory initialization." >&2
725
+ exit 1 # Exit initContainer with error if copy fails
726
+ fi
727
+ echo "Home directory initialization copy complete."
728
+ else
729
+ # If marker exists, skip the copy
730
+ echo "Marker '$MARKER_FILE' found. Skipping initial home directory copy."
731
+ fi
732
+ echo "Current contents of $DEST_PATH:"
733
+ ls -la "$DEST_PATH"
734
+ volumeMounts:
735
+ # Mount the persistent volume claim into the initContainer
736
+ - name: {{k8s_high_availability_deployment_volume_mount_name}}
737
+ mountPath: /mnt/home # Temporary mount point for initialization
738
+ # should be replaced by pod spec
739
+ {% endif %}
620
740
  setup_commands:
621
741
  # Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
622
742
  # Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
@@ -325,6 +325,7 @@ class CommandRunner:
325
325
  direction = 'up' if up else 'down'
326
326
  error_msg = (f'Failed to rsync {direction}: {source} -> {target}. '
327
327
  'Ensure that the network is stable, then retry.')
328
+
328
329
  subprocess_utils.handle_returncode(returncode,
329
330
  command,
330
331
  error_msg,
@@ -718,6 +719,7 @@ class KubernetesCommandRunner(CommandRunner):
718
719
  def __init__(
719
720
  self,
720
721
  node: Tuple[Tuple[str, Optional[str]], str],
722
+ deployment: Optional[str] = None,
721
723
  **kwargs,
722
724
  ):
723
725
  """Initialize KubernetesCommandRunner.
@@ -733,11 +735,19 @@ class KubernetesCommandRunner(CommandRunner):
733
735
  del kwargs
734
736
  super().__init__(node)
735
737
  (self.namespace, self.context), self.pod_name = node
738
+ self.deployment = deployment
736
739
 
737
740
  @property
738
741
  def node_id(self) -> str:
739
742
  return f'{self.context}-{self.namespace}-{self.pod_name}'
740
743
 
744
+ @property
745
+ def kube_identifier(self) -> str:
746
+ if self.deployment is not None:
747
+ return f'deployment/{self.deployment}'
748
+ else:
749
+ return f'pod/{self.pod_name}'
750
+
741
751
  def port_forward_command(self,
742
752
  port_forward: List[Tuple[int, int]],
743
753
  connect_timeout: int = 1) -> List[str]:
@@ -758,11 +768,12 @@ class KubernetesCommandRunner(CommandRunner):
758
768
  kubectl_args += ['--context', self.context]
759
769
  local_port, remote_port = port_forward[0]
760
770
  local_port_str = f'{local_port}' if local_port is not None else ''
771
+
761
772
  kubectl_cmd = [
762
773
  'kubectl',
763
774
  *kubectl_args,
764
775
  'port-forward',
765
- f'pod/{self.pod_name}',
776
+ self.kube_identifier,
766
777
  f'{local_port_str}:{remote_port}',
767
778
  ]
768
779
  return kubectl_cmd
@@ -785,7 +796,8 @@ class KubernetesCommandRunner(CommandRunner):
785
796
  source_bashrc: bool = False,
786
797
  skip_num_lines: int = 0,
787
798
  **kwargs) -> Union[int, Tuple[int, str, str]]:
788
- """Uses 'kubectl exec' to run 'cmd' on a pod by its name and namespace.
799
+ """Uses 'kubectl exec' to run 'cmd' on a pod or deployment by its
800
+ name and namespace.
789
801
 
790
802
  Args:
791
803
  cmd: The command to run.
@@ -828,7 +840,9 @@ class KubernetesCommandRunner(CommandRunner):
828
840
  # case, need to set KUBECONFIG to /dev/null to avoid using kubeconfig.
829
841
  if self.context is None:
830
842
  kubectl_args += ['--kubeconfig', '/dev/null']
831
- kubectl_args += [self.pod_name]
843
+
844
+ kubectl_args += [self.kube_identifier]
845
+
832
846
  if ssh_mode == SshMode.LOGIN:
833
847
  assert isinstance(cmd, list), 'cmd must be a list for login mode.'
834
848
  base_cmd = ['kubectl', 'exec', '-it', *kubectl_args, '--']
@@ -206,6 +206,8 @@ class KubernetesCommandRunner(CommandRunner):
206
206
  def __init__(
207
207
  self,
208
208
  node: Tuple[Tuple[str, Optional[str]], str],
209
+ deployment: Optional[str] = ...,
210
+ **kwargs,
209
211
  ) -> None:
210
212
  ...
211
213
 
@@ -193,6 +193,30 @@ class Controllers(enum.Enum):
193
193
  return None
194
194
 
195
195
 
196
+ def high_availability_specified(cluster_name: Optional[str],
197
+ skip_warning: bool = True) -> bool:
198
+ """Check if the controller high availability is specified in user config.
199
+ """
200
+ controller = Controllers.from_name(cluster_name)
201
+ if controller is None:
202
+ return False
203
+
204
+ if skypilot_config.loaded():
205
+ high_availability = skypilot_config.get_nested(
206
+ (controller.value.controller_type, 'controller',
207
+ 'high_availability'), False)
208
+ if high_availability:
209
+ if controller.value.controller_type != 'serve':
210
+ if not skip_warning:
211
+ print(f'{colorama.Fore.RED}High availability controller is'
212
+ 'only supported for SkyServe controller. It cannot'
213
+ f'be enabled for {controller.value.name}.'
214
+ f'Skipping this flag.{colorama.Style.RESET_ALL}')
215
+ else:
216
+ return True
217
+ return False
218
+
219
+
196
220
  # Install cli dependencies. Not using SkyPilot wheels because the wheel
197
221
  # can be cleaned up by another process.
198
222
  def _get_cloud_dependencies_installation_commands(