skypilot-nightly 1.0.0.dev20250427__py3-none-any.whl → 1.0.0.dev20250428__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +19 -2
- sky/backends/cloud_vm_ray_backend.py +33 -8
- sky/backends/local_docker_backend.py +1 -2
- sky/cli.py +1 -1
- sky/client/cli.py +1 -1
- sky/clouds/aws.py +12 -6
- sky/clouds/azure.py +3 -0
- sky/clouds/cloud.py +3 -0
- sky/clouds/cudo.py +2 -0
- sky/clouds/do.py +3 -0
- sky/clouds/fluidstack.py +3 -0
- sky/clouds/gcp.py +7 -0
- sky/clouds/ibm.py +2 -0
- sky/clouds/kubernetes.py +38 -15
- sky/clouds/lambda_cloud.py +1 -0
- sky/clouds/nebius.py +2 -0
- sky/clouds/oci.py +6 -3
- sky/clouds/paperspace.py +2 -0
- sky/clouds/runpod.py +2 -0
- sky/clouds/scp.py +2 -0
- sky/clouds/vast.py +2 -0
- sky/clouds/vsphere.py +2 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/exceptions.py +6 -0
- sky/execution.py +19 -4
- sky/global_user_state.py +1 -0
- sky/provision/common.py +2 -5
- sky/provision/instance_setup.py +1 -1
- sky/provision/kubernetes/instance.py +276 -93
- sky/provision/kubernetes/network.py +1 -1
- sky/provision/kubernetes/utils.py +10 -0
- sky/provision/provisioner.py +6 -0
- sky/serve/replica_managers.py +51 -5
- sky/serve/serve_state.py +41 -0
- sky/serve/service.py +108 -63
- sky/server/requests/executor.py +4 -4
- sky/skylet/constants.py +7 -0
- sky/task.py +1 -1
- sky/templates/kubernetes-ray.yml.j2 +122 -2
- sky/utils/command_runner.py +17 -3
- sky/utils/command_runner.pyi +2 -0
- sky/utils/controller_utils.py +24 -0
- sky/utils/kubernetes/rsync_helper.sh +20 -4
- sky/utils/schemas.py +13 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/RECORD +59 -59
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/WHEEL +1 -1
- /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → 2f-jlOWR_G5mOwCF4RcZz}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{kTfCjujxwqIQ4b7YvP7Uq → 2f-jlOWR_G5mOwCF4RcZz}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250427.dist-info → skypilot_nightly-1.0.0.dev20250428.dist-info}/top_level.txt +0 -0
sky/serve/serve_state.py
CHANGED
@@ -479,6 +479,14 @@ def total_number_provisioning_replicas() -> int:
|
|
479
479
|
return provisioning_count
|
480
480
|
|
481
481
|
|
482
|
+
def get_replicas_at_status(
|
483
|
+
service_name: str,
|
484
|
+
status: ReplicaStatus,
|
485
|
+
) -> List['replica_managers.ReplicaInfo']:
|
486
|
+
replicas = get_replica_infos(service_name)
|
487
|
+
return [replica for replica in replicas if replica.status == status]
|
488
|
+
|
489
|
+
|
482
490
|
# === Version functions ===
|
483
491
|
def add_version(service_name: str) -> int:
|
484
492
|
"""Adds a version to the database."""
|
@@ -549,3 +557,36 @@ def delete_all_versions(service_name: str) -> None:
|
|
549
557
|
"""\
|
550
558
|
DELETE FROM version_specs
|
551
559
|
WHERE service_name=(?)""", (service_name,))
|
560
|
+
|
561
|
+
|
562
|
+
def get_latest_version(service_name: str) -> Optional[int]:
|
563
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
564
|
+
rows = cursor.execute(
|
565
|
+
"""\
|
566
|
+
SELECT MAX(version) FROM version_specs
|
567
|
+
WHERE service_name=(?)""", (service_name,)).fetchall()
|
568
|
+
if not rows or rows[0][0] is None:
|
569
|
+
return None
|
570
|
+
return rows[0][0]
|
571
|
+
|
572
|
+
|
573
|
+
def get_service_controller_port(service_name: str) -> int:
|
574
|
+
"""Gets the controller port of a service."""
|
575
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
576
|
+
cursor.execute('SELECT controller_port FROM services WHERE name = ?',
|
577
|
+
(service_name,))
|
578
|
+
row = cursor.fetchone()
|
579
|
+
if row is None:
|
580
|
+
raise ValueError(f'Service {service_name} does not exist.')
|
581
|
+
return row[0]
|
582
|
+
|
583
|
+
|
584
|
+
def get_service_load_balancer_port(service_name: str) -> int:
|
585
|
+
"""Gets the load balancer port of a service."""
|
586
|
+
with db_utils.safe_cursor(_DB_PATH) as cursor:
|
587
|
+
cursor.execute('SELECT load_balancer_port FROM services WHERE name = ?',
|
588
|
+
(service_name,))
|
589
|
+
row = cursor.fetchone()
|
590
|
+
if row is None:
|
591
|
+
raise ValueError(f'Service {service_name} does not exist.')
|
592
|
+
return row[0]
|
sky/serve/service.py
CHANGED
@@ -25,6 +25,7 @@ from sky.serve import load_balancer
|
|
25
25
|
from sky.serve import replica_managers
|
26
26
|
from sky.serve import serve_state
|
27
27
|
from sky.serve import serve_utils
|
28
|
+
from sky.skylet import constants as skylet_constants
|
28
29
|
from sky.utils import common_utils
|
29
30
|
from sky.utils import subprocess_utils
|
30
31
|
from sky.utils import ux_utils
|
@@ -136,8 +137,25 @@ def _cleanup(service_name: str) -> bool:
|
|
136
137
|
return failed
|
137
138
|
|
138
139
|
|
140
|
+
def _cleanup_task_run_script(job_id: int) -> None:
|
141
|
+
"""Clean up task run script.
|
142
|
+
Please see `kubernetes-ray.yml.j2` for more details.
|
143
|
+
"""
|
144
|
+
task_run_dir = pathlib.Path(
|
145
|
+
skylet_constants.PERSISTENT_RUN_SCRIPT_DIR).expanduser()
|
146
|
+
if task_run_dir.exists():
|
147
|
+
this_task_run_script = task_run_dir / f'sky_job_{job_id}'
|
148
|
+
if this_task_run_script.exists():
|
149
|
+
this_task_run_script.unlink()
|
150
|
+
logger.info(f'Task run script {this_task_run_script} removed')
|
151
|
+
else:
|
152
|
+
logger.warning(f'Task run script {this_task_run_script} not found')
|
153
|
+
|
154
|
+
|
139
155
|
def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
140
|
-
"""Starts the service.
|
156
|
+
"""Starts the service.
|
157
|
+
This including the controller and load balancer.
|
158
|
+
"""
|
141
159
|
# Generate ssh key pair to avoid race condition when multiple sky.launch
|
142
160
|
# are executed at the same time.
|
143
161
|
authentication.get_or_generate_keys()
|
@@ -147,62 +165,79 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
147
165
|
# Already checked before submit to controller.
|
148
166
|
assert task.service is not None, task
|
149
167
|
service_spec = task.service
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
# Add initial version information to the service state.
|
171
|
-
serve_state.add_or_update_version(service_name, constants.INITIAL_VERSION,
|
172
|
-
service_spec)
|
173
|
-
|
174
|
-
# Create the service working directory.
|
168
|
+
|
169
|
+
def is_recovery_mode(service_name: str) -> bool:
|
170
|
+
"""Check if service exists in database to determine recovery mode.
|
171
|
+
"""
|
172
|
+
service = serve_state.get_service_from_name(service_name)
|
173
|
+
return service is not None
|
174
|
+
|
175
|
+
is_recovery = is_recovery_mode(service_name)
|
176
|
+
logger.info(f'It is a {"first" if not is_recovery else "recovery"} run')
|
177
|
+
|
178
|
+
if is_recovery:
|
179
|
+
version = serve_state.get_latest_version(service_name)
|
180
|
+
if version is None:
|
181
|
+
raise ValueError(f'No version found for service {service_name}')
|
182
|
+
else:
|
183
|
+
version = constants.INITIAL_VERSION
|
184
|
+
# Add initial version information to the service state.
|
185
|
+
serve_state.add_or_update_version(service_name, version, service_spec)
|
186
|
+
|
175
187
|
service_dir = os.path.expanduser(
|
176
188
|
serve_utils.generate_remote_service_dir_name(service_name))
|
177
|
-
|
178
|
-
|
179
|
-
|
180
|
-
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
189
|
+
task_yaml = serve_utils.generate_task_yaml_file_name(service_name, version)
|
190
|
+
|
191
|
+
if not is_recovery:
|
192
|
+
if (len(serve_state.get_services()) >=
|
193
|
+
serve_utils.get_num_service_threshold()):
|
194
|
+
cleanup_storage(tmp_task_yaml)
|
195
|
+
with ux_utils.print_exception_no_traceback():
|
196
|
+
raise RuntimeError('Max number of services reached.')
|
197
|
+
success = serve_state.add_service(
|
198
|
+
service_name,
|
199
|
+
controller_job_id=job_id,
|
200
|
+
policy=service_spec.autoscaling_policy_str(),
|
201
|
+
requested_resources_str=backend_utils.get_task_resources_str(task),
|
202
|
+
load_balancing_policy=service_spec.load_balancing_policy,
|
203
|
+
status=serve_state.ServiceStatus.CONTROLLER_INIT,
|
204
|
+
tls_encrypted=service_spec.tls_credential is not None)
|
205
|
+
# Directly throw an error here. See sky/serve/api.py::up
|
206
|
+
# for more details.
|
207
|
+
if not success:
|
208
|
+
cleanup_storage(tmp_task_yaml)
|
209
|
+
with ux_utils.print_exception_no_traceback():
|
210
|
+
raise ValueError(f'Service {service_name} already exists.')
|
211
|
+
|
212
|
+
# Create the service working directory.
|
213
|
+
os.makedirs(service_dir, exist_ok=True)
|
214
|
+
|
215
|
+
# Copy the tmp task yaml file to the final task yaml file.
|
216
|
+
# This is for the service name conflict case. The _execute will
|
217
|
+
# sync file mounts first and then realized a name conflict. We
|
218
|
+
# don't want the new file mounts to overwrite the old one, so we
|
219
|
+
# sync to a tmp file first and then copy it to the final name
|
220
|
+
# if there is no name conflict.
|
221
|
+
shutil.copy(tmp_task_yaml, task_yaml)
|
192
222
|
|
193
223
|
controller_process = None
|
194
224
|
load_balancer_process = None
|
195
225
|
try:
|
196
226
|
with filelock.FileLock(
|
197
227
|
os.path.expanduser(constants.PORT_SELECTION_FILE_LOCK_PATH)):
|
198
|
-
|
199
|
-
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
|
205
|
-
|
228
|
+
# Start the controller.
|
229
|
+
controller_port = (
|
230
|
+
common_utils.find_free_port(constants.CONTROLLER_PORT_START)
|
231
|
+
if not is_recovery else
|
232
|
+
serve_state.get_service_controller_port(service_name))
|
233
|
+
|
234
|
+
def _get_controller_host():
|
235
|
+
"""Get the controller host address.
|
236
|
+
We expose the controller to the public network when running
|
237
|
+
inside a kubernetes cluster to allow external load balancers
|
238
|
+
(example, for high availability load balancers) to communicate
|
239
|
+
with the controller.
|
240
|
+
"""
|
206
241
|
if 'KUBERNETES_SERVICE_HOST' in os.environ:
|
207
242
|
return '0.0.0.0'
|
208
243
|
# Not using localhost to avoid using ipv6 address and causing
|
@@ -211,26 +246,28 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
211
246
|
# ('::1', 20001, 0, 0): cannot assign requested address
|
212
247
|
return '127.0.0.1'
|
213
248
|
|
214
|
-
controller_host =
|
215
|
-
|
216
|
-
# Start the controller.
|
249
|
+
controller_host = _get_controller_host()
|
217
250
|
controller_process = multiprocessing.Process(
|
218
251
|
target=controller.run_controller,
|
219
252
|
args=(service_name, service_spec, task_yaml, controller_host,
|
220
253
|
controller_port))
|
221
254
|
controller_process.start()
|
222
|
-
serve_state.set_service_controller_port(service_name,
|
223
|
-
controller_port)
|
224
255
|
|
225
|
-
|
256
|
+
if not is_recovery:
|
257
|
+
serve_state.set_service_controller_port(service_name,
|
258
|
+
controller_port)
|
226
259
|
|
227
|
-
|
228
|
-
constants.LOAD_BALANCER_PORT_START)
|
229
|
-
|
230
|
-
# Extract the load balancing policy from the service spec
|
231
|
-
policy_name = service_spec.load_balancing_policy
|
260
|
+
controller_addr = f'http://{controller_host}:{controller_port}'
|
232
261
|
|
233
262
|
# Start the load balancer.
|
263
|
+
load_balancer_port = (
|
264
|
+
common_utils.find_free_port(constants.LOAD_BALANCER_PORT_START)
|
265
|
+
if not is_recovery else
|
266
|
+
serve_state.get_service_load_balancer_port(service_name))
|
267
|
+
load_balancer_log_file = os.path.expanduser(
|
268
|
+
serve_utils.generate_remote_load_balancer_log_file_name(
|
269
|
+
service_name))
|
270
|
+
|
234
271
|
# TODO(tian): Probably we could enable multiple ports specified in
|
235
272
|
# service spec and we could start multiple load balancers.
|
236
273
|
# After that, we will have a mapping from replica port to endpoint.
|
@@ -238,11 +275,14 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
238
275
|
target=ux_utils.RedirectOutputForProcess(
|
239
276
|
load_balancer.run_load_balancer,
|
240
277
|
load_balancer_log_file).run,
|
241
|
-
args=(controller_addr, load_balancer_port,
|
278
|
+
args=(controller_addr, load_balancer_port,
|
279
|
+
service_spec.load_balancing_policy,
|
242
280
|
service_spec.tls_credential))
|
243
281
|
load_balancer_process.start()
|
244
|
-
|
245
|
-
|
282
|
+
|
283
|
+
if not is_recovery:
|
284
|
+
serve_state.set_service_load_balancer_port(
|
285
|
+
service_name, load_balancer_port)
|
246
286
|
|
247
287
|
while True:
|
248
288
|
_handle_signal(service_name)
|
@@ -262,6 +302,7 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
262
302
|
force=True)
|
263
303
|
for process in process_to_kill:
|
264
304
|
process.join()
|
305
|
+
|
265
306
|
failed = _cleanup(service_name)
|
266
307
|
if failed:
|
267
308
|
serve_state.set_service_status_and_active_versions(
|
@@ -273,8 +314,12 @@ def _start(service_name: str, tmp_task_yaml: str, job_id: int):
|
|
273
314
|
serve_state.delete_all_versions(service_name)
|
274
315
|
logger.info(f'Service {service_name} terminated successfully.')
|
275
316
|
|
317
|
+
_cleanup_task_run_script(job_id)
|
318
|
+
|
276
319
|
|
277
320
|
if __name__ == '__main__':
|
321
|
+
logger.info('Starting service...')
|
322
|
+
|
278
323
|
parser = argparse.ArgumentParser(description='Sky Serve Service')
|
279
324
|
parser.add_argument('--service-name',
|
280
325
|
type=str,
|
sky/server/requests/executor.py
CHANGED
@@ -493,15 +493,15 @@ def start(deploy: bool) -> List[multiprocessing.Process]:
|
|
493
493
|
# Determine the job capacity of the workers based on the system resources.
|
494
494
|
cpu_count = common_utils.get_cpu_count()
|
495
495
|
mem_size_gb = common_utils.get_mem_size_gb()
|
496
|
-
|
496
|
+
mem_for_workers = max(0, mem_size_gb - server_constants.MIN_AVAIL_MEM_GB)
|
497
497
|
# Runs in low resource mode if the available memory is less than
|
498
498
|
# server_constants.MIN_AVAIL_MEM_GB.
|
499
499
|
max_parallel_for_long = _max_long_worker_parallism(cpu_count,
|
500
|
-
|
500
|
+
mem_for_workers,
|
501
501
|
local=not deploy)
|
502
502
|
max_parallel_for_short = _max_short_worker_parallism(
|
503
|
-
|
504
|
-
if mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
|
503
|
+
mem_for_workers, max_parallel_for_long)
|
504
|
+
if not deploy and mem_size_gb < server_constants.MIN_AVAIL_MEM_GB:
|
505
505
|
# Permanent worker process may have significant memory consumption
|
506
506
|
# (~350MB per worker) after running commands like `sky check`, so we
|
507
507
|
# don't start any permanent workers in low resource local mode. This
|
sky/skylet/constants.py
CHANGED
@@ -368,6 +368,13 @@ ROLE_ASSIGNMENT_FAILURE_ERROR_MSG = (
|
|
368
368
|
'Failed to assign Storage Blob Data Owner role to the '
|
369
369
|
'storage account {storage_account_name}.')
|
370
370
|
|
371
|
+
# Constants for path in K8S pod to store persistent setup and run scripts
|
372
|
+
# so that we can run them again after the pod restarts.
|
373
|
+
# Path within user home. For HA controller, assumes home directory is
|
374
|
+
# persistent through PVC. See kubernetes-ray.yml.j2.
|
375
|
+
PERSISTENT_SETUP_SCRIPT_PATH = '~/.sky/.controller_recovery_setup_commands.sh'
|
376
|
+
PERSISTENT_RUN_SCRIPT_DIR = '~/.sky/.controller_recovery_task_run'
|
377
|
+
|
371
378
|
# The placeholder for the local skypilot config path in file mounts for
|
372
379
|
# controllers.
|
373
380
|
LOCAL_SKYPILOT_CONFIG_PATH_PLACEHOLDER = 'skypilot:local_skypilot_config_path'
|
sky/task.py
CHANGED
@@ -306,7 +306,7 @@ class Task:
|
|
306
306
|
self.service_name: Optional[str] = None
|
307
307
|
|
308
308
|
# Filled in by the optimizer. If None, this Task is not planned.
|
309
|
-
self.best_resources = None
|
309
|
+
self.best_resources: Optional[sky.Resources] = None
|
310
310
|
|
311
311
|
# For internal use only.
|
312
312
|
self.file_mounts_mapping = file_mounts_mapping
|
@@ -267,11 +267,14 @@ available_node_types:
|
|
267
267
|
{%- for label_key, label_value in labels.items() %}
|
268
268
|
{{ label_key }}: {{ label_value|tojson }}
|
269
269
|
{%- endfor %}
|
270
|
+
{% if high_availability %}
|
271
|
+
app: {{cluster_name_on_cloud}}
|
272
|
+
{% endif %}
|
270
273
|
spec:
|
271
274
|
# serviceAccountName: skypilot-service-account
|
272
275
|
serviceAccountName: {{k8s_service_account_name}}
|
273
276
|
automountServiceAccountToken: {{k8s_automount_sa_token}}
|
274
|
-
restartPolicy: Never
|
277
|
+
restartPolicy: {{ "Always" if high_availability else "Never" }}
|
275
278
|
|
276
279
|
# Add node selector if GPU/TPUs are requested:
|
277
280
|
{% if (k8s_acc_label_key is not none and k8s_acc_label_value is not none) or (k8s_spot_label_key is not none) %}
|
@@ -311,6 +314,11 @@ available_node_types:
|
|
311
314
|
path: {{k8s_fusermount_shared_dir}}
|
312
315
|
type: DirectoryOrCreate
|
313
316
|
{% endif %}
|
317
|
+
{% if high_availability %}
|
318
|
+
- name: {{k8s_high_availability_deployment_volume_mount_name}}
|
319
|
+
persistentVolumeClaim:
|
320
|
+
claimName: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
|
321
|
+
{% endif %}
|
314
322
|
containers:
|
315
323
|
- name: ray-node
|
316
324
|
imagePullPolicy: IfNotPresent
|
@@ -470,6 +478,7 @@ available_node_types:
|
|
470
478
|
done
|
471
479
|
{{ conda_installation_commands }}
|
472
480
|
{{ ray_installation_commands }}
|
481
|
+
|
473
482
|
VIRTUAL_ENV=~/skypilot-runtime ~/.local/bin/uv pip install skypilot[kubernetes,remote]
|
474
483
|
touch /tmp/ray_skypilot_installation_complete
|
475
484
|
echo "=== Ray and skypilot installation completed ==="
|
@@ -573,6 +582,28 @@ available_node_types:
|
|
573
582
|
sleep 0.1
|
574
583
|
done
|
575
584
|
}
|
585
|
+
|
586
|
+
{% if high_availability %}
|
587
|
+
mkdir -p {{k8s_high_availability_deployment_run_script_dir}}
|
588
|
+
if [ -f {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready ]; then
|
589
|
+
# ! Keep this aligned with `CloudVmRayBackend._setup()`
|
590
|
+
# Suppose all `task.setup` are the same for skyserve controller task.
|
591
|
+
# So be careful for compatibility issue once you change it.
|
592
|
+
chmod +x {{k8s_high_availability_deployment_setup_script_path}}
|
593
|
+
/bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && {{k8s_high_availability_deployment_setup_script_path}} > /tmp/controller_recovery_setup_commands.log 2>&1"
|
594
|
+
echo "=== Controller setup commands completed for recovery ==="
|
595
|
+
|
596
|
+
for file in {{k8s_high_availability_deployment_run_script_dir}}/*; do
|
597
|
+
# ! Keep this aligned with `CloudVmRayBackend._execute()`
|
598
|
+
chmod +x $file
|
599
|
+
/bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && $file > /tmp/task_run_$(basename $file).log 2>&1"
|
600
|
+
echo "=== Controller task run for service (file: $file) completed for recovery ==="
|
601
|
+
done
|
602
|
+
fi
|
603
|
+
|
604
|
+
touch {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready
|
605
|
+
{% endif %}
|
606
|
+
|
576
607
|
trap : TERM INT; log_tail || sleep infinity & wait
|
577
608
|
|
578
609
|
ports:
|
@@ -593,6 +624,10 @@ available_node_types:
|
|
593
624
|
# /tmp which cause slowdowns if is not a shared memory volume.
|
594
625
|
- mountPath: /dev/shm
|
595
626
|
name: dshm
|
627
|
+
{% if high_availability %}
|
628
|
+
- name: {{k8s_high_availability_deployment_volume_mount_name}}
|
629
|
+
mountPath: {{k8s_high_availability_deployment_volume_mount_path}}
|
630
|
+
{% endif %}
|
596
631
|
{% if k8s_fuse_device_required %}
|
597
632
|
- name: fusermount-shared-dir
|
598
633
|
mountPath: {{k8s_fusermount_shared_dir}}
|
@@ -616,7 +651,92 @@ available_node_types:
|
|
616
651
|
{{k8s_resource_key}}: {{accelerator_count}}
|
617
652
|
{% endif %}
|
618
653
|
{% endif %}
|
619
|
-
|
654
|
+
|
655
|
+
{% if high_availability %}
|
656
|
+
pvc_spec:
|
657
|
+
apiVersion: v1
|
658
|
+
kind: PersistentVolumeClaim
|
659
|
+
metadata:
|
660
|
+
name: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
|
661
|
+
namespace: {{k8s_namespace}}
|
662
|
+
spec:
|
663
|
+
accessModes:
|
664
|
+
- ReadWriteOnce # Our controller pod is singleton
|
665
|
+
{% if k8s_high_availability_storage_class_name is not none %}
|
666
|
+
storageClassName: {{k8s_high_availability_storage_class_name}}
|
667
|
+
{% endif %}
|
668
|
+
resources:
|
669
|
+
requests:
|
670
|
+
storage: {{disk_size}}Gi
|
671
|
+
|
672
|
+
deployment_spec:
|
673
|
+
apiVersion: apps/v1
|
674
|
+
kind: Deployment
|
675
|
+
metadata:
|
676
|
+
name: {{cluster_name_on_cloud}}-deployment
|
677
|
+
namespace: {{k8s_namespace}}
|
678
|
+
spec:
|
679
|
+
replicas: 1
|
680
|
+
selector:
|
681
|
+
matchLabels:
|
682
|
+
app: {{cluster_name_on_cloud}}
|
683
|
+
template:
|
684
|
+
# The only difference between the pod spec and this section is the initContainers
|
685
|
+
metadata:
|
686
|
+
# should be replaced by pod metadata
|
687
|
+
spec:
|
688
|
+
securityContext:
|
689
|
+
fsGroup: 1000
|
690
|
+
# To prevent the home dir provided by the docker image from being overriden by pvc mounting,
|
691
|
+
# we use initContainers to copy it first to /mnt/home, which will later be mounted to home dir.
|
692
|
+
initContainers:
|
693
|
+
- name: init-copy-home
|
694
|
+
image: {{image_id}}
|
695
|
+
command: ["/bin/sh", "-c"]
|
696
|
+
args:
|
697
|
+
- |
|
698
|
+
# Define path for the marker file created by the main container upon successful startup.
|
699
|
+
# This file persists in the PVC across Pod restarts.
|
700
|
+
MARKER_FILE="/mnt/home/k8s_container_ready"
|
701
|
+
SOURCE_PATH="{{k8s_high_availability_deployment_volume_mount_path}}"
|
702
|
+
DEST_PATH="/mnt/home"
|
703
|
+
|
704
|
+
# We only need to copy the initial home directory contents from the image
|
705
|
+
# the *first* time a Pod uses a *new* PVC.
|
706
|
+
# On subsequent Pod starts (e.g., after a crash or update), the PVC
|
707
|
+
# already contains the necessary data (and potentially user modifications).
|
708
|
+
# The presence of MARKER_FILE (created by the main container in a previous
|
709
|
+
# successful run) indicates the PVC is already initialized. Checking for
|
710
|
+
# it prevents unnecessary and time-consuming rsync operations on every restart.
|
711
|
+
if [ ! -f "$MARKER_FILE" ]; then
|
712
|
+
echo "Marker '$MARKER_FILE' not found. PVC likely needs initialization."
|
713
|
+
echo "Copying initial home directory from image ($SOURCE_PATH/) to PVC ($DEST_PATH)..."
|
714
|
+
|
715
|
+
# Use rsync with -rl (recursive, links) instead of -a (archive).
|
716
|
+
# This avoids preserving times (-t) and permissions (-p) implied by -a,
|
717
|
+
# which caused 'Operation not permitted' errors on the PVC root directory (/mnt/home).
|
718
|
+
# Owner/group preservation (-o, -g) is also skipped (default for -rl), ensuring
|
719
|
+
# files are owned by the container's user/group.
|
720
|
+
rsync -rl "$SOURCE_PATH/" "$DEST_PATH"
|
721
|
+
|
722
|
+
# Check if rsync failed
|
723
|
+
if [ $? -ne 0 ]; then
|
724
|
+
echo "ERROR: rsync failed during home directory initialization." >&2
|
725
|
+
exit 1 # Exit initContainer with error if copy fails
|
726
|
+
fi
|
727
|
+
echo "Home directory initialization copy complete."
|
728
|
+
else
|
729
|
+
# If marker exists, skip the copy
|
730
|
+
echo "Marker '$MARKER_FILE' found. Skipping initial home directory copy."
|
731
|
+
fi
|
732
|
+
echo "Current contents of $DEST_PATH:"
|
733
|
+
ls -la "$DEST_PATH"
|
734
|
+
volumeMounts:
|
735
|
+
# Mount the persistent volume claim into the initContainer
|
736
|
+
- name: {{k8s_high_availability_deployment_volume_mount_name}}
|
737
|
+
mountPath: /mnt/home # Temporary mount point for initialization
|
738
|
+
# should be replaced by pod spec
|
739
|
+
{% endif %}
|
620
740
|
setup_commands:
|
621
741
|
# Disable `unattended-upgrades` to prevent apt-get from hanging. It should be called at the beginning before the process started to avoid being blocked. (This is a temporary fix.)
|
622
742
|
# Add ~/.ssh/sky-cluster-key to SSH config to allow nodes within a cluster to connect to each other
|
sky/utils/command_runner.py
CHANGED
@@ -325,6 +325,7 @@ class CommandRunner:
|
|
325
325
|
direction = 'up' if up else 'down'
|
326
326
|
error_msg = (f'Failed to rsync {direction}: {source} -> {target}. '
|
327
327
|
'Ensure that the network is stable, then retry.')
|
328
|
+
|
328
329
|
subprocess_utils.handle_returncode(returncode,
|
329
330
|
command,
|
330
331
|
error_msg,
|
@@ -718,6 +719,7 @@ class KubernetesCommandRunner(CommandRunner):
|
|
718
719
|
def __init__(
|
719
720
|
self,
|
720
721
|
node: Tuple[Tuple[str, Optional[str]], str],
|
722
|
+
deployment: Optional[str] = None,
|
721
723
|
**kwargs,
|
722
724
|
):
|
723
725
|
"""Initialize KubernetesCommandRunner.
|
@@ -733,11 +735,19 @@ class KubernetesCommandRunner(CommandRunner):
|
|
733
735
|
del kwargs
|
734
736
|
super().__init__(node)
|
735
737
|
(self.namespace, self.context), self.pod_name = node
|
738
|
+
self.deployment = deployment
|
736
739
|
|
737
740
|
@property
|
738
741
|
def node_id(self) -> str:
|
739
742
|
return f'{self.context}-{self.namespace}-{self.pod_name}'
|
740
743
|
|
744
|
+
@property
|
745
|
+
def kube_identifier(self) -> str:
|
746
|
+
if self.deployment is not None:
|
747
|
+
return f'deployment/{self.deployment}'
|
748
|
+
else:
|
749
|
+
return f'pod/{self.pod_name}'
|
750
|
+
|
741
751
|
def port_forward_command(self,
|
742
752
|
port_forward: List[Tuple[int, int]],
|
743
753
|
connect_timeout: int = 1) -> List[str]:
|
@@ -758,11 +768,12 @@ class KubernetesCommandRunner(CommandRunner):
|
|
758
768
|
kubectl_args += ['--context', self.context]
|
759
769
|
local_port, remote_port = port_forward[0]
|
760
770
|
local_port_str = f'{local_port}' if local_port is not None else ''
|
771
|
+
|
761
772
|
kubectl_cmd = [
|
762
773
|
'kubectl',
|
763
774
|
*kubectl_args,
|
764
775
|
'port-forward',
|
765
|
-
|
776
|
+
self.kube_identifier,
|
766
777
|
f'{local_port_str}:{remote_port}',
|
767
778
|
]
|
768
779
|
return kubectl_cmd
|
@@ -785,7 +796,8 @@ class KubernetesCommandRunner(CommandRunner):
|
|
785
796
|
source_bashrc: bool = False,
|
786
797
|
skip_num_lines: int = 0,
|
787
798
|
**kwargs) -> Union[int, Tuple[int, str, str]]:
|
788
|
-
"""Uses 'kubectl exec' to run 'cmd' on a pod by its
|
799
|
+
"""Uses 'kubectl exec' to run 'cmd' on a pod or deployment by its
|
800
|
+
name and namespace.
|
789
801
|
|
790
802
|
Args:
|
791
803
|
cmd: The command to run.
|
@@ -828,7 +840,9 @@ class KubernetesCommandRunner(CommandRunner):
|
|
828
840
|
# case, need to set KUBECONFIG to /dev/null to avoid using kubeconfig.
|
829
841
|
if self.context is None:
|
830
842
|
kubectl_args += ['--kubeconfig', '/dev/null']
|
831
|
-
|
843
|
+
|
844
|
+
kubectl_args += [self.kube_identifier]
|
845
|
+
|
832
846
|
if ssh_mode == SshMode.LOGIN:
|
833
847
|
assert isinstance(cmd, list), 'cmd must be a list for login mode.'
|
834
848
|
base_cmd = ['kubectl', 'exec', '-it', *kubectl_args, '--']
|
sky/utils/command_runner.pyi
CHANGED
sky/utils/controller_utils.py
CHANGED
@@ -193,6 +193,30 @@ class Controllers(enum.Enum):
|
|
193
193
|
return None
|
194
194
|
|
195
195
|
|
196
|
+
def high_availability_specified(cluster_name: Optional[str],
|
197
|
+
skip_warning: bool = True) -> bool:
|
198
|
+
"""Check if the controller high availability is specified in user config.
|
199
|
+
"""
|
200
|
+
controller = Controllers.from_name(cluster_name)
|
201
|
+
if controller is None:
|
202
|
+
return False
|
203
|
+
|
204
|
+
if skypilot_config.loaded():
|
205
|
+
high_availability = skypilot_config.get_nested(
|
206
|
+
(controller.value.controller_type, 'controller',
|
207
|
+
'high_availability'), False)
|
208
|
+
if high_availability:
|
209
|
+
if controller.value.controller_type != 'serve':
|
210
|
+
if not skip_warning:
|
211
|
+
print(f'{colorama.Fore.RED}High availability controller is'
|
212
|
+
'only supported for SkyServe controller. It cannot'
|
213
|
+
f'be enabled for {controller.value.name}.'
|
214
|
+
f'Skipping this flag.{colorama.Style.RESET_ALL}')
|
215
|
+
else:
|
216
|
+
return True
|
217
|
+
return False
|
218
|
+
|
219
|
+
|
196
220
|
# Install cli dependencies. Not using SkyPilot wheels because the wheel
|
197
221
|
# can be cleaned up by another process.
|
198
222
|
def _get_cloud_dependencies_installation_commands(
|