skypilot-nightly 1.0.0.dev20250114__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +2 -2
- sky/backends/cloud_vm_ray_backend.py +50 -67
- sky/check.py +31 -1
- sky/cli.py +11 -34
- sky/clouds/kubernetes.py +3 -3
- sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
- sky/core.py +8 -5
- sky/data/storage.py +66 -14
- sky/global_user_state.py +1 -1
- sky/jobs/constants.py +8 -7
- sky/jobs/controller.py +19 -22
- sky/jobs/core.py +0 -2
- sky/jobs/recovery_strategy.py +114 -143
- sky/jobs/scheduler.py +283 -0
- sky/jobs/state.py +263 -21
- sky/jobs/utils.py +338 -96
- sky/provision/aws/config.py +48 -26
- sky/provision/gcp/instance_utils.py +15 -9
- sky/provision/kubernetes/instance.py +1 -1
- sky/provision/kubernetes/utils.py +76 -18
- sky/resources.py +1 -1
- sky/serve/autoscalers.py +359 -301
- sky/serve/controller.py +10 -8
- sky/serve/core.py +84 -7
- sky/serve/load_balancer.py +27 -10
- sky/serve/replica_managers.py +1 -3
- sky/serve/serve_state.py +10 -5
- sky/serve/serve_utils.py +28 -1
- sky/serve/service.py +4 -3
- sky/serve/service_spec.py +31 -0
- sky/skylet/constants.py +1 -1
- sky/skylet/events.py +7 -3
- sky/skylet/job_lib.py +10 -30
- sky/skylet/log_lib.py +8 -8
- sky/skylet/log_lib.pyi +3 -0
- sky/skylet/skylet.py +1 -1
- sky/templates/jobs-controller.yaml.j2 +7 -3
- sky/templates/sky-serve-controller.yaml.j2 +4 -0
- sky/utils/db_utils.py +18 -4
- sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
- sky/utils/resources_utils.py +25 -21
- sky/utils/schemas.py +13 -0
- sky/utils/subprocess_utils.py +48 -9
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +4 -1
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +49 -48
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/jobs/controller.py
CHANGED
@@ -16,6 +16,7 @@ from sky import status_lib
|
|
16
16
|
from sky.backends import backend_utils
|
17
17
|
from sky.backends import cloud_vm_ray_backend
|
18
18
|
from sky.jobs import recovery_strategy
|
19
|
+
from sky.jobs import scheduler
|
19
20
|
from sky.jobs import state as managed_job_state
|
20
21
|
from sky.jobs import utils as managed_job_utils
|
21
22
|
from sky.skylet import constants
|
@@ -46,12 +47,10 @@ def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
|
|
46
47
|
class JobsController:
|
47
48
|
"""Each jobs controller manages the life cycle of one managed job."""
|
48
49
|
|
49
|
-
def __init__(self, job_id: int, dag_yaml: str
|
50
|
-
retry_until_up: bool) -> None:
|
50
|
+
def __init__(self, job_id: int, dag_yaml: str) -> None:
|
51
51
|
self._job_id = job_id
|
52
52
|
self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
|
53
53
|
logger.info(self._dag)
|
54
|
-
self._retry_until_up = retry_until_up
|
55
54
|
# TODO(zhwu): this assumes the specific backend.
|
56
55
|
self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
57
56
|
|
@@ -174,7 +173,7 @@ class JobsController:
|
|
174
173
|
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
175
174
|
task.name, self._job_id)
|
176
175
|
self._strategy_executor = recovery_strategy.StrategyExecutor.make(
|
177
|
-
cluster_name, self._backend, task, self.
|
176
|
+
cluster_name, self._backend, task, self._job_id)
|
178
177
|
managed_job_state.set_submitted(
|
179
178
|
self._job_id,
|
180
179
|
task_id,
|
@@ -202,6 +201,7 @@ class JobsController:
|
|
202
201
|
task_id=task_id,
|
203
202
|
start_time=remote_job_submitted_at,
|
204
203
|
callback_func=callback_func)
|
204
|
+
|
205
205
|
while True:
|
206
206
|
time.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
|
207
207
|
|
@@ -243,7 +243,7 @@ class JobsController:
|
|
243
243
|
self._download_log_and_stream(task_id, handle)
|
244
244
|
# Only clean up the cluster, not the storages, because tasks may
|
245
245
|
# share storages.
|
246
|
-
|
246
|
+
managed_job_utils.terminate_cluster(cluster_name=cluster_name)
|
247
247
|
return True
|
248
248
|
|
249
249
|
# For single-node jobs, non-terminated job_status indicates a
|
@@ -256,9 +256,7 @@ class JobsController:
|
|
256
256
|
task.num_nodes == 1):
|
257
257
|
continue
|
258
258
|
|
259
|
-
if job_status in
|
260
|
-
job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
|
261
|
-
]:
|
259
|
+
if job_status in job_lib.JobStatus.user_code_failure_states():
|
262
260
|
# Add a grace period before the check of preemption to avoid
|
263
261
|
# false alarm for job failure.
|
264
262
|
time.sleep(5)
|
@@ -288,9 +286,7 @@ class JobsController:
|
|
288
286
|
if job_status is not None and not job_status.is_terminal():
|
289
287
|
# The multi-node job is still running, continue monitoring.
|
290
288
|
continue
|
291
|
-
elif job_status in
|
292
|
-
job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
|
293
|
-
]:
|
289
|
+
elif job_status in job_lib.JobStatus.user_code_failure_states():
|
294
290
|
# The user code has probably crashed, fail immediately.
|
295
291
|
end_time = managed_job_utils.get_job_timestamp(
|
296
292
|
self._backend, cluster_name, get_end_time=True)
|
@@ -346,7 +342,7 @@ class JobsController:
|
|
346
342
|
# those clusters again may fail.
|
347
343
|
logger.info('Cleaning up the preempted or failed cluster'
|
348
344
|
'...')
|
349
|
-
|
345
|
+
managed_job_utils.terminate_cluster(cluster_name)
|
350
346
|
|
351
347
|
# Try to recover the managed jobs, when the cluster is preempted or
|
352
348
|
# failed or the job status is failed to be fetched.
|
@@ -428,11 +424,11 @@ class JobsController:
|
|
428
424
|
task=self._dag.tasks[task_id]))
|
429
425
|
|
430
426
|
|
431
|
-
def _run_controller(job_id: int, dag_yaml: str
|
427
|
+
def _run_controller(job_id: int, dag_yaml: str):
|
432
428
|
"""Runs the controller in a remote process for interruption."""
|
433
429
|
# The controller needs to be instantiated in the remote process, since
|
434
430
|
# the controller is not serializable.
|
435
|
-
jobs_controller = JobsController(job_id, dag_yaml
|
431
|
+
jobs_controller = JobsController(job_id, dag_yaml)
|
436
432
|
jobs_controller.run()
|
437
433
|
|
438
434
|
|
@@ -482,17 +478,18 @@ def _cleanup(job_id: int, dag_yaml: str):
|
|
482
478
|
assert task.name is not None, task
|
483
479
|
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
484
480
|
task.name, job_id)
|
485
|
-
|
481
|
+
managed_job_utils.terminate_cluster(cluster_name)
|
486
482
|
# Clean up Storages with persistent=False.
|
487
483
|
# TODO(zhwu): this assumes the specific backend.
|
488
484
|
backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
489
485
|
backend.teardown_ephemeral_storage(task)
|
490
486
|
|
491
487
|
|
492
|
-
def start(job_id, dag_yaml
|
488
|
+
def start(job_id, dag_yaml):
|
493
489
|
"""Start the controller."""
|
494
490
|
controller_process = None
|
495
491
|
cancelling = False
|
492
|
+
task_id = None
|
496
493
|
try:
|
497
494
|
_handle_signal(job_id)
|
498
495
|
# TODO(suquark): In theory, we should make controller process a
|
@@ -502,8 +499,7 @@ def start(job_id, dag_yaml, retry_until_up):
|
|
502
499
|
# So we can only enable daemon after we no longer need to
|
503
500
|
# start daemon processes like Ray.
|
504
501
|
controller_process = multiprocessing.Process(target=_run_controller,
|
505
|
-
args=(job_id, dag_yaml
|
506
|
-
retry_until_up))
|
502
|
+
args=(job_id, dag_yaml))
|
507
503
|
controller_process.start()
|
508
504
|
while controller_process.is_alive():
|
509
505
|
_handle_signal(job_id)
|
@@ -511,6 +507,7 @@ def start(job_id, dag_yaml, retry_until_up):
|
|
511
507
|
except exceptions.ManagedJobUserCancelledError:
|
512
508
|
dag, _ = _get_dag_and_name(dag_yaml)
|
513
509
|
task_id, _ = managed_job_state.get_latest_task_id_status(job_id)
|
510
|
+
assert task_id is not None, job_id
|
514
511
|
logger.info(
|
515
512
|
f'Cancelling managed job, job_id: {job_id}, task_id: {task_id}')
|
516
513
|
managed_job_state.set_cancelling(
|
@@ -542,6 +539,7 @@ def start(job_id, dag_yaml, retry_until_up):
|
|
542
539
|
logger.info(f'Cluster of managed job {job_id} has been cleaned up.')
|
543
540
|
|
544
541
|
if cancelling:
|
542
|
+
assert task_id is not None, job_id # Since it's set with cancelling
|
545
543
|
managed_job_state.set_cancelled(
|
546
544
|
job_id=job_id,
|
547
545
|
callback_func=managed_job_utils.event_callback_func(
|
@@ -563,6 +561,8 @@ def start(job_id, dag_yaml, retry_until_up):
|
|
563
561
|
failure_reason=('Unexpected error occurred. For details, '
|
564
562
|
f'run: sky jobs logs --controller {job_id}'))
|
565
563
|
|
564
|
+
scheduler.job_done(job_id)
|
565
|
+
|
566
566
|
|
567
567
|
if __name__ == '__main__':
|
568
568
|
parser = argparse.ArgumentParser()
|
@@ -570,9 +570,6 @@ if __name__ == '__main__':
|
|
570
570
|
required=True,
|
571
571
|
type=int,
|
572
572
|
help='Job id for the controller job.')
|
573
|
-
parser.add_argument('--retry-until-up',
|
574
|
-
action='store_true',
|
575
|
-
help='Retry until the cluster is up.')
|
576
573
|
parser.add_argument('dag_yaml',
|
577
574
|
type=str,
|
578
575
|
help='The path to the user job yaml file.')
|
@@ -580,4 +577,4 @@ if __name__ == '__main__':
|
|
580
577
|
# We start process with 'spawn', because 'fork' could result in weird
|
581
578
|
# behaviors; 'spawn' is also cross-platform.
|
582
579
|
multiprocessing.set_start_method('spawn', force=True)
|
583
|
-
start(args.job_id, args.dag_yaml
|
580
|
+
start(args.job_id, args.dag_yaml)
|
sky/jobs/core.py
CHANGED
@@ -41,7 +41,6 @@ def launch(
|
|
41
41
|
name: Optional[str] = None,
|
42
42
|
stream_logs: bool = True,
|
43
43
|
detach_run: bool = False,
|
44
|
-
retry_until_up: bool = False,
|
45
44
|
# TODO(cooperc): remove fast arg before 0.8.0
|
46
45
|
fast: bool = True, # pylint: disable=unused-argument for compatibility
|
47
46
|
) -> None:
|
@@ -115,7 +114,6 @@ def launch(
|
|
115
114
|
'jobs_controller': controller_name,
|
116
115
|
# Note: actual cluster name will be <task.name>-<managed job ID>
|
117
116
|
'dag_name': dag.name,
|
118
|
-
'retry_until_up': retry_until_up,
|
119
117
|
'remote_user_config_path': remote_user_config_path,
|
120
118
|
'modified_catalogs':
|
121
119
|
service_catalog_common.get_modified_catalog_file_mounts(),
|
sky/jobs/recovery_strategy.py
CHANGED
@@ -17,6 +17,7 @@ from sky import global_user_state
|
|
17
17
|
from sky import sky_logging
|
18
18
|
from sky import status_lib
|
19
19
|
from sky.backends import backend_utils
|
20
|
+
from sky.jobs import scheduler
|
20
21
|
from sky.jobs import utils as managed_job_utils
|
21
22
|
from sky.skylet import job_lib
|
22
23
|
from sky.usage import usage_lib
|
@@ -42,45 +43,20 @@ MAX_JOB_CHECKING_RETRY = 10
|
|
42
43
|
_AUTODOWN_MINUTES = 5
|
43
44
|
|
44
45
|
|
45
|
-
def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
|
46
|
-
"""Terminate the cluster."""
|
47
|
-
retry_cnt = 0
|
48
|
-
while True:
|
49
|
-
try:
|
50
|
-
usage_lib.messages.usage.set_internal()
|
51
|
-
sky.down(cluster_name)
|
52
|
-
return
|
53
|
-
except exceptions.ClusterDoesNotExist:
|
54
|
-
# The cluster is already down.
|
55
|
-
logger.debug(f'The cluster {cluster_name} is already down.')
|
56
|
-
return
|
57
|
-
except Exception as e: # pylint: disable=broad-except
|
58
|
-
retry_cnt += 1
|
59
|
-
if retry_cnt >= max_retry:
|
60
|
-
raise RuntimeError(
|
61
|
-
f'Failed to terminate the cluster {cluster_name}.') from e
|
62
|
-
logger.error(
|
63
|
-
f'Failed to terminate the cluster {cluster_name}. Retrying.'
|
64
|
-
f'Details: {common_utils.format_exception(e)}')
|
65
|
-
with ux_utils.enable_traceback():
|
66
|
-
logger.error(f' Traceback: {traceback.format_exc()}')
|
67
|
-
|
68
|
-
|
69
46
|
class StrategyExecutor:
|
70
47
|
"""Handle the launching, recovery and termination of managed job clusters"""
|
71
48
|
|
72
49
|
RETRY_INIT_GAP_SECONDS = 60
|
73
50
|
|
74
51
|
def __init__(self, cluster_name: str, backend: 'backends.Backend',
|
75
|
-
task: 'task_lib.Task',
|
76
|
-
|
52
|
+
task: 'task_lib.Task', max_restarts_on_errors: int,
|
53
|
+
job_id: int) -> None:
|
77
54
|
"""Initialize the strategy executor.
|
78
55
|
|
79
56
|
Args:
|
80
57
|
cluster_name: The name of the cluster.
|
81
58
|
backend: The backend to use. Only CloudVMRayBackend is supported.
|
82
59
|
task: The task to execute.
|
83
|
-
retry_until_up: Whether to retry until the cluster is up.
|
84
60
|
"""
|
85
61
|
assert isinstance(backend, backends.CloudVmRayBackend), (
|
86
62
|
'Only CloudVMRayBackend is supported.')
|
@@ -88,8 +64,8 @@ class StrategyExecutor:
|
|
88
64
|
self.dag.add(task)
|
89
65
|
self.cluster_name = cluster_name
|
90
66
|
self.backend = backend
|
91
|
-
self.retry_until_up = retry_until_up
|
92
67
|
self.max_restarts_on_errors = max_restarts_on_errors
|
68
|
+
self.job_id = job_id
|
93
69
|
self.restart_cnt_on_failure = 0
|
94
70
|
|
95
71
|
def __init_subclass__(cls, name: str, default: bool = False):
|
@@ -102,7 +78,7 @@ class StrategyExecutor:
|
|
102
78
|
|
103
79
|
@classmethod
|
104
80
|
def make(cls, cluster_name: str, backend: 'backends.Backend',
|
105
|
-
task: 'task_lib.Task',
|
81
|
+
task: 'task_lib.Task', job_id: int) -> 'StrategyExecutor':
|
106
82
|
"""Create a strategy from a task."""
|
107
83
|
|
108
84
|
resource_list = list(task.resources)
|
@@ -127,8 +103,9 @@ class StrategyExecutor:
|
|
127
103
|
job_recovery_name = job_recovery
|
128
104
|
max_restarts_on_errors = 0
|
129
105
|
return RECOVERY_STRATEGIES[job_recovery_name](cluster_name, backend,
|
130
|
-
task,
|
131
|
-
max_restarts_on_errors
|
106
|
+
task,
|
107
|
+
max_restarts_on_errors,
|
108
|
+
job_id)
|
132
109
|
|
133
110
|
def launch(self) -> float:
|
134
111
|
"""Launch the cluster for the first time.
|
@@ -142,10 +119,7 @@ class StrategyExecutor:
|
|
142
119
|
Raises: Please refer to the docstring of self._launch().
|
143
120
|
"""
|
144
121
|
|
145
|
-
|
146
|
-
job_submit_at = self._launch(max_retry=None)
|
147
|
-
else:
|
148
|
-
job_submit_at = self._launch()
|
122
|
+
job_submit_at = self._launch(max_retry=None)
|
149
123
|
assert job_submit_at is not None
|
150
124
|
return job_submit_at
|
151
125
|
|
@@ -195,7 +169,7 @@ class StrategyExecutor:
|
|
195
169
|
f'{common_utils.format_exception(e)}\n'
|
196
170
|
'Terminating the cluster explicitly to ensure no '
|
197
171
|
'remaining job process interferes with recovery.')
|
198
|
-
terminate_cluster(self.cluster_name)
|
172
|
+
managed_job_utils.terminate_cluster(self.cluster_name)
|
199
173
|
|
200
174
|
def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
|
201
175
|
"""Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
|
@@ -304,89 +278,96 @@ class StrategyExecutor:
|
|
304
278
|
backoff = common_utils.Backoff(self.RETRY_INIT_GAP_SECONDS)
|
305
279
|
while True:
|
306
280
|
retry_cnt += 1
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
319
|
-
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
raise exceptions.ProvisionPrechecksError(reasons=[e])
|
330
|
-
return None
|
331
|
-
except exceptions.ResourcesUnavailableError as e:
|
332
|
-
# This is raised when the launch fails due to prechecks or
|
333
|
-
# after failing over through all the candidates.
|
334
|
-
# Please refer to the docstring of `sky.launch` for more
|
335
|
-
# details of how the exception will be structured.
|
336
|
-
if not any(
|
337
|
-
isinstance(err, exceptions.ResourcesUnavailableError)
|
338
|
-
for err in e.failover_history):
|
339
|
-
# _launch() (this function) should fail/exit directly, if
|
340
|
-
# none of the failover reasons were because of resource
|
341
|
-
# unavailability or no failover was attempted (the optimizer
|
342
|
-
# cannot find feasible resources for requested resources),
|
343
|
-
# i.e., e.failover_history is empty.
|
344
|
-
# Failing directly avoids the infinite loop of retrying
|
345
|
-
# the launch when, e.g., an invalid cluster name is used
|
346
|
-
# and --retry-until-up is specified.
|
347
|
-
reasons = (e.failover_history
|
348
|
-
if e.failover_history else [e])
|
349
|
-
reasons_str = '; '.join(
|
350
|
-
common_utils.format_exception(err) for err in reasons)
|
351
|
-
logger.error(
|
352
|
-
'Failure happened before provisioning. Failover '
|
353
|
-
f'reasons: {reasons_str}')
|
281
|
+
with scheduler.scheduled_launch(self.job_id):
|
282
|
+
try:
|
283
|
+
usage_lib.messages.usage.set_internal()
|
284
|
+
# Detach setup, so that the setup failure can be detected
|
285
|
+
# by the controller process (job_status -> FAILED_SETUP).
|
286
|
+
sky.launch(
|
287
|
+
self.dag,
|
288
|
+
cluster_name=self.cluster_name,
|
289
|
+
# We expect to tear down the cluster as soon as the job
|
290
|
+
# is finished. However, in case the controller dies, set
|
291
|
+
# autodown to try and avoid a resource leak.
|
292
|
+
idle_minutes_to_autostop=_AUTODOWN_MINUTES,
|
293
|
+
down=True,
|
294
|
+
detach_setup=True,
|
295
|
+
detach_run=True,
|
296
|
+
_is_launched_by_jobs_controller=True)
|
297
|
+
logger.info('Managed job cluster launched.')
|
298
|
+
except (exceptions.InvalidClusterNameError,
|
299
|
+
exceptions.NoCloudAccessError,
|
300
|
+
exceptions.ResourcesMismatchError) as e:
|
301
|
+
logger.error('Failure happened before provisioning. '
|
302
|
+
f'{common_utils.format_exception(e)}')
|
354
303
|
if raise_on_failure:
|
355
|
-
raise exceptions.ProvisionPrechecksError(reasons)
|
356
|
-
return None
|
357
|
-
logger.info('Failed to launch a cluster with error: '
|
358
|
-
f'{common_utils.format_exception(e)})')
|
359
|
-
except Exception as e: # pylint: disable=broad-except
|
360
|
-
# If the launch fails, it will be recovered by the following
|
361
|
-
# code.
|
362
|
-
logger.info('Failed to launch a cluster with error: '
|
363
|
-
f'{common_utils.format_exception(e)})')
|
364
|
-
with ux_utils.enable_traceback():
|
365
|
-
logger.info(f' Traceback: {traceback.format_exc()}')
|
366
|
-
else: # No exception, the launch succeeds.
|
367
|
-
# At this point, a sky.launch() has succeeded. Cluster may be
|
368
|
-
# UP (no preemption since) or DOWN (newly preempted).
|
369
|
-
job_submitted_at = self._wait_until_job_starts_on_cluster()
|
370
|
-
if job_submitted_at is not None:
|
371
|
-
return job_submitted_at
|
372
|
-
# The job fails to start on the cluster, retry the launch.
|
373
|
-
# TODO(zhwu): log the unexpected error to usage collection
|
374
|
-
# for future debugging.
|
375
|
-
logger.info(
|
376
|
-
'Failed to successfully submit the job to the '
|
377
|
-
'launched cluster, due to unexpected submission errors or '
|
378
|
-
'the cluster being preempted during job submission.')
|
379
|
-
|
380
|
-
terminate_cluster(self.cluster_name)
|
381
|
-
if max_retry is not None and retry_cnt >= max_retry:
|
382
|
-
# Retry forever if max_retry is None.
|
383
|
-
if raise_on_failure:
|
384
|
-
with ux_utils.print_exception_no_traceback():
|
385
|
-
raise exceptions.ManagedJobReachedMaxRetriesError(
|
386
|
-
'Resources unavailable: failed to launch clusters '
|
387
|
-
f'after {max_retry} retries.')
|
388
|
-
else:
|
304
|
+
raise exceptions.ProvisionPrechecksError(reasons=[e])
|
389
305
|
return None
|
306
|
+
except exceptions.ResourcesUnavailableError as e:
|
307
|
+
# This is raised when the launch fails due to prechecks or
|
308
|
+
# after failing over through all the candidates.
|
309
|
+
# Please refer to the docstring of `sky.launch` for more
|
310
|
+
# details of how the exception will be structured.
|
311
|
+
if not any(
|
312
|
+
isinstance(err,
|
313
|
+
exceptions.ResourcesUnavailableError)
|
314
|
+
for err in e.failover_history):
|
315
|
+
# _launch() (this function) should fail/exit directly,
|
316
|
+
# if none of the failover reasons were because of
|
317
|
+
# resource unavailability or no failover was attempted
|
318
|
+
# (the optimizer cannot find feasible resources for
|
319
|
+
# requested resources), i.e., e.failover_history is
|
320
|
+
# empty. Failing directly avoids the infinite loop of
|
321
|
+
# retrying the launch when, e.g., an invalid cluster
|
322
|
+
# name is used and --retry-until-up is specified.
|
323
|
+
reasons = (e.failover_history
|
324
|
+
if e.failover_history else [e])
|
325
|
+
reasons_str = '; '.join(
|
326
|
+
common_utils.format_exception(err)
|
327
|
+
for err in reasons)
|
328
|
+
logger.error(
|
329
|
+
'Failure happened before provisioning. Failover '
|
330
|
+
f'reasons: {reasons_str}')
|
331
|
+
if raise_on_failure:
|
332
|
+
raise exceptions.ProvisionPrechecksError(reasons)
|
333
|
+
return None
|
334
|
+
logger.info('Failed to launch a cluster with error: '
|
335
|
+
f'{common_utils.format_exception(e)})')
|
336
|
+
except Exception as e: # pylint: disable=broad-except
|
337
|
+
# If the launch fails, it will be recovered by the following
|
338
|
+
# code.
|
339
|
+
logger.info('Failed to launch a cluster with error: '
|
340
|
+
f'{common_utils.format_exception(e)})')
|
341
|
+
with ux_utils.enable_traceback():
|
342
|
+
logger.info(f' Traceback: {traceback.format_exc()}')
|
343
|
+
else: # No exception, the launch succeeds.
|
344
|
+
# At this point, a sky.launch() has succeeded. Cluster may
|
345
|
+
# be UP (no preemption since) or DOWN (newly preempted).
|
346
|
+
job_submitted_at = self._wait_until_job_starts_on_cluster()
|
347
|
+
if job_submitted_at is not None:
|
348
|
+
return job_submitted_at
|
349
|
+
# The job fails to start on the cluster, retry the launch.
|
350
|
+
# TODO(zhwu): log the unexpected error to usage collection
|
351
|
+
# for future debugging.
|
352
|
+
logger.info(
|
353
|
+
'Failed to successfully submit the job to the '
|
354
|
+
'launched cluster, due to unexpected submission errors '
|
355
|
+
'or the cluster being preempted during job submission.')
|
356
|
+
|
357
|
+
# If we get here, the launch did not succeed. Tear down the
|
358
|
+
# cluster and retry.
|
359
|
+
managed_job_utils.terminate_cluster(self.cluster_name)
|
360
|
+
if max_retry is not None and retry_cnt >= max_retry:
|
361
|
+
# Retry forever if max_retry is None.
|
362
|
+
if raise_on_failure:
|
363
|
+
with ux_utils.print_exception_no_traceback():
|
364
|
+
raise exceptions.ManagedJobReachedMaxRetriesError(
|
365
|
+
'Resources unavailable: failed to launch '
|
366
|
+
f'clusters after {max_retry} retries.')
|
367
|
+
else:
|
368
|
+
return None
|
369
|
+
# Exit the scheduled_launch context so that the scheulde state is
|
370
|
+
# ALIVE during the backoff. This allows other jobs to launch.
|
390
371
|
gap_seconds = backoff.current_backoff()
|
391
372
|
logger.info('Retrying to launch the cluster in '
|
392
373
|
f'{gap_seconds:.1f} seconds.')
|
@@ -411,10 +392,10 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
|
|
411
392
|
_MAX_RETRY_CNT = 240 # Retry for 4 hours.
|
412
393
|
|
413
394
|
def __init__(self, cluster_name: str, backend: 'backends.Backend',
|
414
|
-
task: 'task_lib.Task',
|
415
|
-
|
416
|
-
super().__init__(cluster_name, backend, task,
|
417
|
-
|
395
|
+
task: 'task_lib.Task', max_restarts_on_errors: int,
|
396
|
+
job_id: int) -> None:
|
397
|
+
super().__init__(cluster_name, backend, task, max_restarts_on_errors,
|
398
|
+
job_id)
|
418
399
|
# Note down the cloud/region of the launched cluster, so that we can
|
419
400
|
# first retry in the same cloud/region. (Inside recover() we may not
|
420
401
|
# rely on cluster handle, as it can be None if the cluster is
|
@@ -468,7 +449,7 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
|
|
468
449
|
# Step 2
|
469
450
|
logger.debug('Terminating unhealthy cluster and reset cloud '
|
470
451
|
'region.')
|
471
|
-
terminate_cluster(self.cluster_name)
|
452
|
+
managed_job_utils.terminate_cluster(self.cluster_name)
|
472
453
|
|
473
454
|
# Step 3
|
474
455
|
logger.debug('Relaunch the cluster without constraining to prior '
|
@@ -478,16 +459,11 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
|
|
478
459
|
raise_on_failure=False)
|
479
460
|
if job_submitted_at is None:
|
480
461
|
# Failed to launch the cluster.
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
continue
|
487
|
-
with ux_utils.print_exception_no_traceback():
|
488
|
-
raise exceptions.ResourcesUnavailableError(
|
489
|
-
f'Failed to recover the cluster after retrying '
|
490
|
-
f'{self._MAX_RETRY_CNT} times.')
|
462
|
+
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
463
|
+
logger.info('Retrying to recover the cluster in '
|
464
|
+
f'{gap_seconds:.1f} seconds.')
|
465
|
+
time.sleep(gap_seconds)
|
466
|
+
continue
|
491
467
|
|
492
468
|
return job_submitted_at
|
493
469
|
|
@@ -531,7 +507,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
|
|
531
507
|
|
532
508
|
# Step 1
|
533
509
|
logger.debug('Terminating unhealthy cluster and reset cloud region.')
|
534
|
-
terminate_cluster(self.cluster_name)
|
510
|
+
managed_job_utils.terminate_cluster(self.cluster_name)
|
535
511
|
|
536
512
|
# Step 2
|
537
513
|
logger.debug('Relaunch the cluster skipping the previously launched '
|
@@ -566,15 +542,10 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
|
|
566
542
|
raise_on_failure=False)
|
567
543
|
if job_submitted_at is None:
|
568
544
|
# Failed to launch the cluster.
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
continue
|
575
|
-
with ux_utils.print_exception_no_traceback():
|
576
|
-
raise exceptions.ResourcesUnavailableError(
|
577
|
-
f'Failed to recover the cluster after retrying '
|
578
|
-
f'{self._MAX_RETRY_CNT} times.')
|
545
|
+
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
546
|
+
logger.info('Retrying to recover the cluster in '
|
547
|
+
f'{gap_seconds:.1f} seconds.')
|
548
|
+
time.sleep(gap_seconds)
|
549
|
+
continue
|
579
550
|
|
580
551
|
return job_submitted_at
|