skypilot-nightly 1.0.0.dev20250114__py3-none-any.whl → 1.0.0.dev20250124__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/cloud_vm_ray_backend.py +50 -67
  3. sky/check.py +31 -1
  4. sky/cli.py +11 -34
  5. sky/clouds/kubernetes.py +3 -3
  6. sky/clouds/service_catalog/kubernetes_catalog.py +14 -0
  7. sky/core.py +8 -5
  8. sky/data/storage.py +66 -14
  9. sky/global_user_state.py +1 -1
  10. sky/jobs/constants.py +8 -7
  11. sky/jobs/controller.py +19 -22
  12. sky/jobs/core.py +0 -2
  13. sky/jobs/recovery_strategy.py +114 -143
  14. sky/jobs/scheduler.py +283 -0
  15. sky/jobs/state.py +263 -21
  16. sky/jobs/utils.py +338 -96
  17. sky/provision/aws/config.py +48 -26
  18. sky/provision/gcp/instance_utils.py +15 -9
  19. sky/provision/kubernetes/instance.py +1 -1
  20. sky/provision/kubernetes/utils.py +76 -18
  21. sky/resources.py +1 -1
  22. sky/serve/autoscalers.py +359 -301
  23. sky/serve/controller.py +10 -8
  24. sky/serve/core.py +84 -7
  25. sky/serve/load_balancer.py +27 -10
  26. sky/serve/replica_managers.py +1 -3
  27. sky/serve/serve_state.py +10 -5
  28. sky/serve/serve_utils.py +28 -1
  29. sky/serve/service.py +4 -3
  30. sky/serve/service_spec.py +31 -0
  31. sky/skylet/constants.py +1 -1
  32. sky/skylet/events.py +7 -3
  33. sky/skylet/job_lib.py +10 -30
  34. sky/skylet/log_lib.py +8 -8
  35. sky/skylet/log_lib.pyi +3 -0
  36. sky/skylet/skylet.py +1 -1
  37. sky/templates/jobs-controller.yaml.j2 +7 -3
  38. sky/templates/sky-serve-controller.yaml.j2 +4 -0
  39. sky/utils/db_utils.py +18 -4
  40. sky/utils/kubernetes/deploy_remote_cluster.sh +5 -5
  41. sky/utils/resources_utils.py +25 -21
  42. sky/utils/schemas.py +13 -0
  43. sky/utils/subprocess_utils.py +48 -9
  44. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/METADATA +4 -1
  45. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/RECORD +49 -48
  46. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/LICENSE +0 -0
  47. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/WHEEL +0 -0
  48. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/entry_points.txt +0 -0
  49. {skypilot_nightly-1.0.0.dev20250114.dist-info → skypilot_nightly-1.0.0.dev20250124.dist-info}/top_level.txt +0 -0
sky/jobs/controller.py CHANGED
@@ -16,6 +16,7 @@ from sky import status_lib
16
16
  from sky.backends import backend_utils
17
17
  from sky.backends import cloud_vm_ray_backend
18
18
  from sky.jobs import recovery_strategy
19
+ from sky.jobs import scheduler
19
20
  from sky.jobs import state as managed_job_state
20
21
  from sky.jobs import utils as managed_job_utils
21
22
  from sky.skylet import constants
@@ -46,12 +47,10 @@ def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
46
47
  class JobsController:
47
48
  """Each jobs controller manages the life cycle of one managed job."""
48
49
 
49
- def __init__(self, job_id: int, dag_yaml: str,
50
- retry_until_up: bool) -> None:
50
+ def __init__(self, job_id: int, dag_yaml: str) -> None:
51
51
  self._job_id = job_id
52
52
  self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
53
53
  logger.info(self._dag)
54
- self._retry_until_up = retry_until_up
55
54
  # TODO(zhwu): this assumes the specific backend.
56
55
  self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
57
56
 
@@ -174,7 +173,7 @@ class JobsController:
174
173
  cluster_name = managed_job_utils.generate_managed_job_cluster_name(
175
174
  task.name, self._job_id)
176
175
  self._strategy_executor = recovery_strategy.StrategyExecutor.make(
177
- cluster_name, self._backend, task, self._retry_until_up)
176
+ cluster_name, self._backend, task, self._job_id)
178
177
  managed_job_state.set_submitted(
179
178
  self._job_id,
180
179
  task_id,
@@ -202,6 +201,7 @@ class JobsController:
202
201
  task_id=task_id,
203
202
  start_time=remote_job_submitted_at,
204
203
  callback_func=callback_func)
204
+
205
205
  while True:
206
206
  time.sleep(managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS)
207
207
 
@@ -243,7 +243,7 @@ class JobsController:
243
243
  self._download_log_and_stream(task_id, handle)
244
244
  # Only clean up the cluster, not the storages, because tasks may
245
245
  # share storages.
246
- recovery_strategy.terminate_cluster(cluster_name=cluster_name)
246
+ managed_job_utils.terminate_cluster(cluster_name=cluster_name)
247
247
  return True
248
248
 
249
249
  # For single-node jobs, non-terminated job_status indicates a
@@ -256,9 +256,7 @@ class JobsController:
256
256
  task.num_nodes == 1):
257
257
  continue
258
258
 
259
- if job_status in [
260
- job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
261
- ]:
259
+ if job_status in job_lib.JobStatus.user_code_failure_states():
262
260
  # Add a grace period before the check of preemption to avoid
263
261
  # false alarm for job failure.
264
262
  time.sleep(5)
@@ -288,9 +286,7 @@ class JobsController:
288
286
  if job_status is not None and not job_status.is_terminal():
289
287
  # The multi-node job is still running, continue monitoring.
290
288
  continue
291
- elif job_status in [
292
- job_lib.JobStatus.FAILED, job_lib.JobStatus.FAILED_SETUP
293
- ]:
289
+ elif job_status in job_lib.JobStatus.user_code_failure_states():
294
290
  # The user code has probably crashed, fail immediately.
295
291
  end_time = managed_job_utils.get_job_timestamp(
296
292
  self._backend, cluster_name, get_end_time=True)
@@ -346,7 +342,7 @@ class JobsController:
346
342
  # those clusters again may fail.
347
343
  logger.info('Cleaning up the preempted or failed cluster'
348
344
  '...')
349
- recovery_strategy.terminate_cluster(cluster_name)
345
+ managed_job_utils.terminate_cluster(cluster_name)
350
346
 
351
347
  # Try to recover the managed jobs, when the cluster is preempted or
352
348
  # failed or the job status is failed to be fetched.
@@ -428,11 +424,11 @@ class JobsController:
428
424
  task=self._dag.tasks[task_id]))
429
425
 
430
426
 
431
- def _run_controller(job_id: int, dag_yaml: str, retry_until_up: bool):
427
+ def _run_controller(job_id: int, dag_yaml: str):
432
428
  """Runs the controller in a remote process for interruption."""
433
429
  # The controller needs to be instantiated in the remote process, since
434
430
  # the controller is not serializable.
435
- jobs_controller = JobsController(job_id, dag_yaml, retry_until_up)
431
+ jobs_controller = JobsController(job_id, dag_yaml)
436
432
  jobs_controller.run()
437
433
 
438
434
 
@@ -482,17 +478,18 @@ def _cleanup(job_id: int, dag_yaml: str):
482
478
  assert task.name is not None, task
483
479
  cluster_name = managed_job_utils.generate_managed_job_cluster_name(
484
480
  task.name, job_id)
485
- recovery_strategy.terminate_cluster(cluster_name)
481
+ managed_job_utils.terminate_cluster(cluster_name)
486
482
  # Clean up Storages with persistent=False.
487
483
  # TODO(zhwu): this assumes the specific backend.
488
484
  backend = cloud_vm_ray_backend.CloudVmRayBackend()
489
485
  backend.teardown_ephemeral_storage(task)
490
486
 
491
487
 
492
- def start(job_id, dag_yaml, retry_until_up):
488
+ def start(job_id, dag_yaml):
493
489
  """Start the controller."""
494
490
  controller_process = None
495
491
  cancelling = False
492
+ task_id = None
496
493
  try:
497
494
  _handle_signal(job_id)
498
495
  # TODO(suquark): In theory, we should make controller process a
@@ -502,8 +499,7 @@ def start(job_id, dag_yaml, retry_until_up):
502
499
  # So we can only enable daemon after we no longer need to
503
500
  # start daemon processes like Ray.
504
501
  controller_process = multiprocessing.Process(target=_run_controller,
505
- args=(job_id, dag_yaml,
506
- retry_until_up))
502
+ args=(job_id, dag_yaml))
507
503
  controller_process.start()
508
504
  while controller_process.is_alive():
509
505
  _handle_signal(job_id)
@@ -511,6 +507,7 @@ def start(job_id, dag_yaml, retry_until_up):
511
507
  except exceptions.ManagedJobUserCancelledError:
512
508
  dag, _ = _get_dag_and_name(dag_yaml)
513
509
  task_id, _ = managed_job_state.get_latest_task_id_status(job_id)
510
+ assert task_id is not None, job_id
514
511
  logger.info(
515
512
  f'Cancelling managed job, job_id: {job_id}, task_id: {task_id}')
516
513
  managed_job_state.set_cancelling(
@@ -542,6 +539,7 @@ def start(job_id, dag_yaml, retry_until_up):
542
539
  logger.info(f'Cluster of managed job {job_id} has been cleaned up.')
543
540
 
544
541
  if cancelling:
542
+ assert task_id is not None, job_id # Since it's set with cancelling
545
543
  managed_job_state.set_cancelled(
546
544
  job_id=job_id,
547
545
  callback_func=managed_job_utils.event_callback_func(
@@ -563,6 +561,8 @@ def start(job_id, dag_yaml, retry_until_up):
563
561
  failure_reason=('Unexpected error occurred. For details, '
564
562
  f'run: sky jobs logs --controller {job_id}'))
565
563
 
564
+ scheduler.job_done(job_id)
565
+
566
566
 
567
567
  if __name__ == '__main__':
568
568
  parser = argparse.ArgumentParser()
@@ -570,9 +570,6 @@ if __name__ == '__main__':
570
570
  required=True,
571
571
  type=int,
572
572
  help='Job id for the controller job.')
573
- parser.add_argument('--retry-until-up',
574
- action='store_true',
575
- help='Retry until the cluster is up.')
576
573
  parser.add_argument('dag_yaml',
577
574
  type=str,
578
575
  help='The path to the user job yaml file.')
@@ -580,4 +577,4 @@ if __name__ == '__main__':
580
577
  # We start process with 'spawn', because 'fork' could result in weird
581
578
  # behaviors; 'spawn' is also cross-platform.
582
579
  multiprocessing.set_start_method('spawn', force=True)
583
- start(args.job_id, args.dag_yaml, args.retry_until_up)
580
+ start(args.job_id, args.dag_yaml)
sky/jobs/core.py CHANGED
@@ -41,7 +41,6 @@ def launch(
41
41
  name: Optional[str] = None,
42
42
  stream_logs: bool = True,
43
43
  detach_run: bool = False,
44
- retry_until_up: bool = False,
45
44
  # TODO(cooperc): remove fast arg before 0.8.0
46
45
  fast: bool = True, # pylint: disable=unused-argument for compatibility
47
46
  ) -> None:
@@ -115,7 +114,6 @@ def launch(
115
114
  'jobs_controller': controller_name,
116
115
  # Note: actual cluster name will be <task.name>-<managed job ID>
117
116
  'dag_name': dag.name,
118
- 'retry_until_up': retry_until_up,
119
117
  'remote_user_config_path': remote_user_config_path,
120
118
  'modified_catalogs':
121
119
  service_catalog_common.get_modified_catalog_file_mounts(),
@@ -17,6 +17,7 @@ from sky import global_user_state
17
17
  from sky import sky_logging
18
18
  from sky import status_lib
19
19
  from sky.backends import backend_utils
20
+ from sky.jobs import scheduler
20
21
  from sky.jobs import utils as managed_job_utils
21
22
  from sky.skylet import job_lib
22
23
  from sky.usage import usage_lib
@@ -42,45 +43,20 @@ MAX_JOB_CHECKING_RETRY = 10
42
43
  _AUTODOWN_MINUTES = 5
43
44
 
44
45
 
45
- def terminate_cluster(cluster_name: str, max_retry: int = 3) -> None:
46
- """Terminate the cluster."""
47
- retry_cnt = 0
48
- while True:
49
- try:
50
- usage_lib.messages.usage.set_internal()
51
- sky.down(cluster_name)
52
- return
53
- except exceptions.ClusterDoesNotExist:
54
- # The cluster is already down.
55
- logger.debug(f'The cluster {cluster_name} is already down.')
56
- return
57
- except Exception as e: # pylint: disable=broad-except
58
- retry_cnt += 1
59
- if retry_cnt >= max_retry:
60
- raise RuntimeError(
61
- f'Failed to terminate the cluster {cluster_name}.') from e
62
- logger.error(
63
- f'Failed to terminate the cluster {cluster_name}. Retrying.'
64
- f'Details: {common_utils.format_exception(e)}')
65
- with ux_utils.enable_traceback():
66
- logger.error(f' Traceback: {traceback.format_exc()}')
67
-
68
-
69
46
  class StrategyExecutor:
70
47
  """Handle the launching, recovery and termination of managed job clusters"""
71
48
 
72
49
  RETRY_INIT_GAP_SECONDS = 60
73
50
 
74
51
  def __init__(self, cluster_name: str, backend: 'backends.Backend',
75
- task: 'task_lib.Task', retry_until_up: bool,
76
- max_restarts_on_errors: int) -> None:
52
+ task: 'task_lib.Task', max_restarts_on_errors: int,
53
+ job_id: int) -> None:
77
54
  """Initialize the strategy executor.
78
55
 
79
56
  Args:
80
57
  cluster_name: The name of the cluster.
81
58
  backend: The backend to use. Only CloudVMRayBackend is supported.
82
59
  task: The task to execute.
83
- retry_until_up: Whether to retry until the cluster is up.
84
60
  """
85
61
  assert isinstance(backend, backends.CloudVmRayBackend), (
86
62
  'Only CloudVMRayBackend is supported.')
@@ -88,8 +64,8 @@ class StrategyExecutor:
88
64
  self.dag.add(task)
89
65
  self.cluster_name = cluster_name
90
66
  self.backend = backend
91
- self.retry_until_up = retry_until_up
92
67
  self.max_restarts_on_errors = max_restarts_on_errors
68
+ self.job_id = job_id
93
69
  self.restart_cnt_on_failure = 0
94
70
 
95
71
  def __init_subclass__(cls, name: str, default: bool = False):
@@ -102,7 +78,7 @@ class StrategyExecutor:
102
78
 
103
79
  @classmethod
104
80
  def make(cls, cluster_name: str, backend: 'backends.Backend',
105
- task: 'task_lib.Task', retry_until_up: bool) -> 'StrategyExecutor':
81
+ task: 'task_lib.Task', job_id: int) -> 'StrategyExecutor':
106
82
  """Create a strategy from a task."""
107
83
 
108
84
  resource_list = list(task.resources)
@@ -127,8 +103,9 @@ class StrategyExecutor:
127
103
  job_recovery_name = job_recovery
128
104
  max_restarts_on_errors = 0
129
105
  return RECOVERY_STRATEGIES[job_recovery_name](cluster_name, backend,
130
- task, retry_until_up,
131
- max_restarts_on_errors)
106
+ task,
107
+ max_restarts_on_errors,
108
+ job_id)
132
109
 
133
110
  def launch(self) -> float:
134
111
  """Launch the cluster for the first time.
@@ -142,10 +119,7 @@ class StrategyExecutor:
142
119
  Raises: Please refer to the docstring of self._launch().
143
120
  """
144
121
 
145
- if self.retry_until_up:
146
- job_submit_at = self._launch(max_retry=None)
147
- else:
148
- job_submit_at = self._launch()
122
+ job_submit_at = self._launch(max_retry=None)
149
123
  assert job_submit_at is not None
150
124
  return job_submit_at
151
125
 
@@ -195,7 +169,7 @@ class StrategyExecutor:
195
169
  f'{common_utils.format_exception(e)}\n'
196
170
  'Terminating the cluster explicitly to ensure no '
197
171
  'remaining job process interferes with recovery.')
198
- terminate_cluster(self.cluster_name)
172
+ managed_job_utils.terminate_cluster(self.cluster_name)
199
173
 
200
174
  def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
201
175
  """Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
@@ -304,89 +278,96 @@ class StrategyExecutor:
304
278
  backoff = common_utils.Backoff(self.RETRY_INIT_GAP_SECONDS)
305
279
  while True:
306
280
  retry_cnt += 1
307
- try:
308
- usage_lib.messages.usage.set_internal()
309
- # Detach setup, so that the setup failure can be detected
310
- # by the controller process (job_status -> FAILED_SETUP).
311
- sky.launch(
312
- self.dag,
313
- cluster_name=self.cluster_name,
314
- # We expect to tear down the cluster as soon as the job is
315
- # finished. However, in case the controller dies, set
316
- # autodown to try and avoid a resource leak.
317
- idle_minutes_to_autostop=_AUTODOWN_MINUTES,
318
- down=True,
319
- detach_setup=True,
320
- detach_run=True,
321
- _is_launched_by_jobs_controller=True)
322
- logger.info('Managed job cluster launched.')
323
- except (exceptions.InvalidClusterNameError,
324
- exceptions.NoCloudAccessError,
325
- exceptions.ResourcesMismatchError) as e:
326
- logger.error('Failure happened before provisioning. '
327
- f'{common_utils.format_exception(e)}')
328
- if raise_on_failure:
329
- raise exceptions.ProvisionPrechecksError(reasons=[e])
330
- return None
331
- except exceptions.ResourcesUnavailableError as e:
332
- # This is raised when the launch fails due to prechecks or
333
- # after failing over through all the candidates.
334
- # Please refer to the docstring of `sky.launch` for more
335
- # details of how the exception will be structured.
336
- if not any(
337
- isinstance(err, exceptions.ResourcesUnavailableError)
338
- for err in e.failover_history):
339
- # _launch() (this function) should fail/exit directly, if
340
- # none of the failover reasons were because of resource
341
- # unavailability or no failover was attempted (the optimizer
342
- # cannot find feasible resources for requested resources),
343
- # i.e., e.failover_history is empty.
344
- # Failing directly avoids the infinite loop of retrying
345
- # the launch when, e.g., an invalid cluster name is used
346
- # and --retry-until-up is specified.
347
- reasons = (e.failover_history
348
- if e.failover_history else [e])
349
- reasons_str = '; '.join(
350
- common_utils.format_exception(err) for err in reasons)
351
- logger.error(
352
- 'Failure happened before provisioning. Failover '
353
- f'reasons: {reasons_str}')
281
+ with scheduler.scheduled_launch(self.job_id):
282
+ try:
283
+ usage_lib.messages.usage.set_internal()
284
+ # Detach setup, so that the setup failure can be detected
285
+ # by the controller process (job_status -> FAILED_SETUP).
286
+ sky.launch(
287
+ self.dag,
288
+ cluster_name=self.cluster_name,
289
+ # We expect to tear down the cluster as soon as the job
290
+ # is finished. However, in case the controller dies, set
291
+ # autodown to try and avoid a resource leak.
292
+ idle_minutes_to_autostop=_AUTODOWN_MINUTES,
293
+ down=True,
294
+ detach_setup=True,
295
+ detach_run=True,
296
+ _is_launched_by_jobs_controller=True)
297
+ logger.info('Managed job cluster launched.')
298
+ except (exceptions.InvalidClusterNameError,
299
+ exceptions.NoCloudAccessError,
300
+ exceptions.ResourcesMismatchError) as e:
301
+ logger.error('Failure happened before provisioning. '
302
+ f'{common_utils.format_exception(e)}')
354
303
  if raise_on_failure:
355
- raise exceptions.ProvisionPrechecksError(reasons)
356
- return None
357
- logger.info('Failed to launch a cluster with error: '
358
- f'{common_utils.format_exception(e)})')
359
- except Exception as e: # pylint: disable=broad-except
360
- # If the launch fails, it will be recovered by the following
361
- # code.
362
- logger.info('Failed to launch a cluster with error: '
363
- f'{common_utils.format_exception(e)})')
364
- with ux_utils.enable_traceback():
365
- logger.info(f' Traceback: {traceback.format_exc()}')
366
- else: # No exception, the launch succeeds.
367
- # At this point, a sky.launch() has succeeded. Cluster may be
368
- # UP (no preemption since) or DOWN (newly preempted).
369
- job_submitted_at = self._wait_until_job_starts_on_cluster()
370
- if job_submitted_at is not None:
371
- return job_submitted_at
372
- # The job fails to start on the cluster, retry the launch.
373
- # TODO(zhwu): log the unexpected error to usage collection
374
- # for future debugging.
375
- logger.info(
376
- 'Failed to successfully submit the job to the '
377
- 'launched cluster, due to unexpected submission errors or '
378
- 'the cluster being preempted during job submission.')
379
-
380
- terminate_cluster(self.cluster_name)
381
- if max_retry is not None and retry_cnt >= max_retry:
382
- # Retry forever if max_retry is None.
383
- if raise_on_failure:
384
- with ux_utils.print_exception_no_traceback():
385
- raise exceptions.ManagedJobReachedMaxRetriesError(
386
- 'Resources unavailable: failed to launch clusters '
387
- f'after {max_retry} retries.')
388
- else:
304
+ raise exceptions.ProvisionPrechecksError(reasons=[e])
389
305
  return None
306
+ except exceptions.ResourcesUnavailableError as e:
307
+ # This is raised when the launch fails due to prechecks or
308
+ # after failing over through all the candidates.
309
+ # Please refer to the docstring of `sky.launch` for more
310
+ # details of how the exception will be structured.
311
+ if not any(
312
+ isinstance(err,
313
+ exceptions.ResourcesUnavailableError)
314
+ for err in e.failover_history):
315
+ # _launch() (this function) should fail/exit directly,
316
+ # if none of the failover reasons were because of
317
+ # resource unavailability or no failover was attempted
318
+ # (the optimizer cannot find feasible resources for
319
+ # requested resources), i.e., e.failover_history is
320
+ # empty. Failing directly avoids the infinite loop of
321
+ # retrying the launch when, e.g., an invalid cluster
322
+ # name is used and --retry-until-up is specified.
323
+ reasons = (e.failover_history
324
+ if e.failover_history else [e])
325
+ reasons_str = '; '.join(
326
+ common_utils.format_exception(err)
327
+ for err in reasons)
328
+ logger.error(
329
+ 'Failure happened before provisioning. Failover '
330
+ f'reasons: {reasons_str}')
331
+ if raise_on_failure:
332
+ raise exceptions.ProvisionPrechecksError(reasons)
333
+ return None
334
+ logger.info('Failed to launch a cluster with error: '
335
+ f'{common_utils.format_exception(e)})')
336
+ except Exception as e: # pylint: disable=broad-except
337
+ # If the launch fails, it will be recovered by the following
338
+ # code.
339
+ logger.info('Failed to launch a cluster with error: '
340
+ f'{common_utils.format_exception(e)})')
341
+ with ux_utils.enable_traceback():
342
+ logger.info(f' Traceback: {traceback.format_exc()}')
343
+ else: # No exception, the launch succeeds.
344
+ # At this point, a sky.launch() has succeeded. Cluster may
345
+ # be UP (no preemption since) or DOWN (newly preempted).
346
+ job_submitted_at = self._wait_until_job_starts_on_cluster()
347
+ if job_submitted_at is not None:
348
+ return job_submitted_at
349
+ # The job fails to start on the cluster, retry the launch.
350
+ # TODO(zhwu): log the unexpected error to usage collection
351
+ # for future debugging.
352
+ logger.info(
353
+ 'Failed to successfully submit the job to the '
354
+ 'launched cluster, due to unexpected submission errors '
355
+ 'or the cluster being preempted during job submission.')
356
+
357
+ # If we get here, the launch did not succeed. Tear down the
358
+ # cluster and retry.
359
+ managed_job_utils.terminate_cluster(self.cluster_name)
360
+ if max_retry is not None and retry_cnt >= max_retry:
361
+ # Retry forever if max_retry is None.
362
+ if raise_on_failure:
363
+ with ux_utils.print_exception_no_traceback():
364
+ raise exceptions.ManagedJobReachedMaxRetriesError(
365
+ 'Resources unavailable: failed to launch '
366
+ f'clusters after {max_retry} retries.')
367
+ else:
368
+ return None
369
+ # Exit the scheduled_launch context so that the scheulde state is
370
+ # ALIVE during the backoff. This allows other jobs to launch.
390
371
  gap_seconds = backoff.current_backoff()
391
372
  logger.info('Retrying to launch the cluster in '
392
373
  f'{gap_seconds:.1f} seconds.')
@@ -411,10 +392,10 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
411
392
  _MAX_RETRY_CNT = 240 # Retry for 4 hours.
412
393
 
413
394
  def __init__(self, cluster_name: str, backend: 'backends.Backend',
414
- task: 'task_lib.Task', retry_until_up: bool,
415
- max_restarts_on_errors: int) -> None:
416
- super().__init__(cluster_name, backend, task, retry_until_up,
417
- max_restarts_on_errors)
395
+ task: 'task_lib.Task', max_restarts_on_errors: int,
396
+ job_id: int) -> None:
397
+ super().__init__(cluster_name, backend, task, max_restarts_on_errors,
398
+ job_id)
418
399
  # Note down the cloud/region of the launched cluster, so that we can
419
400
  # first retry in the same cloud/region. (Inside recover() we may not
420
401
  # rely on cluster handle, as it can be None if the cluster is
@@ -468,7 +449,7 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
468
449
  # Step 2
469
450
  logger.debug('Terminating unhealthy cluster and reset cloud '
470
451
  'region.')
471
- terminate_cluster(self.cluster_name)
452
+ managed_job_utils.terminate_cluster(self.cluster_name)
472
453
 
473
454
  # Step 3
474
455
  logger.debug('Relaunch the cluster without constraining to prior '
@@ -478,16 +459,11 @@ class FailoverStrategyExecutor(StrategyExecutor, name='FAILOVER',
478
459
  raise_on_failure=False)
479
460
  if job_submitted_at is None:
480
461
  # Failed to launch the cluster.
481
- if self.retry_until_up:
482
- gap_seconds = self.RETRY_INIT_GAP_SECONDS
483
- logger.info('Retrying to recover the cluster in '
484
- f'{gap_seconds:.1f} seconds.')
485
- time.sleep(gap_seconds)
486
- continue
487
- with ux_utils.print_exception_no_traceback():
488
- raise exceptions.ResourcesUnavailableError(
489
- f'Failed to recover the cluster after retrying '
490
- f'{self._MAX_RETRY_CNT} times.')
462
+ gap_seconds = self.RETRY_INIT_GAP_SECONDS
463
+ logger.info('Retrying to recover the cluster in '
464
+ f'{gap_seconds:.1f} seconds.')
465
+ time.sleep(gap_seconds)
466
+ continue
491
467
 
492
468
  return job_submitted_at
493
469
 
@@ -531,7 +507,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
531
507
 
532
508
  # Step 1
533
509
  logger.debug('Terminating unhealthy cluster and reset cloud region.')
534
- terminate_cluster(self.cluster_name)
510
+ managed_job_utils.terminate_cluster(self.cluster_name)
535
511
 
536
512
  # Step 2
537
513
  logger.debug('Relaunch the cluster skipping the previously launched '
@@ -566,15 +542,10 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor,
566
542
  raise_on_failure=False)
567
543
  if job_submitted_at is None:
568
544
  # Failed to launch the cluster.
569
- if self.retry_until_up:
570
- gap_seconds = self.RETRY_INIT_GAP_SECONDS
571
- logger.info('Retrying to recover the cluster in '
572
- f'{gap_seconds:.1f} seconds.')
573
- time.sleep(gap_seconds)
574
- continue
575
- with ux_utils.print_exception_no_traceback():
576
- raise exceptions.ResourcesUnavailableError(
577
- f'Failed to recover the cluster after retrying '
578
- f'{self._MAX_RETRY_CNT} times.')
545
+ gap_seconds = self.RETRY_INIT_GAP_SECONDS
546
+ logger.info('Retrying to recover the cluster in '
547
+ f'{gap_seconds:.1f} seconds.')
548
+ time.sleep(gap_seconds)
549
+ continue
579
550
 
580
551
  return job_submitted_at