skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (81) hide show
  1. sky/__init__.py +2 -2
  2. sky/backends/backend_utils.py +4 -1
  3. sky/backends/cloud_vm_ray_backend.py +4 -3
  4. sky/catalog/__init__.py +3 -3
  5. sky/catalog/aws_catalog.py +12 -0
  6. sky/catalog/common.py +2 -2
  7. sky/catalog/data_fetchers/fetch_aws.py +13 -1
  8. sky/client/cli/command.py +452 -53
  9. sky/dashboard/out/404.html +1 -1
  10. sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-42cd1b19a6b01078.js} +1 -1
  11. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  12. sky/dashboard/out/clusters/[cluster].html +1 -1
  13. sky/dashboard/out/clusters.html +1 -1
  14. sky/dashboard/out/config.html +1 -1
  15. sky/dashboard/out/index.html +1 -1
  16. sky/dashboard/out/infra/[context].html +1 -1
  17. sky/dashboard/out/infra.html +1 -1
  18. sky/dashboard/out/jobs/[job].html +1 -1
  19. sky/dashboard/out/jobs.html +1 -1
  20. sky/dashboard/out/users.html +1 -1
  21. sky/dashboard/out/volumes.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/data/data_utils.py +21 -1
  26. sky/data/storage.py +12 -0
  27. sky/jobs/__init__.py +3 -0
  28. sky/jobs/client/sdk.py +80 -3
  29. sky/jobs/controller.py +76 -25
  30. sky/jobs/recovery_strategy.py +80 -34
  31. sky/jobs/scheduler.py +68 -20
  32. sky/jobs/server/core.py +228 -136
  33. sky/jobs/server/server.py +40 -0
  34. sky/jobs/state.py +129 -24
  35. sky/jobs/utils.py +109 -51
  36. sky/provision/nebius/constants.py +3 -0
  37. sky/provision/runpod/utils.py +27 -12
  38. sky/py.typed +0 -0
  39. sky/resources.py +16 -12
  40. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  41. sky/serve/autoscalers.py +8 -0
  42. sky/serve/client/impl.py +188 -0
  43. sky/serve/client/sdk.py +12 -82
  44. sky/serve/constants.py +5 -1
  45. sky/serve/controller.py +5 -0
  46. sky/serve/replica_managers.py +112 -37
  47. sky/serve/serve_state.py +16 -6
  48. sky/serve/serve_utils.py +274 -77
  49. sky/serve/server/core.py +8 -525
  50. sky/serve/server/impl.py +709 -0
  51. sky/serve/service.py +13 -9
  52. sky/serve/service_spec.py +74 -4
  53. sky/server/constants.py +1 -1
  54. sky/server/daemons.py +164 -0
  55. sky/server/requests/payloads.py +33 -0
  56. sky/server/requests/requests.py +2 -107
  57. sky/server/requests/serializers/decoders.py +12 -3
  58. sky/server/requests/serializers/encoders.py +13 -2
  59. sky/server/server.py +2 -1
  60. sky/server/uvicorn.py +2 -1
  61. sky/sky_logging.py +30 -0
  62. sky/skylet/constants.py +2 -1
  63. sky/skylet/events.py +9 -0
  64. sky/skypilot_config.py +24 -21
  65. sky/task.py +41 -11
  66. sky/templates/jobs-controller.yaml.j2 +3 -0
  67. sky/templates/sky-serve-controller.yaml.j2 +18 -2
  68. sky/users/server.py +1 -1
  69. sky/utils/command_runner.py +4 -2
  70. sky/utils/controller_utils.py +14 -10
  71. sky/utils/dag_utils.py +4 -2
  72. sky/utils/db/migration_utils.py +2 -4
  73. sky/utils/schemas.py +47 -19
  74. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/METADATA +1 -1
  75. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/RECORD +81 -76
  76. /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_buildManifest.js +0 -0
  77. /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_ssgManifest.js +0 -0
  78. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/WHEEL +0 -0
  79. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/entry_points.txt +0 -0
  80. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/licenses/LICENSE +0 -0
  81. {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/top_level.txt +0 -0
sky/jobs/controller.py CHANGED
@@ -30,6 +30,7 @@ from sky.jobs import recovery_strategy
30
30
  from sky.jobs import scheduler
31
31
  from sky.jobs import state as managed_job_state
32
32
  from sky.jobs import utils as managed_job_utils
33
+ from sky.serve import serve_utils
33
34
  from sky.skylet import constants
34
35
  from sky.skylet import job_lib
35
36
  from sky.usage import usage_lib
@@ -60,12 +61,13 @@ def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
60
61
  class JobsController:
61
62
  """Each jobs controller manages the life cycle of one managed job."""
62
63
 
63
- def __init__(self, job_id: int, dag_yaml: str) -> None:
64
+ def __init__(self, job_id: int, dag_yaml: str, pool: Optional[str]) -> None:
64
65
  self._job_id = job_id
65
66
  self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
66
67
  logger.info(self._dag)
67
68
  # TODO(zhwu): this assumes the specific backend.
68
69
  self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
70
+ self._pool = pool
69
71
 
70
72
  # pylint: disable=line-too-long
71
73
  # Add a unique identifier to the task environment variables, so that
@@ -99,8 +101,10 @@ class JobsController:
99
101
  task.update_envs(task_envs)
100
102
 
101
103
  def _download_log_and_stream(
102
- self, task_id: Optional[int],
103
- handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle]
104
+ self,
105
+ task_id: Optional[int],
106
+ handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle],
107
+ job_id_on_pool_cluster: Optional[int],
104
108
  ) -> None:
105
109
  """Downloads and streams the logs of the current job with given task ID.
106
110
 
@@ -113,9 +117,14 @@ class JobsController:
113
117
  'Skipping downloading and streaming the logs.')
114
118
  return
115
119
  managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
116
- 'managed_jobs')
117
- log_file = controller_utils.download_and_stream_latest_job_log(
118
- self._backend, handle, managed_job_logs_dir)
120
+ 'managed_jobs',
121
+ f'job-id-{self._job_id}')
122
+ log_file = controller_utils.download_and_stream_job_log(
123
+ self._backend,
124
+ handle,
125
+ managed_job_logs_dir,
126
+ job_ids=[str(job_id_on_pool_cluster)]
127
+ if job_id_on_pool_cluster is not None else None)
119
128
  if log_file is not None:
120
129
  # Set the path of the log file for the current task, so it can be
121
130
  # accessed even after the job is finished
@@ -123,6 +132,12 @@ class JobsController:
123
132
  log_file)
124
133
  logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
125
134
 
135
+ def _cleanup_cluster(self, cluster_name: Optional[str]) -> None:
136
+ if cluster_name is None:
137
+ return
138
+ if self._pool is None:
139
+ managed_job_utils.terminate_cluster(cluster_name)
140
+
126
141
  def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
127
142
  """Busy loop monitoring cluster status and handling recovery.
128
143
 
@@ -193,10 +208,14 @@ class JobsController:
193
208
  usage_lib.messages.usage.update_task_id(task_id)
194
209
  task_id_env_var = task.envs[constants.TASK_ID_ENV_VAR]
195
210
  assert task.name is not None, task
211
+ # Set the cluster name to None if the job is submitted
212
+ # to a pool. This will be updated when we later calls the `launch`
213
+ # or `recover` function from the strategy executor.
196
214
  cluster_name = managed_job_utils.generate_managed_job_cluster_name(
197
- task.name, self._job_id)
215
+ task.name, self._job_id) if self._pool is None else None
198
216
  self._strategy_executor = recovery_strategy.StrategyExecutor.make(
199
- cluster_name, self._backend, task, self._job_id, task_id)
217
+ cluster_name, self._backend, task, self._job_id, task_id,
218
+ self._pool)
200
219
  if not is_resume:
201
220
  submitted_at = time.time()
202
221
  if task_id == 0:
@@ -226,6 +245,13 @@ class JobsController:
226
245
  if not is_resume:
227
246
  remote_job_submitted_at = self._strategy_executor.launch()
228
247
  assert remote_job_submitted_at is not None, remote_job_submitted_at
248
+ if self._pool is None:
249
+ job_id_on_pool_cluster = None
250
+ else:
251
+ # Update the cluster name when using cluster pool.
252
+ cluster_name, job_id_on_pool_cluster = (
253
+ managed_job_state.get_pool_submit_info(self._job_id))
254
+ assert cluster_name is not None, (cluster_name, job_id_on_pool_cluster)
229
255
 
230
256
  if not is_resume:
231
257
  managed_job_state.set_started(job_id=self._job_id,
@@ -279,7 +305,9 @@ class JobsController:
279
305
  if not force_transit_to_recovering:
280
306
  try:
281
307
  job_status = managed_job_utils.get_job_status(
282
- self._backend, cluster_name)
308
+ self._backend,
309
+ cluster_name,
310
+ job_id=job_id_on_pool_cluster)
283
311
  except exceptions.FetchClusterInfoError as fetch_e:
284
312
  logger.info(
285
313
  'Failed to fetch the job status. Start recovery.\n'
@@ -288,7 +316,7 @@ class JobsController:
288
316
 
289
317
  if job_status == job_lib.JobStatus.SUCCEEDED:
290
318
  success_end_time = managed_job_utils.try_to_get_job_end_time(
291
- self._backend, cluster_name)
319
+ self._backend, cluster_name, job_id_on_pool_cluster)
292
320
  # The job is done. Set the job to SUCCEEDED first before start
293
321
  # downloading and streaming the logs to make it more responsive.
294
322
  managed_job_state.set_succeeded(self._job_id,
@@ -299,6 +327,8 @@ class JobsController:
299
327
  f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
300
328
  f'Cleaning up the cluster {cluster_name}.')
301
329
  try:
330
+ logger.info(f'Downloading logs on cluster {cluster_name} '
331
+ f'and job id {job_id_on_pool_cluster}.')
302
332
  clusters = backend_utils.get_clusters(
303
333
  cluster_names=[cluster_name],
304
334
  refresh=common.StatusRefreshMode.NONE,
@@ -307,7 +337,8 @@ class JobsController:
307
337
  assert len(clusters) == 1, (clusters, cluster_name)
308
338
  handle = clusters[0].get('handle')
309
339
  # Best effort to download and stream the logs.
310
- self._download_log_and_stream(task_id, handle)
340
+ self._download_log_and_stream(task_id, handle,
341
+ job_id_on_pool_cluster)
311
342
  except Exception as e: # pylint: disable=broad-except
312
343
  # We don't want to crash here, so just log and continue.
313
344
  logger.warning(
@@ -316,7 +347,7 @@ class JobsController:
316
347
  exc_info=True)
317
348
  # Only clean up the cluster, not the storages, because tasks may
318
349
  # share storages.
319
- managed_job_utils.terminate_cluster(cluster_name=cluster_name)
350
+ self._cleanup_cluster(cluster_name)
320
351
  return True
321
352
 
322
353
  # For single-node jobs, non-terminated job_status indicates a
@@ -364,13 +395,14 @@ class JobsController:
364
395
  job_status == job_lib.JobStatus.FAILED_DRIVER):
365
396
  # The user code has probably crashed, fail immediately.
366
397
  end_time = managed_job_utils.try_to_get_job_end_time(
367
- self._backend, cluster_name)
398
+ self._backend, cluster_name, job_id_on_pool_cluster)
368
399
  logger.info(
369
400
  f'The user job failed ({job_status}). Please check the '
370
401
  'logs below.\n'
371
402
  f'== Logs of the user job (ID: {self._job_id}) ==\n')
372
403
 
373
- self._download_log_and_stream(task_id, handle)
404
+ self._download_log_and_stream(task_id, handle,
405
+ job_id_on_pool_cluster)
374
406
 
375
407
  failure_reason = (
376
408
  'To see the details, run: '
@@ -457,7 +489,7 @@ class JobsController:
457
489
  # those clusters again may fail.
458
490
  logger.info('Cleaning up the preempted or failed cluster'
459
491
  '...')
460
- managed_job_utils.terminate_cluster(cluster_name)
492
+ self._cleanup_cluster(cluster_name)
461
493
 
462
494
  # Try to recover the managed jobs, when the cluster is preempted or
463
495
  # failed or the job status is failed to be fetched.
@@ -467,6 +499,10 @@ class JobsController:
467
499
  force_transit_to_recovering=force_transit_to_recovering,
468
500
  callback_func=callback_func)
469
501
  recovered_time = self._strategy_executor.recover()
502
+ if self._pool is not None:
503
+ cluster_name, job_id_on_pool_cluster = (
504
+ managed_job_state.get_pool_submit_info(self._job_id))
505
+ assert cluster_name is not None
470
506
  managed_job_state.set_recovered(self._job_id,
471
507
  task_id,
472
508
  recovered_time=recovered_time,
@@ -541,11 +577,11 @@ class JobsController:
541
577
  task=self._dag.tasks[task_id]))
542
578
 
543
579
 
544
- def _run_controller(job_id: int, dag_yaml: str):
580
+ def _run_controller(job_id: int, dag_yaml: str, pool: Optional[str]):
545
581
  """Runs the controller in a remote process for interruption."""
546
582
  # The controller needs to be instantiated in the remote process, since
547
583
  # the controller is not serializable.
548
- jobs_controller = JobsController(job_id, dag_yaml)
584
+ jobs_controller = JobsController(job_id, dag_yaml, pool)
549
585
  jobs_controller.run()
550
586
 
551
587
 
@@ -577,7 +613,7 @@ def _handle_signal(job_id):
577
613
  f'User sent {user_signal.value} signal.')
578
614
 
579
615
 
580
- def _cleanup(job_id: int, dag_yaml: str):
616
+ def _cleanup(job_id: int, dag_yaml: str, pool: Optional[str]):
581
617
  """Clean up the cluster(s) and storages.
582
618
 
583
619
  (1) Clean up the succeeded task(s)' ephemeral storage. The storage has
@@ -595,9 +631,18 @@ def _cleanup(job_id: int, dag_yaml: str):
595
631
  dag, _ = _get_dag_and_name(dag_yaml)
596
632
  for task in dag.tasks:
597
633
  assert task.name is not None, task
598
- cluster_name = managed_job_utils.generate_managed_job_cluster_name(
599
- task.name, job_id)
600
- managed_job_utils.terminate_cluster(cluster_name)
634
+ if pool is None:
635
+ cluster_name = managed_job_utils.generate_managed_job_cluster_name(
636
+ task.name, job_id)
637
+ managed_job_utils.terminate_cluster(cluster_name)
638
+ else:
639
+ cluster_name, job_id_on_pool_cluster = (
640
+ managed_job_state.get_pool_submit_info(job_id))
641
+ if cluster_name is not None:
642
+ if job_id_on_pool_cluster is not None:
643
+ core.cancel(cluster_name=cluster_name,
644
+ job_ids=[job_id_on_pool_cluster],
645
+ _try_cancel_if_cluster_is_init=True)
601
646
 
602
647
  # Clean up Storages with persistent=False.
603
648
  # TODO(zhwu): this assumes the specific backend.
@@ -629,7 +674,7 @@ def _cleanup(job_id: int, dag_yaml: str):
629
674
  f'Failed to clean up file mount {file_mount}: {e}')
630
675
 
631
676
 
632
- def start(job_id, dag_yaml):
677
+ def start(job_id, dag_yaml, pool):
633
678
  """Start the controller."""
634
679
  controller_process = None
635
680
  cancelling = False
@@ -643,7 +688,8 @@ def start(job_id, dag_yaml):
643
688
  # So we can only enable daemon after we no longer need to
644
689
  # start daemon processes like Ray.
645
690
  controller_process = multiprocessing.Process(target=_run_controller,
646
- args=(job_id, dag_yaml))
691
+ args=(job_id, dag_yaml,
692
+ pool))
647
693
  controller_process.start()
648
694
  while controller_process.is_alive():
649
695
  _handle_signal(job_id)
@@ -679,7 +725,7 @@ def start(job_id, dag_yaml):
679
725
  # https://unix.stackexchange.com/questions/356408/strange-problem-with-trap-and-sigint
680
726
  # But anyway, a clean solution is killing the controller process
681
727
  # directly, and then cleanup the cluster job_state.
682
- _cleanup(job_id, dag_yaml=dag_yaml)
728
+ _cleanup(job_id, dag_yaml=dag_yaml, pool=pool)
683
729
  logger.info(f'Cluster of managed job {job_id} has been cleaned up.')
684
730
 
685
731
  if cancelling:
@@ -717,8 +763,13 @@ if __name__ == '__main__':
717
763
  parser.add_argument('dag_yaml',
718
764
  type=str,
719
765
  help='The path to the user job yaml file.')
766
+ parser.add_argument('--pool',
767
+ required=False,
768
+ default=None,
769
+ type=str,
770
+ help='The pool to use for the controller job.')
720
771
  args = parser.parse_args()
721
772
  # We start process with 'spawn', because 'fork' could result in weird
722
773
  # behaviors; 'spawn' is also cross-platform.
723
774
  multiprocessing.set_start_method('spawn', force=True)
724
- start(args.job_id, args.dag_yaml)
775
+ start(args.job_id, args.dag_yaml, args.pool)
@@ -20,6 +20,7 @@ from sky.backends import backend_utils
20
20
  from sky.jobs import scheduler
21
21
  from sky.jobs import state
22
22
  from sky.jobs import utils as managed_job_utils
23
+ from sky.serve import serve_utils
23
24
  from sky.skylet import job_lib
24
25
  from sky.usage import usage_lib
25
26
  from sky.utils import common_utils
@@ -48,9 +49,9 @@ class StrategyExecutor:
48
49
 
49
50
  RETRY_INIT_GAP_SECONDS = 60
50
51
 
51
- def __init__(self, cluster_name: str, backend: 'backends.Backend',
52
+ def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
52
53
  task: 'task_lib.Task', max_restarts_on_errors: int,
53
- job_id: int, task_id: int) -> None:
54
+ job_id: int, task_id: int, pool: Optional[str]) -> None:
54
55
  """Initialize the strategy executor.
55
56
 
56
57
  Args:
@@ -62,17 +63,23 @@ class StrategyExecutor:
62
63
  'Only CloudVMRayBackend is supported.')
63
64
  self.dag = sky.Dag()
64
65
  self.dag.add(task)
66
+ # For jobs submitted to a pool, the cluster name might change after each
67
+ # recovery. Initially this is set to an empty string to indicate that no
68
+ # cluster is assigned yet, and in `_launch`, it will be set to one of
69
+ # the cluster names in the pool.
65
70
  self.cluster_name = cluster_name
66
71
  self.backend = backend
67
72
  self.max_restarts_on_errors = max_restarts_on_errors
68
73
  self.job_id = job_id
69
74
  self.task_id = task_id
75
+ self.pool = pool
70
76
  self.restart_cnt_on_failure = 0
77
+ self.job_id_on_pool_cluster: Optional[int] = None
71
78
 
72
79
  @classmethod
73
- def make(cls, cluster_name: str, backend: 'backends.Backend',
74
- task: 'task_lib.Task', job_id: int,
75
- task_id: int) -> 'StrategyExecutor':
80
+ def make(cls, cluster_name: Optional[str], backend: 'backends.Backend',
81
+ task: 'task_lib.Task', job_id: int, task_id: int,
82
+ pool: Optional[str]) -> 'StrategyExecutor':
76
83
  """Create a strategy from a task."""
77
84
 
78
85
  resource_list = list(task.resources)
@@ -103,7 +110,8 @@ class StrategyExecutor:
103
110
  from_str(job_recovery_name))
104
111
  assert job_recovery_strategy is not None, job_recovery_name
105
112
  return job_recovery_strategy(cluster_name, backend, task,
106
- max_restarts_on_errors, job_id, task_id)
113
+ max_restarts_on_errors, job_id, task_id,
114
+ pool)
107
115
 
108
116
  def launch(self) -> float:
109
117
  """Launch the cluster for the first time.
@@ -131,12 +139,14 @@ class StrategyExecutor:
131
139
  """
132
140
  raise NotImplementedError
133
141
 
134
- def _try_cancel_all_jobs(self):
142
+ def _try_cancel_jobs(self):
135
143
  from sky import core # pylint: disable=import-outside-toplevel
136
144
 
145
+ if self.cluster_name is None:
146
+ return
137
147
  handle = global_user_state.get_handle_from_cluster_name(
138
148
  self.cluster_name)
139
- if handle is None:
149
+ if handle is None or self.pool is not None:
140
150
  return
141
151
  try:
142
152
  usage_lib.messages.usage.set_internal()
@@ -159,8 +169,13 @@ class StrategyExecutor:
159
169
  # should be functional with the `_try_cancel_if_cluster_is_init`
160
170
  # flag, i.e. it sends the cancel signal to the head node, which will
161
171
  # then kill the user process on remaining worker nodes.
172
+ # Only cancel the corresponding job for worker pool.
173
+ if self.pool is None:
174
+ kwargs = dict(all=True)
175
+ else:
176
+ kwargs = dict(job_ids=[self.job_id_on_pool_cluster])
162
177
  core.cancel(cluster_name=self.cluster_name,
163
- all=True,
178
+ **kwargs,
164
179
  _try_cancel_if_cluster_is_init=True)
165
180
  except Exception as e: # pylint: disable=broad-except
166
181
  logger.info('Failed to cancel the job on the cluster. The cluster '
@@ -169,7 +184,7 @@ class StrategyExecutor:
169
184
  f'{common_utils.format_exception(e)}\n'
170
185
  'Terminating the cluster explicitly to ensure no '
171
186
  'remaining job process interferes with recovery.')
172
- managed_job_utils.terminate_cluster(self.cluster_name)
187
+ self._cleanup_cluster()
173
188
 
174
189
  def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
175
190
  """Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
@@ -178,6 +193,7 @@ class StrategyExecutor:
178
193
  The timestamp of when the job is submitted, or None if failed to
179
194
  submit.
180
195
  """
196
+ assert self.cluster_name is not None
181
197
  status = None
182
198
  job_checking_retry_cnt = 0
183
199
  while job_checking_retry_cnt < MAX_JOB_CHECKING_RETRY:
@@ -208,7 +224,9 @@ class StrategyExecutor:
208
224
 
209
225
  try:
210
226
  status = managed_job_utils.get_job_status(
211
- self.backend, self.cluster_name)
227
+ self.backend,
228
+ self.cluster_name,
229
+ job_id=self.job_id_on_pool_cluster)
212
230
  except Exception as e: # pylint: disable=broad-except
213
231
  # If any unexpected error happens, retry the job checking
214
232
  # loop.
@@ -224,7 +242,10 @@ class StrategyExecutor:
224
242
  if status is not None and status > job_lib.JobStatus.INIT:
225
243
  try:
226
244
  job_submitted_at = managed_job_utils.get_job_timestamp(
227
- self.backend, self.cluster_name, get_end_time=False)
245
+ self.backend,
246
+ self.cluster_name,
247
+ self.job_id_on_pool_cluster,
248
+ get_end_time=False)
228
249
  return job_submitted_at
229
250
  except Exception as e: # pylint: disable=broad-except
230
251
  # If we failed to get the job timestamp, we will retry
@@ -236,6 +257,12 @@ class StrategyExecutor:
236
257
  time.sleep(managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
237
258
  return None
238
259
 
260
+ def _cleanup_cluster(self) -> None:
261
+ if self.cluster_name is None:
262
+ return
263
+ if self.pool is None:
264
+ managed_job_utils.terminate_cluster(self.cluster_name)
265
+
239
266
  def _launch(self,
240
267
  max_retry: Optional[int] = 3,
241
268
  raise_on_failure: bool = True,
@@ -290,19 +317,35 @@ class StrategyExecutor:
290
317
  recovery)
291
318
  try:
292
319
  usage_lib.messages.usage.set_internal()
293
- # Detach setup, so that the setup failure can be
294
- # detected by the controller process (job_status ->
295
- # FAILED_SETUP).
296
- execution.launch(
297
- self.dag,
298
- cluster_name=self.cluster_name,
299
- # We expect to tear down the cluster as soon as the
300
- # job is finished. However, in case the controller
301
- # dies, set autodown to try and avoid a resource
302
- # leak.
303
- idle_minutes_to_autostop=_AUTODOWN_MINUTES,
304
- down=True,
305
- _is_launched_by_jobs_controller=True)
320
+ if self.pool is None:
321
+ assert self.cluster_name is not None
322
+ # Detach setup, so that the setup failure can be
323
+ # detected by the controller process (job_status ->
324
+ # FAILED_SETUP).
325
+ execution.launch(
326
+ self.dag,
327
+ cluster_name=self.cluster_name,
328
+ # We expect to tear down the cluster as soon as
329
+ # the job is finished. However, in case the
330
+ # controller dies, set autodown to try and avoid
331
+ # a resource leak.
332
+ idle_minutes_to_autostop=_AUTODOWN_MINUTES,
333
+ down=True,
334
+ _is_launched_by_jobs_controller=True)
335
+ else:
336
+ self.cluster_name = (
337
+ serve_utils.get_next_cluster_name(
338
+ self.pool, self.job_id))
339
+ if self.cluster_name is None:
340
+ raise exceptions.NoClusterLaunchedError(
341
+ 'No cluster name found in the pool.')
342
+ job_id_on_pool_cluster, _ = execution.exec(
343
+ self.dag, cluster_name=self.cluster_name)
344
+ assert job_id_on_pool_cluster is not None, (
345
+ self.cluster_name, self.job_id)
346
+ self.job_id_on_pool_cluster = job_id_on_pool_cluster
347
+ state.set_job_id_on_pool_cluster(
348
+ self.job_id, job_id_on_pool_cluster)
306
349
  logger.info('Managed job cluster launched.')
307
350
  except (exceptions.InvalidClusterNameError,
308
351
  exceptions.NoCloudAccessError,
@@ -373,7 +416,7 @@ class StrategyExecutor:
373
416
 
374
417
  # If we get here, the launch did not succeed. Tear down the
375
418
  # cluster and retry.
376
- managed_job_utils.terminate_cluster(self.cluster_name)
419
+ self._cleanup_cluster()
377
420
  if max_retry is not None and retry_cnt >= max_retry:
378
421
  # Retry forever if max_retry is None.
379
422
  if raise_on_failure:
@@ -398,7 +441,10 @@ class StrategyExecutor:
398
441
  # Update the status to PENDING during backoff.
399
442
  state.set_backoff_pending(self.job_id, self.task_id)
400
443
  # Calculate the backoff time and sleep.
401
- gap_seconds = backoff.current_backoff()
444
+ # We retry immediately for worker pool, since no sky.launch()
445
+ # is called and the overhead is minimal.
446
+ gap_seconds = (backoff.current_backoff()
447
+ if self.pool is None else 0)
402
448
  logger.info('Retrying to launch the cluster in '
403
449
  f'{gap_seconds:.1f} seconds.')
404
450
  time.sleep(gap_seconds)
@@ -427,11 +473,11 @@ class FailoverStrategyExecutor(StrategyExecutor):
427
473
 
428
474
  _MAX_RETRY_CNT = 240 # Retry for 4 hours.
429
475
 
430
- def __init__(self, cluster_name: str, backend: 'backends.Backend',
476
+ def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
431
477
  task: 'task_lib.Task', max_restarts_on_errors: int,
432
- job_id: int, task_id: int) -> None:
478
+ job_id: int, task_id: int, pool: Optional[str]) -> None:
433
479
  super().__init__(cluster_name, backend, task, max_restarts_on_errors,
434
- job_id, task_id)
480
+ job_id, task_id, pool)
435
481
  # Note down the cloud/region of the launched cluster, so that we can
436
482
  # first retry in the same cloud/region. (Inside recover() we may not
437
483
  # rely on cluster handle, as it can be None if the cluster is
@@ -444,7 +490,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
444
490
  recovery: bool = False) -> Optional[float]:
445
491
  job_submitted_at = super()._launch(max_retry, raise_on_failure,
446
492
  recovery)
447
- if job_submitted_at is not None:
493
+ if job_submitted_at is not None and self.cluster_name is not None:
448
494
  # Only record the cloud/region if the launch is successful.
449
495
  handle = global_user_state.get_handle_from_cluster_name(
450
496
  self.cluster_name)
@@ -464,7 +510,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
464
510
  # original user specification.
465
511
 
466
512
  # Step 1
467
- self._try_cancel_all_jobs()
513
+ self._try_cancel_jobs()
468
514
 
469
515
  while True:
470
516
  # Add region constraint to the task, to retry on the same region
@@ -488,7 +534,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
488
534
  # Step 2
489
535
  logger.debug('Terminating unhealthy cluster and reset cloud '
490
536
  'region.')
491
- managed_job_utils.terminate_cluster(self.cluster_name)
537
+ self._cleanup_cluster()
492
538
 
493
539
  # Step 3
494
540
  logger.debug('Relaunch the cluster without constraining to prior '
@@ -547,7 +593,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
547
593
 
548
594
  # Step 1
549
595
  logger.debug('Terminating unhealthy cluster and reset cloud region.')
550
- managed_job_utils.terminate_cluster(self.cluster_name)
596
+ self._cleanup_cluster()
551
597
 
552
598
  # Step 2
553
599
  logger.debug('Relaunch the cluster skipping the previously launched '