skypilot-nightly 1.0.0.dev20250730__py3-none-any.whl → 1.0.0.dev20250801__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/backends/backend_utils.py +4 -1
- sky/backends/cloud_vm_ray_backend.py +4 -3
- sky/catalog/__init__.py +3 -3
- sky/catalog/aws_catalog.py +12 -0
- sky/catalog/common.py +2 -2
- sky/catalog/data_fetchers/fetch_aws.py +13 -1
- sky/client/cli/command.py +452 -53
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-5adfc4d4b3db6f71.js → webpack-42cd1b19a6b01078.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +21 -1
- sky/data/storage.py +12 -0
- sky/jobs/__init__.py +3 -0
- sky/jobs/client/sdk.py +80 -3
- sky/jobs/controller.py +76 -25
- sky/jobs/recovery_strategy.py +80 -34
- sky/jobs/scheduler.py +68 -20
- sky/jobs/server/core.py +228 -136
- sky/jobs/server/server.py +40 -0
- sky/jobs/state.py +129 -24
- sky/jobs/utils.py +109 -51
- sky/provision/nebius/constants.py +3 -0
- sky/provision/runpod/utils.py +27 -12
- sky/py.typed +0 -0
- sky/resources.py +16 -12
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/serve/autoscalers.py +8 -0
- sky/serve/client/impl.py +188 -0
- sky/serve/client/sdk.py +12 -82
- sky/serve/constants.py +5 -1
- sky/serve/controller.py +5 -0
- sky/serve/replica_managers.py +112 -37
- sky/serve/serve_state.py +16 -6
- sky/serve/serve_utils.py +274 -77
- sky/serve/server/core.py +8 -525
- sky/serve/server/impl.py +709 -0
- sky/serve/service.py +13 -9
- sky/serve/service_spec.py +74 -4
- sky/server/constants.py +1 -1
- sky/server/daemons.py +164 -0
- sky/server/requests/payloads.py +33 -0
- sky/server/requests/requests.py +2 -107
- sky/server/requests/serializers/decoders.py +12 -3
- sky/server/requests/serializers/encoders.py +13 -2
- sky/server/server.py +2 -1
- sky/server/uvicorn.py +2 -1
- sky/sky_logging.py +30 -0
- sky/skylet/constants.py +2 -1
- sky/skylet/events.py +9 -0
- sky/skypilot_config.py +24 -21
- sky/task.py +41 -11
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/sky-serve-controller.yaml.j2 +18 -2
- sky/users/server.py +1 -1
- sky/utils/command_runner.py +4 -2
- sky/utils/controller_utils.py +14 -10
- sky/utils/dag_utils.py +4 -2
- sky/utils/db/migration_utils.py +2 -4
- sky/utils/schemas.py +47 -19
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/METADATA +1 -1
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/RECORD +81 -76
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{_r2LwCFLjlWjZDUIJQG_V → f2fEsZwJxryJVOYRNtNKE}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250730.dist-info → skypilot_nightly-1.0.0.dev20250801.dist-info}/top_level.txt +0 -0
sky/jobs/controller.py
CHANGED
|
@@ -30,6 +30,7 @@ from sky.jobs import recovery_strategy
|
|
|
30
30
|
from sky.jobs import scheduler
|
|
31
31
|
from sky.jobs import state as managed_job_state
|
|
32
32
|
from sky.jobs import utils as managed_job_utils
|
|
33
|
+
from sky.serve import serve_utils
|
|
33
34
|
from sky.skylet import constants
|
|
34
35
|
from sky.skylet import job_lib
|
|
35
36
|
from sky.usage import usage_lib
|
|
@@ -60,12 +61,13 @@ def _get_dag_and_name(dag_yaml: str) -> Tuple['sky.Dag', str]:
|
|
|
60
61
|
class JobsController:
|
|
61
62
|
"""Each jobs controller manages the life cycle of one managed job."""
|
|
62
63
|
|
|
63
|
-
def __init__(self, job_id: int, dag_yaml: str) -> None:
|
|
64
|
+
def __init__(self, job_id: int, dag_yaml: str, pool: Optional[str]) -> None:
|
|
64
65
|
self._job_id = job_id
|
|
65
66
|
self._dag, self._dag_name = _get_dag_and_name(dag_yaml)
|
|
66
67
|
logger.info(self._dag)
|
|
67
68
|
# TODO(zhwu): this assumes the specific backend.
|
|
68
69
|
self._backend = cloud_vm_ray_backend.CloudVmRayBackend()
|
|
70
|
+
self._pool = pool
|
|
69
71
|
|
|
70
72
|
# pylint: disable=line-too-long
|
|
71
73
|
# Add a unique identifier to the task environment variables, so that
|
|
@@ -99,8 +101,10 @@ class JobsController:
|
|
|
99
101
|
task.update_envs(task_envs)
|
|
100
102
|
|
|
101
103
|
def _download_log_and_stream(
|
|
102
|
-
self,
|
|
103
|
-
|
|
104
|
+
self,
|
|
105
|
+
task_id: Optional[int],
|
|
106
|
+
handle: Optional[cloud_vm_ray_backend.CloudVmRayResourceHandle],
|
|
107
|
+
job_id_on_pool_cluster: Optional[int],
|
|
104
108
|
) -> None:
|
|
105
109
|
"""Downloads and streams the logs of the current job with given task ID.
|
|
106
110
|
|
|
@@ -113,9 +117,14 @@ class JobsController:
|
|
|
113
117
|
'Skipping downloading and streaming the logs.')
|
|
114
118
|
return
|
|
115
119
|
managed_job_logs_dir = os.path.join(constants.SKY_LOGS_DIRECTORY,
|
|
116
|
-
'managed_jobs'
|
|
117
|
-
|
|
118
|
-
|
|
120
|
+
'managed_jobs',
|
|
121
|
+
f'job-id-{self._job_id}')
|
|
122
|
+
log_file = controller_utils.download_and_stream_job_log(
|
|
123
|
+
self._backend,
|
|
124
|
+
handle,
|
|
125
|
+
managed_job_logs_dir,
|
|
126
|
+
job_ids=[str(job_id_on_pool_cluster)]
|
|
127
|
+
if job_id_on_pool_cluster is not None else None)
|
|
119
128
|
if log_file is not None:
|
|
120
129
|
# Set the path of the log file for the current task, so it can be
|
|
121
130
|
# accessed even after the job is finished
|
|
@@ -123,6 +132,12 @@ class JobsController:
|
|
|
123
132
|
log_file)
|
|
124
133
|
logger.info(f'\n== End of logs (ID: {self._job_id}) ==')
|
|
125
134
|
|
|
135
|
+
def _cleanup_cluster(self, cluster_name: Optional[str]) -> None:
|
|
136
|
+
if cluster_name is None:
|
|
137
|
+
return
|
|
138
|
+
if self._pool is None:
|
|
139
|
+
managed_job_utils.terminate_cluster(cluster_name)
|
|
140
|
+
|
|
126
141
|
def _run_one_task(self, task_id: int, task: 'sky.Task') -> bool:
|
|
127
142
|
"""Busy loop monitoring cluster status and handling recovery.
|
|
128
143
|
|
|
@@ -193,10 +208,14 @@ class JobsController:
|
|
|
193
208
|
usage_lib.messages.usage.update_task_id(task_id)
|
|
194
209
|
task_id_env_var = task.envs[constants.TASK_ID_ENV_VAR]
|
|
195
210
|
assert task.name is not None, task
|
|
211
|
+
# Set the cluster name to None if the job is submitted
|
|
212
|
+
# to a pool. This will be updated when we later calls the `launch`
|
|
213
|
+
# or `recover` function from the strategy executor.
|
|
196
214
|
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
|
197
|
-
task.name, self._job_id)
|
|
215
|
+
task.name, self._job_id) if self._pool is None else None
|
|
198
216
|
self._strategy_executor = recovery_strategy.StrategyExecutor.make(
|
|
199
|
-
cluster_name, self._backend, task, self._job_id, task_id
|
|
217
|
+
cluster_name, self._backend, task, self._job_id, task_id,
|
|
218
|
+
self._pool)
|
|
200
219
|
if not is_resume:
|
|
201
220
|
submitted_at = time.time()
|
|
202
221
|
if task_id == 0:
|
|
@@ -226,6 +245,13 @@ class JobsController:
|
|
|
226
245
|
if not is_resume:
|
|
227
246
|
remote_job_submitted_at = self._strategy_executor.launch()
|
|
228
247
|
assert remote_job_submitted_at is not None, remote_job_submitted_at
|
|
248
|
+
if self._pool is None:
|
|
249
|
+
job_id_on_pool_cluster = None
|
|
250
|
+
else:
|
|
251
|
+
# Update the cluster name when using cluster pool.
|
|
252
|
+
cluster_name, job_id_on_pool_cluster = (
|
|
253
|
+
managed_job_state.get_pool_submit_info(self._job_id))
|
|
254
|
+
assert cluster_name is not None, (cluster_name, job_id_on_pool_cluster)
|
|
229
255
|
|
|
230
256
|
if not is_resume:
|
|
231
257
|
managed_job_state.set_started(job_id=self._job_id,
|
|
@@ -279,7 +305,9 @@ class JobsController:
|
|
|
279
305
|
if not force_transit_to_recovering:
|
|
280
306
|
try:
|
|
281
307
|
job_status = managed_job_utils.get_job_status(
|
|
282
|
-
self._backend,
|
|
308
|
+
self._backend,
|
|
309
|
+
cluster_name,
|
|
310
|
+
job_id=job_id_on_pool_cluster)
|
|
283
311
|
except exceptions.FetchClusterInfoError as fetch_e:
|
|
284
312
|
logger.info(
|
|
285
313
|
'Failed to fetch the job status. Start recovery.\n'
|
|
@@ -288,7 +316,7 @@ class JobsController:
|
|
|
288
316
|
|
|
289
317
|
if job_status == job_lib.JobStatus.SUCCEEDED:
|
|
290
318
|
success_end_time = managed_job_utils.try_to_get_job_end_time(
|
|
291
|
-
self._backend, cluster_name)
|
|
319
|
+
self._backend, cluster_name, job_id_on_pool_cluster)
|
|
292
320
|
# The job is done. Set the job to SUCCEEDED first before start
|
|
293
321
|
# downloading and streaming the logs to make it more responsive.
|
|
294
322
|
managed_job_state.set_succeeded(self._job_id,
|
|
@@ -299,6 +327,8 @@ class JobsController:
|
|
|
299
327
|
f'Managed job {self._job_id} (task: {task_id}) SUCCEEDED. '
|
|
300
328
|
f'Cleaning up the cluster {cluster_name}.')
|
|
301
329
|
try:
|
|
330
|
+
logger.info(f'Downloading logs on cluster {cluster_name} '
|
|
331
|
+
f'and job id {job_id_on_pool_cluster}.')
|
|
302
332
|
clusters = backend_utils.get_clusters(
|
|
303
333
|
cluster_names=[cluster_name],
|
|
304
334
|
refresh=common.StatusRefreshMode.NONE,
|
|
@@ -307,7 +337,8 @@ class JobsController:
|
|
|
307
337
|
assert len(clusters) == 1, (clusters, cluster_name)
|
|
308
338
|
handle = clusters[0].get('handle')
|
|
309
339
|
# Best effort to download and stream the logs.
|
|
310
|
-
self._download_log_and_stream(task_id, handle
|
|
340
|
+
self._download_log_and_stream(task_id, handle,
|
|
341
|
+
job_id_on_pool_cluster)
|
|
311
342
|
except Exception as e: # pylint: disable=broad-except
|
|
312
343
|
# We don't want to crash here, so just log and continue.
|
|
313
344
|
logger.warning(
|
|
@@ -316,7 +347,7 @@ class JobsController:
|
|
|
316
347
|
exc_info=True)
|
|
317
348
|
# Only clean up the cluster, not the storages, because tasks may
|
|
318
349
|
# share storages.
|
|
319
|
-
|
|
350
|
+
self._cleanup_cluster(cluster_name)
|
|
320
351
|
return True
|
|
321
352
|
|
|
322
353
|
# For single-node jobs, non-terminated job_status indicates a
|
|
@@ -364,13 +395,14 @@ class JobsController:
|
|
|
364
395
|
job_status == job_lib.JobStatus.FAILED_DRIVER):
|
|
365
396
|
# The user code has probably crashed, fail immediately.
|
|
366
397
|
end_time = managed_job_utils.try_to_get_job_end_time(
|
|
367
|
-
self._backend, cluster_name)
|
|
398
|
+
self._backend, cluster_name, job_id_on_pool_cluster)
|
|
368
399
|
logger.info(
|
|
369
400
|
f'The user job failed ({job_status}). Please check the '
|
|
370
401
|
'logs below.\n'
|
|
371
402
|
f'== Logs of the user job (ID: {self._job_id}) ==\n')
|
|
372
403
|
|
|
373
|
-
self._download_log_and_stream(task_id, handle
|
|
404
|
+
self._download_log_and_stream(task_id, handle,
|
|
405
|
+
job_id_on_pool_cluster)
|
|
374
406
|
|
|
375
407
|
failure_reason = (
|
|
376
408
|
'To see the details, run: '
|
|
@@ -457,7 +489,7 @@ class JobsController:
|
|
|
457
489
|
# those clusters again may fail.
|
|
458
490
|
logger.info('Cleaning up the preempted or failed cluster'
|
|
459
491
|
'...')
|
|
460
|
-
|
|
492
|
+
self._cleanup_cluster(cluster_name)
|
|
461
493
|
|
|
462
494
|
# Try to recover the managed jobs, when the cluster is preempted or
|
|
463
495
|
# failed or the job status is failed to be fetched.
|
|
@@ -467,6 +499,10 @@ class JobsController:
|
|
|
467
499
|
force_transit_to_recovering=force_transit_to_recovering,
|
|
468
500
|
callback_func=callback_func)
|
|
469
501
|
recovered_time = self._strategy_executor.recover()
|
|
502
|
+
if self._pool is not None:
|
|
503
|
+
cluster_name, job_id_on_pool_cluster = (
|
|
504
|
+
managed_job_state.get_pool_submit_info(self._job_id))
|
|
505
|
+
assert cluster_name is not None
|
|
470
506
|
managed_job_state.set_recovered(self._job_id,
|
|
471
507
|
task_id,
|
|
472
508
|
recovered_time=recovered_time,
|
|
@@ -541,11 +577,11 @@ class JobsController:
|
|
|
541
577
|
task=self._dag.tasks[task_id]))
|
|
542
578
|
|
|
543
579
|
|
|
544
|
-
def _run_controller(job_id: int, dag_yaml: str):
|
|
580
|
+
def _run_controller(job_id: int, dag_yaml: str, pool: Optional[str]):
|
|
545
581
|
"""Runs the controller in a remote process for interruption."""
|
|
546
582
|
# The controller needs to be instantiated in the remote process, since
|
|
547
583
|
# the controller is not serializable.
|
|
548
|
-
jobs_controller = JobsController(job_id, dag_yaml)
|
|
584
|
+
jobs_controller = JobsController(job_id, dag_yaml, pool)
|
|
549
585
|
jobs_controller.run()
|
|
550
586
|
|
|
551
587
|
|
|
@@ -577,7 +613,7 @@ def _handle_signal(job_id):
|
|
|
577
613
|
f'User sent {user_signal.value} signal.')
|
|
578
614
|
|
|
579
615
|
|
|
580
|
-
def _cleanup(job_id: int, dag_yaml: str):
|
|
616
|
+
def _cleanup(job_id: int, dag_yaml: str, pool: Optional[str]):
|
|
581
617
|
"""Clean up the cluster(s) and storages.
|
|
582
618
|
|
|
583
619
|
(1) Clean up the succeeded task(s)' ephemeral storage. The storage has
|
|
@@ -595,9 +631,18 @@ def _cleanup(job_id: int, dag_yaml: str):
|
|
|
595
631
|
dag, _ = _get_dag_and_name(dag_yaml)
|
|
596
632
|
for task in dag.tasks:
|
|
597
633
|
assert task.name is not None, task
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
634
|
+
if pool is None:
|
|
635
|
+
cluster_name = managed_job_utils.generate_managed_job_cluster_name(
|
|
636
|
+
task.name, job_id)
|
|
637
|
+
managed_job_utils.terminate_cluster(cluster_name)
|
|
638
|
+
else:
|
|
639
|
+
cluster_name, job_id_on_pool_cluster = (
|
|
640
|
+
managed_job_state.get_pool_submit_info(job_id))
|
|
641
|
+
if cluster_name is not None:
|
|
642
|
+
if job_id_on_pool_cluster is not None:
|
|
643
|
+
core.cancel(cluster_name=cluster_name,
|
|
644
|
+
job_ids=[job_id_on_pool_cluster],
|
|
645
|
+
_try_cancel_if_cluster_is_init=True)
|
|
601
646
|
|
|
602
647
|
# Clean up Storages with persistent=False.
|
|
603
648
|
# TODO(zhwu): this assumes the specific backend.
|
|
@@ -629,7 +674,7 @@ def _cleanup(job_id: int, dag_yaml: str):
|
|
|
629
674
|
f'Failed to clean up file mount {file_mount}: {e}')
|
|
630
675
|
|
|
631
676
|
|
|
632
|
-
def start(job_id, dag_yaml):
|
|
677
|
+
def start(job_id, dag_yaml, pool):
|
|
633
678
|
"""Start the controller."""
|
|
634
679
|
controller_process = None
|
|
635
680
|
cancelling = False
|
|
@@ -643,7 +688,8 @@ def start(job_id, dag_yaml):
|
|
|
643
688
|
# So we can only enable daemon after we no longer need to
|
|
644
689
|
# start daemon processes like Ray.
|
|
645
690
|
controller_process = multiprocessing.Process(target=_run_controller,
|
|
646
|
-
args=(job_id, dag_yaml
|
|
691
|
+
args=(job_id, dag_yaml,
|
|
692
|
+
pool))
|
|
647
693
|
controller_process.start()
|
|
648
694
|
while controller_process.is_alive():
|
|
649
695
|
_handle_signal(job_id)
|
|
@@ -679,7 +725,7 @@ def start(job_id, dag_yaml):
|
|
|
679
725
|
# https://unix.stackexchange.com/questions/356408/strange-problem-with-trap-and-sigint
|
|
680
726
|
# But anyway, a clean solution is killing the controller process
|
|
681
727
|
# directly, and then cleanup the cluster job_state.
|
|
682
|
-
_cleanup(job_id, dag_yaml=dag_yaml)
|
|
728
|
+
_cleanup(job_id, dag_yaml=dag_yaml, pool=pool)
|
|
683
729
|
logger.info(f'Cluster of managed job {job_id} has been cleaned up.')
|
|
684
730
|
|
|
685
731
|
if cancelling:
|
|
@@ -717,8 +763,13 @@ if __name__ == '__main__':
|
|
|
717
763
|
parser.add_argument('dag_yaml',
|
|
718
764
|
type=str,
|
|
719
765
|
help='The path to the user job yaml file.')
|
|
766
|
+
parser.add_argument('--pool',
|
|
767
|
+
required=False,
|
|
768
|
+
default=None,
|
|
769
|
+
type=str,
|
|
770
|
+
help='The pool to use for the controller job.')
|
|
720
771
|
args = parser.parse_args()
|
|
721
772
|
# We start process with 'spawn', because 'fork' could result in weird
|
|
722
773
|
# behaviors; 'spawn' is also cross-platform.
|
|
723
774
|
multiprocessing.set_start_method('spawn', force=True)
|
|
724
|
-
start(args.job_id, args.dag_yaml)
|
|
775
|
+
start(args.job_id, args.dag_yaml, args.pool)
|
sky/jobs/recovery_strategy.py
CHANGED
|
@@ -20,6 +20,7 @@ from sky.backends import backend_utils
|
|
|
20
20
|
from sky.jobs import scheduler
|
|
21
21
|
from sky.jobs import state
|
|
22
22
|
from sky.jobs import utils as managed_job_utils
|
|
23
|
+
from sky.serve import serve_utils
|
|
23
24
|
from sky.skylet import job_lib
|
|
24
25
|
from sky.usage import usage_lib
|
|
25
26
|
from sky.utils import common_utils
|
|
@@ -48,9 +49,9 @@ class StrategyExecutor:
|
|
|
48
49
|
|
|
49
50
|
RETRY_INIT_GAP_SECONDS = 60
|
|
50
51
|
|
|
51
|
-
def __init__(self, cluster_name: str, backend: 'backends.Backend',
|
|
52
|
+
def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
|
|
52
53
|
task: 'task_lib.Task', max_restarts_on_errors: int,
|
|
53
|
-
job_id: int, task_id: int) -> None:
|
|
54
|
+
job_id: int, task_id: int, pool: Optional[str]) -> None:
|
|
54
55
|
"""Initialize the strategy executor.
|
|
55
56
|
|
|
56
57
|
Args:
|
|
@@ -62,17 +63,23 @@ class StrategyExecutor:
|
|
|
62
63
|
'Only CloudVMRayBackend is supported.')
|
|
63
64
|
self.dag = sky.Dag()
|
|
64
65
|
self.dag.add(task)
|
|
66
|
+
# For jobs submitted to a pool, the cluster name might change after each
|
|
67
|
+
# recovery. Initially this is set to an empty string to indicate that no
|
|
68
|
+
# cluster is assigned yet, and in `_launch`, it will be set to one of
|
|
69
|
+
# the cluster names in the pool.
|
|
65
70
|
self.cluster_name = cluster_name
|
|
66
71
|
self.backend = backend
|
|
67
72
|
self.max_restarts_on_errors = max_restarts_on_errors
|
|
68
73
|
self.job_id = job_id
|
|
69
74
|
self.task_id = task_id
|
|
75
|
+
self.pool = pool
|
|
70
76
|
self.restart_cnt_on_failure = 0
|
|
77
|
+
self.job_id_on_pool_cluster: Optional[int] = None
|
|
71
78
|
|
|
72
79
|
@classmethod
|
|
73
|
-
def make(cls, cluster_name: str, backend: 'backends.Backend',
|
|
74
|
-
task: 'task_lib.Task', job_id: int,
|
|
75
|
-
|
|
80
|
+
def make(cls, cluster_name: Optional[str], backend: 'backends.Backend',
|
|
81
|
+
task: 'task_lib.Task', job_id: int, task_id: int,
|
|
82
|
+
pool: Optional[str]) -> 'StrategyExecutor':
|
|
76
83
|
"""Create a strategy from a task."""
|
|
77
84
|
|
|
78
85
|
resource_list = list(task.resources)
|
|
@@ -103,7 +110,8 @@ class StrategyExecutor:
|
|
|
103
110
|
from_str(job_recovery_name))
|
|
104
111
|
assert job_recovery_strategy is not None, job_recovery_name
|
|
105
112
|
return job_recovery_strategy(cluster_name, backend, task,
|
|
106
|
-
max_restarts_on_errors, job_id, task_id
|
|
113
|
+
max_restarts_on_errors, job_id, task_id,
|
|
114
|
+
pool)
|
|
107
115
|
|
|
108
116
|
def launch(self) -> float:
|
|
109
117
|
"""Launch the cluster for the first time.
|
|
@@ -131,12 +139,14 @@ class StrategyExecutor:
|
|
|
131
139
|
"""
|
|
132
140
|
raise NotImplementedError
|
|
133
141
|
|
|
134
|
-
def
|
|
142
|
+
def _try_cancel_jobs(self):
|
|
135
143
|
from sky import core # pylint: disable=import-outside-toplevel
|
|
136
144
|
|
|
145
|
+
if self.cluster_name is None:
|
|
146
|
+
return
|
|
137
147
|
handle = global_user_state.get_handle_from_cluster_name(
|
|
138
148
|
self.cluster_name)
|
|
139
|
-
if handle is None:
|
|
149
|
+
if handle is None or self.pool is not None:
|
|
140
150
|
return
|
|
141
151
|
try:
|
|
142
152
|
usage_lib.messages.usage.set_internal()
|
|
@@ -159,8 +169,13 @@ class StrategyExecutor:
|
|
|
159
169
|
# should be functional with the `_try_cancel_if_cluster_is_init`
|
|
160
170
|
# flag, i.e. it sends the cancel signal to the head node, which will
|
|
161
171
|
# then kill the user process on remaining worker nodes.
|
|
172
|
+
# Only cancel the corresponding job for worker pool.
|
|
173
|
+
if self.pool is None:
|
|
174
|
+
kwargs = dict(all=True)
|
|
175
|
+
else:
|
|
176
|
+
kwargs = dict(job_ids=[self.job_id_on_pool_cluster])
|
|
162
177
|
core.cancel(cluster_name=self.cluster_name,
|
|
163
|
-
|
|
178
|
+
**kwargs,
|
|
164
179
|
_try_cancel_if_cluster_is_init=True)
|
|
165
180
|
except Exception as e: # pylint: disable=broad-except
|
|
166
181
|
logger.info('Failed to cancel the job on the cluster. The cluster '
|
|
@@ -169,7 +184,7 @@ class StrategyExecutor:
|
|
|
169
184
|
f'{common_utils.format_exception(e)}\n'
|
|
170
185
|
'Terminating the cluster explicitly to ensure no '
|
|
171
186
|
'remaining job process interferes with recovery.')
|
|
172
|
-
|
|
187
|
+
self._cleanup_cluster()
|
|
173
188
|
|
|
174
189
|
def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
|
|
175
190
|
"""Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
|
|
@@ -178,6 +193,7 @@ class StrategyExecutor:
|
|
|
178
193
|
The timestamp of when the job is submitted, or None if failed to
|
|
179
194
|
submit.
|
|
180
195
|
"""
|
|
196
|
+
assert self.cluster_name is not None
|
|
181
197
|
status = None
|
|
182
198
|
job_checking_retry_cnt = 0
|
|
183
199
|
while job_checking_retry_cnt < MAX_JOB_CHECKING_RETRY:
|
|
@@ -208,7 +224,9 @@ class StrategyExecutor:
|
|
|
208
224
|
|
|
209
225
|
try:
|
|
210
226
|
status = managed_job_utils.get_job_status(
|
|
211
|
-
self.backend,
|
|
227
|
+
self.backend,
|
|
228
|
+
self.cluster_name,
|
|
229
|
+
job_id=self.job_id_on_pool_cluster)
|
|
212
230
|
except Exception as e: # pylint: disable=broad-except
|
|
213
231
|
# If any unexpected error happens, retry the job checking
|
|
214
232
|
# loop.
|
|
@@ -224,7 +242,10 @@ class StrategyExecutor:
|
|
|
224
242
|
if status is not None and status > job_lib.JobStatus.INIT:
|
|
225
243
|
try:
|
|
226
244
|
job_submitted_at = managed_job_utils.get_job_timestamp(
|
|
227
|
-
self.backend,
|
|
245
|
+
self.backend,
|
|
246
|
+
self.cluster_name,
|
|
247
|
+
self.job_id_on_pool_cluster,
|
|
248
|
+
get_end_time=False)
|
|
228
249
|
return job_submitted_at
|
|
229
250
|
except Exception as e: # pylint: disable=broad-except
|
|
230
251
|
# If we failed to get the job timestamp, we will retry
|
|
@@ -236,6 +257,12 @@ class StrategyExecutor:
|
|
|
236
257
|
time.sleep(managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
|
|
237
258
|
return None
|
|
238
259
|
|
|
260
|
+
def _cleanup_cluster(self) -> None:
|
|
261
|
+
if self.cluster_name is None:
|
|
262
|
+
return
|
|
263
|
+
if self.pool is None:
|
|
264
|
+
managed_job_utils.terminate_cluster(self.cluster_name)
|
|
265
|
+
|
|
239
266
|
def _launch(self,
|
|
240
267
|
max_retry: Optional[int] = 3,
|
|
241
268
|
raise_on_failure: bool = True,
|
|
@@ -290,19 +317,35 @@ class StrategyExecutor:
|
|
|
290
317
|
recovery)
|
|
291
318
|
try:
|
|
292
319
|
usage_lib.messages.usage.set_internal()
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
320
|
+
if self.pool is None:
|
|
321
|
+
assert self.cluster_name is not None
|
|
322
|
+
# Detach setup, so that the setup failure can be
|
|
323
|
+
# detected by the controller process (job_status ->
|
|
324
|
+
# FAILED_SETUP).
|
|
325
|
+
execution.launch(
|
|
326
|
+
self.dag,
|
|
327
|
+
cluster_name=self.cluster_name,
|
|
328
|
+
# We expect to tear down the cluster as soon as
|
|
329
|
+
# the job is finished. However, in case the
|
|
330
|
+
# controller dies, set autodown to try and avoid
|
|
331
|
+
# a resource leak.
|
|
332
|
+
idle_minutes_to_autostop=_AUTODOWN_MINUTES,
|
|
333
|
+
down=True,
|
|
334
|
+
_is_launched_by_jobs_controller=True)
|
|
335
|
+
else:
|
|
336
|
+
self.cluster_name = (
|
|
337
|
+
serve_utils.get_next_cluster_name(
|
|
338
|
+
self.pool, self.job_id))
|
|
339
|
+
if self.cluster_name is None:
|
|
340
|
+
raise exceptions.NoClusterLaunchedError(
|
|
341
|
+
'No cluster name found in the pool.')
|
|
342
|
+
job_id_on_pool_cluster, _ = execution.exec(
|
|
343
|
+
self.dag, cluster_name=self.cluster_name)
|
|
344
|
+
assert job_id_on_pool_cluster is not None, (
|
|
345
|
+
self.cluster_name, self.job_id)
|
|
346
|
+
self.job_id_on_pool_cluster = job_id_on_pool_cluster
|
|
347
|
+
state.set_job_id_on_pool_cluster(
|
|
348
|
+
self.job_id, job_id_on_pool_cluster)
|
|
306
349
|
logger.info('Managed job cluster launched.')
|
|
307
350
|
except (exceptions.InvalidClusterNameError,
|
|
308
351
|
exceptions.NoCloudAccessError,
|
|
@@ -373,7 +416,7 @@ class StrategyExecutor:
|
|
|
373
416
|
|
|
374
417
|
# If we get here, the launch did not succeed. Tear down the
|
|
375
418
|
# cluster and retry.
|
|
376
|
-
|
|
419
|
+
self._cleanup_cluster()
|
|
377
420
|
if max_retry is not None and retry_cnt >= max_retry:
|
|
378
421
|
# Retry forever if max_retry is None.
|
|
379
422
|
if raise_on_failure:
|
|
@@ -398,7 +441,10 @@ class StrategyExecutor:
|
|
|
398
441
|
# Update the status to PENDING during backoff.
|
|
399
442
|
state.set_backoff_pending(self.job_id, self.task_id)
|
|
400
443
|
# Calculate the backoff time and sleep.
|
|
401
|
-
|
|
444
|
+
# We retry immediately for worker pool, since no sky.launch()
|
|
445
|
+
# is called and the overhead is minimal.
|
|
446
|
+
gap_seconds = (backoff.current_backoff()
|
|
447
|
+
if self.pool is None else 0)
|
|
402
448
|
logger.info('Retrying to launch the cluster in '
|
|
403
449
|
f'{gap_seconds:.1f} seconds.')
|
|
404
450
|
time.sleep(gap_seconds)
|
|
@@ -427,11 +473,11 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
427
473
|
|
|
428
474
|
_MAX_RETRY_CNT = 240 # Retry for 4 hours.
|
|
429
475
|
|
|
430
|
-
def __init__(self, cluster_name: str, backend: 'backends.Backend',
|
|
476
|
+
def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
|
|
431
477
|
task: 'task_lib.Task', max_restarts_on_errors: int,
|
|
432
|
-
job_id: int, task_id: int) -> None:
|
|
478
|
+
job_id: int, task_id: int, pool: Optional[str]) -> None:
|
|
433
479
|
super().__init__(cluster_name, backend, task, max_restarts_on_errors,
|
|
434
|
-
job_id, task_id)
|
|
480
|
+
job_id, task_id, pool)
|
|
435
481
|
# Note down the cloud/region of the launched cluster, so that we can
|
|
436
482
|
# first retry in the same cloud/region. (Inside recover() we may not
|
|
437
483
|
# rely on cluster handle, as it can be None if the cluster is
|
|
@@ -444,7 +490,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
444
490
|
recovery: bool = False) -> Optional[float]:
|
|
445
491
|
job_submitted_at = super()._launch(max_retry, raise_on_failure,
|
|
446
492
|
recovery)
|
|
447
|
-
if job_submitted_at is not None:
|
|
493
|
+
if job_submitted_at is not None and self.cluster_name is not None:
|
|
448
494
|
# Only record the cloud/region if the launch is successful.
|
|
449
495
|
handle = global_user_state.get_handle_from_cluster_name(
|
|
450
496
|
self.cluster_name)
|
|
@@ -464,7 +510,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
464
510
|
# original user specification.
|
|
465
511
|
|
|
466
512
|
# Step 1
|
|
467
|
-
self.
|
|
513
|
+
self._try_cancel_jobs()
|
|
468
514
|
|
|
469
515
|
while True:
|
|
470
516
|
# Add region constraint to the task, to retry on the same region
|
|
@@ -488,7 +534,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
488
534
|
# Step 2
|
|
489
535
|
logger.debug('Terminating unhealthy cluster and reset cloud '
|
|
490
536
|
'region.')
|
|
491
|
-
|
|
537
|
+
self._cleanup_cluster()
|
|
492
538
|
|
|
493
539
|
# Step 3
|
|
494
540
|
logger.debug('Relaunch the cluster without constraining to prior '
|
|
@@ -547,7 +593,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
|
547
593
|
|
|
548
594
|
# Step 1
|
|
549
595
|
logger.debug('Terminating unhealthy cluster and reset cloud region.')
|
|
550
|
-
|
|
596
|
+
self._cleanup_cluster()
|
|
551
597
|
|
|
552
598
|
# Step 2
|
|
553
599
|
logger.debug('Relaunch the cluster skipping the previously launched '
|