skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250910__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +2 -2
- sky/authentication.py +19 -4
- sky/backends/backend_utils.py +35 -1
- sky/backends/cloud_vm_ray_backend.py +2 -2
- sky/client/sdk.py +20 -0
- sky/client/sdk_async.py +18 -16
- sky/clouds/aws.py +3 -1
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-1d7e11230da3ca89.js} +1 -1
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/storage.py +5 -1
- sky/execution.py +21 -14
- sky/jobs/constants.py +3 -0
- sky/jobs/controller.py +732 -310
- sky/jobs/recovery_strategy.py +251 -129
- sky/jobs/scheduler.py +247 -174
- sky/jobs/server/core.py +20 -4
- sky/jobs/server/utils.py +2 -2
- sky/jobs/state.py +702 -511
- sky/jobs/utils.py +94 -39
- sky/provision/aws/config.py +4 -1
- sky/provision/gcp/config.py +6 -1
- sky/provision/kubernetes/utils.py +17 -8
- sky/provision/provisioner.py +1 -0
- sky/serve/replica_managers.py +0 -7
- sky/serve/serve_utils.py +5 -0
- sky/serve/server/impl.py +1 -2
- sky/serve/service.py +0 -2
- sky/server/common.py +8 -3
- sky/server/config.py +43 -24
- sky/server/constants.py +1 -0
- sky/server/daemons.py +7 -11
- sky/server/requests/serializers/encoders.py +1 -1
- sky/server/server.py +8 -1
- sky/setup_files/dependencies.py +4 -2
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/constants.py +3 -1
- sky/skylet/events.py +2 -10
- sky/utils/command_runner.pyi +3 -3
- sky/utils/common_utils.py +11 -1
- sky/utils/controller_utils.py +5 -0
- sky/utils/db/db_utils.py +31 -2
- sky/utils/rich_utils.py +3 -1
- sky/utils/subprocess_utils.py +9 -0
- sky/volumes/volume.py +2 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/METADATA +39 -37
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/RECORD +67 -67
- /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_buildManifest.js +0 -0
- /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/top_level.txt +0 -0
sky/jobs/recovery_strategy.py
CHANGED
|
@@ -5,18 +5,19 @@ In the YAML file, the user can specify the strategy to use for managed jobs.
|
|
|
5
5
|
resources:
|
|
6
6
|
job_recovery: EAGER_NEXT_REGION
|
|
7
7
|
"""
|
|
8
|
-
import
|
|
8
|
+
import asyncio
|
|
9
|
+
import logging
|
|
9
10
|
import traceback
|
|
10
11
|
import typing
|
|
11
|
-
from typing import Optional
|
|
12
|
+
from typing import Optional, Set
|
|
12
13
|
|
|
13
14
|
from sky import backends
|
|
14
15
|
from sky import dag as dag_lib
|
|
15
16
|
from sky import exceptions
|
|
16
|
-
from sky import execution
|
|
17
17
|
from sky import global_user_state
|
|
18
18
|
from sky import sky_logging
|
|
19
19
|
from sky.backends import backend_utils
|
|
20
|
+
from sky.client import sdk
|
|
20
21
|
from sky.jobs import scheduler
|
|
21
22
|
from sky.jobs import state
|
|
22
23
|
from sky.jobs import utils as managed_job_utils
|
|
@@ -24,6 +25,7 @@ from sky.serve import serve_utils
|
|
|
24
25
|
from sky.skylet import job_lib
|
|
25
26
|
from sky.usage import usage_lib
|
|
26
27
|
from sky.utils import common_utils
|
|
28
|
+
from sky.utils import context_utils
|
|
27
29
|
from sky.utils import registry
|
|
28
30
|
from sky.utils import status_lib
|
|
29
31
|
from sky.utils import ux_utils
|
|
@@ -41,7 +43,7 @@ MAX_JOB_CHECKING_RETRY = 10
|
|
|
41
43
|
# Minutes to job cluster autodown. This should be significantly larger than
|
|
42
44
|
# managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS, to avoid tearing down the
|
|
43
45
|
# cluster before its status can be updated by the job controller.
|
|
44
|
-
_AUTODOWN_MINUTES =
|
|
46
|
+
_AUTODOWN_MINUTES = 10
|
|
45
47
|
|
|
46
48
|
|
|
47
49
|
class StrategyExecutor:
|
|
@@ -49,15 +51,33 @@ class StrategyExecutor:
|
|
|
49
51
|
|
|
50
52
|
RETRY_INIT_GAP_SECONDS = 60
|
|
51
53
|
|
|
52
|
-
def __init__(
|
|
53
|
-
|
|
54
|
-
|
|
54
|
+
def __init__(
|
|
55
|
+
self,
|
|
56
|
+
cluster_name: Optional[str],
|
|
57
|
+
backend: 'backends.Backend',
|
|
58
|
+
task: 'task_lib.Task',
|
|
59
|
+
max_restarts_on_errors: int,
|
|
60
|
+
job_id: int,
|
|
61
|
+
task_id: int,
|
|
62
|
+
job_logger: logging.Logger,
|
|
63
|
+
pool: Optional[str],
|
|
64
|
+
starting: Set[int],
|
|
65
|
+
starting_lock: asyncio.Lock,
|
|
66
|
+
starting_signal: asyncio.Condition,
|
|
67
|
+
) -> None:
|
|
55
68
|
"""Initialize the strategy executor.
|
|
56
69
|
|
|
57
70
|
Args:
|
|
58
71
|
cluster_name: The name of the cluster.
|
|
59
72
|
backend: The backend to use. Only CloudVMRayBackend is supported.
|
|
60
73
|
task: The task to execute.
|
|
74
|
+
max_restarts_on_errors: Maximum number of restarts on errors.
|
|
75
|
+
job_id: The ID of the job.
|
|
76
|
+
task_id: The ID of the task.
|
|
77
|
+
job_logger: Logger instance for this specific job.
|
|
78
|
+
starting: Set of job IDs that are currently starting.
|
|
79
|
+
starting_lock: Lock to synchronize starting jobs.
|
|
80
|
+
starting_signal: Condition to signal when a job can start.
|
|
61
81
|
"""
|
|
62
82
|
assert isinstance(backend, backends.CloudVmRayBackend), (
|
|
63
83
|
'Only CloudVMRayBackend is supported.')
|
|
@@ -74,12 +94,26 @@ class StrategyExecutor:
|
|
|
74
94
|
self.task_id = task_id
|
|
75
95
|
self.pool = pool
|
|
76
96
|
self.restart_cnt_on_failure = 0
|
|
97
|
+
self._logger = job_logger
|
|
77
98
|
self.job_id_on_pool_cluster: Optional[int] = None
|
|
99
|
+
self.starting = starting
|
|
100
|
+
self.starting_lock = starting_lock
|
|
101
|
+
self.starting_signal = starting_signal
|
|
78
102
|
|
|
79
103
|
@classmethod
|
|
80
|
-
def make(
|
|
81
|
-
|
|
82
|
-
|
|
104
|
+
def make(
|
|
105
|
+
cls,
|
|
106
|
+
cluster_name: Optional[str],
|
|
107
|
+
backend: 'backends.Backend',
|
|
108
|
+
task: 'task_lib.Task',
|
|
109
|
+
job_id: int,
|
|
110
|
+
task_id: int,
|
|
111
|
+
job_logger: logging.Logger,
|
|
112
|
+
pool: Optional[str],
|
|
113
|
+
starting: Set[int],
|
|
114
|
+
starting_lock: asyncio.Lock,
|
|
115
|
+
starting_signal: asyncio.Condition,
|
|
116
|
+
) -> 'StrategyExecutor':
|
|
83
117
|
"""Create a strategy from a task."""
|
|
84
118
|
|
|
85
119
|
resource_list = list(task.resources)
|
|
@@ -111,9 +145,10 @@ class StrategyExecutor:
|
|
|
111
145
|
assert job_recovery_strategy is not None, job_recovery_name
|
|
112
146
|
return job_recovery_strategy(cluster_name, backend, task,
|
|
113
147
|
max_restarts_on_errors, job_id, task_id,
|
|
114
|
-
pool
|
|
148
|
+
job_logger, pool, starting, starting_lock,
|
|
149
|
+
starting_signal)
|
|
115
150
|
|
|
116
|
-
def launch(self) -> float:
|
|
151
|
+
async def launch(self) -> float:
|
|
117
152
|
"""Launch the cluster for the first time.
|
|
118
153
|
|
|
119
154
|
It can fail if resource is not available. Need to check the cluster
|
|
@@ -125,11 +160,11 @@ class StrategyExecutor:
|
|
|
125
160
|
Raises: Please refer to the docstring of self._launch().
|
|
126
161
|
"""
|
|
127
162
|
|
|
128
|
-
job_submit_at = self._launch(max_retry=None)
|
|
163
|
+
job_submit_at = await self._launch(max_retry=None)
|
|
129
164
|
assert job_submit_at is not None
|
|
130
165
|
return job_submit_at
|
|
131
166
|
|
|
132
|
-
def recover(self) -> float:
|
|
167
|
+
async def recover(self) -> float:
|
|
133
168
|
"""Relaunch the cluster after failure and wait until job starts.
|
|
134
169
|
|
|
135
170
|
When recover() is called the cluster should be in STOPPED status (i.e.
|
|
@@ -139,13 +174,11 @@ class StrategyExecutor:
|
|
|
139
174
|
"""
|
|
140
175
|
raise NotImplementedError
|
|
141
176
|
|
|
142
|
-
def _try_cancel_jobs(self):
|
|
143
|
-
from sky import core # pylint: disable=import-outside-toplevel
|
|
144
|
-
|
|
177
|
+
async def _try_cancel_jobs(self):
|
|
145
178
|
if self.cluster_name is None:
|
|
146
179
|
return
|
|
147
|
-
handle =
|
|
148
|
-
self.cluster_name)
|
|
180
|
+
handle = await context_utils.to_thread(
|
|
181
|
+
global_user_state.get_handle_from_cluster_name, self.cluster_name)
|
|
149
182
|
if handle is None or self.pool is not None:
|
|
150
183
|
return
|
|
151
184
|
try:
|
|
@@ -174,9 +207,16 @@ class StrategyExecutor:
|
|
|
174
207
|
kwargs = dict(all=True)
|
|
175
208
|
else:
|
|
176
209
|
kwargs = dict(job_ids=[self.job_id_on_pool_cluster])
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
210
|
+
request_id = await context_utils.to_thread(
|
|
211
|
+
sdk.cancel,
|
|
212
|
+
cluster_name=self.cluster_name,
|
|
213
|
+
**kwargs,
|
|
214
|
+
_try_cancel_if_cluster_is_init=True,
|
|
215
|
+
)
|
|
216
|
+
await context_utils.to_thread(
|
|
217
|
+
sdk.get,
|
|
218
|
+
request_id,
|
|
219
|
+
)
|
|
180
220
|
except Exception as e: # pylint: disable=broad-except
|
|
181
221
|
logger.info('Failed to cancel the job on the cluster. The cluster '
|
|
182
222
|
'might be already down or the head node is preempted.'
|
|
@@ -184,9 +224,9 @@ class StrategyExecutor:
|
|
|
184
224
|
f'{common_utils.format_exception(e)}\n'
|
|
185
225
|
'Terminating the cluster explicitly to ensure no '
|
|
186
226
|
'remaining job process interferes with recovery.')
|
|
187
|
-
self._cleanup_cluster
|
|
227
|
+
await context_utils.to_thread(self._cleanup_cluster)
|
|
188
228
|
|
|
189
|
-
def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
|
|
229
|
+
async def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
|
|
190
230
|
"""Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
|
|
191
231
|
|
|
192
232
|
Returns:
|
|
@@ -200,32 +240,34 @@ class StrategyExecutor:
|
|
|
200
240
|
# Avoid the infinite loop, if any bug happens.
|
|
201
241
|
job_checking_retry_cnt += 1
|
|
202
242
|
try:
|
|
203
|
-
cluster_status, _ = (
|
|
204
|
-
backend_utils.refresh_cluster_status_handle
|
|
205
|
-
|
|
206
|
-
|
|
243
|
+
cluster_status, _ = (await context_utils.to_thread(
|
|
244
|
+
backend_utils.refresh_cluster_status_handle,
|
|
245
|
+
self.cluster_name,
|
|
246
|
+
force_refresh_statuses=set(status_lib.ClusterStatus)))
|
|
207
247
|
except Exception as e: # pylint: disable=broad-except
|
|
208
248
|
# If any unexpected error happens, retry the job checking
|
|
209
249
|
# loop.
|
|
210
250
|
# TODO(zhwu): log the unexpected error to usage collection
|
|
211
251
|
# for future debugging.
|
|
212
|
-
|
|
213
|
-
|
|
252
|
+
self._logger.info(
|
|
253
|
+
f'Unexpected exception: {e}\nFailed to get the '
|
|
254
|
+
'refresh the cluster status. Retrying.')
|
|
214
255
|
continue
|
|
215
256
|
if cluster_status != status_lib.ClusterStatus.UP:
|
|
216
257
|
# The cluster can be preempted before the job is
|
|
217
258
|
# launched.
|
|
218
259
|
# Break to let the retry launch kick in.
|
|
219
|
-
|
|
220
|
-
|
|
260
|
+
self._logger.info('The cluster is preempted before the job '
|
|
261
|
+
'is submitted.')
|
|
221
262
|
# TODO(zhwu): we should recover the preemption with the
|
|
222
263
|
# recovery strategy instead of the current while loop.
|
|
223
264
|
break
|
|
224
265
|
|
|
225
266
|
try:
|
|
226
|
-
status = managed_job_utils.get_job_status(
|
|
267
|
+
status = await managed_job_utils.get_job_status(
|
|
227
268
|
self.backend,
|
|
228
269
|
self.cluster_name,
|
|
270
|
+
job_logger=self._logger,
|
|
229
271
|
job_id=self.job_id_on_pool_cluster)
|
|
230
272
|
except Exception as e: # pylint: disable=broad-except
|
|
231
273
|
# If any unexpected error happens, retry the job checking
|
|
@@ -234,14 +276,16 @@ class StrategyExecutor:
|
|
|
234
276
|
# get_job_status, so it should not happen here.
|
|
235
277
|
# TODO(zhwu): log the unexpected error to usage collection
|
|
236
278
|
# for future debugging.
|
|
237
|
-
|
|
238
|
-
|
|
279
|
+
self._logger.info(
|
|
280
|
+
f'Unexpected exception: {e}\nFailed to get the '
|
|
281
|
+
'job status. Retrying.')
|
|
239
282
|
continue
|
|
240
283
|
|
|
241
284
|
# Check the job status until it is not in initialized status
|
|
242
285
|
if status is not None and status > job_lib.JobStatus.INIT:
|
|
243
286
|
try:
|
|
244
|
-
job_submitted_at =
|
|
287
|
+
job_submitted_at = await context_utils.to_thread(
|
|
288
|
+
managed_job_utils.get_job_timestamp,
|
|
245
289
|
self.backend,
|
|
246
290
|
self.cluster_name,
|
|
247
291
|
self.job_id_on_pool_cluster,
|
|
@@ -250,11 +294,13 @@ class StrategyExecutor:
|
|
|
250
294
|
except Exception as e: # pylint: disable=broad-except
|
|
251
295
|
# If we failed to get the job timestamp, we will retry
|
|
252
296
|
# job checking loop.
|
|
253
|
-
|
|
254
|
-
|
|
297
|
+
self._logger.info(
|
|
298
|
+
f'Unexpected Exception: {e}\nFailed to get '
|
|
299
|
+
'the job start timestamp. Retrying.')
|
|
255
300
|
continue
|
|
256
301
|
# Wait for the job to be started
|
|
257
|
-
|
|
302
|
+
await asyncio.sleep(
|
|
303
|
+
managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
|
|
258
304
|
return None
|
|
259
305
|
|
|
260
306
|
def _cleanup_cluster(self) -> None:
|
|
@@ -263,10 +309,10 @@ class StrategyExecutor:
|
|
|
263
309
|
if self.pool is None:
|
|
264
310
|
managed_job_utils.terminate_cluster(self.cluster_name)
|
|
265
311
|
|
|
266
|
-
def _launch(self,
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
312
|
+
async def _launch(self,
|
|
313
|
+
max_retry: Optional[int] = 3,
|
|
314
|
+
raise_on_failure: bool = True,
|
|
315
|
+
recovery: bool = False) -> Optional[float]:
|
|
270
316
|
"""Implementation of launch().
|
|
271
317
|
|
|
272
318
|
The function will wait until the job starts running, but will leave the
|
|
@@ -307,56 +353,107 @@ class StrategyExecutor:
|
|
|
307
353
|
while True:
|
|
308
354
|
retry_cnt += 1
|
|
309
355
|
try:
|
|
310
|
-
with scheduler.scheduled_launch(
|
|
356
|
+
async with scheduler.scheduled_launch(
|
|
357
|
+
self.job_id,
|
|
358
|
+
self.starting,
|
|
359
|
+
self.starting_lock,
|
|
360
|
+
self.starting_signal,
|
|
361
|
+
self._logger,
|
|
362
|
+
):
|
|
311
363
|
# The job state may have been PENDING during backoff -
|
|
312
364
|
# update to STARTING or RECOVERING.
|
|
313
365
|
# On the first attempt (when retry_cnt is 1), we should
|
|
314
366
|
# already be in STARTING or RECOVERING.
|
|
315
367
|
if retry_cnt > 1:
|
|
316
|
-
state.
|
|
317
|
-
|
|
368
|
+
await state.set_restarting_async(
|
|
369
|
+
self.job_id, self.task_id, recovery)
|
|
318
370
|
try:
|
|
319
371
|
usage_lib.messages.usage.set_internal()
|
|
320
372
|
if self.pool is None:
|
|
321
373
|
assert self.cluster_name is not None
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
374
|
+
log_file = _get_logger_file(self._logger)
|
|
375
|
+
request_id = None
|
|
376
|
+
try:
|
|
377
|
+
request_id = await context_utils.to_thread(
|
|
378
|
+
sdk.launch,
|
|
379
|
+
self.dag,
|
|
380
|
+
cluster_name=self.cluster_name,
|
|
381
|
+
# We expect to tear down the cluster as soon
|
|
382
|
+
# as the job is finished. However, in case
|
|
383
|
+
# the controller dies, we may end up with a
|
|
384
|
+
# resource leak.
|
|
385
|
+
# Ideally, we should autodown to be safe,
|
|
386
|
+
# but it's fine to disable it for now, as
|
|
387
|
+
# Nebius doesn't support autodown yet.
|
|
388
|
+
# TODO(kevin): set down=True once Nebius
|
|
389
|
+
# supports autodown.
|
|
390
|
+
# idle_minutes_to_autostop=(
|
|
391
|
+
# _AUTODOWN_MINUTES),
|
|
392
|
+
# down=True,
|
|
393
|
+
_is_launched_by_jobs_controller=True,
|
|
394
|
+
)
|
|
395
|
+
if log_file is None:
|
|
396
|
+
raise OSError('Log file is None')
|
|
397
|
+
with open(log_file, 'a', encoding='utf-8') as f:
|
|
398
|
+
await context_utils.to_thread(
|
|
399
|
+
sdk.stream_and_get,
|
|
400
|
+
request_id,
|
|
401
|
+
output_stream=f,
|
|
402
|
+
)
|
|
403
|
+
except asyncio.CancelledError:
|
|
404
|
+
if request_id:
|
|
405
|
+
req = await context_utils.to_thread(
|
|
406
|
+
sdk.api_cancel, request_id)
|
|
407
|
+
try:
|
|
408
|
+
await context_utils.to_thread(
|
|
409
|
+
sdk.get, req)
|
|
410
|
+
except Exception as e: # pylint: disable=broad-except
|
|
411
|
+
# we must still return a CancelledError
|
|
412
|
+
self._logger.error(
|
|
413
|
+
f'Failed to cancel the job: {e}')
|
|
414
|
+
raise
|
|
415
|
+
self._logger.info('Managed job cluster launched.')
|
|
340
416
|
else:
|
|
341
|
-
self.cluster_name = (
|
|
342
|
-
serve_utils.get_next_cluster_name
|
|
343
|
-
|
|
417
|
+
self.cluster_name = await (context_utils.to_thread(
|
|
418
|
+
serve_utils.get_next_cluster_name, self.pool,
|
|
419
|
+
self.job_id))
|
|
344
420
|
if self.cluster_name is None:
|
|
345
421
|
raise exceptions.NoClusterLaunchedError(
|
|
346
422
|
'No cluster name found in the pool.')
|
|
347
|
-
|
|
348
|
-
|
|
423
|
+
request_id = None
|
|
424
|
+
try:
|
|
425
|
+
request_id = await context_utils.to_thread(
|
|
426
|
+
sdk.exec,
|
|
427
|
+
self.dag,
|
|
428
|
+
cluster_name=self.cluster_name,
|
|
429
|
+
)
|
|
430
|
+
job_id_on_pool_cluster, _ = (
|
|
431
|
+
await context_utils.to_thread(
|
|
432
|
+
sdk.get, request_id))
|
|
433
|
+
except asyncio.CancelledError:
|
|
434
|
+
if request_id:
|
|
435
|
+
req = await context_utils.to_thread(
|
|
436
|
+
sdk.api_cancel, request_id)
|
|
437
|
+
try:
|
|
438
|
+
await context_utils.to_thread(
|
|
439
|
+
sdk.get, req)
|
|
440
|
+
except Exception as e: # pylint: disable=broad-except
|
|
441
|
+
# we must still return a CancelledError
|
|
442
|
+
self._logger.error(
|
|
443
|
+
f'Failed to cancel the job: {e}')
|
|
444
|
+
raise
|
|
349
445
|
assert job_id_on_pool_cluster is not None, (
|
|
350
446
|
self.cluster_name, self.job_id)
|
|
351
447
|
self.job_id_on_pool_cluster = job_id_on_pool_cluster
|
|
352
|
-
state.
|
|
448
|
+
await state.set_job_id_on_pool_cluster_async(
|
|
353
449
|
self.job_id, job_id_on_pool_cluster)
|
|
354
|
-
|
|
450
|
+
self._logger.info('Managed job cluster launched.')
|
|
355
451
|
except (exceptions.InvalidClusterNameError,
|
|
356
452
|
exceptions.NoCloudAccessError,
|
|
357
453
|
exceptions.ResourcesMismatchError) as e:
|
|
358
|
-
|
|
359
|
-
|
|
454
|
+
self._logger.error(
|
|
455
|
+
'Failure happened before provisioning. '
|
|
456
|
+
f'{common_utils.format_exception(e)}')
|
|
360
457
|
if raise_on_failure:
|
|
361
458
|
raise exceptions.ProvisionPrechecksError(
|
|
362
459
|
reasons=[e])
|
|
@@ -384,28 +481,30 @@ class StrategyExecutor:
|
|
|
384
481
|
reasons_str = '; '.join(
|
|
385
482
|
common_utils.format_exception(err)
|
|
386
483
|
for err in reasons)
|
|
387
|
-
|
|
484
|
+
self._logger.error(
|
|
388
485
|
'Failure happened before provisioning. '
|
|
389
486
|
f'Failover reasons: {reasons_str}')
|
|
390
487
|
if raise_on_failure:
|
|
391
488
|
raise exceptions.ProvisionPrechecksError(
|
|
392
489
|
reasons)
|
|
393
490
|
return None
|
|
394
|
-
|
|
395
|
-
|
|
491
|
+
self._logger.info(
|
|
492
|
+
'Failed to launch a cluster with error: '
|
|
493
|
+
f'{common_utils.format_exception(e)})')
|
|
396
494
|
except Exception as e: # pylint: disable=broad-except
|
|
397
495
|
# If the launch fails, it will be recovered by the
|
|
398
496
|
# following code.
|
|
399
|
-
|
|
400
|
-
|
|
497
|
+
self._logger.info(
|
|
498
|
+
'Failed to launch a cluster with error: '
|
|
499
|
+
f'{common_utils.format_exception(e)})')
|
|
401
500
|
with ux_utils.enable_traceback():
|
|
402
|
-
|
|
501
|
+
self._logger.info(
|
|
403
502
|
f' Traceback: {traceback.format_exc()}')
|
|
404
503
|
else: # No exception, the launch succeeds.
|
|
405
504
|
# At this point, a sky.launch() has succeeded. Cluster
|
|
406
505
|
# may be UP (no preemption since) or DOWN (newly
|
|
407
506
|
# preempted).
|
|
408
|
-
job_submitted_at = (
|
|
507
|
+
job_submitted_at = await (
|
|
409
508
|
self._wait_until_job_starts_on_cluster())
|
|
410
509
|
if job_submitted_at is not None:
|
|
411
510
|
return job_submitted_at
|
|
@@ -413,7 +512,7 @@ class StrategyExecutor:
|
|
|
413
512
|
# launch.
|
|
414
513
|
# TODO(zhwu): log the unexpected error to usage
|
|
415
514
|
# collection for future debugging.
|
|
416
|
-
|
|
515
|
+
self._logger.info(
|
|
417
516
|
'Failed to successfully submit the job to the '
|
|
418
517
|
'launched cluster, due to unexpected submission '
|
|
419
518
|
'errors or the cluster being preempted during '
|
|
@@ -421,7 +520,7 @@ class StrategyExecutor:
|
|
|
421
520
|
|
|
422
521
|
# If we get here, the launch did not succeed. Tear down the
|
|
423
522
|
# cluster and retry.
|
|
424
|
-
self._cleanup_cluster
|
|
523
|
+
await context_utils.to_thread(self._cleanup_cluster)
|
|
425
524
|
if max_retry is not None and retry_cnt >= max_retry:
|
|
426
525
|
# Retry forever if max_retry is None.
|
|
427
526
|
if raise_on_failure:
|
|
@@ -444,15 +543,13 @@ class StrategyExecutor:
|
|
|
444
543
|
|
|
445
544
|
except exceptions.NoClusterLaunchedError:
|
|
446
545
|
# Update the status to PENDING during backoff.
|
|
447
|
-
state.
|
|
546
|
+
state.set_backoff_pending_async(self.job_id, self.task_id)
|
|
448
547
|
# Calculate the backoff time and sleep.
|
|
449
|
-
# We retry immediately for worker pool, since no sky.launch()
|
|
450
|
-
# is called and the overhead is minimal.
|
|
451
548
|
gap_seconds = (backoff.current_backoff()
|
|
452
549
|
if self.pool is None else 1)
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
550
|
+
self._logger.info('Retrying to launch the cluster in '
|
|
551
|
+
f'{gap_seconds:.1f} seconds.')
|
|
552
|
+
await asyncio.sleep(gap_seconds)
|
|
456
553
|
continue
|
|
457
554
|
else:
|
|
458
555
|
# The inner loop should either return or throw
|
|
@@ -478,26 +575,39 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
478
575
|
|
|
479
576
|
_MAX_RETRY_CNT = 240 # Retry for 4 hours.
|
|
480
577
|
|
|
481
|
-
def __init__(
|
|
482
|
-
|
|
483
|
-
|
|
578
|
+
def __init__(
|
|
579
|
+
self,
|
|
580
|
+
cluster_name: Optional[str],
|
|
581
|
+
backend: 'backends.Backend',
|
|
582
|
+
task: 'task_lib.Task',
|
|
583
|
+
max_restarts_on_errors: int,
|
|
584
|
+
job_id: int,
|
|
585
|
+
task_id: int,
|
|
586
|
+
job_logger: logging.Logger,
|
|
587
|
+
pool: Optional[str],
|
|
588
|
+
starting: Set[int],
|
|
589
|
+
starting_lock: asyncio.Lock,
|
|
590
|
+
starting_signal: asyncio.Condition,
|
|
591
|
+
) -> None:
|
|
484
592
|
super().__init__(cluster_name, backend, task, max_restarts_on_errors,
|
|
485
|
-
job_id, task_id, pool
|
|
593
|
+
job_id, task_id, job_logger, pool, starting,
|
|
594
|
+
starting_lock, starting_signal)
|
|
486
595
|
# Note down the cloud/region of the launched cluster, so that we can
|
|
487
596
|
# first retry in the same cloud/region. (Inside recover() we may not
|
|
488
597
|
# rely on cluster handle, as it can be None if the cluster is
|
|
489
598
|
# preempted.)
|
|
490
599
|
self._launched_resources: Optional['resources.Resources'] = None
|
|
491
600
|
|
|
492
|
-
def _launch(self,
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
job_submitted_at = super()._launch(max_retry, raise_on_failure,
|
|
497
|
-
|
|
601
|
+
async def _launch(self,
|
|
602
|
+
max_retry: Optional[int] = 3,
|
|
603
|
+
raise_on_failure: bool = True,
|
|
604
|
+
recovery: bool = False) -> Optional[float]:
|
|
605
|
+
job_submitted_at = await super()._launch(max_retry, raise_on_failure,
|
|
606
|
+
recovery)
|
|
498
607
|
if job_submitted_at is not None and self.cluster_name is not None:
|
|
499
608
|
# Only record the cloud/region if the launch is successful.
|
|
500
|
-
handle =
|
|
609
|
+
handle = await context_utils.to_thread(
|
|
610
|
+
global_user_state.get_handle_from_cluster_name,
|
|
501
611
|
self.cluster_name)
|
|
502
612
|
assert isinstance(handle, backends.CloudVmRayResourceHandle), (
|
|
503
613
|
'Cluster should be launched.', handle)
|
|
@@ -507,7 +617,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
507
617
|
self._launched_resources = None
|
|
508
618
|
return job_submitted_at
|
|
509
619
|
|
|
510
|
-
def recover(self) -> float:
|
|
620
|
+
async def recover(self) -> float:
|
|
511
621
|
# 1. Cancel the jobs and launch the cluster with the STOPPED status,
|
|
512
622
|
# so that it will try on the current region first until timeout.
|
|
513
623
|
# 2. Tear down the cluster, if the step 1 failed to launch the cluster.
|
|
@@ -515,7 +625,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
515
625
|
# original user specification.
|
|
516
626
|
|
|
517
627
|
# Step 1
|
|
518
|
-
self._try_cancel_jobs()
|
|
628
|
+
await self._try_cancel_jobs()
|
|
519
629
|
|
|
520
630
|
while True:
|
|
521
631
|
# Add region constraint to the task, to retry on the same region
|
|
@@ -529,31 +639,32 @@ class FailoverStrategyExecutor(StrategyExecutor):
|
|
|
529
639
|
cloud=launched_cloud, region=launched_region, zone=None)
|
|
530
640
|
task.set_resources({new_resources})
|
|
531
641
|
# Not using self.launch to avoid the retry until up logic.
|
|
532
|
-
job_submitted_at = self._launch(raise_on_failure=False,
|
|
533
|
-
|
|
642
|
+
job_submitted_at = await self._launch(raise_on_failure=False,
|
|
643
|
+
recovery=True)
|
|
534
644
|
# Restore the original dag, i.e. reset the region constraint.
|
|
535
645
|
task.set_resources(original_resources)
|
|
536
646
|
if job_submitted_at is not None:
|
|
537
647
|
return job_submitted_at
|
|
538
648
|
|
|
539
649
|
# Step 2
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
self._cleanup_cluster
|
|
650
|
+
self._logger.debug('Terminating unhealthy cluster and reset cloud '
|
|
651
|
+
'region.')
|
|
652
|
+
await context_utils.to_thread(self._cleanup_cluster)
|
|
543
653
|
|
|
544
654
|
# Step 3
|
|
545
|
-
|
|
546
|
-
|
|
655
|
+
self._logger.debug(
|
|
656
|
+
'Relaunch the cluster without constraining to prior '
|
|
657
|
+
'cloud/region.')
|
|
547
658
|
# Not using self.launch to avoid the retry until up logic.
|
|
548
|
-
job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
|
|
549
|
-
|
|
550
|
-
|
|
659
|
+
job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
|
|
660
|
+
raise_on_failure=False,
|
|
661
|
+
recovery=True)
|
|
551
662
|
if job_submitted_at is None:
|
|
552
663
|
# Failed to launch the cluster.
|
|
553
664
|
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
665
|
+
self._logger.info('Retrying to recover the cluster in '
|
|
666
|
+
f'{gap_seconds:.1f} seconds.')
|
|
667
|
+
await asyncio.sleep(gap_seconds)
|
|
557
668
|
continue
|
|
558
669
|
|
|
559
670
|
return job_submitted_at
|
|
@@ -585,7 +696,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
|
585
696
|
-> R1Z1 (success)
|
|
586
697
|
"""
|
|
587
698
|
|
|
588
|
-
def recover(self) -> float:
|
|
699
|
+
async def recover(self) -> float:
|
|
589
700
|
# 1. Terminate the current cluster
|
|
590
701
|
# 2. Launch again by explicitly blocking the previously launched region
|
|
591
702
|
# (this will failover through the entire search space except the
|
|
@@ -597,12 +708,14 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
|
597
708
|
# task.resources.
|
|
598
709
|
|
|
599
710
|
# Step 1
|
|
600
|
-
|
|
601
|
-
|
|
711
|
+
self._logger.debug(
|
|
712
|
+
'Terminating unhealthy cluster and reset cloud region.')
|
|
713
|
+
await context_utils.to_thread(self._cleanup_cluster)
|
|
602
714
|
|
|
603
715
|
# Step 2
|
|
604
|
-
|
|
605
|
-
|
|
716
|
+
self._logger.debug(
|
|
717
|
+
'Relaunch the cluster skipping the previously launched '
|
|
718
|
+
'cloud/region.')
|
|
606
719
|
if self._launched_resources is not None:
|
|
607
720
|
task = self.dag.tasks[0]
|
|
608
721
|
requested_resources = self._launched_resources
|
|
@@ -619,26 +732,35 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
|
|
|
619
732
|
region=launched_region)
|
|
620
733
|
}
|
|
621
734
|
# Not using self.launch to avoid the retry until up logic.
|
|
622
|
-
job_submitted_at = self._launch(raise_on_failure=False,
|
|
623
|
-
|
|
735
|
+
job_submitted_at = await self._launch(raise_on_failure=False,
|
|
736
|
+
recovery=True)
|
|
624
737
|
task.blocked_resources = None
|
|
625
738
|
if job_submitted_at is not None:
|
|
626
739
|
return job_submitted_at
|
|
627
740
|
|
|
628
741
|
while True:
|
|
629
742
|
# Step 3
|
|
630
|
-
|
|
631
|
-
|
|
743
|
+
self._logger.debug(
|
|
744
|
+
'Relaunch the cluster without constraining to prior '
|
|
745
|
+
'cloud/region.')
|
|
632
746
|
# Not using self.launch to avoid the retry until up logic.
|
|
633
|
-
job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
|
|
634
|
-
|
|
635
|
-
|
|
747
|
+
job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
|
|
748
|
+
raise_on_failure=False,
|
|
749
|
+
recovery=True)
|
|
636
750
|
if job_submitted_at is None:
|
|
637
751
|
# Failed to launch the cluster.
|
|
638
752
|
gap_seconds = self.RETRY_INIT_GAP_SECONDS
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
753
|
+
self._logger.info('Retrying to recover the cluster in '
|
|
754
|
+
f'{gap_seconds:.1f} seconds.')
|
|
755
|
+
await asyncio.sleep(gap_seconds)
|
|
642
756
|
continue
|
|
643
757
|
|
|
644
758
|
return job_submitted_at
|
|
759
|
+
|
|
760
|
+
|
|
761
|
+
def _get_logger_file(file_logger: logging.Logger) -> Optional[str]:
|
|
762
|
+
"""Gets the file path that the logger writes to."""
|
|
763
|
+
for handler in file_logger.handlers:
|
|
764
|
+
if isinstance(handler, logging.FileHandler):
|
|
765
|
+
return handler.baseFilename
|
|
766
|
+
return None
|