skypilot-nightly 1.0.0.dev20250909__py3-none-any.whl → 1.0.0.dev20250910__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (67) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +19 -4
  3. sky/backends/backend_utils.py +35 -1
  4. sky/backends/cloud_vm_ray_backend.py +2 -2
  5. sky/client/sdk.py +20 -0
  6. sky/client/sdk_async.py +18 -16
  7. sky/clouds/aws.py +3 -1
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/chunks/{webpack-d4fabc08788e14af.js → webpack-1d7e11230da3ca89.js} +1 -1
  10. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  11. sky/dashboard/out/clusters/[cluster].html +1 -1
  12. sky/dashboard/out/clusters.html +1 -1
  13. sky/dashboard/out/config.html +1 -1
  14. sky/dashboard/out/index.html +1 -1
  15. sky/dashboard/out/infra/[context].html +1 -1
  16. sky/dashboard/out/infra.html +1 -1
  17. sky/dashboard/out/jobs/[job].html +1 -1
  18. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  19. sky/dashboard/out/jobs.html +1 -1
  20. sky/dashboard/out/users.html +1 -1
  21. sky/dashboard/out/volumes.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/data/storage.py +5 -1
  26. sky/execution.py +21 -14
  27. sky/jobs/constants.py +3 -0
  28. sky/jobs/controller.py +732 -310
  29. sky/jobs/recovery_strategy.py +251 -129
  30. sky/jobs/scheduler.py +247 -174
  31. sky/jobs/server/core.py +20 -4
  32. sky/jobs/server/utils.py +2 -2
  33. sky/jobs/state.py +702 -511
  34. sky/jobs/utils.py +94 -39
  35. sky/provision/aws/config.py +4 -1
  36. sky/provision/gcp/config.py +6 -1
  37. sky/provision/kubernetes/utils.py +17 -8
  38. sky/provision/provisioner.py +1 -0
  39. sky/serve/replica_managers.py +0 -7
  40. sky/serve/serve_utils.py +5 -0
  41. sky/serve/server/impl.py +1 -2
  42. sky/serve/service.py +0 -2
  43. sky/server/common.py +8 -3
  44. sky/server/config.py +43 -24
  45. sky/server/constants.py +1 -0
  46. sky/server/daemons.py +7 -11
  47. sky/server/requests/serializers/encoders.py +1 -1
  48. sky/server/server.py +8 -1
  49. sky/setup_files/dependencies.py +4 -2
  50. sky/skylet/attempt_skylet.py +1 -0
  51. sky/skylet/constants.py +3 -1
  52. sky/skylet/events.py +2 -10
  53. sky/utils/command_runner.pyi +3 -3
  54. sky/utils/common_utils.py +11 -1
  55. sky/utils/controller_utils.py +5 -0
  56. sky/utils/db/db_utils.py +31 -2
  57. sky/utils/rich_utils.py +3 -1
  58. sky/utils/subprocess_utils.py +9 -0
  59. sky/volumes/volume.py +2 -0
  60. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/METADATA +39 -37
  61. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/RECORD +67 -67
  62. /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_buildManifest.js +0 -0
  63. /sky/dashboard/out/_next/static/{eWytLgin5zvayQw3Xk46m → 3SYxqNGnvvPS8h3gdD2T7}/_ssgManifest.js +0 -0
  64. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/WHEEL +0 -0
  65. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/entry_points.txt +0 -0
  66. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/licenses/LICENSE +0 -0
  67. {skypilot_nightly-1.0.0.dev20250909.dist-info → skypilot_nightly-1.0.0.dev20250910.dist-info}/top_level.txt +0 -0
@@ -5,18 +5,19 @@ In the YAML file, the user can specify the strategy to use for managed jobs.
5
5
  resources:
6
6
  job_recovery: EAGER_NEXT_REGION
7
7
  """
8
- import time
8
+ import asyncio
9
+ import logging
9
10
  import traceback
10
11
  import typing
11
- from typing import Optional
12
+ from typing import Optional, Set
12
13
 
13
14
  from sky import backends
14
15
  from sky import dag as dag_lib
15
16
  from sky import exceptions
16
- from sky import execution
17
17
  from sky import global_user_state
18
18
  from sky import sky_logging
19
19
  from sky.backends import backend_utils
20
+ from sky.client import sdk
20
21
  from sky.jobs import scheduler
21
22
  from sky.jobs import state
22
23
  from sky.jobs import utils as managed_job_utils
@@ -24,6 +25,7 @@ from sky.serve import serve_utils
24
25
  from sky.skylet import job_lib
25
26
  from sky.usage import usage_lib
26
27
  from sky.utils import common_utils
28
+ from sky.utils import context_utils
27
29
  from sky.utils import registry
28
30
  from sky.utils import status_lib
29
31
  from sky.utils import ux_utils
@@ -41,7 +43,7 @@ MAX_JOB_CHECKING_RETRY = 10
41
43
  # Minutes to job cluster autodown. This should be significantly larger than
42
44
  # managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS, to avoid tearing down the
43
45
  # cluster before its status can be updated by the job controller.
44
- _AUTODOWN_MINUTES = 5
46
+ _AUTODOWN_MINUTES = 10
45
47
 
46
48
 
47
49
  class StrategyExecutor:
@@ -49,15 +51,33 @@ class StrategyExecutor:
49
51
 
50
52
  RETRY_INIT_GAP_SECONDS = 60
51
53
 
52
- def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
53
- task: 'task_lib.Task', max_restarts_on_errors: int,
54
- job_id: int, task_id: int, pool: Optional[str]) -> None:
54
+ def __init__(
55
+ self,
56
+ cluster_name: Optional[str],
57
+ backend: 'backends.Backend',
58
+ task: 'task_lib.Task',
59
+ max_restarts_on_errors: int,
60
+ job_id: int,
61
+ task_id: int,
62
+ job_logger: logging.Logger,
63
+ pool: Optional[str],
64
+ starting: Set[int],
65
+ starting_lock: asyncio.Lock,
66
+ starting_signal: asyncio.Condition,
67
+ ) -> None:
55
68
  """Initialize the strategy executor.
56
69
 
57
70
  Args:
58
71
  cluster_name: The name of the cluster.
59
72
  backend: The backend to use. Only CloudVMRayBackend is supported.
60
73
  task: The task to execute.
74
+ max_restarts_on_errors: Maximum number of restarts on errors.
75
+ job_id: The ID of the job.
76
+ task_id: The ID of the task.
77
+ job_logger: Logger instance for this specific job.
78
+ starting: Set of job IDs that are currently starting.
79
+ starting_lock: Lock to synchronize starting jobs.
80
+ starting_signal: Condition to signal when a job can start.
61
81
  """
62
82
  assert isinstance(backend, backends.CloudVmRayBackend), (
63
83
  'Only CloudVMRayBackend is supported.')
@@ -74,12 +94,26 @@ class StrategyExecutor:
74
94
  self.task_id = task_id
75
95
  self.pool = pool
76
96
  self.restart_cnt_on_failure = 0
97
+ self._logger = job_logger
77
98
  self.job_id_on_pool_cluster: Optional[int] = None
99
+ self.starting = starting
100
+ self.starting_lock = starting_lock
101
+ self.starting_signal = starting_signal
78
102
 
79
103
  @classmethod
80
- def make(cls, cluster_name: Optional[str], backend: 'backends.Backend',
81
- task: 'task_lib.Task', job_id: int, task_id: int,
82
- pool: Optional[str]) -> 'StrategyExecutor':
104
+ def make(
105
+ cls,
106
+ cluster_name: Optional[str],
107
+ backend: 'backends.Backend',
108
+ task: 'task_lib.Task',
109
+ job_id: int,
110
+ task_id: int,
111
+ job_logger: logging.Logger,
112
+ pool: Optional[str],
113
+ starting: Set[int],
114
+ starting_lock: asyncio.Lock,
115
+ starting_signal: asyncio.Condition,
116
+ ) -> 'StrategyExecutor':
83
117
  """Create a strategy from a task."""
84
118
 
85
119
  resource_list = list(task.resources)
@@ -111,9 +145,10 @@ class StrategyExecutor:
111
145
  assert job_recovery_strategy is not None, job_recovery_name
112
146
  return job_recovery_strategy(cluster_name, backend, task,
113
147
  max_restarts_on_errors, job_id, task_id,
114
- pool)
148
+ job_logger, pool, starting, starting_lock,
149
+ starting_signal)
115
150
 
116
- def launch(self) -> float:
151
+ async def launch(self) -> float:
117
152
  """Launch the cluster for the first time.
118
153
 
119
154
  It can fail if resource is not available. Need to check the cluster
@@ -125,11 +160,11 @@ class StrategyExecutor:
125
160
  Raises: Please refer to the docstring of self._launch().
126
161
  """
127
162
 
128
- job_submit_at = self._launch(max_retry=None)
163
+ job_submit_at = await self._launch(max_retry=None)
129
164
  assert job_submit_at is not None
130
165
  return job_submit_at
131
166
 
132
- def recover(self) -> float:
167
+ async def recover(self) -> float:
133
168
  """Relaunch the cluster after failure and wait until job starts.
134
169
 
135
170
  When recover() is called the cluster should be in STOPPED status (i.e.
@@ -139,13 +174,11 @@ class StrategyExecutor:
139
174
  """
140
175
  raise NotImplementedError
141
176
 
142
- def _try_cancel_jobs(self):
143
- from sky import core # pylint: disable=import-outside-toplevel
144
-
177
+ async def _try_cancel_jobs(self):
145
178
  if self.cluster_name is None:
146
179
  return
147
- handle = global_user_state.get_handle_from_cluster_name(
148
- self.cluster_name)
180
+ handle = await context_utils.to_thread(
181
+ global_user_state.get_handle_from_cluster_name, self.cluster_name)
149
182
  if handle is None or self.pool is not None:
150
183
  return
151
184
  try:
@@ -174,9 +207,16 @@ class StrategyExecutor:
174
207
  kwargs = dict(all=True)
175
208
  else:
176
209
  kwargs = dict(job_ids=[self.job_id_on_pool_cluster])
177
- core.cancel(cluster_name=self.cluster_name,
178
- **kwargs,
179
- _try_cancel_if_cluster_is_init=True)
210
+ request_id = await context_utils.to_thread(
211
+ sdk.cancel,
212
+ cluster_name=self.cluster_name,
213
+ **kwargs,
214
+ _try_cancel_if_cluster_is_init=True,
215
+ )
216
+ await context_utils.to_thread(
217
+ sdk.get,
218
+ request_id,
219
+ )
180
220
  except Exception as e: # pylint: disable=broad-except
181
221
  logger.info('Failed to cancel the job on the cluster. The cluster '
182
222
  'might be already down or the head node is preempted.'
@@ -184,9 +224,9 @@ class StrategyExecutor:
184
224
  f'{common_utils.format_exception(e)}\n'
185
225
  'Terminating the cluster explicitly to ensure no '
186
226
  'remaining job process interferes with recovery.')
187
- self._cleanup_cluster()
227
+ await context_utils.to_thread(self._cleanup_cluster)
188
228
 
189
- def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
229
+ async def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
190
230
  """Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
191
231
 
192
232
  Returns:
@@ -200,32 +240,34 @@ class StrategyExecutor:
200
240
  # Avoid the infinite loop, if any bug happens.
201
241
  job_checking_retry_cnt += 1
202
242
  try:
203
- cluster_status, _ = (
204
- backend_utils.refresh_cluster_status_handle(
205
- self.cluster_name,
206
- force_refresh_statuses=set(status_lib.ClusterStatus)))
243
+ cluster_status, _ = (await context_utils.to_thread(
244
+ backend_utils.refresh_cluster_status_handle,
245
+ self.cluster_name,
246
+ force_refresh_statuses=set(status_lib.ClusterStatus)))
207
247
  except Exception as e: # pylint: disable=broad-except
208
248
  # If any unexpected error happens, retry the job checking
209
249
  # loop.
210
250
  # TODO(zhwu): log the unexpected error to usage collection
211
251
  # for future debugging.
212
- logger.info(f'Unexpected exception: {e}\nFailed to get the '
213
- 'refresh the cluster status. Retrying.')
252
+ self._logger.info(
253
+ f'Unexpected exception: {e}\nFailed to get the '
254
+ 'refresh the cluster status. Retrying.')
214
255
  continue
215
256
  if cluster_status != status_lib.ClusterStatus.UP:
216
257
  # The cluster can be preempted before the job is
217
258
  # launched.
218
259
  # Break to let the retry launch kick in.
219
- logger.info('The cluster is preempted before the job '
220
- 'is submitted.')
260
+ self._logger.info('The cluster is preempted before the job '
261
+ 'is submitted.')
221
262
  # TODO(zhwu): we should recover the preemption with the
222
263
  # recovery strategy instead of the current while loop.
223
264
  break
224
265
 
225
266
  try:
226
- status = managed_job_utils.get_job_status(
267
+ status = await managed_job_utils.get_job_status(
227
268
  self.backend,
228
269
  self.cluster_name,
270
+ job_logger=self._logger,
229
271
  job_id=self.job_id_on_pool_cluster)
230
272
  except Exception as e: # pylint: disable=broad-except
231
273
  # If any unexpected error happens, retry the job checking
@@ -234,14 +276,16 @@ class StrategyExecutor:
234
276
  # get_job_status, so it should not happen here.
235
277
  # TODO(zhwu): log the unexpected error to usage collection
236
278
  # for future debugging.
237
- logger.info(f'Unexpected exception: {e}\nFailed to get the '
238
- 'job status. Retrying.')
279
+ self._logger.info(
280
+ f'Unexpected exception: {e}\nFailed to get the '
281
+ 'job status. Retrying.')
239
282
  continue
240
283
 
241
284
  # Check the job status until it is not in initialized status
242
285
  if status is not None and status > job_lib.JobStatus.INIT:
243
286
  try:
244
- job_submitted_at = managed_job_utils.get_job_timestamp(
287
+ job_submitted_at = await context_utils.to_thread(
288
+ managed_job_utils.get_job_timestamp,
245
289
  self.backend,
246
290
  self.cluster_name,
247
291
  self.job_id_on_pool_cluster,
@@ -250,11 +294,13 @@ class StrategyExecutor:
250
294
  except Exception as e: # pylint: disable=broad-except
251
295
  # If we failed to get the job timestamp, we will retry
252
296
  # job checking loop.
253
- logger.info(f'Unexpected Exception: {e}\nFailed to get '
254
- 'the job start timestamp. Retrying.')
297
+ self._logger.info(
298
+ f'Unexpected Exception: {e}\nFailed to get '
299
+ 'the job start timestamp. Retrying.')
255
300
  continue
256
301
  # Wait for the job to be started
257
- time.sleep(managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
302
+ await asyncio.sleep(
303
+ managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
258
304
  return None
259
305
 
260
306
  def _cleanup_cluster(self) -> None:
@@ -263,10 +309,10 @@ class StrategyExecutor:
263
309
  if self.pool is None:
264
310
  managed_job_utils.terminate_cluster(self.cluster_name)
265
311
 
266
- def _launch(self,
267
- max_retry: Optional[int] = 3,
268
- raise_on_failure: bool = True,
269
- recovery: bool = False) -> Optional[float]:
312
+ async def _launch(self,
313
+ max_retry: Optional[int] = 3,
314
+ raise_on_failure: bool = True,
315
+ recovery: bool = False) -> Optional[float]:
270
316
  """Implementation of launch().
271
317
 
272
318
  The function will wait until the job starts running, but will leave the
@@ -307,56 +353,107 @@ class StrategyExecutor:
307
353
  while True:
308
354
  retry_cnt += 1
309
355
  try:
310
- with scheduler.scheduled_launch(self.job_id):
356
+ async with scheduler.scheduled_launch(
357
+ self.job_id,
358
+ self.starting,
359
+ self.starting_lock,
360
+ self.starting_signal,
361
+ self._logger,
362
+ ):
311
363
  # The job state may have been PENDING during backoff -
312
364
  # update to STARTING or RECOVERING.
313
365
  # On the first attempt (when retry_cnt is 1), we should
314
366
  # already be in STARTING or RECOVERING.
315
367
  if retry_cnt > 1:
316
- state.set_restarting(self.job_id, self.task_id,
317
- recovery)
368
+ await state.set_restarting_async(
369
+ self.job_id, self.task_id, recovery)
318
370
  try:
319
371
  usage_lib.messages.usage.set_internal()
320
372
  if self.pool is None:
321
373
  assert self.cluster_name is not None
322
- # Detach setup, so that the setup failure can be
323
- # detected by the controller process (job_status ->
324
- # FAILED_SETUP).
325
- execution.launch(
326
- self.dag,
327
- cluster_name=self.cluster_name,
328
- # We expect to tear down the cluster as soon as
329
- # the job is finished. However, in case the
330
- # controller dies, we may end up with a
331
- # resource leak.
332
- # Ideally, we should autodown to be safe,
333
- # but it's fine to disable it for now, as
334
- # Nebius doesn't support autodown yet.
335
- # TODO(kevin): set down=True once Nebius
336
- # supports autodown.
337
- # idle_minutes_to_autostop=_AUTODOWN_MINUTES,
338
- # down=True,
339
- _is_launched_by_jobs_controller=True)
374
+ log_file = _get_logger_file(self._logger)
375
+ request_id = None
376
+ try:
377
+ request_id = await context_utils.to_thread(
378
+ sdk.launch,
379
+ self.dag,
380
+ cluster_name=self.cluster_name,
381
+ # We expect to tear down the cluster as soon
382
+ # as the job is finished. However, in case
383
+ # the controller dies, we may end up with a
384
+ # resource leak.
385
+ # Ideally, we should autodown to be safe,
386
+ # but it's fine to disable it for now, as
387
+ # Nebius doesn't support autodown yet.
388
+ # TODO(kevin): set down=True once Nebius
389
+ # supports autodown.
390
+ # idle_minutes_to_autostop=(
391
+ # _AUTODOWN_MINUTES),
392
+ # down=True,
393
+ _is_launched_by_jobs_controller=True,
394
+ )
395
+ if log_file is None:
396
+ raise OSError('Log file is None')
397
+ with open(log_file, 'a', encoding='utf-8') as f:
398
+ await context_utils.to_thread(
399
+ sdk.stream_and_get,
400
+ request_id,
401
+ output_stream=f,
402
+ )
403
+ except asyncio.CancelledError:
404
+ if request_id:
405
+ req = await context_utils.to_thread(
406
+ sdk.api_cancel, request_id)
407
+ try:
408
+ await context_utils.to_thread(
409
+ sdk.get, req)
410
+ except Exception as e: # pylint: disable=broad-except
411
+ # we must still return a CancelledError
412
+ self._logger.error(
413
+ f'Failed to cancel the job: {e}')
414
+ raise
415
+ self._logger.info('Managed job cluster launched.')
340
416
  else:
341
- self.cluster_name = (
342
- serve_utils.get_next_cluster_name(
343
- self.pool, self.job_id))
417
+ self.cluster_name = await (context_utils.to_thread(
418
+ serve_utils.get_next_cluster_name, self.pool,
419
+ self.job_id))
344
420
  if self.cluster_name is None:
345
421
  raise exceptions.NoClusterLaunchedError(
346
422
  'No cluster name found in the pool.')
347
- job_id_on_pool_cluster, _ = execution.exec(
348
- self.dag, cluster_name=self.cluster_name)
423
+ request_id = None
424
+ try:
425
+ request_id = await context_utils.to_thread(
426
+ sdk.exec,
427
+ self.dag,
428
+ cluster_name=self.cluster_name,
429
+ )
430
+ job_id_on_pool_cluster, _ = (
431
+ await context_utils.to_thread(
432
+ sdk.get, request_id))
433
+ except asyncio.CancelledError:
434
+ if request_id:
435
+ req = await context_utils.to_thread(
436
+ sdk.api_cancel, request_id)
437
+ try:
438
+ await context_utils.to_thread(
439
+ sdk.get, req)
440
+ except Exception as e: # pylint: disable=broad-except
441
+ # we must still return a CancelledError
442
+ self._logger.error(
443
+ f'Failed to cancel the job: {e}')
444
+ raise
349
445
  assert job_id_on_pool_cluster is not None, (
350
446
  self.cluster_name, self.job_id)
351
447
  self.job_id_on_pool_cluster = job_id_on_pool_cluster
352
- state.set_job_id_on_pool_cluster(
448
+ await state.set_job_id_on_pool_cluster_async(
353
449
  self.job_id, job_id_on_pool_cluster)
354
- logger.info('Managed job cluster launched.')
450
+ self._logger.info('Managed job cluster launched.')
355
451
  except (exceptions.InvalidClusterNameError,
356
452
  exceptions.NoCloudAccessError,
357
453
  exceptions.ResourcesMismatchError) as e:
358
- logger.error('Failure happened before provisioning. '
359
- f'{common_utils.format_exception(e)}')
454
+ self._logger.error(
455
+ 'Failure happened before provisioning. '
456
+ f'{common_utils.format_exception(e)}')
360
457
  if raise_on_failure:
361
458
  raise exceptions.ProvisionPrechecksError(
362
459
  reasons=[e])
@@ -384,28 +481,30 @@ class StrategyExecutor:
384
481
  reasons_str = '; '.join(
385
482
  common_utils.format_exception(err)
386
483
  for err in reasons)
387
- logger.error(
484
+ self._logger.error(
388
485
  'Failure happened before provisioning. '
389
486
  f'Failover reasons: {reasons_str}')
390
487
  if raise_on_failure:
391
488
  raise exceptions.ProvisionPrechecksError(
392
489
  reasons)
393
490
  return None
394
- logger.info('Failed to launch a cluster with error: '
395
- f'{common_utils.format_exception(e)})')
491
+ self._logger.info(
492
+ 'Failed to launch a cluster with error: '
493
+ f'{common_utils.format_exception(e)})')
396
494
  except Exception as e: # pylint: disable=broad-except
397
495
  # If the launch fails, it will be recovered by the
398
496
  # following code.
399
- logger.info('Failed to launch a cluster with error: '
400
- f'{common_utils.format_exception(e)})')
497
+ self._logger.info(
498
+ 'Failed to launch a cluster with error: '
499
+ f'{common_utils.format_exception(e)})')
401
500
  with ux_utils.enable_traceback():
402
- logger.info(
501
+ self._logger.info(
403
502
  f' Traceback: {traceback.format_exc()}')
404
503
  else: # No exception, the launch succeeds.
405
504
  # At this point, a sky.launch() has succeeded. Cluster
406
505
  # may be UP (no preemption since) or DOWN (newly
407
506
  # preempted).
408
- job_submitted_at = (
507
+ job_submitted_at = await (
409
508
  self._wait_until_job_starts_on_cluster())
410
509
  if job_submitted_at is not None:
411
510
  return job_submitted_at
@@ -413,7 +512,7 @@ class StrategyExecutor:
413
512
  # launch.
414
513
  # TODO(zhwu): log the unexpected error to usage
415
514
  # collection for future debugging.
416
- logger.info(
515
+ self._logger.info(
417
516
  'Failed to successfully submit the job to the '
418
517
  'launched cluster, due to unexpected submission '
419
518
  'errors or the cluster being preempted during '
@@ -421,7 +520,7 @@ class StrategyExecutor:
421
520
 
422
521
  # If we get here, the launch did not succeed. Tear down the
423
522
  # cluster and retry.
424
- self._cleanup_cluster()
523
+ await context_utils.to_thread(self._cleanup_cluster)
425
524
  if max_retry is not None and retry_cnt >= max_retry:
426
525
  # Retry forever if max_retry is None.
427
526
  if raise_on_failure:
@@ -444,15 +543,13 @@ class StrategyExecutor:
444
543
 
445
544
  except exceptions.NoClusterLaunchedError:
446
545
  # Update the status to PENDING during backoff.
447
- state.set_backoff_pending(self.job_id, self.task_id)
546
+ state.set_backoff_pending_async(self.job_id, self.task_id)
448
547
  # Calculate the backoff time and sleep.
449
- # We retry immediately for worker pool, since no sky.launch()
450
- # is called and the overhead is minimal.
451
548
  gap_seconds = (backoff.current_backoff()
452
549
  if self.pool is None else 1)
453
- logger.info('Retrying to launch the cluster in '
454
- f'{gap_seconds:.1f} seconds.')
455
- time.sleep(gap_seconds)
550
+ self._logger.info('Retrying to launch the cluster in '
551
+ f'{gap_seconds:.1f} seconds.')
552
+ await asyncio.sleep(gap_seconds)
456
553
  continue
457
554
  else:
458
555
  # The inner loop should either return or throw
@@ -478,26 +575,39 @@ class FailoverStrategyExecutor(StrategyExecutor):
478
575
 
479
576
  _MAX_RETRY_CNT = 240 # Retry for 4 hours.
480
577
 
481
- def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
482
- task: 'task_lib.Task', max_restarts_on_errors: int,
483
- job_id: int, task_id: int, pool: Optional[str]) -> None:
578
+ def __init__(
579
+ self,
580
+ cluster_name: Optional[str],
581
+ backend: 'backends.Backend',
582
+ task: 'task_lib.Task',
583
+ max_restarts_on_errors: int,
584
+ job_id: int,
585
+ task_id: int,
586
+ job_logger: logging.Logger,
587
+ pool: Optional[str],
588
+ starting: Set[int],
589
+ starting_lock: asyncio.Lock,
590
+ starting_signal: asyncio.Condition,
591
+ ) -> None:
484
592
  super().__init__(cluster_name, backend, task, max_restarts_on_errors,
485
- job_id, task_id, pool)
593
+ job_id, task_id, job_logger, pool, starting,
594
+ starting_lock, starting_signal)
486
595
  # Note down the cloud/region of the launched cluster, so that we can
487
596
  # first retry in the same cloud/region. (Inside recover() we may not
488
597
  # rely on cluster handle, as it can be None if the cluster is
489
598
  # preempted.)
490
599
  self._launched_resources: Optional['resources.Resources'] = None
491
600
 
492
- def _launch(self,
493
- max_retry: Optional[int] = 3,
494
- raise_on_failure: bool = True,
495
- recovery: bool = False) -> Optional[float]:
496
- job_submitted_at = super()._launch(max_retry, raise_on_failure,
497
- recovery)
601
+ async def _launch(self,
602
+ max_retry: Optional[int] = 3,
603
+ raise_on_failure: bool = True,
604
+ recovery: bool = False) -> Optional[float]:
605
+ job_submitted_at = await super()._launch(max_retry, raise_on_failure,
606
+ recovery)
498
607
  if job_submitted_at is not None and self.cluster_name is not None:
499
608
  # Only record the cloud/region if the launch is successful.
500
- handle = global_user_state.get_handle_from_cluster_name(
609
+ handle = await context_utils.to_thread(
610
+ global_user_state.get_handle_from_cluster_name,
501
611
  self.cluster_name)
502
612
  assert isinstance(handle, backends.CloudVmRayResourceHandle), (
503
613
  'Cluster should be launched.', handle)
@@ -507,7 +617,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
507
617
  self._launched_resources = None
508
618
  return job_submitted_at
509
619
 
510
- def recover(self) -> float:
620
+ async def recover(self) -> float:
511
621
  # 1. Cancel the jobs and launch the cluster with the STOPPED status,
512
622
  # so that it will try on the current region first until timeout.
513
623
  # 2. Tear down the cluster, if the step 1 failed to launch the cluster.
@@ -515,7 +625,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
515
625
  # original user specification.
516
626
 
517
627
  # Step 1
518
- self._try_cancel_jobs()
628
+ await self._try_cancel_jobs()
519
629
 
520
630
  while True:
521
631
  # Add region constraint to the task, to retry on the same region
@@ -529,31 +639,32 @@ class FailoverStrategyExecutor(StrategyExecutor):
529
639
  cloud=launched_cloud, region=launched_region, zone=None)
530
640
  task.set_resources({new_resources})
531
641
  # Not using self.launch to avoid the retry until up logic.
532
- job_submitted_at = self._launch(raise_on_failure=False,
533
- recovery=True)
642
+ job_submitted_at = await self._launch(raise_on_failure=False,
643
+ recovery=True)
534
644
  # Restore the original dag, i.e. reset the region constraint.
535
645
  task.set_resources(original_resources)
536
646
  if job_submitted_at is not None:
537
647
  return job_submitted_at
538
648
 
539
649
  # Step 2
540
- logger.debug('Terminating unhealthy cluster and reset cloud '
541
- 'region.')
542
- self._cleanup_cluster()
650
+ self._logger.debug('Terminating unhealthy cluster and reset cloud '
651
+ 'region.')
652
+ await context_utils.to_thread(self._cleanup_cluster)
543
653
 
544
654
  # Step 3
545
- logger.debug('Relaunch the cluster without constraining to prior '
546
- 'cloud/region.')
655
+ self._logger.debug(
656
+ 'Relaunch the cluster without constraining to prior '
657
+ 'cloud/region.')
547
658
  # Not using self.launch to avoid the retry until up logic.
548
- job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
549
- raise_on_failure=False,
550
- recovery=True)
659
+ job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
660
+ raise_on_failure=False,
661
+ recovery=True)
551
662
  if job_submitted_at is None:
552
663
  # Failed to launch the cluster.
553
664
  gap_seconds = self.RETRY_INIT_GAP_SECONDS
554
- logger.info('Retrying to recover the cluster in '
555
- f'{gap_seconds:.1f} seconds.')
556
- time.sleep(gap_seconds)
665
+ self._logger.info('Retrying to recover the cluster in '
666
+ f'{gap_seconds:.1f} seconds.')
667
+ await asyncio.sleep(gap_seconds)
557
668
  continue
558
669
 
559
670
  return job_submitted_at
@@ -585,7 +696,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
585
696
  -> R1Z1 (success)
586
697
  """
587
698
 
588
- def recover(self) -> float:
699
+ async def recover(self) -> float:
589
700
  # 1. Terminate the current cluster
590
701
  # 2. Launch again by explicitly blocking the previously launched region
591
702
  # (this will failover through the entire search space except the
@@ -597,12 +708,14 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
597
708
  # task.resources.
598
709
 
599
710
  # Step 1
600
- logger.debug('Terminating unhealthy cluster and reset cloud region.')
601
- self._cleanup_cluster()
711
+ self._logger.debug(
712
+ 'Terminating unhealthy cluster and reset cloud region.')
713
+ await context_utils.to_thread(self._cleanup_cluster)
602
714
 
603
715
  # Step 2
604
- logger.debug('Relaunch the cluster skipping the previously launched '
605
- 'cloud/region.')
716
+ self._logger.debug(
717
+ 'Relaunch the cluster skipping the previously launched '
718
+ 'cloud/region.')
606
719
  if self._launched_resources is not None:
607
720
  task = self.dag.tasks[0]
608
721
  requested_resources = self._launched_resources
@@ -619,26 +732,35 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
619
732
  region=launched_region)
620
733
  }
621
734
  # Not using self.launch to avoid the retry until up logic.
622
- job_submitted_at = self._launch(raise_on_failure=False,
623
- recovery=True)
735
+ job_submitted_at = await self._launch(raise_on_failure=False,
736
+ recovery=True)
624
737
  task.blocked_resources = None
625
738
  if job_submitted_at is not None:
626
739
  return job_submitted_at
627
740
 
628
741
  while True:
629
742
  # Step 3
630
- logger.debug('Relaunch the cluster without constraining to prior '
631
- 'cloud/region.')
743
+ self._logger.debug(
744
+ 'Relaunch the cluster without constraining to prior '
745
+ 'cloud/region.')
632
746
  # Not using self.launch to avoid the retry until up logic.
633
- job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
634
- raise_on_failure=False,
635
- recovery=True)
747
+ job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
748
+ raise_on_failure=False,
749
+ recovery=True)
636
750
  if job_submitted_at is None:
637
751
  # Failed to launch the cluster.
638
752
  gap_seconds = self.RETRY_INIT_GAP_SECONDS
639
- logger.info('Retrying to recover the cluster in '
640
- f'{gap_seconds:.1f} seconds.')
641
- time.sleep(gap_seconds)
753
+ self._logger.info('Retrying to recover the cluster in '
754
+ f'{gap_seconds:.1f} seconds.')
755
+ await asyncio.sleep(gap_seconds)
642
756
  continue
643
757
 
644
758
  return job_submitted_at
759
+
760
+
761
+ def _get_logger_file(file_logger: logging.Logger) -> Optional[str]:
762
+ """Gets the file path that the logger writes to."""
763
+ for handler in file_logger.handlers:
764
+ if isinstance(handler, logging.FileHandler):
765
+ return handler.baseFilename
766
+ return None