skypilot-nightly 1.0.0.dev20251013__py3-none-any.whl → 1.0.0.dev20251014__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (52) hide show
  1. sky/__init__.py +2 -2
  2. sky/authentication.py +9 -2
  3. sky/backends/backend_utils.py +33 -25
  4. sky/backends/cloud_vm_ray_backend.py +3 -5
  5. sky/catalog/kubernetes_catalog.py +19 -25
  6. sky/client/cli/command.py +53 -19
  7. sky/client/sdk.py +13 -1
  8. sky/dashboard/out/404.html +1 -1
  9. sky/dashboard/out/_next/static/chunks/{webpack-ac3a34c8f9fef041.js → webpack-66f23594d38c7f16.js} +1 -1
  10. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  11. sky/dashboard/out/clusters/[cluster].html +1 -1
  12. sky/dashboard/out/clusters.html +1 -1
  13. sky/dashboard/out/config.html +1 -1
  14. sky/dashboard/out/index.html +1 -1
  15. sky/dashboard/out/infra/[context].html +1 -1
  16. sky/dashboard/out/infra.html +1 -1
  17. sky/dashboard/out/jobs/[job].html +1 -1
  18. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  19. sky/dashboard/out/jobs.html +1 -1
  20. sky/dashboard/out/users.html +1 -1
  21. sky/dashboard/out/volumes.html +1 -1
  22. sky/dashboard/out/workspace/new.html +1 -1
  23. sky/dashboard/out/workspaces/[name].html +1 -1
  24. sky/dashboard/out/workspaces.html +1 -1
  25. sky/jobs/controller.py +122 -145
  26. sky/jobs/recovery_strategy.py +59 -82
  27. sky/jobs/scheduler.py +5 -5
  28. sky/jobs/state.py +65 -21
  29. sky/jobs/utils.py +58 -22
  30. sky/metrics/utils.py +27 -6
  31. sky/provision/kubernetes/utils.py +44 -39
  32. sky/server/common.py +4 -2
  33. sky/server/requests/executor.py +3 -1
  34. sky/server/server.py +5 -0
  35. sky/sky_logging.py +0 -2
  36. sky/skylet/constants.py +22 -5
  37. sky/skylet/log_lib.py +0 -1
  38. sky/skylet/log_lib.pyi +1 -1
  39. sky/utils/common.py +2 -0
  40. sky/utils/context.py +57 -51
  41. sky/utils/context_utils.py +2 -2
  42. sky/utils/controller_utils.py +35 -8
  43. sky/utils/locks.py +20 -5
  44. sky/utils/subprocess_utils.py +4 -3
  45. {skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/METADATA +36 -36
  46. {skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/RECORD +52 -52
  47. /sky/dashboard/out/_next/static/{MtlDUf-nH1hhcy7xwbCj3 → 9Fek73R28lDp1A5J4N7g7}/_buildManifest.js +0 -0
  48. /sky/dashboard/out/_next/static/{MtlDUf-nH1hhcy7xwbCj3 → 9Fek73R28lDp1A5J4N7g7}/_ssgManifest.js +0 -0
  49. {skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/WHEEL +0 -0
  50. {skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/entry_points.txt +0 -0
  51. {skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/licenses/LICENSE +0 -0
  52. {skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251014.dist-info}/top_level.txt +0 -0
@@ -70,7 +70,6 @@ class StrategyExecutor:
70
70
  max_restarts_on_errors: int,
71
71
  job_id: int,
72
72
  task_id: int,
73
- job_logger: logging.Logger,
74
73
  pool: Optional[str],
75
74
  starting: Set[int],
76
75
  starting_lock: asyncio.Lock,
@@ -85,7 +84,6 @@ class StrategyExecutor:
85
84
  max_restarts_on_errors: Maximum number of restarts on errors.
86
85
  job_id: The ID of the job.
87
86
  task_id: The ID of the task.
88
- job_logger: Logger instance for this specific job.
89
87
  starting: Set of job IDs that are currently starting.
90
88
  starting_lock: Lock to synchronize starting jobs.
91
89
  starting_signal: Condition to signal when a job can start.
@@ -105,7 +103,6 @@ class StrategyExecutor:
105
103
  self.task_id = task_id
106
104
  self.pool = pool
107
105
  self.restart_cnt_on_failure = 0
108
- self._logger = job_logger
109
106
  self.job_id_on_pool_cluster: Optional[int] = None
110
107
  self.starting = starting
111
108
  self.starting_lock = starting_lock
@@ -119,7 +116,6 @@ class StrategyExecutor:
119
116
  task: 'task_lib.Task',
120
117
  job_id: int,
121
118
  task_id: int,
122
- job_logger: logging.Logger,
123
119
  pool: Optional[str],
124
120
  starting: Set[int],
125
121
  starting_lock: asyncio.Lock,
@@ -156,7 +152,7 @@ class StrategyExecutor:
156
152
  assert job_recovery_strategy is not None, job_recovery_name
157
153
  return job_recovery_strategy(cluster_name, backend, task,
158
154
  max_restarts_on_errors, job_id, task_id,
159
- job_logger, pool, starting, starting_lock,
155
+ pool, starting, starting_lock,
160
156
  starting_signal)
161
157
 
162
158
  async def launch(self) -> float:
@@ -224,7 +220,7 @@ class StrategyExecutor:
224
220
  **kwargs,
225
221
  _try_cancel_if_cluster_is_init=True,
226
222
  )
227
- self._logger.debug(f'sdk.cancel request ID: {request_id}')
223
+ logger.debug(f'sdk.cancel request ID: {request_id}')
228
224
  await context_utils.to_thread(
229
225
  sdk.get,
230
226
  request_id,
@@ -261,16 +257,15 @@ class StrategyExecutor:
261
257
  # loop.
262
258
  # TODO(zhwu): log the unexpected error to usage collection
263
259
  # for future debugging.
264
- self._logger.info(
265
- f'Unexpected exception: {e}\nFailed to get the '
266
- 'refresh the cluster status. Retrying.')
260
+ logger.info(f'Unexpected exception: {e}\nFailed to get the '
261
+ 'refresh the cluster status. Retrying.')
267
262
  continue
268
263
  if cluster_status != status_lib.ClusterStatus.UP:
269
264
  # The cluster can be preempted before the job is
270
265
  # launched.
271
266
  # Break to let the retry launch kick in.
272
- self._logger.info('The cluster is preempted before the job '
273
- 'is submitted.')
267
+ logger.info('The cluster is preempted before the job '
268
+ 'is submitted.')
274
269
  # TODO(zhwu): we should recover the preemption with the
275
270
  # recovery strategy instead of the current while loop.
276
271
  break
@@ -279,7 +274,6 @@ class StrategyExecutor:
279
274
  status = await managed_job_utils.get_job_status(
280
275
  self.backend,
281
276
  self.cluster_name,
282
- job_logger=self._logger,
283
277
  job_id=self.job_id_on_pool_cluster)
284
278
  except Exception as e: # pylint: disable=broad-except
285
279
  # If any unexpected error happens, retry the job checking
@@ -288,9 +282,8 @@ class StrategyExecutor:
288
282
  # get_job_status, so it should not happen here.
289
283
  # TODO(zhwu): log the unexpected error to usage collection
290
284
  # for future debugging.
291
- self._logger.info(
292
- f'Unexpected exception: {e}\nFailed to get the '
293
- 'job status. Retrying.')
285
+ logger.info(f'Unexpected exception: {e}\nFailed to get the '
286
+ 'job status. Retrying.')
294
287
  continue
295
288
 
296
289
  # Check the job status until it is not in initialized status
@@ -306,9 +299,8 @@ class StrategyExecutor:
306
299
  except Exception as e: # pylint: disable=broad-except
307
300
  # If we failed to get the job timestamp, we will retry
308
301
  # job checking loop.
309
- self._logger.info(
310
- f'Unexpected Exception: {e}\nFailed to get '
311
- 'the job start timestamp. Retrying.')
302
+ logger.info(f'Unexpected Exception: {e}\nFailed to get '
303
+ 'the job start timestamp. Retrying.')
312
304
  continue
313
305
  # Wait for the job to be started
314
306
  await asyncio.sleep(
@@ -370,7 +362,6 @@ class StrategyExecutor:
370
362
  self.starting,
371
363
  self.starting_lock,
372
364
  self.starting_signal,
373
- self._logger,
374
365
  ):
375
366
  # The job state may have been PENDING during backoff -
376
367
  # update to STARTING or RECOVERING.
@@ -394,21 +385,19 @@ class StrategyExecutor:
394
385
  for env_var in ENV_VARS_TO_CLEAR:
395
386
  vars_to_restore[env_var] = os.environ.pop(
396
387
  env_var, None)
397
- self._logger.debug('Cleared env var: '
398
- f'{env_var}')
399
- self._logger.debug('Env vars for api_start: '
400
- f'{os.environ}')
388
+ logger.debug('Cleared env var: '
389
+ f'{env_var}')
390
+ logger.debug('Env vars for api_start: '
391
+ f'{os.environ}')
401
392
  await context_utils.to_thread(sdk.api_start)
402
- self._logger.info('API server started.')
393
+ logger.info('API server started.')
403
394
  finally:
404
395
  for env_var, value in vars_to_restore.items():
405
396
  if value is not None:
406
- self._logger.debug(
407
- 'Restored env var: '
408
- f'{env_var}: {value}')
397
+ logger.debug('Restored env var: '
398
+ f'{env_var}: {value}')
409
399
  os.environ[env_var] = value
410
400
 
411
- log_file = _get_logger_file(self._logger)
412
401
  request_id = None
413
402
  try:
414
403
  request_id = await context_utils.to_thread(
@@ -429,31 +418,27 @@ class StrategyExecutor:
429
418
  # down=True,
430
419
  _is_launched_by_jobs_controller=True,
431
420
  )
432
- self._logger.debug('sdk.launch request ID: '
433
- f'{request_id}')
434
- if log_file is None:
435
- raise OSError('Log file is None')
436
- with open(log_file, 'a', encoding='utf-8') as f:
437
- await context_utils.to_thread(
438
- sdk.stream_and_get,
439
- request_id,
440
- output_stream=f,
441
- )
421
+ logger.debug('sdk.launch request ID: '
422
+ f'{request_id}')
423
+ await context_utils.to_thread(
424
+ sdk.stream_and_get,
425
+ request_id,
426
+ )
442
427
  except asyncio.CancelledError:
443
428
  if request_id:
444
429
  req = await context_utils.to_thread(
445
430
  sdk.api_cancel, request_id)
446
- self._logger.debug('sdk.api_cancel request '
447
- f'ID: {req}')
431
+ logger.debug('sdk.api_cancel request '
432
+ f'ID: {req}')
448
433
  try:
449
434
  await context_utils.to_thread(
450
435
  sdk.get, req)
451
436
  except Exception as e: # pylint: disable=broad-except
452
437
  # we must still return a CancelledError
453
- self._logger.error(
438
+ logger.error(
454
439
  f'Failed to cancel the job: {e}')
455
440
  raise
456
- self._logger.info('Managed job cluster launched.')
441
+ logger.info('Managed job cluster launched.')
457
442
  else:
458
443
  self.cluster_name = await (context_utils.to_thread(
459
444
  serve_utils.get_next_cluster_name, self.pool,
@@ -468,8 +453,8 @@ class StrategyExecutor:
468
453
  self.dag,
469
454
  cluster_name=self.cluster_name,
470
455
  )
471
- self._logger.debug('sdk.exec request ID: '
472
- f'{request_id}')
456
+ logger.debug('sdk.exec request ID: '
457
+ f'{request_id}')
473
458
  job_id_on_pool_cluster, _ = (
474
459
  await context_utils.to_thread(
475
460
  sdk.get, request_id))
@@ -477,14 +462,14 @@ class StrategyExecutor:
477
462
  if request_id:
478
463
  req = await context_utils.to_thread(
479
464
  sdk.api_cancel, request_id)
480
- self._logger.debug('sdk.api_cancel request '
481
- f'ID: {req}')
465
+ logger.debug('sdk.api_cancel request '
466
+ f'ID: {req}')
482
467
  try:
483
468
  await context_utils.to_thread(
484
469
  sdk.get, req)
485
470
  except Exception as e: # pylint: disable=broad-except
486
471
  # we must still return a CancelledError
487
- self._logger.error(
472
+ logger.error(
488
473
  f'Failed to cancel the job: {e}')
489
474
  raise
490
475
  assert job_id_on_pool_cluster is not None, (
@@ -492,15 +477,14 @@ class StrategyExecutor:
492
477
  self.job_id_on_pool_cluster = job_id_on_pool_cluster
493
478
  await state.set_job_id_on_pool_cluster_async(
494
479
  self.job_id, job_id_on_pool_cluster)
495
- self._logger.info('Managed job cluster launched.')
480
+ logger.info('Managed job cluster launched.')
496
481
  except (exceptions.InvalidClusterNameError,
497
482
  exceptions.NoCloudAccessError,
498
483
  exceptions.ResourcesMismatchError,
499
484
  exceptions.StorageSpecError,
500
485
  exceptions.StorageError) as e:
501
- self._logger.error(
502
- 'Failure happened before provisioning. '
503
- f'{common_utils.format_exception(e)}')
486
+ logger.error('Failure happened before provisioning. '
487
+ f'{common_utils.format_exception(e)}')
504
488
  if raise_on_failure:
505
489
  raise exceptions.ProvisionPrechecksError(
506
490
  reasons=[e])
@@ -528,24 +512,22 @@ class StrategyExecutor:
528
512
  reasons_str = '; '.join(
529
513
  common_utils.format_exception(err)
530
514
  for err in reasons)
531
- self._logger.error(
515
+ logger.error(
532
516
  'Failure happened before provisioning. '
533
517
  f'Failover reasons: {reasons_str}')
534
518
  if raise_on_failure:
535
519
  raise exceptions.ProvisionPrechecksError(
536
520
  reasons)
537
521
  return None
538
- self._logger.info(
539
- 'Failed to launch a cluster with error: '
540
- f'{common_utils.format_exception(e)})')
522
+ logger.info('Failed to launch a cluster with error: '
523
+ f'{common_utils.format_exception(e)})')
541
524
  except Exception as e: # pylint: disable=broad-except
542
525
  # If the launch fails, it will be recovered by the
543
526
  # following code.
544
- self._logger.info(
545
- 'Failed to launch a cluster with error: '
546
- f'{common_utils.format_exception(e)})')
527
+ logger.info('Failed to launch a cluster with error: '
528
+ f'{common_utils.format_exception(e)})')
547
529
  with ux_utils.enable_traceback():
548
- self._logger.info(
530
+ logger.info(
549
531
  f' Traceback: {traceback.format_exc()}')
550
532
  else: # No exception, the launch succeeds.
551
533
  # At this point, a sky.launch() has succeeded. Cluster
@@ -559,7 +541,7 @@ class StrategyExecutor:
559
541
  # launch.
560
542
  # TODO(zhwu): log the unexpected error to usage
561
543
  # collection for future debugging.
562
- self._logger.info(
544
+ logger.info(
563
545
  'Failed to successfully submit the job to the '
564
546
  'launched cluster, due to unexpected submission '
565
547
  'errors or the cluster being preempted during '
@@ -594,8 +576,8 @@ class StrategyExecutor:
594
576
  # Calculate the backoff time and sleep.
595
577
  gap_seconds = (backoff.current_backoff()
596
578
  if self.pool is None else 1)
597
- self._logger.info('Retrying to launch the cluster in '
598
- f'{gap_seconds:.1f} seconds.')
579
+ logger.info('Retrying to launch the cluster in '
580
+ f'{gap_seconds:.1f} seconds.')
599
581
  await asyncio.sleep(gap_seconds)
600
582
  continue
601
583
  else:
@@ -630,15 +612,14 @@ class FailoverStrategyExecutor(StrategyExecutor):
630
612
  max_restarts_on_errors: int,
631
613
  job_id: int,
632
614
  task_id: int,
633
- job_logger: logging.Logger,
634
615
  pool: Optional[str],
635
616
  starting: Set[int],
636
617
  starting_lock: asyncio.Lock,
637
618
  starting_signal: asyncio.Condition,
638
619
  ) -> None:
639
620
  super().__init__(cluster_name, backend, task, max_restarts_on_errors,
640
- job_id, task_id, job_logger, pool, starting,
641
- starting_lock, starting_signal)
621
+ job_id, task_id, pool, starting, starting_lock,
622
+ starting_signal)
642
623
  # Note down the cloud/region of the launched cluster, so that we can
643
624
  # first retry in the same cloud/region. (Inside recover() we may not
644
625
  # rely on cluster handle, as it can be None if the cluster is
@@ -694,14 +675,13 @@ class FailoverStrategyExecutor(StrategyExecutor):
694
675
  return job_submitted_at
695
676
 
696
677
  # Step 2
697
- self._logger.debug('Terminating unhealthy cluster and reset cloud '
698
- 'region.')
678
+ logger.debug('Terminating unhealthy cluster and reset cloud '
679
+ 'region.')
699
680
  await context_utils.to_thread(self._cleanup_cluster)
700
681
 
701
682
  # Step 3
702
- self._logger.debug(
703
- 'Relaunch the cluster without constraining to prior '
704
- 'cloud/region.')
683
+ logger.debug('Relaunch the cluster without constraining to prior '
684
+ 'cloud/region.')
705
685
  # Not using self.launch to avoid the retry until up logic.
706
686
  job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
707
687
  raise_on_failure=False,
@@ -709,8 +689,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
709
689
  if job_submitted_at is None:
710
690
  # Failed to launch the cluster.
711
691
  gap_seconds = self.RETRY_INIT_GAP_SECONDS
712
- self._logger.info('Retrying to recover the cluster in '
713
- f'{gap_seconds:.1f} seconds.')
692
+ logger.info('Retrying to recover the cluster in '
693
+ f'{gap_seconds:.1f} seconds.')
714
694
  await asyncio.sleep(gap_seconds)
715
695
  continue
716
696
 
@@ -755,14 +735,12 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
755
735
  # task.resources.
756
736
 
757
737
  # Step 1
758
- self._logger.debug(
759
- 'Terminating unhealthy cluster and reset cloud region.')
738
+ logger.debug('Terminating unhealthy cluster and reset cloud region.')
760
739
  await context_utils.to_thread(self._cleanup_cluster)
761
740
 
762
741
  # Step 2
763
- self._logger.debug(
764
- 'Relaunch the cluster skipping the previously launched '
765
- 'cloud/region.')
742
+ logger.debug('Relaunch the cluster skipping the previously launched '
743
+ 'cloud/region.')
766
744
  if self._launched_resources is not None:
767
745
  task = self.dag.tasks[0]
768
746
  requested_resources = self._launched_resources
@@ -787,9 +765,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
787
765
 
788
766
  while True:
789
767
  # Step 3
790
- self._logger.debug(
791
- 'Relaunch the cluster without constraining to prior '
792
- 'cloud/region.')
768
+ logger.debug('Relaunch the cluster without constraining to prior '
769
+ 'cloud/region.')
793
770
  # Not using self.launch to avoid the retry until up logic.
794
771
  job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
795
772
  raise_on_failure=False,
@@ -797,8 +774,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
797
774
  if job_submitted_at is None:
798
775
  # Failed to launch the cluster.
799
776
  gap_seconds = self.RETRY_INIT_GAP_SECONDS
800
- self._logger.info('Retrying to recover the cluster in '
801
- f'{gap_seconds:.1f} seconds.')
777
+ logger.info('Retrying to recover the cluster in '
778
+ f'{gap_seconds:.1f} seconds.')
802
779
  await asyncio.sleep(gap_seconds)
803
780
  continue
804
781
 
sky/jobs/scheduler.py CHANGED
@@ -168,11 +168,12 @@ def start_controller() -> None:
168
168
  logs_dir = os.path.expanduser(
169
169
  managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
170
170
  os.makedirs(logs_dir, exist_ok=True)
171
- log_path = os.path.join(logs_dir, f'controller_{uuid.uuid4()}.log')
171
+ controller_uuid = str(uuid.uuid4())
172
+ log_path = os.path.join(logs_dir, f'controller_{controller_uuid}.log')
172
173
 
173
174
  activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
174
175
  run_controller_cmd = (f'{sys.executable} -u -m'
175
- 'sky.jobs.controller')
176
+ f'sky.jobs.controller {controller_uuid}')
176
177
 
177
178
  run_cmd = (f'{activate_python_env_cmd}'
178
179
  f'{run_controller_cmd}')
@@ -309,7 +310,6 @@ async def scheduled_launch(
309
310
  starting: Set[int],
310
311
  starting_lock: asyncio.Lock,
311
312
  starting_signal: asyncio.Condition,
312
- job_logger: 'logging.Logger',
313
313
  ):
314
314
  """Launch as part of an ongoing job.
315
315
 
@@ -347,10 +347,10 @@ async def scheduled_launch(
347
347
  starting_count = len(starting)
348
348
  if starting_count < LAUNCHES_PER_WORKER:
349
349
  break
350
- job_logger.info('Too many jobs starting, waiting for a slot')
350
+ logger.info('Too many jobs starting, waiting for a slot')
351
351
  await starting_signal.wait()
352
352
 
353
- job_logger.info(f'Starting job {job_id}')
353
+ logger.info(f'Starting job {job_id}')
354
354
 
355
355
  async with starting_lock:
356
356
  starting.add(job_id)
sky/jobs/state.py CHANGED
@@ -280,6 +280,27 @@ def _init_db(func):
280
280
  return wrapper
281
281
 
282
282
 
283
+ async def _describe_task_transition_failure(session: sql_async.AsyncSession,
284
+ job_id: int, task_id: int) -> str:
285
+ """Return a human-readable description when a task transition fails."""
286
+ details = 'Couldn\'t fetch the task details.'
287
+ try:
288
+ debug_result = await session.execute(
289
+ sqlalchemy.select(spot_table.c.status, spot_table.c.end_at).where(
290
+ sqlalchemy.and_(spot_table.c.spot_job_id == job_id,
291
+ spot_table.c.task_id == task_id)))
292
+ rows = debug_result.mappings().all()
293
+ details = (f'{len(rows)} rows matched job {job_id} and task '
294
+ f'{task_id}.')
295
+ for row in rows:
296
+ status = row['status']
297
+ end_at = row['end_at']
298
+ details += f' Status: {status}, End time: {end_at}.'
299
+ except Exception as exc: # pylint: disable=broad-except
300
+ details += f' Error fetching task details: {exc}'
301
+ return details
302
+
303
+
283
304
  # job_duration is the time a job actually runs (including the
284
305
  # setup duration) before last_recover, excluding the provision
285
306
  # and recovery time.
@@ -758,9 +779,12 @@ async def set_backoff_pending_async(job_id: int, task_id: int):
758
779
  count = result.rowcount
759
780
  await session.commit()
760
781
  if count != 1:
761
- raise exceptions.ManagedJobStatusError(
762
- 'Failed to set the task back to pending. '
763
- f'({count} rows updated)')
782
+ details = await _describe_task_transition_failure(
783
+ session, job_id, task_id)
784
+ message = ('Failed to set the task back to pending. '
785
+ f'({count} rows updated. {details})')
786
+ logger.error(message)
787
+ raise exceptions.ManagedJobStatusError(message)
764
788
  # Do not call callback_func here, as we don't use the callback for PENDING.
765
789
 
766
790
 
@@ -789,9 +813,12 @@ async def set_restarting_async(job_id: int, task_id: int, recovering: bool):
789
813
  await session.commit()
790
814
  logger.debug(f'back to {target_status}')
791
815
  if count != 1:
792
- raise exceptions.ManagedJobStatusError(
793
- f'Failed to set the task back to {target_status}. '
794
- f'({count} rows updated)')
816
+ details = await _describe_task_transition_failure(
817
+ session, job_id, task_id)
818
+ message = (f'Failed to set the task back to {target_status}. '
819
+ f'({count} rows updated. {details})')
820
+ logger.error(message)
821
+ raise exceptions.ManagedJobStatusError(message)
795
822
  # Do not call callback_func here, as it should only be invoked for the
796
823
  # initial (pre-`set_backoff_pending`) transition to STARTING or RECOVERING.
797
824
 
@@ -1644,9 +1671,12 @@ async def set_starting_async(job_id: int, task_id: int, run_timestamp: str,
1644
1671
  count = result.rowcount
1645
1672
  await session.commit()
1646
1673
  if count != 1:
1647
- raise exceptions.ManagedJobStatusError(
1648
- 'Failed to set the task to starting. '
1649
- f'({count} rows updated)')
1674
+ details = await _describe_task_transition_failure(
1675
+ session, job_id, task_id)
1676
+ message = ('Failed to set the task to starting. '
1677
+ f'({count} rows updated. {details})')
1678
+ logger.error(message)
1679
+ raise exceptions.ManagedJobStatusError(message)
1650
1680
  await callback_func('SUBMITTED')
1651
1681
  await callback_func('STARTING')
1652
1682
 
@@ -1676,9 +1706,12 @@ async def set_started_async(job_id: int, task_id: int, start_time: float,
1676
1706
  count = result.rowcount
1677
1707
  await session.commit()
1678
1708
  if count != 1:
1679
- raise exceptions.ManagedJobStatusError(
1680
- f'Failed to set the task to started. '
1681
- f'({count} rows updated)')
1709
+ details = await _describe_task_transition_failure(
1710
+ session, job_id, task_id)
1711
+ message = (f'Failed to set the task to started. '
1712
+ f'({count} rows updated. {details})')
1713
+ logger.error(message)
1714
+ raise exceptions.ManagedJobStatusError(message)
1682
1715
  await callback_func('STARTED')
1683
1716
 
1684
1717
 
@@ -1733,9 +1766,14 @@ async def set_recovering_async(job_id: int, task_id: int,
1733
1766
  count = result.rowcount
1734
1767
  await session.commit()
1735
1768
  if count != 1:
1736
- raise exceptions.ManagedJobStatusError(
1737
- f'Failed to set the task to recovering. '
1738
- f'({count} rows updated)')
1769
+ details = await _describe_task_transition_failure(
1770
+ session, job_id, task_id)
1771
+ message = ('Failed to set the task to recovering with '
1772
+ 'force_transit_to_recovering='
1773
+ f'{force_transit_to_recovering}. '
1774
+ f'({count} rows updated. {details})')
1775
+ logger.error(message)
1776
+ raise exceptions.ManagedJobStatusError(message)
1739
1777
  await callback_func('RECOVERING')
1740
1778
 
1741
1779
 
@@ -1761,9 +1799,12 @@ async def set_recovered_async(job_id: int, task_id: int, recovered_time: float,
1761
1799
  count = result.rowcount
1762
1800
  await session.commit()
1763
1801
  if count != 1:
1764
- raise exceptions.ManagedJobStatusError(
1765
- f'Failed to set the task to recovered. '
1766
- f'({count} rows updated)')
1802
+ details = await _describe_task_transition_failure(
1803
+ session, job_id, task_id)
1804
+ message = (f'Failed to set the task to recovered. '
1805
+ f'({count} rows updated. {details})')
1806
+ logger.error(message)
1807
+ raise exceptions.ManagedJobStatusError(message)
1767
1808
  logger.info('==== Recovered. ====')
1768
1809
  await callback_func('RECOVERED')
1769
1810
 
@@ -1788,9 +1829,12 @@ async def set_succeeded_async(job_id: int, task_id: int, end_time: float,
1788
1829
  count = result.rowcount
1789
1830
  await session.commit()
1790
1831
  if count != 1:
1791
- raise exceptions.ManagedJobStatusError(
1792
- f'Failed to set the task to succeeded. '
1793
- f'({count} rows updated)')
1832
+ details = await _describe_task_transition_failure(
1833
+ session, job_id, task_id)
1834
+ message = (f'Failed to set the task to succeeded. '
1835
+ f'({count} rows updated. {details})')
1836
+ logger.error(message)
1837
+ raise exceptions.ManagedJobStatusError(message)
1794
1838
  await callback_func('SUCCEEDED')
1795
1839
  logger.info('Job succeeded.')
1796
1840