PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251013__py3-none-any.whl → 1.0.0.dev20251015__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251013py3-none-any.whl → 1.0.0.dev20251015py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (57) hide show

sky/__init__.py +2 -2
sky/authentication.py +9 -2
sky/backends/backend_utils.py +62 -40
sky/backends/cloud_vm_ray_backend.py +8 -6
sky/catalog/kubernetes_catalog.py +19 -25
sky/client/cli/command.py +53 -19
sky/client/sdk.py +13 -1
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/jobs/controller.py +122 -145
sky/jobs/recovery_strategy.py +59 -82
sky/jobs/scheduler.py +5 -5
sky/jobs/state.py +65 -21
sky/jobs/utils.py +58 -22
sky/metrics/utils.py +27 -6
sky/provision/common.py +2 -0
sky/provision/instance_setup.py +10 -2
sky/provision/kubernetes/instance.py +34 -10
sky/provision/kubernetes/utils.py +53 -39
sky/server/common.py +4 -2
sky/server/requests/executor.py +3 -1
sky/server/requests/preconditions.py +2 -4
sky/server/requests/requests.py +13 -23
sky/server/server.py +5 -0
sky/sky_logging.py +0 -2
sky/skylet/constants.py +22 -5
sky/skylet/log_lib.py +0 -1
sky/skylet/log_lib.pyi +1 -1
sky/utils/asyncio_utils.py +18 -0
sky/utils/common.py +2 -0
sky/utils/context.py +57 -51
sky/utils/context_utils.py +2 -2
sky/utils/controller_utils.py +35 -8
sky/utils/locks.py +20 -5
sky/utils/subprocess_utils.py +4 -3
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/METADATA +38 -37
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/RECORD +57 -56
/sky/dashboard/out/_next/static/{MtlDUf-nH1hhcy7xwbCj3 → -bih7JVStsXyeasac-dvQ}/_buildManifest.js +0 -0
/sky/dashboard/out/_next/static/{MtlDUf-nH1hhcy7xwbCj3 → -bih7JVStsXyeasac-dvQ}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251013.dist-info → skypilot_nightly-1.0.0.dev20251015.dist-info}/top_level.txt +0 -0

sky/jobs/recovery_strategy.py CHANGED Viewed

@@ -70,7 +70,6 @@ class StrategyExecutor:
         max_restarts_on_errors: int,
         job_id: int,
         task_id: int,
-        job_logger: logging.Logger,
         pool: Optional[str],
         starting: Set[int],
         starting_lock: asyncio.Lock,
@@ -85,7 +84,6 @@ class StrategyExecutor:
             max_restarts_on_errors: Maximum number of restarts on errors.
             job_id: The ID of the job.
             task_id: The ID of the task.
-            job_logger: Logger instance for this specific job.
             starting: Set of job IDs that are currently starting.
             starting_lock: Lock to synchronize starting jobs.
             starting_signal: Condition to signal when a job can start.
@@ -105,7 +103,6 @@ class StrategyExecutor:
         self.task_id = task_id
         self.pool = pool
         self.restart_cnt_on_failure = 0
-        self._logger = job_logger
         self.job_id_on_pool_cluster: Optional[int] = None
         self.starting = starting
         self.starting_lock = starting_lock
@@ -119,7 +116,6 @@ class StrategyExecutor:
         task: 'task_lib.Task',
         job_id: int,
         task_id: int,
-        job_logger: logging.Logger,
         pool: Optional[str],
         starting: Set[int],
         starting_lock: asyncio.Lock,
@@ -156,7 +152,7 @@ class StrategyExecutor:
         assert job_recovery_strategy is not None, job_recovery_name
         return job_recovery_strategy(cluster_name, backend, task,
                                      max_restarts_on_errors, job_id, task_id,
-                                     job_logger, pool, starting, starting_lock,
+                                     pool, starting, starting_lock,
                                      starting_signal)
     async def launch(self) -> float:
@@ -224,7 +220,7 @@ class StrategyExecutor:
                 **kwargs,
                 _try_cancel_if_cluster_is_init=True,
             )
-            self._logger.debug(f'sdk.cancel request ID: {request_id}')
+            logger.debug(f'sdk.cancel request ID: {request_id}')
             await context_utils.to_thread(
                 sdk.get,
                 request_id,
@@ -261,16 +257,15 @@ class StrategyExecutor:
                 # loop.
                 # TODO(zhwu): log the unexpected error to usage collection
                 # for future debugging.
-                self._logger.info(
-                    f'Unexpected exception: {e}\nFailed to get the '
-                    'refresh the cluster status. Retrying.')
+                logger.info(f'Unexpected exception: {e}\nFailed to get the '
+                            'refresh the cluster status. Retrying.')
                 continue
             if cluster_status != status_lib.ClusterStatus.UP:
                 # The cluster can be preempted before the job is
                 # launched.
                 # Break to let the retry launch kick in.
-                self._logger.info('The cluster is preempted before the job '
-                                  'is submitted.')
+                logger.info('The cluster is preempted before the job '
+                            'is submitted.')
                 # TODO(zhwu): we should recover the preemption with the
                 # recovery strategy instead of the current while loop.
                 break
@@ -279,7 +274,6 @@ class StrategyExecutor:
                 status = await managed_job_utils.get_job_status(
                     self.backend,
                     self.cluster_name,
-                    job_logger=self._logger,
                     job_id=self.job_id_on_pool_cluster)
             except Exception as e:  # pylint: disable=broad-except
                 # If any unexpected error happens, retry the job checking
@@ -288,9 +282,8 @@ class StrategyExecutor:
                 # get_job_status, so it should not happen here.
                 # TODO(zhwu): log the unexpected error to usage collection
                 # for future debugging.
-                self._logger.info(
-                    f'Unexpected exception: {e}\nFailed to get the '
-                    'job status. Retrying.')
+                logger.info(f'Unexpected exception: {e}\nFailed to get the '
+                            'job status. Retrying.')
                 continue
             # Check the job status until it is not in initialized status
@@ -306,9 +299,8 @@ class StrategyExecutor:
                 except Exception as e:  # pylint: disable=broad-except
                     # If we failed to get the job timestamp, we will retry
                     # job checking loop.
-                    self._logger.info(
-                        f'Unexpected Exception: {e}\nFailed to get '
-                        'the job start timestamp. Retrying.')
+                    logger.info(f'Unexpected Exception: {e}\nFailed to get '
+                                'the job start timestamp. Retrying.')
                     continue
             # Wait for the job to be started
             await asyncio.sleep(
@@ -370,7 +362,6 @@ class StrategyExecutor:
                         self.starting,
                         self.starting_lock,
                         self.starting_signal,
-                        self._logger,
                 ):
                     # The job state may have been PENDING during backoff -
                     # update to STARTING or RECOVERING.
@@ -394,21 +385,19 @@ class StrategyExecutor:
                                 for env_var in ENV_VARS_TO_CLEAR:
                                     vars_to_restore[env_var] = os.environ.pop(
                                         env_var, None)
-                                    self._logger.debug('Cleared env var: '
-                                                       f'{env_var}')
-                                self._logger.debug('Env vars for api_start: '
-                                                   f'{os.environ}')
+                                    logger.debug('Cleared env var: '
+                                                 f'{env_var}')
+                                logger.debug('Env vars for api_start: '
+                                             f'{os.environ}')
                                 await context_utils.to_thread(sdk.api_start)
-                                self._logger.info('API server started.')
+                                logger.info('API server started.')
                             finally:
                                 for env_var, value in vars_to_restore.items():
                                     if value is not None:
-                                        self._logger.debug(
-                                            'Restored env var: '
-                                            f'{env_var}: {value}')
+                                        logger.debug('Restored env var: '
+                                                     f'{env_var}: {value}')
                                         os.environ[env_var] = value
-                            log_file = _get_logger_file(self._logger)
                             request_id = None
                             try:
                                 request_id = await context_utils.to_thread(
@@ -429,31 +418,27 @@ class StrategyExecutor:
                                     # down=True,
                                     _is_launched_by_jobs_controller=True,
                                 )
-                                self._logger.debug('sdk.launch request ID: '
-                                                   f'{request_id}')
-                                if log_file is None:
-                                    raise OSError('Log file is None')
-                                with open(log_file, 'a', encoding='utf-8') as f:
-                                    await context_utils.to_thread(
-                                        sdk.stream_and_get,
-                                        request_id,
-                                        output_stream=f,
-                                    )
+                                logger.debug('sdk.launch request ID: '
+                                             f'{request_id}')
+                                await context_utils.to_thread(
+                                    sdk.stream_and_get,
+                                    request_id,
+                                )
                             except asyncio.CancelledError:
                                 if request_id:
                                     req = await context_utils.to_thread(
                                         sdk.api_cancel, request_id)
-                                    self._logger.debug('sdk.api_cancel request '
-                                                       f'ID: {req}')
+                                    logger.debug('sdk.api_cancel request '
+                                                 f'ID: {req}')
                                     try:
                                         await context_utils.to_thread(
                                             sdk.get, req)
                                     except Exception as e:  # pylint: disable=broad-except
                                         # we must still return a CancelledError
-                                        self._logger.error(
+                                        logger.error(
                                             f'Failed to cancel the job: {e}')
                                 raise
-                            self._logger.info('Managed job cluster launched.')
+                            logger.info('Managed job cluster launched.')
                         else:
                             self.cluster_name = await (context_utils.to_thread(
                                 serve_utils.get_next_cluster_name, self.pool,
@@ -468,8 +453,8 @@ class StrategyExecutor:
                                     self.dag,
                                     cluster_name=self.cluster_name,
                                 )
-                                self._logger.debug('sdk.exec request ID: '
-                                                   f'{request_id}')
+                                logger.debug('sdk.exec request ID: '
+                                             f'{request_id}')
                                 job_id_on_pool_cluster, _ = (
                                     await context_utils.to_thread(
                                         sdk.get, request_id))
@@ -477,14 +462,14 @@ class StrategyExecutor:
                                 if request_id:
                                     req = await context_utils.to_thread(
                                         sdk.api_cancel, request_id)
-                                    self._logger.debug('sdk.api_cancel request '
-                                                       f'ID: {req}')
+                                    logger.debug('sdk.api_cancel request '
+                                                 f'ID: {req}')
                                     try:
                                         await context_utils.to_thread(
                                             sdk.get, req)
                                     except Exception as e:  # pylint: disable=broad-except
                                         # we must still return a CancelledError
-                                        self._logger.error(
+                                        logger.error(
                                             f'Failed to cancel the job: {e}')
                                 raise
                             assert job_id_on_pool_cluster is not None, (
@@ -492,15 +477,14 @@ class StrategyExecutor:
                             self.job_id_on_pool_cluster = job_id_on_pool_cluster
                             await state.set_job_id_on_pool_cluster_async(
                                 self.job_id, job_id_on_pool_cluster)
-                        self._logger.info('Managed job cluster launched.')
+                        logger.info('Managed job cluster launched.')
                     except (exceptions.InvalidClusterNameError,
                             exceptions.NoCloudAccessError,
                             exceptions.ResourcesMismatchError,
                             exceptions.StorageSpecError,
                             exceptions.StorageError) as e:
-                        self._logger.error(
-                            'Failure happened before provisioning. '
-                            f'{common_utils.format_exception(e)}')
+                        logger.error('Failure happened before provisioning. '
+                                     f'{common_utils.format_exception(e)}')
                         if raise_on_failure:
                             raise exceptions.ProvisionPrechecksError(
                                 reasons=[e])
@@ -528,24 +512,22 @@ class StrategyExecutor:
                             reasons_str = '; '.join(
                                 common_utils.format_exception(err)
                                 for err in reasons)
-                            self._logger.error(
+                            logger.error(
                                 'Failure happened before provisioning. '
                                 f'Failover reasons: {reasons_str}')
                             if raise_on_failure:
                                 raise exceptions.ProvisionPrechecksError(
                                     reasons)
                             return None
-                        self._logger.info(
-                            'Failed to launch a cluster with error: '
-                            f'{common_utils.format_exception(e)})')
+                        logger.info('Failed to launch a cluster with error: '
+                                    f'{common_utils.format_exception(e)})')
                     except Exception as e:  # pylint: disable=broad-except
                         # If the launch fails, it will be recovered by the
                         # following code.
-                        self._logger.info(
-                            'Failed to launch a cluster with error: '
-                            f'{common_utils.format_exception(e)})')
+                        logger.info('Failed to launch a cluster with error: '
+                                    f'{common_utils.format_exception(e)})')
                         with ux_utils.enable_traceback():
-                            self._logger.info(
+                            logger.info(
                                 f'  Traceback: {traceback.format_exc()}')
                     else:  # No exception, the launch succeeds.
                         # At this point, a sky.launch() has succeeded. Cluster
@@ -559,7 +541,7 @@ class StrategyExecutor:
                         # launch.
                         # TODO(zhwu): log the unexpected error to usage
                         # collection for future debugging.
-                        self._logger.info(
+                        logger.info(
                             'Failed to successfully submit the job to the '
                             'launched cluster, due to unexpected submission '
                             'errors or the cluster being preempted during '
@@ -594,8 +576,8 @@ class StrategyExecutor:
                 # Calculate the backoff time and sleep.
                 gap_seconds = (backoff.current_backoff()
                                if self.pool is None else 1)
-                self._logger.info('Retrying to launch the cluster in '
-                                  f'{gap_seconds:.1f} seconds.')
+                logger.info('Retrying to launch the cluster in '
+                            f'{gap_seconds:.1f} seconds.')
                 await asyncio.sleep(gap_seconds)
                 continue
             else:
@@ -630,15 +612,14 @@ class FailoverStrategyExecutor(StrategyExecutor):
         max_restarts_on_errors: int,
         job_id: int,
         task_id: int,
-        job_logger: logging.Logger,
         pool: Optional[str],
         starting: Set[int],
         starting_lock: asyncio.Lock,
         starting_signal: asyncio.Condition,
     ) -> None:
         super().__init__(cluster_name, backend, task, max_restarts_on_errors,
-                         job_id, task_id, job_logger, pool, starting,
-                         starting_lock, starting_signal)
+                         job_id, task_id, pool, starting, starting_lock,
+                         starting_signal)
         # Note down the cloud/region of the launched cluster, so that we can
         # first retry in the same cloud/region. (Inside recover() we may not
         # rely on cluster handle, as it can be None if the cluster is
@@ -694,14 +675,13 @@ class FailoverStrategyExecutor(StrategyExecutor):
                     return job_submitted_at
             # Step 2
-            self._logger.debug('Terminating unhealthy cluster and reset cloud '
-                               'region.')
+            logger.debug('Terminating unhealthy cluster and reset cloud '
+                         'region.')
             await context_utils.to_thread(self._cleanup_cluster)
             # Step 3
-            self._logger.debug(
-                'Relaunch the cluster  without constraining to prior '
-                'cloud/region.')
+            logger.debug('Relaunch the cluster  without constraining to prior '
+                         'cloud/region.')
             # Not using self.launch to avoid the retry until up logic.
             job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
                                                   raise_on_failure=False,
@@ -709,8 +689,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
             if job_submitted_at is None:
                 # Failed to launch the cluster.
                 gap_seconds = self.RETRY_INIT_GAP_SECONDS
-                self._logger.info('Retrying to recover the cluster in '
-                                  f'{gap_seconds:.1f} seconds.')
+                logger.info('Retrying to recover the cluster in '
+                            f'{gap_seconds:.1f} seconds.')
                 await asyncio.sleep(gap_seconds)
                 continue
@@ -755,14 +735,12 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
         # task.resources.
         # Step 1
-        self._logger.debug(
-            'Terminating unhealthy cluster and reset cloud region.')
+        logger.debug('Terminating unhealthy cluster and reset cloud region.')
         await context_utils.to_thread(self._cleanup_cluster)
         # Step 2
-        self._logger.debug(
-            'Relaunch the cluster skipping the previously launched '
-            'cloud/region.')
+        logger.debug('Relaunch the cluster skipping the previously launched '
+                     'cloud/region.')
         if self._launched_resources is not None:
             task = self.dag.tasks[0]
             requested_resources = self._launched_resources
@@ -787,9 +765,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
         while True:
             # Step 3
-            self._logger.debug(
-                'Relaunch the cluster without constraining to prior '
-                'cloud/region.')
+            logger.debug('Relaunch the cluster without constraining to prior '
+                         'cloud/region.')
             # Not using self.launch to avoid the retry until up logic.
             job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
                                                   raise_on_failure=False,
@@ -797,8 +774,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
             if job_submitted_at is None:
                 # Failed to launch the cluster.
                 gap_seconds = self.RETRY_INIT_GAP_SECONDS
-                self._logger.info('Retrying to recover the cluster in '
-                                  f'{gap_seconds:.1f} seconds.')
+                logger.info('Retrying to recover the cluster in '
+                            f'{gap_seconds:.1f} seconds.')
                 await asyncio.sleep(gap_seconds)
                 continue

sky/jobs/scheduler.py CHANGED Viewed

@@ -168,11 +168,12 @@ def start_controller() -> None:
     logs_dir = os.path.expanduser(
         managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
     os.makedirs(logs_dir, exist_ok=True)
-    log_path = os.path.join(logs_dir, f'controller_{uuid.uuid4()}.log')
+    controller_uuid = str(uuid.uuid4())
+    log_path = os.path.join(logs_dir, f'controller_{controller_uuid}.log')
     activate_python_env_cmd = (f'{constants.ACTIVATE_SKY_REMOTE_PYTHON_ENV};')
     run_controller_cmd = (f'{sys.executable} -u -m'
-                          'sky.jobs.controller')
+                          f'sky.jobs.controller {controller_uuid}')
     run_cmd = (f'{activate_python_env_cmd}'
                f'{run_controller_cmd}')
@@ -309,7 +310,6 @@ async def scheduled_launch(
     starting: Set[int],
     starting_lock: asyncio.Lock,
     starting_signal: asyncio.Condition,
-    job_logger: 'logging.Logger',
 ):
     """Launch as part of an ongoing job.
@@ -347,10 +347,10 @@ async def scheduled_launch(
             starting_count = len(starting)
             if starting_count < LAUNCHES_PER_WORKER:
                 break
-            job_logger.info('Too many jobs starting, waiting for a slot')
+            logger.info('Too many jobs starting, waiting for a slot')
             await starting_signal.wait()
-    job_logger.info(f'Starting job {job_id}')
+    logger.info(f'Starting job {job_id}')
     async with starting_lock:
         starting.add(job_id)

sky/jobs/state.py CHANGED Viewed

@@ -280,6 +280,27 @@ def _init_db(func):
     return wrapper
+async def _describe_task_transition_failure(session: sql_async.AsyncSession,
+                                            job_id: int, task_id: int) -> str:
+    """Return a human-readable description when a task transition fails."""
+    details = 'Couldn\'t fetch the task details.'
+    try:
+        debug_result = await session.execute(
+            sqlalchemy.select(spot_table.c.status, spot_table.c.end_at).where(
+                sqlalchemy.and_(spot_table.c.spot_job_id == job_id,
+                                spot_table.c.task_id == task_id)))
+        rows = debug_result.mappings().all()
+        details = (f'{len(rows)} rows matched job {job_id} and task '
+                   f'{task_id}.')
+        for row in rows:
+            status = row['status']
+            end_at = row['end_at']
+            details += f' Status: {status}, End time: {end_at}.'
+    except Exception as exc:  # pylint: disable=broad-except
+        details += f' Error fetching task details: {exc}'
+    return details
 # job_duration is the time a job actually runs (including the
 # setup duration) before last_recover, excluding the provision
 # and recovery time.
@@ -758,9 +779,12 @@ async def set_backoff_pending_async(job_id: int, task_id: int):
         count = result.rowcount
         await session.commit()
         if count != 1:
-            raise exceptions.ManagedJobStatusError(
-                'Failed to set the task back to pending. '
-                f'({count} rows updated)')
+            details = await _describe_task_transition_failure(
+                session, job_id, task_id)
+            message = ('Failed to set the task back to pending. '
+                       f'({count} rows updated. {details})')
+            logger.error(message)
+            raise exceptions.ManagedJobStatusError(message)
     # Do not call callback_func here, as we don't use the callback for PENDING.
@@ -789,9 +813,12 @@ async def set_restarting_async(job_id: int, task_id: int, recovering: bool):
         await session.commit()
         logger.debug(f'back to {target_status}')
         if count != 1:
-            raise exceptions.ManagedJobStatusError(
-                f'Failed to set the task back to {target_status}. '
-                f'({count} rows updated)')
+            details = await _describe_task_transition_failure(
+                session, job_id, task_id)
+            message = (f'Failed to set the task back to {target_status}. '
+                       f'({count} rows updated. {details})')
+            logger.error(message)
+            raise exceptions.ManagedJobStatusError(message)
     # Do not call callback_func here, as it should only be invoked for the
     # initial (pre-`set_backoff_pending`) transition to STARTING or RECOVERING.
@@ -1644,9 +1671,12 @@ async def set_starting_async(job_id: int, task_id: int, run_timestamp: str,
         count = result.rowcount
         await session.commit()
         if count != 1:
-            raise exceptions.ManagedJobStatusError(
-                'Failed to set the task to starting. '
-                f'({count} rows updated)')
+            details = await _describe_task_transition_failure(
+                session, job_id, task_id)
+            message = ('Failed to set the task to starting. '
+                       f'({count} rows updated. {details})')
+            logger.error(message)
+            raise exceptions.ManagedJobStatusError(message)
     await callback_func('SUBMITTED')
     await callback_func('STARTING')
@@ -1676,9 +1706,12 @@ async def set_started_async(job_id: int, task_id: int, start_time: float,
         count = result.rowcount
         await session.commit()
         if count != 1:
-            raise exceptions.ManagedJobStatusError(
-                f'Failed to set the task to started. '
-                f'({count} rows updated)')
+            details = await _describe_task_transition_failure(
+                session, job_id, task_id)
+            message = (f'Failed to set the task to started. '
+                       f'({count} rows updated. {details})')
+            logger.error(message)
+            raise exceptions.ManagedJobStatusError(message)
     await callback_func('STARTED')
@@ -1733,9 +1766,14 @@ async def set_recovering_async(job_id: int, task_id: int,
         count = result.rowcount
         await session.commit()
         if count != 1:
-            raise exceptions.ManagedJobStatusError(
-                f'Failed to set the task to recovering. '
-                f'({count} rows updated)')
+            details = await _describe_task_transition_failure(
+                session, job_id, task_id)
+            message = ('Failed to set the task to recovering with '
+                       'force_transit_to_recovering='
+                       f'{force_transit_to_recovering}. '
+                       f'({count} rows updated. {details})')
+            logger.error(message)
+            raise exceptions.ManagedJobStatusError(message)
     await callback_func('RECOVERING')
@@ -1761,9 +1799,12 @@ async def set_recovered_async(job_id: int, task_id: int, recovered_time: float,
         count = result.rowcount
         await session.commit()
         if count != 1:
-            raise exceptions.ManagedJobStatusError(
-                f'Failed to set the task to recovered. '
-                f'({count} rows updated)')
+            details = await _describe_task_transition_failure(
+                session, job_id, task_id)
+            message = (f'Failed to set the task to recovered. '
+                       f'({count} rows updated. {details})')
+            logger.error(message)
+            raise exceptions.ManagedJobStatusError(message)
     logger.info('==== Recovered. ====')
     await callback_func('RECOVERED')
@@ -1788,9 +1829,12 @@ async def set_succeeded_async(job_id: int, task_id: int, end_time: float,
         count = result.rowcount
         await session.commit()
         if count != 1:
-            raise exceptions.ManagedJobStatusError(
-                f'Failed to set the task to succeeded. '
-                f'({count} rows updated)')
+            details = await _describe_task_transition_failure(
+                session, job_id, task_id)
+            message = (f'Failed to set the task to succeeded. '
+                       f'({count} rows updated. {details})')
+            logger.error(message)
+            raise exceptions.ManagedJobStatusError(message)
     await callback_func('SUCCEEDED')
     logger.info('Job succeeded.')

skypilot-nightly 1.0.0.dev20251013__py3-none-any.whl → 1.0.0.dev20251015__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251013py3-none-any.whl → 1.0.0.dev20251015py3-none-any.whl