PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251009py3-none-any.whl → 1.0.0.dev20251107py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show

sky/__init__.py +6 -2
sky/adaptors/aws.py +25 -7
sky/adaptors/coreweave.py +278 -0
sky/adaptors/kubernetes.py +64 -0
sky/adaptors/shadeform.py +89 -0
sky/admin_policy.py +20 -0
sky/authentication.py +59 -149
sky/backends/backend_utils.py +104 -63
sky/backends/cloud_vm_ray_backend.py +84 -39
sky/catalog/data_fetchers/fetch_runpod.py +698 -0
sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
sky/catalog/kubernetes_catalog.py +24 -28
sky/catalog/runpod_catalog.py +5 -1
sky/catalog/shadeform_catalog.py +165 -0
sky/check.py +25 -13
sky/client/cli/command.py +335 -86
sky/client/cli/flags.py +4 -2
sky/client/cli/table_utils.py +17 -9
sky/client/sdk.py +59 -12
sky/cloud_stores.py +73 -0
sky/clouds/__init__.py +2 -0
sky/clouds/aws.py +71 -16
sky/clouds/azure.py +12 -5
sky/clouds/cloud.py +19 -9
sky/clouds/cudo.py +12 -5
sky/clouds/do.py +4 -1
sky/clouds/fluidstack.py +12 -5
sky/clouds/gcp.py +12 -5
sky/clouds/hyperbolic.py +12 -5
sky/clouds/ibm.py +12 -5
sky/clouds/kubernetes.py +62 -25
sky/clouds/lambda_cloud.py +12 -5
sky/clouds/nebius.py +12 -5
sky/clouds/oci.py +12 -5
sky/clouds/paperspace.py +4 -1
sky/clouds/primeintellect.py +4 -1
sky/clouds/runpod.py +12 -5
sky/clouds/scp.py +12 -5
sky/clouds/seeweb.py +4 -1
sky/clouds/shadeform.py +400 -0
sky/clouds/ssh.py +4 -2
sky/clouds/vast.py +12 -5
sky/clouds/vsphere.py +4 -1
sky/core.py +12 -11
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +92 -1
sky/data/mounting_utils.py +143 -19
sky/data/storage.py +168 -11
sky/exceptions.py +13 -1
sky/execution.py +13 -0
sky/global_user_state.py +189 -113
sky/jobs/client/sdk.py +32 -10
sky/jobs/client/sdk_async.py +9 -3
sky/jobs/constants.py +3 -1
sky/jobs/controller.py +164 -192
sky/jobs/file_content_utils.py +80 -0
sky/jobs/log_gc.py +201 -0
sky/jobs/recovery_strategy.py +59 -82
sky/jobs/scheduler.py +20 -9
sky/jobs/server/core.py +105 -23
sky/jobs/server/server.py +40 -28
sky/jobs/server/utils.py +32 -11
sky/jobs/state.py +588 -110
sky/jobs/utils.py +442 -209
sky/logs/agent.py +1 -1
sky/metrics/utils.py +45 -6
sky/optimizer.py +1 -1
sky/provision/__init__.py +7 -0
sky/provision/aws/instance.py +2 -1
sky/provision/azure/instance.py +2 -1
sky/provision/common.py +2 -0
sky/provision/cudo/instance.py +2 -1
sky/provision/do/instance.py +2 -1
sky/provision/fluidstack/instance.py +4 -3
sky/provision/gcp/instance.py +2 -1
sky/provision/hyperbolic/instance.py +2 -1
sky/provision/instance_setup.py +10 -2
sky/provision/kubernetes/constants.py +0 -1
sky/provision/kubernetes/instance.py +222 -89
sky/provision/kubernetes/network.py +12 -8
sky/provision/kubernetes/utils.py +114 -53
sky/provision/kubernetes/volume.py +5 -4
sky/provision/lambda_cloud/instance.py +2 -1
sky/provision/nebius/instance.py +2 -1
sky/provision/oci/instance.py +2 -1
sky/provision/paperspace/instance.py +2 -1
sky/provision/provisioner.py +11 -2
sky/provision/runpod/instance.py +2 -1
sky/provision/scp/instance.py +2 -1
sky/provision/seeweb/instance.py +3 -3
sky/provision/shadeform/__init__.py +11 -0
sky/provision/shadeform/config.py +12 -0
sky/provision/shadeform/instance.py +351 -0
sky/provision/shadeform/shadeform_utils.py +83 -0
sky/provision/vast/instance.py +2 -1
sky/provision/vsphere/instance.py +2 -1
sky/resources.py +1 -1
sky/schemas/api/responses.py +9 -5
sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/generated/jobsv1_pb2.py +52 -52
sky/schemas/generated/jobsv1_pb2.pyi +4 -2
sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
sky/serve/client/impl.py +11 -3
sky/serve/replica_managers.py +5 -2
sky/serve/serve_utils.py +9 -2
sky/serve/server/impl.py +7 -2
sky/serve/server/server.py +18 -15
sky/serve/service.py +2 -2
sky/server/auth/oauth2_proxy.py +2 -5
sky/server/common.py +31 -28
sky/server/constants.py +5 -1
sky/server/daemons.py +27 -19
sky/server/requests/executor.py +138 -74
sky/server/requests/payloads.py +9 -1
sky/server/requests/preconditions.py +13 -10
sky/server/requests/request_names.py +120 -0
sky/server/requests/requests.py +485 -153
sky/server/requests/serializers/decoders.py +26 -13
sky/server/requests/serializers/encoders.py +56 -11
sky/server/requests/threads.py +106 -0
sky/server/rest.py +70 -18
sky/server/server.py +283 -104
sky/server/stream_utils.py +233 -59
sky/server/uvicorn.py +18 -17
sky/setup_files/alembic.ini +4 -0
sky/setup_files/dependencies.py +32 -13
sky/sky_logging.py +0 -2
sky/skylet/constants.py +30 -7
sky/skylet/events.py +7 -0
sky/skylet/log_lib.py +8 -2
sky/skylet/log_lib.pyi +1 -1
sky/skylet/services.py +26 -13
sky/skylet/subprocess_daemon.py +103 -29
sky/skypilot_config.py +87 -75
sky/ssh_node_pools/server.py +9 -8
sky/task.py +67 -54
sky/templates/kubernetes-ray.yml.j2 +8 -1
sky/templates/nebius-ray.yml.j2 +1 -0
sky/templates/shadeform-ray.yml.j2 +72 -0
sky/templates/websocket_proxy.py +142 -12
sky/users/permission.py +8 -1
sky/utils/admin_policy_utils.py +16 -3
sky/utils/asyncio_utils.py +78 -0
sky/utils/auth_utils.py +153 -0
sky/utils/cli_utils/status_utils.py +8 -2
sky/utils/command_runner.py +11 -0
sky/utils/common.py +3 -1
sky/utils/common_utils.py +7 -4
sky/utils/context.py +57 -51
sky/utils/context_utils.py +30 -12
sky/utils/controller_utils.py +35 -8
sky/utils/db/db_utils.py +37 -10
sky/utils/db/migration_utils.py +8 -4
sky/utils/locks.py +24 -6
sky/utils/resource_checker.py +4 -1
sky/utils/resources_utils.py +53 -29
sky/utils/schemas.py +23 -4
sky/utils/subprocess_utils.py +17 -4
sky/volumes/server/server.py +7 -6
sky/workspaces/server.py +13 -12
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
/sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0

sky/jobs/state.py CHANGED Viewed

@@ -10,8 +10,7 @@ import sqlite3
 import threading
 import time
 import typing
-from typing import (Any, Awaitable, Callable, Dict, List, Optional, Set, Tuple,
-                    Union)
+from typing import Any, Awaitable, Callable, Dict, List, Optional, Tuple, Union
 import urllib.parse
 import colorama
@@ -94,6 +93,7 @@ spot_table = sqlalchemy.Table(
     sqlalchemy.Column('specs', sqlalchemy.Text),
     sqlalchemy.Column('local_log_file', sqlalchemy.Text, server_default=None),
     sqlalchemy.Column('metadata', sqlalchemy.Text, server_default='{}'),
+    sqlalchemy.Column('logs_cleaned_at', sqlalchemy.Float, server_default=None),
 )
 job_info_table = sqlalchemy.Table(
@@ -109,6 +109,8 @@ job_info_table = sqlalchemy.Table(
                       server_default=None),
     sqlalchemy.Column('dag_yaml_path', sqlalchemy.Text),
     sqlalchemy.Column('env_file_path', sqlalchemy.Text),
+    sqlalchemy.Column('dag_yaml_content', sqlalchemy.Text, server_default=None),
+    sqlalchemy.Column('env_file_content', sqlalchemy.Text, server_default=None),
     sqlalchemy.Column('user_hash', sqlalchemy.Text),
     sqlalchemy.Column('workspace', sqlalchemy.Text, server_default=None),
     sqlalchemy.Column('priority',
@@ -118,6 +120,9 @@ job_info_table = sqlalchemy.Table(
     sqlalchemy.Column('original_user_yaml_path',
                       sqlalchemy.Text,
                       server_default=None),
+    sqlalchemy.Column('original_user_yaml_content',
+                      sqlalchemy.Text,
+                      server_default=None),
     sqlalchemy.Column('pool', sqlalchemy.Text, server_default=None),
     sqlalchemy.Column('current_cluster_name',
                       sqlalchemy.Text,
@@ -126,6 +131,9 @@ job_info_table = sqlalchemy.Table(
                       sqlalchemy.Integer,
                       server_default=None),
     sqlalchemy.Column('pool_hash', sqlalchemy.Text, server_default=None),
+    sqlalchemy.Column('controller_logs_cleaned_at',
+                      sqlalchemy.Float,
+                      server_default=None),
 )
 ha_recovery_script_table = sqlalchemy.Table(
@@ -280,6 +288,27 @@ def _init_db(func):
     return wrapper
+async def _describe_task_transition_failure(session: sql_async.AsyncSession,
+                                            job_id: int, task_id: int) -> str:
+    """Return a human-readable description when a task transition fails."""
+    details = 'Couldn\'t fetch the task details.'
+    try:
+        debug_result = await session.execute(
+            sqlalchemy.select(spot_table.c.status, spot_table.c.end_at).where(
+                sqlalchemy.and_(spot_table.c.spot_job_id == job_id,
+                                spot_table.c.task_id == task_id)))
+        rows = debug_result.mappings().all()
+        details = (f'{len(rows)} rows matched job {job_id} and task '
+                   f'{task_id}.')
+        for row in rows:
+            status = row['status']
+            end_at = row['end_at']
+            details += f' Status: {status}, End time: {end_at}.'
+    except Exception as exc:  # pylint: disable=broad-except
+        details += f' Error fetching task details: {exc}'
+    return details
 # job_duration is the time a job actually runs (including the
 # setup duration) before last_recover, excluding the provision
 # and recovery time.
@@ -293,42 +322,50 @@ def _init_db(func):
 # column names in the DB and it corresponds to the combined view
 # by joining the spot and job_info tables.
 def _get_jobs_dict(r: 'row.RowMapping') -> Dict[str, Any]:
+    # WARNING: If you update these you may also need to update GetJobTable in
+    # the skylet ManagedJobsServiceImpl.
     return {
-        '_job_id': r['job_id'],  # from spot table
-        '_task_name': r['job_name'],  # deprecated, from spot table
-        'resources': r['resources'],
-        'submitted_at': r['submitted_at'],
-        'status': r['status'],
-        'run_timestamp': r['run_timestamp'],
-        'start_at': r['start_at'],
-        'end_at': r['end_at'],
-        'last_recovered_at': r['last_recovered_at'],
-        'recovery_count': r['recovery_count'],
-        'job_duration': r['job_duration'],
-        'failure_reason': r['failure_reason'],
-        'job_id': r[spot_table.c.spot_job_id],  # ambiguous, use table.column
-        'task_id': r['task_id'],
-        'task_name': r['task_name'],
-        'specs': r['specs'],
-        'local_log_file': r['local_log_file'],
-        'metadata': r['metadata'],
+        '_job_id': r.get('job_id'),  # from spot table
+        '_task_name': r.get('job_name'),  # deprecated, from spot table
+        'resources': r.get('resources'),
+        'submitted_at': r.get('submitted_at'),
+        'status': r.get('status'),
+        'run_timestamp': r.get('run_timestamp'),
+        'start_at': r.get('start_at'),
+        'end_at': r.get('end_at'),
+        'last_recovered_at': r.get('last_recovered_at'),
+        'recovery_count': r.get('recovery_count'),
+        'job_duration': r.get('job_duration'),
+        'failure_reason': r.get('failure_reason'),
+        'job_id': r.get(spot_table.c.spot_job_id
+                       ),  # ambiguous, use table.column
+        'task_id': r.get('task_id'),
+        'task_name': r.get('task_name'),
+        'specs': r.get('specs'),
+        'local_log_file': r.get('local_log_file'),
+        'metadata': r.get('metadata'),
         # columns from job_info table (some may be None for legacy jobs)
-        '_job_info_job_id': r[job_info_table.c.spot_job_id
-                             ],  # ambiguous, use table.column
-        'job_name': r['name'],  # from job_info table
-        'schedule_state': r['schedule_state'],
-        'controller_pid': r['controller_pid'],
-        'dag_yaml_path': r['dag_yaml_path'],
-        'env_file_path': r['env_file_path'],
-        'user_hash': r['user_hash'],
-        'workspace': r['workspace'],
-        'priority': r['priority'],
-        'entrypoint': r['entrypoint'],
-        'original_user_yaml_path': r['original_user_yaml_path'],
-        'pool': r['pool'],
-        'current_cluster_name': r['current_cluster_name'],
-        'job_id_on_pool_cluster': r['job_id_on_pool_cluster'],
-        'pool_hash': r['pool_hash'],
+        '_job_info_job_id': r.get(job_info_table.c.spot_job_id
+                                 ),  # ambiguous, use table.column
+        'job_name': r.get('name'),  # from job_info table
+        'schedule_state': r.get('schedule_state'),
+        'controller_pid': r.get('controller_pid'),
+        # the _path columns are for backwards compatibility, use the _content
+        # columns instead
+        'dag_yaml_path': r.get('dag_yaml_path'),
+        'env_file_path': r.get('env_file_path'),
+        'dag_yaml_content': r.get('dag_yaml_content'),
+        'env_file_content': r.get('env_file_content'),
+        'user_hash': r.get('user_hash'),
+        'workspace': r.get('workspace'),
+        'priority': r.get('priority'),
+        'entrypoint': r.get('entrypoint'),
+        'original_user_yaml_path': r.get('original_user_yaml_path'),
+        'original_user_yaml_content': r.get('original_user_yaml_content'),
+        'pool': r.get('pool'),
+        'current_cluster_name': r.get('current_cluster_name'),
+        'job_id_on_pool_cluster': r.get('job_id_on_pool_cluster'),
+        'pool_hash': r.get('pool_hash'),
     }
@@ -671,8 +708,8 @@ class ManagedJobScheduleState(enum.Enum):
 # === Status transition functions ===
 @_init_db
 def set_job_info_without_job_id(name: str, workspace: str, entrypoint: str,
-                                pool: Optional[str],
-                                pool_hash: Optional[str]) -> int:
+                                pool: Optional[str], pool_hash: Optional[str],
+                                user_hash: Optional[str]) -> int:
     assert _SQLALCHEMY_ENGINE is not None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -691,6 +728,7 @@ def set_job_info_without_job_id(name: str, workspace: str, entrypoint: str,
             entrypoint=entrypoint,
             pool=pool,
             pool_hash=pool_hash,
+            user_hash=user_hash,
         )
         if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -758,9 +796,12 @@ async def set_backoff_pending_async(job_id: int, task_id: int):
         count = result.rowcount
         await session.commit()
         if count != 1:
-            raise exceptions.ManagedJobStatusError(
-                'Failed to set the task back to pending. '
-                f'({count} rows updated)')
+            details = await _describe_task_transition_failure(
+                session, job_id, task_id)
+            message = ('Failed to set the task back to pending. '
+                       f'({count} rows updated. {details})')
+            logger.error(message)
+            raise exceptions.ManagedJobStatusError(message)
     # Do not call callback_func here, as we don't use the callback for PENDING.
@@ -789,9 +830,12 @@ async def set_restarting_async(job_id: int, task_id: int, recovering: bool):
         await session.commit()
         logger.debug(f'back to {target_status}')
         if count != 1:
-            raise exceptions.ManagedJobStatusError(
-                f'Failed to set the task back to {target_status}. '
-                f'({count} rows updated)')
+            details = await _describe_task_transition_failure(
+                session, job_id, task_id)
+            message = (f'Failed to set the task back to {target_status}. '
+                       f'({count} rows updated. {details})')
+            logger.error(message)
+            raise exceptions.ManagedJobStatusError(message)
     # Do not call callback_func here, as it should only be invoked for the
     # initial (pre-`set_backoff_pending`) transition to STARTING or RECOVERING.
@@ -1048,7 +1092,8 @@ def _get_all_task_ids_statuses(
 @_init_db
 def get_all_task_ids_names_statuses_logs(
-        job_id: int) -> List[Tuple[int, str, ManagedJobStatus, str]]:
+    job_id: int
+) -> List[Tuple[int, str, ManagedJobStatus, str, Optional[float]]]:
     assert _SQLALCHEMY_ENGINE is not None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         id_names = session.execute(
@@ -1057,9 +1102,10 @@ def get_all_task_ids_names_statuses_logs(
                 spot_table.c.task_name,
                 spot_table.c.status,
                 spot_table.c.local_log_file,
+                spot_table.c.logs_cleaned_at,
             ).where(spot_table.c.spot_job_id == job_id).order_by(
                 spot_table.c.task_id.asc())).fetchall()
-        return [(row[0], row[1], ManagedJobStatus(row[2]), row[3])
+        return [(row[0], row[1], ManagedJobStatus(row[2]), row[3], row[4])
                 for row in id_names]
@@ -1124,8 +1170,8 @@ def get_failure_reason(job_id: int) -> Optional[str]:
 @_init_db
-def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
-    """Get managed jobs from the database."""
+def get_managed_job_tasks(job_id: int) -> List[Dict[str, Any]]:
+    """Get managed job tasks for a specific managed job id from the database."""
     assert _SQLALCHEMY_ENGINE is not None
     # Join spot and job_info tables to get the job name for each task.
@@ -1140,10 +1186,8 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
         spot_table.outerjoin(
             job_info_table,
             spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
-    if job_id is not None:
-        query = query.where(spot_table.c.spot_job_id == job_id)
-    query = query.order_by(spot_table.c.spot_job_id.desc(),
-                           spot_table.c.task_id.asc())
+    query = query.where(spot_table.c.spot_job_id == job_id)
+    query = query.order_by(spot_table.c.task_id.asc())
     rows = None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         rows = session.execute(query).fetchall()
@@ -1158,20 +1202,307 @@ def get_managed_jobs(job_id: Optional[int] = None) -> List[Dict[str, Any]]:
         job_dict['metadata'] = json.loads(job_dict['metadata'])
         # Add user YAML content for managed jobs.
-        yaml_path = job_dict.get('original_user_yaml_path')
-        if yaml_path:
-            try:
-                with open(yaml_path, 'r', encoding='utf-8') as f:
-                    job_dict['user_yaml'] = f.read()
-            except (FileNotFoundError, IOError, OSError):
-                job_dict['user_yaml'] = None
-        else:
-            job_dict['user_yaml'] = None
+        job_dict['user_yaml'] = job_dict.get('original_user_yaml_content')
+        if job_dict['user_yaml'] is None:
+            # Backwards compatibility - try to read from file path
+            yaml_path = job_dict.get('original_user_yaml_path')
+            if yaml_path:
+                try:
+                    with open(yaml_path, 'r', encoding='utf-8') as f:
+                        job_dict['user_yaml'] = f.read()
+                except (FileNotFoundError, IOError, OSError) as e:
+                    logger.debug('Failed to read original user YAML for job '
+                                 f'{job_id} from {yaml_path}: {e}')
         jobs.append(job_dict)
     return jobs
+def _map_response_field_to_db_column(field: str):
+    """Map the response field name to an actual SQLAlchemy ColumnElement.
+    This ensures we never pass plain strings to SQLAlchemy 2.0 APIs like
+    Select.with_only_columns().
+    """
+    # Explicit aliases differing from actual DB column names
+    alias_mapping = {
+        '_job_id': spot_table.c.job_id,  # spot.job_id
+        '_task_name': spot_table.c.job_name,  # deprecated, from spot table
+        'job_id': spot_table.c.spot_job_id,  # public job id -> spot.spot_job_id
+        '_job_info_job_id': job_info_table.c.spot_job_id,
+        'job_name': job_info_table.c.name,  # public job name -> job_info.name
+    }
+    if field in alias_mapping:
+        return alias_mapping[field]
+    # Try direct match on the `spot` table columns
+    if field in spot_table.c:
+        return spot_table.c[field]
+    # Try direct match on the `job_info` table columns
+    if field in job_info_table.c:
+        return job_info_table.c[field]
+    raise ValueError(f'Unknown field: {field}')
+@_init_db
+def get_managed_jobs_total() -> int:
+    """Get the total number of managed jobs."""
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        result = session.execute(
+            sqlalchemy.select(sqlalchemy.func.count()  # pylint: disable=not-callable
+                             ).select_from(spot_table)).fetchone()
+        return result[0] if result else 0
+@_init_db
+def get_managed_jobs_highest_priority() -> int:
+    """Get the highest priority of the managed jobs."""
+    assert _SQLALCHEMY_ENGINE is not None
+    query = sqlalchemy.select(sqlalchemy.func.max(
+        job_info_table.c.priority)).where(
+            sqlalchemy.and_(
+                job_info_table.c.schedule_state.in_([
+                    ManagedJobScheduleState.LAUNCHING.value,
+                    ManagedJobScheduleState.ALIVE_BACKOFF.value,
+                    ManagedJobScheduleState.WAITING.value,
+                    ManagedJobScheduleState.ALIVE_WAITING.value,
+                ]),
+                job_info_table.c.priority.is_not(None),
+            ))
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        priority = session.execute(query).fetchone()
+        return priority[0] if priority and priority[
+            0] is not None else constants.MIN_PRIORITY
+def build_managed_jobs_with_filters_no_status_query(
+    fields: Optional[List[str]] = None,
+    job_ids: Optional[List[int]] = None,
+    accessible_workspaces: Optional[List[str]] = None,
+    workspace_match: Optional[str] = None,
+    name_match: Optional[str] = None,
+    pool_match: Optional[str] = None,
+    user_hashes: Optional[List[Optional[str]]] = None,
+    skip_finished: bool = False,
+    count_only: bool = False,
+    status_count: bool = False,
+) -> sqlalchemy.Select:
+    """Build a query to get managed jobs from the database with filters."""
+    # Join spot and job_info tables to get the job name for each task.
+    # We use LEFT OUTER JOIN mainly for backward compatibility, as for an
+    # existing controller before #1982, the job_info table may not exist,
+    # and all the managed jobs created before will not present in the
+    # job_info.
+    # Note: we will get the user_hash here, but don't try to call
+    # global_user_state.get_user() on it. This runs on the controller, which may
+    # not have the user info. Prefer to do it on the API server side.
+    if count_only:
+        query = sqlalchemy.select(sqlalchemy.func.count().label('count'))  # pylint: disable=not-callable
+    elif status_count:
+        query = sqlalchemy.select(spot_table.c.status,
+                                  sqlalchemy.func.count().label('count'))  # pylint: disable=not-callable
+    else:
+        query = sqlalchemy.select(spot_table, job_info_table)
+    query = query.select_from(
+        spot_table.outerjoin(
+            job_info_table,
+            spot_table.c.spot_job_id == job_info_table.c.spot_job_id))
+    if skip_finished:
+        # Filter out finished jobs at the DB level. If a multi-task job is
+        # partially finished, include all its tasks. We do this by first
+        # selecting job_ids that have at least one non-terminal task, then
+        # restricting the main query to those job_ids.
+        terminal_status_values = [
+            s.value for s in ManagedJobStatus.terminal_statuses()
+        ]
+        non_terminal_job_ids_subquery = (sqlalchemy.select(
+            spot_table.c.spot_job_id).where(
+                sqlalchemy.or_(
+                    spot_table.c.status.is_(None),
+                    sqlalchemy.not_(
+                        spot_table.c.status.in_(terminal_status_values)),
+                )).distinct())
+        query = query.where(
+            spot_table.c.spot_job_id.in_(non_terminal_job_ids_subquery))
+    if not count_only and not status_count and fields:
+        # Resolve requested field names to explicit ColumnElements from
+        # the joined tables.
+        selected_columns = [_map_response_field_to_db_column(f) for f in fields]
+        query = query.with_only_columns(*selected_columns)
+    if job_ids is not None:
+        query = query.where(spot_table.c.spot_job_id.in_(job_ids))
+    if accessible_workspaces is not None:
+        query = query.where(
+            job_info_table.c.workspace.in_(accessible_workspaces))
+    if workspace_match is not None:
+        query = query.where(
+            job_info_table.c.workspace.like(f'%{workspace_match}%'))
+    if name_match is not None:
+        query = query.where(job_info_table.c.name.like(f'%{name_match}%'))
+    if pool_match is not None:
+        query = query.where(job_info_table.c.pool.like(f'%{pool_match}%'))
+    if user_hashes is not None:
+        query = query.where(job_info_table.c.user_hash.in_(user_hashes))
+    return query
+def build_managed_jobs_with_filters_query(
+    fields: Optional[List[str]] = None,
+    job_ids: Optional[List[int]] = None,
+    accessible_workspaces: Optional[List[str]] = None,
+    workspace_match: Optional[str] = None,
+    name_match: Optional[str] = None,
+    pool_match: Optional[str] = None,
+    user_hashes: Optional[List[Optional[str]]] = None,
+    statuses: Optional[List[str]] = None,
+    skip_finished: bool = False,
+    count_only: bool = False,
+) -> sqlalchemy.Select:
+    """Build a query to get managed jobs from the database with filters."""
+    query = build_managed_jobs_with_filters_no_status_query(
+        fields=fields,
+        job_ids=job_ids,
+        accessible_workspaces=accessible_workspaces,
+        workspace_match=workspace_match,
+        name_match=name_match,
+        pool_match=pool_match,
+        user_hashes=user_hashes,
+        skip_finished=skip_finished,
+        count_only=count_only,
+    )
+    if statuses is not None:
+        query = query.where(spot_table.c.status.in_(statuses))
+    return query
+@_init_db
+def get_status_count_with_filters(
+    fields: Optional[List[str]] = None,
+    job_ids: Optional[List[int]] = None,
+    accessible_workspaces: Optional[List[str]] = None,
+    workspace_match: Optional[str] = None,
+    name_match: Optional[str] = None,
+    pool_match: Optional[str] = None,
+    user_hashes: Optional[List[Optional[str]]] = None,
+    skip_finished: bool = False,
+) -> Dict[str, int]:
+    """Get the status count of the managed jobs with filters."""
+    query = build_managed_jobs_with_filters_no_status_query(
+        fields=fields,
+        job_ids=job_ids,
+        accessible_workspaces=accessible_workspaces,
+        workspace_match=workspace_match,
+        name_match=name_match,
+        pool_match=pool_match,
+        user_hashes=user_hashes,
+        skip_finished=skip_finished,
+        status_count=True,
+    )
+    query = query.group_by(spot_table.c.status)
+    results: Dict[str, int] = {}
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        rows = session.execute(query).fetchall()
+        for status_value, count in rows:
+            # status_value is already a string (enum value)
+            results[str(status_value)] = int(count)
+    return results
+@_init_db
+def get_managed_jobs_with_filters(
+    fields: Optional[List[str]] = None,
+    job_ids: Optional[List[int]] = None,
+    accessible_workspaces: Optional[List[str]] = None,
+    workspace_match: Optional[str] = None,
+    name_match: Optional[str] = None,
+    pool_match: Optional[str] = None,
+    user_hashes: Optional[List[Optional[str]]] = None,
+    statuses: Optional[List[str]] = None,
+    skip_finished: bool = False,
+    page: Optional[int] = None,
+    limit: Optional[int] = None,
+) -> Tuple[List[Dict[str, Any]], int]:
+    """Get managed jobs from the database with filters.
+    Returns:
+        A tuple containing
+         - the list of managed jobs
+         - the total number of managed jobs
+    """
+    assert _SQLALCHEMY_ENGINE is not None
+    count_query = build_managed_jobs_with_filters_query(
+        fields=None,
+        job_ids=job_ids,
+        accessible_workspaces=accessible_workspaces,
+        workspace_match=workspace_match,
+        name_match=name_match,
+        pool_match=pool_match,
+        user_hashes=user_hashes,
+        statuses=statuses,
+        skip_finished=skip_finished,
+        count_only=True,
+    )
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        total = session.execute(count_query).fetchone()[0]
+    query = build_managed_jobs_with_filters_query(
+        fields=fields,
+        job_ids=job_ids,
+        accessible_workspaces=accessible_workspaces,
+        workspace_match=workspace_match,
+        name_match=name_match,
+        pool_match=pool_match,
+        user_hashes=user_hashes,
+        statuses=statuses,
+        skip_finished=skip_finished,
+    )
+    query = query.order_by(spot_table.c.spot_job_id.desc(),
+                           spot_table.c.task_id.asc())
+    if page is not None and limit is not None:
+        query = query.offset((page - 1) * limit).limit(limit)
+    rows = None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        rows = session.execute(query).fetchall()
+    jobs = []
+    for row in rows:
+        job_dict = _get_jobs_dict(row._mapping)  # pylint: disable=protected-access
+        if job_dict.get('status') is not None:
+            job_dict['status'] = ManagedJobStatus(job_dict['status'])
+        if job_dict.get('schedule_state') is not None:
+            job_dict['schedule_state'] = ManagedJobScheduleState(
+                job_dict['schedule_state'])
+        if job_dict.get('job_name') is None:
+            job_dict['job_name'] = job_dict.get('task_name')
+        if job_dict.get('metadata') is not None:
+            job_dict['metadata'] = json.loads(job_dict['metadata'])
+        # Add user YAML content for managed jobs.
+        job_dict['user_yaml'] = job_dict.get('original_user_yaml_content')
+        if job_dict['user_yaml'] is None:
+            # Backwards compatibility - try to read from file path
+            yaml_path = job_dict.get('original_user_yaml_path')
+            if yaml_path:
+                try:
+                    with open(yaml_path, 'r', encoding='utf-8') as f:
+                        job_dict['user_yaml'] = f.read()
+                except (FileNotFoundError, IOError, OSError) as e:
+                    job_id = job_dict.get('job_id')
+                    if job_id is not None:
+                        logger.debug('Failed to read original user YAML for '
+                                     f'job {job_id} from {yaml_path}: {e}')
+                    else:
+                        logger.debug('Failed to read original user YAML from '
+                                     f'{yaml_path}: {e}')
+        jobs.append(job_dict)
+    return jobs, total
 @_init_db
 def get_task_name(job_id: int, task_id: int) -> str:
     """Get the task name of a job."""
@@ -1212,9 +1543,9 @@ def get_task_specs(job_id: int, task_id: int) -> Dict[str, Any]:
 @_init_db
-def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
-                          original_user_yaml_path: str, env_file_path: str,
-                          user_hash: str, priority: int):
+def scheduler_set_waiting(job_id: int, dag_yaml_content: str,
+                          original_user_yaml_content: str,
+                          env_file_content: str, priority: int):
     """Do not call without holding the scheduler lock.
     Returns: Whether this is a recovery run or not.
@@ -1226,20 +1557,48 @@ def scheduler_set_waiting(job_id: int, dag_yaml_path: str,
     assert _SQLALCHEMY_ENGINE is not None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         updated_count = session.query(job_info_table).filter(
-            sqlalchemy.and_(job_info_table.c.spot_job_id == job_id,)
-        ).update({
-            job_info_table.c.schedule_state:
-                ManagedJobScheduleState.WAITING.value,
-            job_info_table.c.dag_yaml_path: dag_yaml_path,
-            job_info_table.c.original_user_yaml_path: original_user_yaml_path,
-            job_info_table.c.env_file_path: env_file_path,
-            job_info_table.c.user_hash: user_hash,
-            job_info_table.c.priority: priority,
-        })
+            sqlalchemy.and_(job_info_table.c.spot_job_id == job_id,)).update({
+                job_info_table.c.schedule_state:
+                    ManagedJobScheduleState.WAITING.value,
+                job_info_table.c.dag_yaml_content: dag_yaml_content,
+                job_info_table.c.original_user_yaml_content:
+                    (original_user_yaml_content),
+                job_info_table.c.env_file_content: env_file_content,
+                job_info_table.c.priority: priority,
+            })
         session.commit()
         assert updated_count <= 1, (job_id, updated_count)
+@_init_db
+def get_job_file_contents(job_id: int) -> Dict[str, Optional[str]]:
+    """Return file information and stored contents for a managed job."""
+    assert _SQLALCHEMY_ENGINE is not None
+    with orm.Session(_SQLALCHEMY_ENGINE) as session:
+        row = session.execute(
+            sqlalchemy.select(
+                job_info_table.c.dag_yaml_path,
+                job_info_table.c.env_file_path,
+                job_info_table.c.dag_yaml_content,
+                job_info_table.c.env_file_content,
+            ).where(job_info_table.c.spot_job_id == job_id)).fetchone()
+    if row is None:
+        return {
+            'dag_yaml_path': None,
+            'env_file_path': None,
+            'dag_yaml_content': None,
+            'env_file_content': None,
+        }
+    return {
+        'dag_yaml_path': row[0],
+        'env_file_path': row[1],
+        'dag_yaml_content': row[2],
+        'env_file_content': row[3],
+    }
 @_init_db
 def get_pool_from_job_id(job_id: int) -> Optional[str]:
     """Get the pool from the job id."""
@@ -1251,25 +1610,6 @@ def get_pool_from_job_id(job_id: int) -> Optional[str]:
         return pool[0] if pool else None
-@_init_db
-def get_pool_and_submit_info_from_job_ids(
-    job_ids: Set[int]
-) -> Dict[int, Tuple[Optional[str], Optional[str], Optional[int]]]:
-    """Get the pool, cluster name, and job id on pool from job id"""
-    assert _SQLALCHEMY_ENGINE is not None
-    with orm.Session(_SQLALCHEMY_ENGINE) as session:
-        rows = session.execute(
-            sqlalchemy.select(
-                job_info_table.c.spot_job_id, job_info_table.c.pool,
-                job_info_table.c.current_cluster_name,
-                job_info_table.c.job_id_on_pool_cluster).where(
-                    job_info_table.c.spot_job_id.in_(job_ids))).fetchall()
-        return {
-            job_id: (pool, cluster_name, job_id_on_pool_cluster)
-            for job_id, pool, cluster_name, job_id_on_pool_cluster in rows
-        }
 @_init_db
 def set_current_cluster_name(job_id: int, current_cluster_name: str) -> None:
     """Set the current cluster name for a job."""
@@ -1644,9 +1984,12 @@ async def set_starting_async(job_id: int, task_id: int, run_timestamp: str,
         count = result.rowcount
         await session.commit()
         if count != 1:
-            raise exceptions.ManagedJobStatusError(
-                'Failed to set the task to starting. '
-                f'({count} rows updated)')
+            details = await _describe_task_transition_failure(
+                session, job_id, task_id)
+            message = ('Failed to set the task to starting. '
+                       f'({count} rows updated. {details})')
+            logger.error(message)
+            raise exceptions.ManagedJobStatusError(message)
     await callback_func('SUBMITTED')
     await callback_func('STARTING')
@@ -1676,9 +2019,12 @@ async def set_started_async(job_id: int, task_id: int, start_time: float,
         count = result.rowcount
         await session.commit()
         if count != 1:
-            raise exceptions.ManagedJobStatusError(
-                f'Failed to set the task to started. '
-                f'({count} rows updated)')
+            details = await _describe_task_transition_failure(
+                session, job_id, task_id)
+            message = (f'Failed to set the task to started. '
+                       f'({count} rows updated. {details})')
+            logger.error(message)
+            raise exceptions.ManagedJobStatusError(message)
     await callback_func('STARTED')
@@ -1733,9 +2079,14 @@ async def set_recovering_async(job_id: int, task_id: int,
         count = result.rowcount
         await session.commit()
         if count != 1:
-            raise exceptions.ManagedJobStatusError(
-                f'Failed to set the task to recovering. '
-                f'({count} rows updated)')
+            details = await _describe_task_transition_failure(
+                session, job_id, task_id)
+            message = ('Failed to set the task to recovering with '
+                       'force_transit_to_recovering='
+                       f'{force_transit_to_recovering}. '
+                       f'({count} rows updated. {details})')
+            logger.error(message)
+            raise exceptions.ManagedJobStatusError(message)
     await callback_func('RECOVERING')
@@ -1761,9 +2112,12 @@ async def set_recovered_async(job_id: int, task_id: int, recovered_time: float,
         count = result.rowcount
         await session.commit()
         if count != 1:
-            raise exceptions.ManagedJobStatusError(
-                f'Failed to set the task to recovered. '
-                f'({count} rows updated)')
+            details = await _describe_task_transition_failure(
+                session, job_id, task_id)
+            message = (f'Failed to set the task to recovered. '
+                       f'({count} rows updated. {details})')
+            logger.error(message)
+            raise exceptions.ManagedJobStatusError(message)
     logger.info('==== Recovered. ====')
     await callback_func('RECOVERED')
@@ -1788,9 +2142,12 @@ async def set_succeeded_async(job_id: int, task_id: int, end_time: float,
         count = result.rowcount
         await session.commit()
         if count != 1:
-            raise exceptions.ManagedJobStatusError(
-                f'Failed to set the task to succeeded. '
-                f'({count} rows updated)')
+            details = await _describe_task_transition_failure(
+                session, job_id, task_id)
+            message = (f'Failed to set the task to succeeded. '
+                       f'({count} rows updated. {details})')
+            logger.error(message)
+            raise exceptions.ManagedJobStatusError(message)
     await callback_func('SUCCEEDED')
     logger.info('Job succeeded.')
@@ -1956,8 +2313,13 @@ async def scheduler_set_done_async(job_id: int,
 @_init_db
-def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str,
-                 pool: Optional[str], pool_hash: Optional[str]):
+def set_job_info(job_id: int,
+                 name: str,
+                 workspace: str,
+                 entrypoint: str,
+                 pool: Optional[str],
+                 pool_hash: Optional[str],
+                 user_hash: Optional[str] = None):
     assert _SQLALCHEMY_ENGINE is not None
     with orm.Session(_SQLALCHEMY_ENGINE) as session:
         if (_SQLALCHEMY_ENGINE.dialect.name ==
@@ -1976,6 +2338,7 @@ def set_job_info(job_id: int, name: str, workspace: str, entrypoint: str,
             entrypoint=entrypoint,
             pool=pool,
             pool_hash=pool_hash,
+            user_hash=user_hash,
         )
         session.execute(insert_stmt)
         session.commit()
@@ -2029,3 +2392,118 @@ def get_all_job_ids_by_name(name: Optional[str]) -> List[int]:
         rows = session.execute(query).fetchall()
         job_ids = [row[0] for row in rows if row[0] is not None]
         return job_ids
+@_init_db_async
+async def get_task_logs_to_clean_async(retention_seconds: int,
+                                       batch_size) -> List[Dict[str, Any]]:
+    """Get the logs of job tasks to clean.
+    The logs of a task will only cleaned when:
+    - the job schedule state is DONE
+    - AND the end time of the task is older than the retention period
+    """
+    assert _SQLALCHEMY_ENGINE_ASYNC is not None
+    async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
+        now = time.time()
+        result = await session.execute(
+            sqlalchemy.select(
+                spot_table.c.spot_job_id,
+                spot_table.c.task_id,
+                spot_table.c.local_log_file,
+            ).select_from(
+                spot_table.join(
+                    job_info_table,
+                    spot_table.c.spot_job_id == job_info_table.c.spot_job_id,
+                )).
+            where(
+                sqlalchemy.and_(
+                    job_info_table.c.schedule_state.is_(
+                        ManagedJobScheduleState.DONE.value),
+                    spot_table.c.end_at.isnot(None),
+                    spot_table.c.end_at < (now - retention_seconds),
+                    spot_table.c.logs_cleaned_at.is_(None),
+                    # The local log file is set AFTER the task is finished,
+                    # add this condition to ensure the entire log file has
+                    # been written.
+                    spot_table.c.local_log_file.isnot(None),
+                )).limit(batch_size))
+        rows = result.fetchall()
+        return [{
+            'job_id': row[0],
+            'task_id': row[1],
+            'local_log_file': row[2]
+        } for row in rows]
+@_init_db_async
+async def get_controller_logs_to_clean_async(
+        retention_seconds: int, batch_size: int) -> List[Dict[str, Any]]:
+    """Get the controller logs to clean.
+    The controller logs will only cleaned when:
+    - the job schedule state is DONE
+    - AND the end time of the latest task is older than the retention period
+    """
+    assert _SQLALCHEMY_ENGINE_ASYNC is not None
+    async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
+        now = time.time()
+        result = await session.execute(
+            sqlalchemy.select(job_info_table.c.spot_job_id,).select_from(
+                job_info_table.join(
+                    spot_table,
+                    job_info_table.c.spot_job_id == spot_table.c.spot_job_id,
+                )).where(
+                    sqlalchemy.and_(
+                        job_info_table.c.schedule_state.is_(
+                            ManagedJobScheduleState.DONE.value),
+                        spot_table.c.local_log_file.isnot(None),
+                        job_info_table.c.controller_logs_cleaned_at.is_(None),
+                    )).group_by(
+                        job_info_table.c.spot_job_id,
+                        job_info_table.c.current_cluster_name,
+                    ).having(
+                        sqlalchemy.func.max(
+                            spot_table.c.end_at).isnot(None),).having(
+                                sqlalchemy.func.max(spot_table.c.end_at) < (
+                                    now - retention_seconds)).limit(batch_size))
+        rows = result.fetchall()
+        return [{'job_id': row[0]} for row in rows]
+@_init_db_async
+async def set_task_logs_cleaned_async(tasks: List[Tuple[int, int]],
+                                      logs_cleaned_at: float):
+    """Set the task logs cleaned at."""
+    if not tasks:
+        return
+    # Deduplicate
+    task_keys = list(dict.fromkeys(tasks))
+    assert _SQLALCHEMY_ENGINE_ASYNC is not None
+    async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
+        await session.execute(
+            sqlalchemy.update(spot_table).where(
+                sqlalchemy.tuple_(spot_table.c.spot_job_id,
+                                  spot_table.c.task_id).in_(task_keys)).values(
+                                      logs_cleaned_at=logs_cleaned_at))
+        await session.commit()
+@_init_db_async
+async def set_controller_logs_cleaned_async(job_ids: List[int],
+                                            logs_cleaned_at: float):
+    """Set the controller logs cleaned at."""
+    if not job_ids:
+        return
+    # Deduplicate
+    job_ids = list(dict.fromkeys(job_ids))
+    assert _SQLALCHEMY_ENGINE_ASYNC is not None
+    async with sql_async.AsyncSession(_SQLALCHEMY_ENGINE_ASYNC) as session:
+        await session.execute(
+            sqlalchemy.update(job_info_table).where(
+                job_info_table.c.spot_job_id.in_(job_ids)).values(
+                    controller_logs_cleaned_at=logs_cleaned_at))
+        await session.commit()

skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251009py3-none-any.whl → 1.0.0.dev20251107py3-none-any.whl