PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20251009py3-none-any.whl → 1.0.0.dev20251107py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (231) hide show

sky/__init__.py +6 -2
sky/adaptors/aws.py +25 -7
sky/adaptors/coreweave.py +278 -0
sky/adaptors/kubernetes.py +64 -0
sky/adaptors/shadeform.py +89 -0
sky/admin_policy.py +20 -0
sky/authentication.py +59 -149
sky/backends/backend_utils.py +104 -63
sky/backends/cloud_vm_ray_backend.py +84 -39
sky/catalog/data_fetchers/fetch_runpod.py +698 -0
sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
sky/catalog/kubernetes_catalog.py +24 -28
sky/catalog/runpod_catalog.py +5 -1
sky/catalog/shadeform_catalog.py +165 -0
sky/check.py +25 -13
sky/client/cli/command.py +335 -86
sky/client/cli/flags.py +4 -2
sky/client/cli/table_utils.py +17 -9
sky/client/sdk.py +59 -12
sky/cloud_stores.py +73 -0
sky/clouds/__init__.py +2 -0
sky/clouds/aws.py +71 -16
sky/clouds/azure.py +12 -5
sky/clouds/cloud.py +19 -9
sky/clouds/cudo.py +12 -5
sky/clouds/do.py +4 -1
sky/clouds/fluidstack.py +12 -5
sky/clouds/gcp.py +12 -5
sky/clouds/hyperbolic.py +12 -5
sky/clouds/ibm.py +12 -5
sky/clouds/kubernetes.py +62 -25
sky/clouds/lambda_cloud.py +12 -5
sky/clouds/nebius.py +12 -5
sky/clouds/oci.py +12 -5
sky/clouds/paperspace.py +4 -1
sky/clouds/primeintellect.py +4 -1
sky/clouds/runpod.py +12 -5
sky/clouds/scp.py +12 -5
sky/clouds/seeweb.py +4 -1
sky/clouds/shadeform.py +400 -0
sky/clouds/ssh.py +4 -2
sky/clouds/vast.py +12 -5
sky/clouds/vsphere.py +4 -1
sky/core.py +12 -11
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +92 -1
sky/data/mounting_utils.py +143 -19
sky/data/storage.py +168 -11
sky/exceptions.py +13 -1
sky/execution.py +13 -0
sky/global_user_state.py +189 -113
sky/jobs/client/sdk.py +32 -10
sky/jobs/client/sdk_async.py +9 -3
sky/jobs/constants.py +3 -1
sky/jobs/controller.py +164 -192
sky/jobs/file_content_utils.py +80 -0
sky/jobs/log_gc.py +201 -0
sky/jobs/recovery_strategy.py +59 -82
sky/jobs/scheduler.py +20 -9
sky/jobs/server/core.py +105 -23
sky/jobs/server/server.py +40 -28
sky/jobs/server/utils.py +32 -11
sky/jobs/state.py +588 -110
sky/jobs/utils.py +442 -209
sky/logs/agent.py +1 -1
sky/metrics/utils.py +45 -6
sky/optimizer.py +1 -1
sky/provision/__init__.py +7 -0
sky/provision/aws/instance.py +2 -1
sky/provision/azure/instance.py +2 -1
sky/provision/common.py +2 -0
sky/provision/cudo/instance.py +2 -1
sky/provision/do/instance.py +2 -1
sky/provision/fluidstack/instance.py +4 -3
sky/provision/gcp/instance.py +2 -1
sky/provision/hyperbolic/instance.py +2 -1
sky/provision/instance_setup.py +10 -2
sky/provision/kubernetes/constants.py +0 -1
sky/provision/kubernetes/instance.py +222 -89
sky/provision/kubernetes/network.py +12 -8
sky/provision/kubernetes/utils.py +114 -53
sky/provision/kubernetes/volume.py +5 -4
sky/provision/lambda_cloud/instance.py +2 -1
sky/provision/nebius/instance.py +2 -1
sky/provision/oci/instance.py +2 -1
sky/provision/paperspace/instance.py +2 -1
sky/provision/provisioner.py +11 -2
sky/provision/runpod/instance.py +2 -1
sky/provision/scp/instance.py +2 -1
sky/provision/seeweb/instance.py +3 -3
sky/provision/shadeform/__init__.py +11 -0
sky/provision/shadeform/config.py +12 -0
sky/provision/shadeform/instance.py +351 -0
sky/provision/shadeform/shadeform_utils.py +83 -0
sky/provision/vast/instance.py +2 -1
sky/provision/vsphere/instance.py +2 -1
sky/resources.py +1 -1
sky/schemas/api/responses.py +9 -5
sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/generated/jobsv1_pb2.py +52 -52
sky/schemas/generated/jobsv1_pb2.pyi +4 -2
sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
sky/serve/client/impl.py +11 -3
sky/serve/replica_managers.py +5 -2
sky/serve/serve_utils.py +9 -2
sky/serve/server/impl.py +7 -2
sky/serve/server/server.py +18 -15
sky/serve/service.py +2 -2
sky/server/auth/oauth2_proxy.py +2 -5
sky/server/common.py +31 -28
sky/server/constants.py +5 -1
sky/server/daemons.py +27 -19
sky/server/requests/executor.py +138 -74
sky/server/requests/payloads.py +9 -1
sky/server/requests/preconditions.py +13 -10
sky/server/requests/request_names.py +120 -0
sky/server/requests/requests.py +485 -153
sky/server/requests/serializers/decoders.py +26 -13
sky/server/requests/serializers/encoders.py +56 -11
sky/server/requests/threads.py +106 -0
sky/server/rest.py +70 -18
sky/server/server.py +283 -104
sky/server/stream_utils.py +233 -59
sky/server/uvicorn.py +18 -17
sky/setup_files/alembic.ini +4 -0
sky/setup_files/dependencies.py +32 -13
sky/sky_logging.py +0 -2
sky/skylet/constants.py +30 -7
sky/skylet/events.py +7 -0
sky/skylet/log_lib.py +8 -2
sky/skylet/log_lib.pyi +1 -1
sky/skylet/services.py +26 -13
sky/skylet/subprocess_daemon.py +103 -29
sky/skypilot_config.py +87 -75
sky/ssh_node_pools/server.py +9 -8
sky/task.py +67 -54
sky/templates/kubernetes-ray.yml.j2 +8 -1
sky/templates/nebius-ray.yml.j2 +1 -0
sky/templates/shadeform-ray.yml.j2 +72 -0
sky/templates/websocket_proxy.py +142 -12
sky/users/permission.py +8 -1
sky/utils/admin_policy_utils.py +16 -3
sky/utils/asyncio_utils.py +78 -0
sky/utils/auth_utils.py +153 -0
sky/utils/cli_utils/status_utils.py +8 -2
sky/utils/command_runner.py +11 -0
sky/utils/common.py +3 -1
sky/utils/common_utils.py +7 -4
sky/utils/context.py +57 -51
sky/utils/context_utils.py +30 -12
sky/utils/controller_utils.py +35 -8
sky/utils/db/db_utils.py +37 -10
sky/utils/db/migration_utils.py +8 -4
sky/utils/locks.py +24 -6
sky/utils/resource_checker.py +4 -1
sky/utils/resources_utils.py +53 -29
sky/utils/schemas.py +23 -4
sky/utils/subprocess_utils.py +17 -4
sky/volumes/server/server.py +7 -6
sky/workspaces/server.py +13 -12
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
/sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
{skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0

sky/jobs/utils.py CHANGED Viewed

@@ -6,9 +6,8 @@ ManagedJobCodeGen.
 """
 import asyncio
 import collections
-import datetime
+from datetime import datetime
 import enum
-import logging
 import os
 import pathlib
 import re
@@ -84,6 +83,7 @@ _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
 _JOB_STATUS_FETCH_MAX_RETRIES = 3
 _JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
+_JOB_STATUS_FETCH_TIMEOUT_SECONDS = 30
 _JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
     'Waiting for task to start[/]'
@@ -101,6 +101,28 @@ _JOB_CANCELLED_MESSAGE = (
 # update the state.
 _FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
+# After enabling consolidation mode, we need to restart the API server to get
+# the jobs refresh deamon and correct number of executors. We use this file to
+# indicate that the API server has been restarted after enabling consolidation
+# mode.
+_JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE = (
+    '~/.sky/.jobs_controller_consolidation_reloaded_signal')
+# The response fields for managed jobs that require cluster handle
+_CLUSTER_HANDLE_FIELDS = [
+    'cluster_resources',
+    'cluster_resources_full',
+    'cloud',
+    'region',
+    'zone',
+    'infra',
+    'accelerators',
+]
+# The response fields for managed jobs that are not stored in the database
+# These fields will be mapped to the DB fields in the `_update_fields`.
+_NON_DB_FIELDS = _CLUSTER_HANDLE_FIELDS + ['user_yaml', 'user_name', 'details']
 class ManagedJobQueueResultType(enum.Enum):
     """The type of the managed job queue result."""
@@ -117,9 +139,8 @@ class UserSignal(enum.Enum):
 # ====== internal functions ======
 def terminate_cluster(
-        cluster_name: str,
-        max_retry: int = 6,
-        _logger: logging.Logger = logger,  # pylint: disable=invalid-name
+    cluster_name: str,
+    max_retry: int = 6,
 ) -> None:
     """Terminate the cluster."""
     from sky import core  # pylint: disable=import-outside-toplevel
@@ -143,18 +164,18 @@ def terminate_cluster(
             return
         except exceptions.ClusterDoesNotExist:
             # The cluster is already down.
-            _logger.debug(f'The cluster {cluster_name} is already down.')
+            logger.debug(f'The cluster {cluster_name} is already down.')
             return
         except Exception as e:  # pylint: disable=broad-except
             retry_cnt += 1
             if retry_cnt >= max_retry:
                 raise RuntimeError(
                     f'Failed to terminate the cluster {cluster_name}.') from e
-            _logger.error(
+            logger.error(
                 f'Failed to terminate the cluster {cluster_name}. Retrying.'
                 f'Details: {common_utils.format_exception(e)}')
             with ux_utils.enable_traceback():
-                _logger.error(f'  Traceback: {traceback.format_exc()}')
+                logger.error(f'  Traceback: {traceback.format_exc()}')
             time.sleep(backoff.current_backoff())
@@ -174,8 +195,8 @@ def _validate_consolidation_mode_config(
                     'terminate the controller cluster first.'
                     f'{colorama.Style.RESET_ALL}')
     else:
-        all_jobs = managed_job_state.get_managed_jobs()
-        if all_jobs:
+        total_jobs = managed_job_state.get_managed_jobs_total()
+        if total_jobs > 0:
             nonterminal_jobs = (
                 managed_job_state.get_nonterminal_job_ids_by_name(
                     None, None, all_users=True))
@@ -190,7 +211,7 @@ def _validate_consolidation_mode_config(
             else:
                 logger.warning(
                     f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
-                    f'but there are {len(all_jobs)} jobs from previous '
+                    f'but there are {total_jobs} jobs from previous '
                     'consolidation mode. Reset the `jobs.controller.'
                     'consolidation_mode` to `true` and run `sky jobs queue` '
                     'to see those jobs. Switching to normal mode will '
@@ -202,13 +223,39 @@ def _validate_consolidation_mode_config(
 # API Server. Under the hood, we submit the job monitoring logic as processes
 # directly in the API Server.
 # Use LRU Cache so that the check is only done once.
-@annotations.lru_cache(scope='request', maxsize=1)
-def is_consolidation_mode() -> bool:
+@annotations.lru_cache(scope='request', maxsize=2)
+def is_consolidation_mode(on_api_restart: bool = False) -> bool:
     if os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
         return True
-    consolidation_mode = skypilot_config.get_nested(
+    config_consolidation_mode = skypilot_config.get_nested(
         ('jobs', 'controller', 'consolidation_mode'), default_value=False)
+    signal_file = pathlib.Path(
+        _JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE).expanduser()
+    restart_signal_file_exists = signal_file.exists()
+    consolidation_mode = (config_consolidation_mode and
+                          restart_signal_file_exists)
+    if on_api_restart:
+        if config_consolidation_mode:
+            signal_file.touch()
+    else:
+        if not restart_signal_file_exists:
+            if config_consolidation_mode:
+                logger.warning(f'{colorama.Fore.YELLOW}Consolidation mode for '
+                               'managed jobs is enabled in the server config, '
+                               'but the API server has not been restarted yet. '
+                               'Please restart the API server to enable it.'
+                               f'{colorama.Style.RESET_ALL}')
+                return False
+        elif not config_consolidation_mode:
+            # Cleanup the signal file if the consolidation mode is disabled in
+            # the config. This allow the user to disable the consolidation mode
+            # without restarting the API server.
+            signal_file.unlink()
     # We should only do this check on API server, as the controller will not
     # have related config and will always seemingly disabled for consolidation
     # mode. Check #6611 for more details.
@@ -219,6 +266,12 @@ def is_consolidation_mode() -> bool:
 def ha_recovery_for_consolidation_mode():
     """Recovery logic for HA mode."""
+    # Touch the signal file here to avoid conflict with
+    # update_managed_jobs_statuses. Although we run this first and then start
+    # the deamon, this function is also called in cancel_jobs_by_id.
+    signal_file = pathlib.Path(
+        constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE).expanduser()
+    signal_file.touch()
     # No setup recovery is needed in consolidation mode, as the API server
     # already has all runtime installed. Directly start jobs recovery here.
     # Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
@@ -229,7 +282,9 @@ def ha_recovery_for_consolidation_mode():
               encoding='utf-8') as f:
         start = time.time()
         f.write(f'Starting HA recovery at {datetime.datetime.now()}\n')
-        for job in managed_job_state.get_managed_jobs():
+        jobs, _ = managed_job_state.get_managed_jobs_with_filters(
+            fields=['job_id', 'controller_pid', 'schedule_state', 'status'])
+        for job in jobs:
             job_id = job['job_id']
             controller_pid = job['controller_pid']
@@ -265,12 +320,12 @@ def ha_recovery_for_consolidation_mode():
                         f'{datetime.datetime.now()}\n')
         f.write(f'HA recovery completed at {datetime.datetime.now()}\n')
         f.write(f'Total recovery time: {time.time() - start} seconds\n')
+    signal_file.unlink()
 async def get_job_status(
         backend: 'backends.CloudVmRayBackend', cluster_name: str,
-        job_id: Optional[int],
-        job_logger: logging.Logger) -> Optional['job_lib.JobStatus']:
+        job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
     """Check the status of the job running on a managed job cluster.
     It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
@@ -282,26 +337,28 @@ async def get_job_status(
     if handle is None:
         # This can happen if the cluster was preempted and background status
         # refresh already noticed and cleaned it up.
-        job_logger.info(f'Cluster {cluster_name} not found.')
+        logger.info(f'Cluster {cluster_name} not found.')
         return None
     assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
     job_ids = None if job_id is None else [job_id]
     for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
         try:
-            job_logger.info('=== Checking the job status... ===')
-            statuses = await context_utils.to_thread(backend.get_job_status,
-                                                     handle,
-                                                     job_ids=job_ids,
-                                                     stream_logs=False)
+            logger.info('=== Checking the job status... ===')
+            statuses = await asyncio.wait_for(
+                context_utils.to_thread(backend.get_job_status,
+                                        handle,
+                                        job_ids=job_ids,
+                                        stream_logs=False),
+                timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
             status = list(statuses.values())[0]
             if status is None:
-                job_logger.info('No job found.')
+                logger.info('No job found.')
             else:
-                job_logger.info(f'Job status: {status}')
-            job_logger.info('=' * 34)
+                logger.info(f'Job status: {status}')
+            logger.info('=' * 34)
             return status
         except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
-                ValueError, TypeError) as e:
+                ValueError, TypeError, asyncio.TimeoutError) as e:
             # Note: Each of these exceptions has some additional conditions to
             # limit how we handle it and whether or not we catch it.
             # Retry on k8s transient network errors. This is useful when using
@@ -322,6 +379,9 @@ async def get_job_status(
                     is_transient_error = True
             elif isinstance(e, grpc.FutureTimeoutError):
                 detailed_reason = 'Timeout'
+            elif isinstance(e, asyncio.TimeoutError):
+                detailed_reason = ('Job status check timed out after '
+                                   f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
             # TODO(cooperc): Gracefully handle these exceptions in the backend.
             elif isinstance(e, ValueError):
                 # If the cluster yaml is deleted in the middle of getting the
@@ -405,7 +465,7 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
         """
         managed_job_state.remove_ha_recovery_script(job_id)
         error_msg = None
-        tasks = managed_job_state.get_managed_jobs(job_id)
+        tasks = managed_job_state.get_managed_job_tasks(job_id)
         for task in tasks:
             pool = task.get('pool', None)
             if pool is None:
@@ -474,7 +534,7 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
     for job_id in job_ids:
         assert job_id is not None
-        tasks = managed_job_state.get_managed_jobs(job_id)
+        tasks = managed_job_state.get_managed_job_tasks(job_id)
         # Note: controller_pid and schedule_state are in the job_info table
         # which is joined to the spot table, so all tasks with the same job_id
         # will have the same value for these columns. This is what lets us just
@@ -494,9 +554,9 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
         if schedule_state == managed_job_state.ManagedJobScheduleState.DONE:
             # There are two cases where we could get a job that is DONE.
             # 1. At query time (get_jobs_to_check_status), the job was not yet
-            #    DONE, but since then (before get_managed_jobs is called) it has
-            #    hit a terminal status, marked itself done, and exited. This is
-            #    fine.
+            #    DONE, but since then (before get_managed_job_tasks is called)
+            #    it has hit a terminal status, marked itself done, and exited.
+            #    This is fine.
             # 2. The job is DONE, but in a non-terminal status. This is
             #    unexpected. For instance, the task status is RUNNING, but the
             #    job schedule_state is DONE.
@@ -850,6 +910,14 @@ def cancel_jobs_by_pool(pool_name: str,
     return cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
+def controller_log_file_for_job(job_id: int,
+                                create_if_not_exists: bool = False) -> str:
+    log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
+    if create_if_not_exists:
+        os.makedirs(log_dir, exist_ok=True)
+    return os.path.join(log_dir, f'{job_id}.log')
 def stream_logs_by_id(job_id: int,
                       follow: bool = True,
                       tail: Optional[int] = None) -> Tuple[str, int]:
@@ -882,13 +950,20 @@ def stream_logs_by_id(job_id: int,
             if managed_job_status.is_failed():
                 job_msg = ('\nFailure reason: '
                            f'{managed_job_state.get_failure_reason(job_id)}')
-            log_file_exists = False
+            log_file_ever_existed = False
             task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
                 job_id)
             num_tasks = len(task_info)
-            for task_id, task_name, task_status, log_file in task_info:
+            for (task_id, task_name, task_status, log_file,
+                 logs_cleaned_at) in task_info:
                 if log_file:
-                    log_file_exists = True
+                    log_file_ever_existed = True
+                    if logs_cleaned_at is not None:
+                        ts_str = datetime.fromtimestamp(
+                            logs_cleaned_at).strftime('%Y-%m-%d %H:%M:%S')
+                        print(f'Task {task_name}({task_id}) log has been '
+                              f'cleaned at {ts_str}.')
+                        continue
                     task_str = (f'Task {task_name}({task_id})'
                                 if task_name else f'Task {task_id}')
                     if num_tasks > 1:
@@ -923,7 +998,7 @@ def stream_logs_by_id(job_id: int,
                                 f'{task_str} finished '
                                 f'(status: {task_status.value}).'),
                                   flush=True)
-            if log_file_exists:
+            if log_file_ever_existed:
                 # Add the "Job finished" message for terminal states
                 if managed_job_status.is_terminal():
                     print(ux_utils.finishing_message(
@@ -1151,7 +1226,8 @@ def stream_logs(job_id: Optional[int],
     if controller:
         if job_id is None:
             assert job_name is not None
-            managed_jobs = managed_job_state.get_managed_jobs()
+            managed_jobs, _ = managed_job_state.get_managed_jobs_with_filters(
+                name_match=job_name, fields=['job_id', 'job_name', 'status'])
             # We manually filter the jobs by name, instead of using
             # get_nonterminal_job_ids_by_name, as with `controller=True`, we
             # should be able to show the logs for jobs in terminal states.
@@ -1174,9 +1250,7 @@ def stream_logs(job_id: Optional[int],
             job_id = managed_job_ids.pop()
         assert job_id is not None, (job_id, job_name)
-        controller_log_path = os.path.join(
-            os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
-            f'{job_id}.log')
+        controller_log_path = controller_log_file_for_job(job_id)
         job_status = None
         # Wait for the log file to be written
@@ -1277,11 +1351,87 @@ def dump_managed_job_queue(
     limit: Optional[int] = None,
     user_hashes: Optional[List[Optional[str]]] = None,
     statuses: Optional[List[str]] = None,
+    fields: Optional[List[str]] = None,
 ) -> str:
     return message_utils.encode_payload(
         get_managed_job_queue(skip_finished, accessible_workspaces, job_ids,
                               workspace_match, name_match, pool_match, page,
-                              limit, user_hashes, statuses))
+                              limit, user_hashes, statuses, fields))
+def _update_fields(fields: List[str],) -> Tuple[List[str], bool]:
+    """Update the fields list to include the necessary fields.
+    Args:
+        fields: The fields to update.
+    It will:
+    - Add the necessary dependent fields to the list.
+    - Remove the fields that are not in the DB.
+    - Determine if cluster handle is required.
+    Returns:
+        A tuple containing the updated fields and a boolean indicating if
+        cluster handle is required.
+    """
+    cluster_handle_required = True
+    if _cluster_handle_not_required(fields):
+        cluster_handle_required = False
+    # Copy the list to avoid modifying the original list
+    new_fields = fields.copy()
+    # status and job_id are always included
+    if 'status' not in new_fields:
+        new_fields.append('status')
+    if 'job_id' not in new_fields:
+        new_fields.append('job_id')
+    # user_hash is required if user_name is present
+    if 'user_name' in new_fields and 'user_hash' not in new_fields:
+        new_fields.append('user_hash')
+    if 'job_duration' in new_fields:
+        if 'last_recovered_at' not in new_fields:
+            new_fields.append('last_recovered_at')
+        if 'end_at' not in new_fields:
+            new_fields.append('end_at')
+    if 'job_name' in new_fields and 'task_name' not in new_fields:
+        new_fields.append('task_name')
+    if 'details' in new_fields:
+        if 'schedule_state' not in new_fields:
+            new_fields.append('schedule_state')
+        if 'priority' not in new_fields:
+            new_fields.append('priority')
+        if 'failure_reason' not in new_fields:
+            new_fields.append('failure_reason')
+    if 'user_yaml' in new_fields:
+        if 'original_user_yaml_path' not in new_fields:
+            new_fields.append('original_user_yaml_path')
+        if 'original_user_yaml_content' not in new_fields:
+            new_fields.append('original_user_yaml_content')
+    if cluster_handle_required:
+        if 'task_name' not in new_fields:
+            new_fields.append('task_name')
+        if 'current_cluster_name' not in new_fields:
+            new_fields.append('current_cluster_name')
+    # Remove _NON_DB_FIELDS
+    # These fields have been mapped to the DB fields in the above code, so we
+    # don't need to include them in the updated fields.
+    for field in _NON_DB_FIELDS:
+        if field in new_fields:
+            new_fields.remove(field)
+    return new_fields, cluster_handle_required
+def _cluster_handle_not_required(fields: List[str]) -> bool:
+    """Determine if cluster handle is not required.
+    Args:
+        fields: The fields to check if they contain any of the cluster handle
+        fields.
+    Returns:
+        True if the fields do not contain any of the cluster handle fields,
+        False otherwise.
+    """
+    return not any(field in fields for field in _CLUSTER_HANDLE_FIELDS)
 def get_managed_job_queue(
@@ -1295,146 +1445,153 @@ def get_managed_job_queue(
     limit: Optional[int] = None,
     user_hashes: Optional[List[Optional[str]]] = None,
     statuses: Optional[List[str]] = None,
+    fields: Optional[List[str]] = None,
 ) -> Dict[str, Any]:
-    # Make sure to get all jobs - some logic below (e.g. high priority job
-    # detection) requires a full view of the jobs table.
-    jobs = managed_job_state.get_managed_jobs()
+    """Get the managed job queue.
-    # Figure out what the highest priority blocking job is. We need to know in
-    # order to determine if other jobs are blocked by a higher priority job, or
-    # just by the limited controller resources.
-    highest_blocking_priority = constants.MIN_PRIORITY
-    for job in jobs:
-        if job['schedule_state'] not in (
-                # LAUNCHING and ALIVE_BACKOFF jobs will block other jobs with
-                # lower priority.
-                managed_job_state.ManagedJobScheduleState.LAUNCHING,
-                managed_job_state.ManagedJobScheduleState.ALIVE_BACKOFF,
-                # It's possible for a WAITING/ALIVE_WAITING job to be ready to
-                # launch, but the scheduler just hasn't run yet.
-                managed_job_state.ManagedJobScheduleState.WAITING,
-                managed_job_state.ManagedJobScheduleState.ALIVE_WAITING):
-            # This job will not block others.
-            continue
-        priority = job.get('priority')
-        if priority is not None and priority > highest_blocking_priority:
-            highest_blocking_priority = priority
+    Args:
+        skip_finished: Whether to skip finished jobs.
+        accessible_workspaces: The accessible workspaces.
+        job_ids: The job ids.
+        workspace_match: The workspace name to match.
+        name_match: The job name to match.
+        pool_match: The pool name to match.
+        page: The page number.
+        limit: The limit number.
+        user_hashes: The user hashes.
+        statuses: The statuses.
+        fields: The fields to include in the response.
-    total_no_filter = len(jobs)
+    Returns:
+        A dictionary containing the managed job queue.
+    """
+    cluster_handle_required = True
+    updated_fields = None
+    # The caller only need to specify the fields in the
+    # `class ManagedJobRecord` in `response.py`, and the `_update_fields`
+    # function will add the necessary dependent fields to the list, for
+    # example, if the caller specifies `['user_name']`, the `_update_fields`
+    # function will add `['user_hash']` to the list.
+    if fields:
+        updated_fields, cluster_handle_required = _update_fields(fields)
+    total_no_filter = managed_job_state.get_managed_jobs_total()
+    status_counts = managed_job_state.get_status_count_with_filters(
+        fields=fields,
+        job_ids=job_ids,
+        accessible_workspaces=accessible_workspaces,
+        workspace_match=workspace_match,
+        name_match=name_match,
+        pool_match=pool_match,
+        user_hashes=user_hashes,
+        skip_finished=skip_finished,
+    )
+    jobs, total = managed_job_state.get_managed_jobs_with_filters(
+        fields=updated_fields,
+        job_ids=job_ids,
+        accessible_workspaces=accessible_workspaces,
+        workspace_match=workspace_match,
+        name_match=name_match,
+        pool_match=pool_match,
+        user_hashes=user_hashes,
+        statuses=statuses,
+        skip_finished=skip_finished,
+        page=page,
+        limit=limit,
+    )
+    if cluster_handle_required:
+        # Fetch the cluster name to handle map for managed clusters only.
+        cluster_name_to_handle = (
+            global_user_state.get_cluster_name_to_handle_map(is_managed=True))
-    if user_hashes:
-        jobs = [
-            job for job in jobs if job.get('user_hash', None) in user_hashes
-        ]
-    if accessible_workspaces:
-        jobs = [
-            job for job in jobs
-            if job.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE) in
-            accessible_workspaces
-        ]
-    if skip_finished:
-        # Filter out the finished jobs. If a multi-task job is partially
-        # finished, we will include all its tasks.
-        non_finished_tasks = list(
-            filter(
-                lambda job: not managed_job_state.ManagedJobStatus(job[
-                    'status']).is_terminal(), jobs))
-        non_finished_job_ids = {job['job_id'] for job in non_finished_tasks}
-        jobs = list(
-            filter(lambda job: job['job_id'] in non_finished_job_ids, jobs))
-    if job_ids:
-        jobs = [job for job in jobs if job['job_id'] in job_ids]
-    jobs, total, status_counts = filter_jobs(jobs,
-                                             workspace_match,
-                                             name_match,
-                                             pool_match,
-                                             page,
-                                             limit,
-                                             statuses=statuses)
-    job_ids = set(job['job_id'] for job in jobs)
-    job_id_to_pool_info = (
-        managed_job_state.get_pool_and_submit_info_from_job_ids(job_ids))
-    cluster_names: Dict[int, str] = {}
-    for job in jobs:
-        # pool info is (pool, cluster_name, job_id_on_pool_cluster)
-        pool_info = job_id_to_pool_info.get(job['job_id'], None)
-        if pool_info and pool_info[0]:
-            cluster_name = pool_info[1]
-        else:
-            cluster_name = generate_managed_job_cluster_name(
-                job['task_name'], job['job_id'])
-        cluster_names[job['job_id']] = cluster_name
-    cluster_name_to_handles = global_user_state.get_handles_from_cluster_names(
-        set(cluster_names.values()))
+    highest_blocking_priority = constants.MIN_PRIORITY
+    if not fields or 'details' in fields:
+        # Figure out what the highest priority blocking job is. We need to know
+        # in order to determine if other jobs are blocked by a higher priority
+        # job, or just by the limited controller resources.
+        highest_blocking_priority = (
+            managed_job_state.get_managed_jobs_highest_priority())
     for job in jobs:
-        end_at = job['end_at']
-        if end_at is None:
-            end_at = time.time()
-        job_submitted_at = job['last_recovered_at'] - job['job_duration']
-        if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
-            # When job is recovering, the duration is exact job['job_duration']
-            job_duration = job['job_duration']
-        elif job_submitted_at > 0:
-            job_duration = end_at - job_submitted_at
-        else:
-            # When job_start_at <= 0, that means the last_recovered_at is not
-            # set yet, i.e. the job is not started.
-            job_duration = 0
-        job['job_duration'] = job_duration
+        if not fields or 'job_duration' in fields:
+            end_at = job['end_at']
+            if end_at is None:
+                end_at = time.time()
+            job_submitted_at = job['last_recovered_at'] - job['job_duration']
+            if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
+                # When job is recovering, the duration is exact
+                # job['job_duration']
+                job_duration = job['job_duration']
+            elif job_submitted_at > 0:
+                job_duration = end_at - job_submitted_at
+            else:
+                # When job_start_at <= 0, that means the last_recovered_at
+                # is not set yet, i.e. the job is not started.
+                job_duration = 0
+            job['job_duration'] = job_duration
         job['status'] = job['status'].value
-        job['schedule_state'] = job['schedule_state'].value
-        cluster_name = cluster_names[job['job_id']]
-        handle = cluster_name_to_handles.get(cluster_name, None)
-        if isinstance(handle, backends.CloudVmRayResourceHandle):
-            resources_str = resources_utils.get_readable_resources_repr(
-                handle, simplify=True)
-            resources_str_full = resources_utils.get_readable_resources_repr(
-                handle, simplify=False)
-            job['cluster_resources'] = resources_str
-            job['cluster_resources_full'] = resources_str_full
-            job['cloud'] = str(handle.launched_resources.cloud)
-            job['region'] = handle.launched_resources.region
-            job['zone'] = handle.launched_resources.zone
-            job['infra'] = infra_utils.InfraInfo(
-                str(handle.launched_resources.cloud),
-                handle.launched_resources.region,
-                handle.launched_resources.zone).formatted_str()
-            job['accelerators'] = handle.launched_resources.accelerators
+        if not fields or 'schedule_state' in fields:
+            job['schedule_state'] = job['schedule_state'].value
         else:
-            # FIXME(zongheng): display the last cached values for these.
-            job['cluster_resources'] = '-'
-            job['cluster_resources_full'] = '-'
-            job['cloud'] = '-'
-            job['region'] = '-'
-            job['zone'] = '-'
-            job['infra'] = '-'
-        # Add details about schedule state / backoff.
-        state_details = None
-        if job['schedule_state'] == 'ALIVE_BACKOFF':
-            state_details = 'In backoff, waiting for resources'
-        elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
-            priority = job.get('priority')
-            if (priority is not None and priority < highest_blocking_priority):
-                # Job is lower priority than some other blocking job.
-                state_details = 'Waiting for higher priority jobs to launch'
+            job['schedule_state'] = None
+        if cluster_handle_required:
+            cluster_name = job.get('current_cluster_name', None)
+            if cluster_name is None:
+                cluster_name = generate_managed_job_cluster_name(
+                    job['task_name'], job['job_id'])
+            handle = cluster_name_to_handle.get(
+                cluster_name, None) if cluster_name is not None else None
+            if isinstance(handle, backends.CloudVmRayResourceHandle):
+                resources_str_simple, resources_str_full = (
+                    resources_utils.get_readable_resources_repr(
+                        handle, simplified_only=False))
+                assert resources_str_full is not None
+                job['cluster_resources'] = resources_str_simple
+                job['cluster_resources_full'] = resources_str_full
+                job['cloud'] = str(handle.launched_resources.cloud)
+                job['region'] = handle.launched_resources.region
+                job['zone'] = handle.launched_resources.zone
+                job['infra'] = infra_utils.InfraInfo(
+                    str(handle.launched_resources.cloud),
+                    handle.launched_resources.region,
+                    handle.launched_resources.zone).formatted_str()
+                job['accelerators'] = handle.launched_resources.accelerators
             else:
-                state_details = 'Waiting for other jobs to launch'
-        if state_details and job['failure_reason']:
-            job['details'] = f'{state_details} - {job["failure_reason"]}'
-        elif state_details:
-            job['details'] = state_details
-        elif job['failure_reason']:
-            job['details'] = f'Failure: {job["failure_reason"]}'
-        else:
-            job['details'] = None
+                # FIXME(zongheng): display the last cached values for these.
+                job['cluster_resources'] = '-'
+                job['cluster_resources_full'] = '-'
+                job['cloud'] = '-'
+                job['region'] = '-'
+                job['zone'] = '-'
+                job['infra'] = '-'
+        if not fields or 'details' in fields:
+            # Add details about schedule state / backoff.
+            state_details = None
+            if job['schedule_state'] == 'ALIVE_BACKOFF':
+                state_details = 'In backoff, waiting for resources'
+            elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
+                priority = job.get('priority')
+                if (priority is not None and
+                        priority < highest_blocking_priority):
+                    # Job is lower priority than some other blocking job.
+                    state_details = 'Waiting for higher priority jobs to launch'
+                else:
+                    state_details = 'Waiting for other jobs to launch'
+            if state_details and job['failure_reason']:
+                job['details'] = f'{state_details} - {job["failure_reason"]}'
+            elif state_details:
+                job['details'] = state_details
+            elif job['failure_reason']:
+                job['details'] = f'Failure: {job["failure_reason"]}'
+            else:
+                job['details'] = None
     return {
         'jobs': jobs,
@@ -1545,21 +1702,14 @@ def load_managed_job_queue(
         total_no_filter = total
         result_type = ManagedJobQueueResultType.LIST
-    job_id_to_user_hash: Dict[int, str] = {}
+    all_users = global_user_state.get_all_users()
+    all_users_map = {user.id: user.name for user in all_users}
     for job in jobs:
+        job['status'] = managed_job_state.ManagedJobStatus(job['status'])
         if 'user_hash' in job and job['user_hash'] is not None:
             # Skip jobs that do not have user_hash info.
             # TODO(cooperc): Remove check before 0.12.0.
-            job_id_to_user_hash[job['job_id']] = job['user_hash']
-    user_hash_to_user = global_user_state.get_users(
-        job_id_to_user_hash.values())
-    for job in jobs:
-        job['status'] = managed_job_state.ManagedJobStatus(job['status'])
-        if job['job_id'] in job_id_to_user_hash:
-            user_hash = job_id_to_user_hash[job['job_id']]
-            user = user_hash_to_user.get(user_hash, None)
-            job['user_name'] = user.name if user is not None else None
+            job['user_name'] = all_users_map.get(job['user_hash'])
     return jobs, total, result_type, total_no_filter, status_counts
@@ -1584,29 +1734,40 @@ def _get_job_status_from_tasks(
 @typing.overload
-def format_job_table(tasks: List[Dict[str, Any]],
-                     show_all: bool,
-                     show_user: bool,
-                     return_rows: Literal[False] = False,
-                     max_jobs: Optional[int] = None) -> str:
+def format_job_table(
+    tasks: List[Dict[str, Any]],
+    show_all: bool,
+    show_user: bool,
+    return_rows: Literal[False] = False,
+    pool_status: Optional[List[Dict[str, Any]]] = None,
+    max_jobs: Optional[int] = None,
+    job_status_counts: Optional[Dict[str, int]] = None,
+) -> str:
     ...
 @typing.overload
-def format_job_table(tasks: List[Dict[str, Any]],
-                     show_all: bool,
-                     show_user: bool,
-                     return_rows: Literal[True],
-                     max_jobs: Optional[int] = None) -> List[List[str]]:
+def format_job_table(
+    tasks: List[Dict[str, Any]],
+    show_all: bool,
+    show_user: bool,
+    return_rows: Literal[True],
+    pool_status: Optional[List[Dict[str, Any]]] = None,
+    max_jobs: Optional[int] = None,
+    job_status_counts: Optional[Dict[str, int]] = None,
+) -> List[List[str]]:
     ...
 def format_job_table(
-        tasks: List[Dict[str, Any]],
-        show_all: bool,
-        show_user: bool,
-        return_rows: bool = False,
-        max_jobs: Optional[int] = None) -> Union[str, List[List[str]]]:
+    tasks: List[Dict[str, Any]],
+    show_all: bool,
+    show_user: bool,
+    return_rows: bool = False,
+    pool_status: Optional[List[Dict[str, Any]]] = None,
+    max_jobs: Optional[int] = None,
+    job_status_counts: Optional[Dict[str, int]] = None,
+) -> Union[str, List[List[str]]]:
     """Returns managed jobs as a formatted string.
     Args:
@@ -1615,6 +1776,8 @@ def format_job_table(
         max_jobs: The maximum number of jobs to show in the table.
         return_rows: If True, return the rows as a list of strings instead of
           all rows concatenated into a single string.
+        pool_status: List of pool status dictionaries with replica_info.
+        job_status_counts: The counts of each job status.
     Returns: A formatted string of managed jobs, if not `return_rows`; otherwise
       a list of "rows" (each of which is a list of str).
@@ -1631,17 +1794,37 @@ def format_job_table(
             return (task['user'], task['job_id'])
         return task['job_id']
+    def _get_job_id_to_worker_map(
+            pool_status: Optional[List[Dict[str, Any]]]) -> Dict[int, int]:
+        """Create a mapping from job_id to worker replica_id.
+        Args:
+            pool_status: List of pool status dictionaries with replica_info.
+        Returns:
+            Dictionary mapping job_id to replica_id (worker ID).
+        """
+        job_to_worker: Dict[int, int] = {}
+        if pool_status is None:
+            return job_to_worker
+        for pool in pool_status:
+            replica_info = pool.get('replica_info', [])
+            for replica in replica_info:
+                used_by = replica.get('used_by')
+                if used_by is not None:
+                    job_to_worker[used_by] = replica.get('replica_id')
+        return job_to_worker
+    # Create mapping from job_id to worker replica_id
+    job_to_worker = _get_job_id_to_worker_map(pool_status)
     for task in tasks:
         # The tasks within the same job_id are already sorted
         # by the task_id.
         jobs[get_hash(task)].append(task)
-    status_counts: Dict[str, int] = collections.defaultdict(int)
     workspaces = set()
     for job_tasks in jobs.values():
-        managed_job_status = _get_job_status_from_tasks(job_tasks)[0]
-        if not managed_job_status.is_terminal():
-            status_counts[managed_job_status.value] += 1
         workspaces.add(job_tasks[0].get('workspace',
                                         constants.SKYPILOT_DEFAULT_WORKSPACE))
@@ -1684,9 +1867,15 @@ def format_job_table(
     job_table = log_utils.create_table(columns)
     status_counts: Dict[str, int] = collections.defaultdict(int)
-    for task in tasks:
-        if not task['status'].is_terminal():
-            status_counts[task['status'].value] += 1
+    if job_status_counts:
+        for status_value, count in job_status_counts.items():
+            status = managed_job_state.ManagedJobStatus(status_value)
+            if not status.is_terminal():
+                status_counts[status_value] = count
+    else:
+        for task in tasks:
+            if not task['status'].is_terminal():
+                status_counts[task['status'].value] += 1
     all_tasks = tasks
     if max_jobs is not None:
@@ -1772,7 +1961,12 @@ def format_job_table(
             if pool is None:
                 pool = '-'
+            # Add worker information if job is assigned to a worker
             job_id = job_hash[1] if tasks_have_k8s_user else job_hash
+            # job_id is now always an integer, use it to look up worker
+            if job_id in job_to_worker and pool != '-':
+                pool = f'{pool} (worker={job_to_worker[job_id]})'
             job_values = [
                 job_id,
                 '',
@@ -1815,6 +2009,12 @@ def format_job_table(
             pool = task.get('pool')
             if pool is None:
                 pool = '-'
+            # Add worker information if task is assigned to a worker
+            task_job_id = task['job_id']
+            if task_job_id in job_to_worker and pool != '-':
+                pool = f'{pool} (worker={job_to_worker[task_job_id]})'
             values = [
                 task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
                 task['task_id'] if len(job_tasks) > 1 else '-',
@@ -1934,7 +2134,8 @@ def _job_proto_to_dict(
         # and Protobuf encodes int64 as decimal strings in JSON,
         # so we need to convert them back to ints.
         # https://protobuf.dev/programming-guides/json/#field-representation
-        if field.type == descriptor.FieldDescriptor.TYPE_INT64:
+        if (field.type == descriptor.FieldDescriptor.TYPE_INT64 and
+                job_dict.get(field.name) is not None):
             job_dict[field.name] = int(job_dict[field.name])
     job_dict['status'] = managed_job_state.ManagedJobStatus.from_protobuf(
         job_dict['status'])
@@ -1978,6 +2179,7 @@ class ManagedJobCodeGen:
         limit: Optional[int] = None,
         user_hashes: Optional[List[Optional[str]]] = None,
         statuses: Optional[List[str]] = None,
+        fields: Optional[List[str]] = None,
     ) -> str:
         code = textwrap.dedent(f"""\
         if managed_job_version < 9:
@@ -1996,7 +2198,7 @@ class ManagedJobCodeGen:
                                 page={page!r},
                                 limit={limit!r},
                                 user_hashes={user_hashes!r})
-        else:
+        elif managed_job_version < 12:
             job_table = utils.dump_managed_job_queue(
                                 skip_finished={skip_finished},
                                 accessible_workspaces={accessible_workspaces!r},
@@ -2008,6 +2210,19 @@ class ManagedJobCodeGen:
                                 limit={limit!r},
                                 user_hashes={user_hashes!r},
                                 statuses={statuses!r})
+        else:
+            job_table = utils.dump_managed_job_queue(
+                                skip_finished={skip_finished},
+                                accessible_workspaces={accessible_workspaces!r},
+                                job_ids={job_ids!r},
+                                workspace_match={workspace_match!r},
+                                name_match={name_match!r},
+                                pool_match={pool_match!r},
+                                page={page!r},
+                                limit={limit!r},
+                                user_hashes={user_hashes!r},
+                                statuses={statuses!r},
+                                fields={fields!r})
         print(job_table, flush=True)
         """)
         return cls._build(code)
@@ -2075,6 +2290,18 @@ class ManagedJobCodeGen:
         """)
         return cls._build(code)
+    @classmethod
+    def get_version(cls) -> str:
+        """Generate code to get controller version."""
+        code = textwrap.dedent("""\
+        from sky.skylet import constants as controller_constants
+        # Get controller version
+        controller_version = controller_constants.SKYLET_VERSION
+        print(f"controller_version:{controller_version}", flush=True)
+        """)
+        return cls._build(code)
     @classmethod
     def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
         code = textwrap.dedent(f"""\
@@ -2112,8 +2339,12 @@ class ManagedJobCodeGen:
         return cls._build(code)
     @classmethod
-    def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag',
-                    workspace: str, entrypoint: str) -> str:
+    def set_pending(cls,
+                    job_id: int,
+                    managed_job_dag: 'dag_lib.Dag',
+                    workspace: str,
+                    entrypoint: str,
+                    user_hash: Optional[str] = None) -> str:
         dag_name = managed_job_dag.name
         pool = managed_job_dag.pool
         # Add the managed job to queue table.
@@ -2130,6 +2361,8 @@ class ManagedJobCodeGen:
                     pool_hash = serve_state.get_service_hash({pool!r})
                 set_job_info_kwargs['pool'] = {pool!r}
                 set_job_info_kwargs['pool_hash'] = pool_hash
+            if managed_job_version >= 11:
+                set_job_info_kwargs['user_hash'] = {user_hash!r}
             managed_job_state.set_job_info(
                 {job_id}, {dag_name!r}, **set_job_info_kwargs)
             """)

skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

Potentially problematic release.

skypilot-nightly 1.0.0.dev20251009py3-none-any.whl → 1.0.0.dev20251107py3-none-any.whl