PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250502py3-none-any.whl → 1.0.0.dev20251203py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (546) hide show

sky/__init__.py +22 -6
sky/adaptors/aws.py +81 -16
sky/adaptors/common.py +25 -2
sky/adaptors/coreweave.py +278 -0
sky/adaptors/do.py +8 -2
sky/adaptors/gcp.py +11 -0
sky/adaptors/hyperbolic.py +8 -0
sky/adaptors/ibm.py +5 -2
sky/adaptors/kubernetes.py +149 -18
sky/adaptors/nebius.py +173 -30
sky/adaptors/primeintellect.py +1 -0
sky/adaptors/runpod.py +68 -0
sky/adaptors/seeweb.py +183 -0
sky/adaptors/shadeform.py +89 -0
sky/admin_policy.py +187 -4
sky/authentication.py +179 -225
sky/backends/__init__.py +4 -2
sky/backends/backend.py +22 -9
sky/backends/backend_utils.py +1323 -397
sky/backends/cloud_vm_ray_backend.py +1749 -1029
sky/backends/docker_utils.py +1 -1
sky/backends/local_docker_backend.py +11 -6
sky/backends/task_codegen.py +633 -0
sky/backends/wheel_utils.py +55 -9
sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
sky/{clouds/service_catalog → catalog}/common.py +90 -49
sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
sky/catalog/data_fetchers/fetch_nebius.py +338 -0
sky/catalog/data_fetchers/fetch_runpod.py +698 -0
sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
sky/catalog/hyperbolic_catalog.py +136 -0
sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
sky/catalog/primeintellect_catalog.py +95 -0
sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
sky/catalog/seeweb_catalog.py +184 -0
sky/catalog/shadeform_catalog.py +165 -0
sky/catalog/ssh_catalog.py +167 -0
sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
sky/check.py +533 -185
sky/cli.py +5 -5975
sky/client/{cli.py → cli/command.py} +2591 -1956
sky/client/cli/deprecation_utils.py +99 -0
sky/client/cli/flags.py +359 -0
sky/client/cli/table_utils.py +322 -0
sky/client/cli/utils.py +79 -0
sky/client/common.py +78 -32
sky/client/oauth.py +82 -0
sky/client/sdk.py +1219 -319
sky/client/sdk_async.py +827 -0
sky/client/service_account_auth.py +47 -0
sky/cloud_stores.py +82 -3
sky/clouds/__init__.py +13 -0
sky/clouds/aws.py +564 -164
sky/clouds/azure.py +105 -83
sky/clouds/cloud.py +140 -40
sky/clouds/cudo.py +68 -50
sky/clouds/do.py +66 -48
sky/clouds/fluidstack.py +63 -44
sky/clouds/gcp.py +339 -110
sky/clouds/hyperbolic.py +293 -0
sky/clouds/ibm.py +70 -49
sky/clouds/kubernetes.py +570 -162
sky/clouds/lambda_cloud.py +74 -54
sky/clouds/nebius.py +210 -81
sky/clouds/oci.py +88 -66
sky/clouds/paperspace.py +61 -44
sky/clouds/primeintellect.py +317 -0
sky/clouds/runpod.py +164 -74
sky/clouds/scp.py +89 -86
sky/clouds/seeweb.py +477 -0
sky/clouds/shadeform.py +400 -0
sky/clouds/ssh.py +263 -0
sky/clouds/utils/aws_utils.py +10 -4
sky/clouds/utils/gcp_utils.py +87 -11
sky/clouds/utils/oci_utils.py +38 -14
sky/clouds/utils/scp_utils.py +231 -167
sky/clouds/vast.py +99 -77
sky/clouds/vsphere.py +51 -40
sky/core.py +375 -173
sky/dag.py +15 -0
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -0
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -0
sky/dashboard/out/infra.html +1 -0
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -0
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -0
sky/dashboard/out/volumes.html +1 -0
sky/dashboard/out/workspace/new.html +1 -0
sky/dashboard/out/workspaces/[name].html +1 -0
sky/dashboard/out/workspaces.html +1 -0
sky/data/data_utils.py +137 -1
sky/data/mounting_utils.py +269 -84
sky/data/storage.py +1460 -1807
sky/data/storage_utils.py +43 -57
sky/exceptions.py +126 -2
sky/execution.py +216 -63
sky/global_user_state.py +2390 -586
sky/jobs/__init__.py +7 -0
sky/jobs/client/sdk.py +300 -58
sky/jobs/client/sdk_async.py +161 -0
sky/jobs/constants.py +15 -8
sky/jobs/controller.py +848 -275
sky/jobs/file_content_utils.py +128 -0
sky/jobs/log_gc.py +193 -0
sky/jobs/recovery_strategy.py +402 -152
sky/jobs/scheduler.py +314 -189
sky/jobs/server/core.py +836 -255
sky/jobs/server/server.py +156 -115
sky/jobs/server/utils.py +136 -0
sky/jobs/state.py +2109 -706
sky/jobs/utils.py +1306 -215
sky/logs/__init__.py +21 -0
sky/logs/agent.py +108 -0
sky/logs/aws.py +243 -0
sky/logs/gcp.py +91 -0
sky/metrics/__init__.py +0 -0
sky/metrics/utils.py +453 -0
sky/models.py +78 -1
sky/optimizer.py +164 -70
sky/provision/__init__.py +90 -4
sky/provision/aws/config.py +147 -26
sky/provision/aws/instance.py +136 -50
sky/provision/azure/instance.py +11 -6
sky/provision/common.py +13 -1
sky/provision/cudo/cudo_machine_type.py +1 -1
sky/provision/cudo/cudo_utils.py +14 -8
sky/provision/cudo/cudo_wrapper.py +72 -71
sky/provision/cudo/instance.py +10 -6
sky/provision/do/instance.py +10 -6
sky/provision/do/utils.py +4 -3
sky/provision/docker_utils.py +140 -33
sky/provision/fluidstack/instance.py +13 -8
sky/provision/gcp/__init__.py +1 -0
sky/provision/gcp/config.py +301 -19
sky/provision/gcp/constants.py +218 -0
sky/provision/gcp/instance.py +36 -8
sky/provision/gcp/instance_utils.py +18 -4
sky/provision/gcp/volume_utils.py +247 -0
sky/provision/hyperbolic/__init__.py +12 -0
sky/provision/hyperbolic/config.py +10 -0
sky/provision/hyperbolic/instance.py +437 -0
sky/provision/hyperbolic/utils.py +373 -0
sky/provision/instance_setup.py +101 -20
sky/provision/kubernetes/__init__.py +5 -0
sky/provision/kubernetes/config.py +9 -52
sky/provision/kubernetes/constants.py +17 -0
sky/provision/kubernetes/instance.py +919 -280
sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
sky/provision/kubernetes/network.py +27 -17
sky/provision/kubernetes/network_utils.py +44 -43
sky/provision/kubernetes/utils.py +1221 -534
sky/provision/kubernetes/volume.py +343 -0
sky/provision/lambda_cloud/instance.py +22 -16
sky/provision/nebius/constants.py +50 -0
sky/provision/nebius/instance.py +19 -6
sky/provision/nebius/utils.py +237 -137
sky/provision/oci/instance.py +10 -5
sky/provision/paperspace/instance.py +10 -7
sky/provision/paperspace/utils.py +1 -1
sky/provision/primeintellect/__init__.py +10 -0
sky/provision/primeintellect/config.py +11 -0
sky/provision/primeintellect/instance.py +454 -0
sky/provision/primeintellect/utils.py +398 -0
sky/provision/provisioner.py +117 -36
sky/provision/runpod/__init__.py +5 -0
sky/provision/runpod/instance.py +27 -6
sky/provision/runpod/utils.py +51 -18
sky/provision/runpod/volume.py +214 -0
sky/provision/scp/__init__.py +15 -0
sky/provision/scp/config.py +93 -0
sky/provision/scp/instance.py +707 -0
sky/provision/seeweb/__init__.py +11 -0
sky/provision/seeweb/config.py +13 -0
sky/provision/seeweb/instance.py +812 -0
sky/provision/shadeform/__init__.py +11 -0
sky/provision/shadeform/config.py +12 -0
sky/provision/shadeform/instance.py +351 -0
sky/provision/shadeform/shadeform_utils.py +83 -0
sky/provision/ssh/__init__.py +18 -0
sky/provision/vast/instance.py +13 -8
sky/provision/vast/utils.py +10 -7
sky/provision/volume.py +164 -0
sky/provision/vsphere/common/ssl_helper.py +1 -1
sky/provision/vsphere/common/vapiconnect.py +2 -1
sky/provision/vsphere/common/vim_utils.py +4 -4
sky/provision/vsphere/instance.py +15 -10
sky/provision/vsphere/vsphere_utils.py +17 -20
sky/py.typed +0 -0
sky/resources.py +845 -119
sky/schemas/__init__.py +0 -0
sky/schemas/api/__init__.py +0 -0
sky/schemas/api/responses.py +227 -0
sky/schemas/db/README +4 -0
sky/schemas/db/env.py +90 -0
sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
sky/schemas/db/global_user_state/004_is_managed.py +34 -0
sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
sky/schemas/db/global_user_state/006_provision_log.py +41 -0
sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
sky/schemas/db/script.py.mako +28 -0
sky/schemas/db/serve_state/001_initial_schema.py +67 -0
sky/schemas/db/serve_state/002_yaml_content.py +34 -0
sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
sky/schemas/generated/__init__.py +0 -0
sky/schemas/generated/autostopv1_pb2.py +36 -0
sky/schemas/generated/autostopv1_pb2.pyi +43 -0
sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
sky/schemas/generated/jobsv1_pb2.py +86 -0
sky/schemas/generated/jobsv1_pb2.pyi +254 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
sky/schemas/generated/servev1_pb2.py +58 -0
sky/schemas/generated/servev1_pb2.pyi +115 -0
sky/schemas/generated/servev1_pb2_grpc.py +322 -0
sky/serve/autoscalers.py +357 -5
sky/serve/client/impl.py +310 -0
sky/serve/client/sdk.py +47 -139
sky/serve/client/sdk_async.py +130 -0
sky/serve/constants.py +12 -9
sky/serve/controller.py +68 -17
sky/serve/load_balancer.py +106 -60
sky/serve/load_balancing_policies.py +116 -2
sky/serve/replica_managers.py +434 -249
sky/serve/serve_rpc_utils.py +179 -0
sky/serve/serve_state.py +569 -257
sky/serve/serve_utils.py +775 -265
sky/serve/server/core.py +66 -711
sky/serve/server/impl.py +1093 -0
sky/serve/server/server.py +21 -18
sky/serve/service.py +192 -89
sky/serve/service_spec.py +144 -20
sky/serve/spot_placer.py +3 -0
sky/server/auth/__init__.py +0 -0
sky/server/auth/authn.py +50 -0
sky/server/auth/loopback.py +38 -0
sky/server/auth/oauth2_proxy.py +202 -0
sky/server/common.py +478 -182
sky/server/config.py +85 -23
sky/server/constants.py +44 -6
sky/server/daemons.py +295 -0
sky/server/html/token_page.html +185 -0
sky/server/metrics.py +160 -0
sky/server/middleware_utils.py +166 -0
sky/server/requests/executor.py +558 -138
sky/server/requests/payloads.py +364 -24
sky/server/requests/preconditions.py +21 -17
sky/server/requests/process.py +112 -29
sky/server/requests/request_names.py +121 -0
sky/server/requests/requests.py +822 -226
sky/server/requests/serializers/decoders.py +82 -31
sky/server/requests/serializers/encoders.py +140 -22
sky/server/requests/threads.py +117 -0
sky/server/rest.py +455 -0
sky/server/server.py +1309 -285
sky/server/state.py +20 -0
sky/server/stream_utils.py +327 -61
sky/server/uvicorn.py +217 -3
sky/server/versions.py +270 -0
sky/setup_files/MANIFEST.in +11 -1
sky/setup_files/alembic.ini +160 -0
sky/setup_files/dependencies.py +139 -31
sky/setup_files/setup.py +44 -42
sky/sky_logging.py +114 -7
sky/skylet/attempt_skylet.py +106 -24
sky/skylet/autostop_lib.py +129 -8
sky/skylet/configs.py +29 -20
sky/skylet/constants.py +216 -25
sky/skylet/events.py +101 -21
sky/skylet/job_lib.py +345 -164
sky/skylet/log_lib.py +297 -18
sky/skylet/log_lib.pyi +44 -1
sky/skylet/providers/ibm/node_provider.py +12 -8
sky/skylet/providers/ibm/vpc_provider.py +13 -12
sky/skylet/ray_patches/__init__.py +17 -3
sky/skylet/ray_patches/autoscaler.py.diff +18 -0
sky/skylet/ray_patches/cli.py.diff +19 -0
sky/skylet/ray_patches/command_runner.py.diff +17 -0
sky/skylet/ray_patches/log_monitor.py.diff +20 -0
sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
sky/skylet/ray_patches/updater.py.diff +18 -0
sky/skylet/ray_patches/worker.py.diff +41 -0
sky/skylet/runtime_utils.py +21 -0
sky/skylet/services.py +568 -0
sky/skylet/skylet.py +72 -4
sky/skylet/subprocess_daemon.py +104 -29
sky/skypilot_config.py +506 -99
sky/ssh_node_pools/__init__.py +1 -0
sky/ssh_node_pools/core.py +135 -0
sky/ssh_node_pools/server.py +233 -0
sky/task.py +685 -163
sky/templates/aws-ray.yml.j2 +11 -3
sky/templates/azure-ray.yml.j2 +2 -1
sky/templates/cudo-ray.yml.j2 +1 -0
sky/templates/do-ray.yml.j2 +2 -1
sky/templates/fluidstack-ray.yml.j2 +1 -0
sky/templates/gcp-ray.yml.j2 +62 -1
sky/templates/hyperbolic-ray.yml.j2 +68 -0
sky/templates/ibm-ray.yml.j2 +2 -1
sky/templates/jobs-controller.yaml.j2 +27 -24
sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
sky/templates/kubernetes-ray.yml.j2 +611 -50
sky/templates/lambda-ray.yml.j2 +2 -1
sky/templates/nebius-ray.yml.j2 +34 -12
sky/templates/oci-ray.yml.j2 +1 -0
sky/templates/paperspace-ray.yml.j2 +2 -1
sky/templates/primeintellect-ray.yml.j2 +72 -0
sky/templates/runpod-ray.yml.j2 +10 -1
sky/templates/scp-ray.yml.j2 +4 -50
sky/templates/seeweb-ray.yml.j2 +171 -0
sky/templates/shadeform-ray.yml.j2 +73 -0
sky/templates/sky-serve-controller.yaml.j2 +22 -2
sky/templates/vast-ray.yml.j2 +1 -0
sky/templates/vsphere-ray.yml.j2 +1 -0
sky/templates/websocket_proxy.py +212 -37
sky/usage/usage_lib.py +31 -15
sky/users/__init__.py +0 -0
sky/users/model.conf +15 -0
sky/users/permission.py +397 -0
sky/users/rbac.py +121 -0
sky/users/server.py +720 -0
sky/users/token_service.py +218 -0
sky/utils/accelerator_registry.py +35 -5
sky/utils/admin_policy_utils.py +84 -38
sky/utils/annotations.py +38 -5
sky/utils/asyncio_utils.py +78 -0
sky/utils/atomic.py +1 -1
sky/utils/auth_utils.py +153 -0
sky/utils/benchmark_utils.py +60 -0
sky/utils/cli_utils/status_utils.py +159 -86
sky/utils/cluster_utils.py +31 -9
sky/utils/command_runner.py +354 -68
sky/utils/command_runner.pyi +93 -3
sky/utils/common.py +35 -8
sky/utils/common_utils.py +314 -91
sky/utils/config_utils.py +74 -5
sky/utils/context.py +403 -0
sky/utils/context_utils.py +242 -0
sky/utils/controller_utils.py +383 -89
sky/utils/dag_utils.py +31 -12
sky/utils/db/__init__.py +0 -0
sky/utils/db/db_utils.py +485 -0
sky/utils/db/kv_cache.py +149 -0
sky/utils/db/migration_utils.py +137 -0
sky/utils/directory_utils.py +12 -0
sky/utils/env_options.py +13 -0
sky/utils/git.py +567 -0
sky/utils/git_clone.sh +460 -0
sky/utils/infra_utils.py +195 -0
sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
sky/utils/kubernetes/config_map_utils.py +133 -0
sky/utils/kubernetes/create_cluster.sh +15 -29
sky/utils/kubernetes/delete_cluster.sh +10 -7
sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
sky/utils/kubernetes/generate_kind_config.py +6 -66
sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
sky/utils/kubernetes/gpu_labeler.py +18 -8
sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
sky/utils/kubernetes/rsync_helper.sh +11 -3
sky/utils/kubernetes/ssh-tunnel.sh +379 -0
sky/utils/kubernetes/ssh_utils.py +221 -0
sky/utils/kubernetes_enums.py +8 -15
sky/utils/lock_events.py +94 -0
sky/utils/locks.py +416 -0
sky/utils/log_utils.py +82 -107
sky/utils/perf_utils.py +22 -0
sky/utils/resource_checker.py +298 -0
sky/utils/resources_utils.py +249 -32
sky/utils/rich_utils.py +217 -39
sky/utils/schemas.py +955 -160
sky/utils/serialize_utils.py +16 -0
sky/utils/status_lib.py +10 -0
sky/utils/subprocess_utils.py +29 -15
sky/utils/tempstore.py +70 -0
sky/utils/thread_utils.py +91 -0
sky/utils/timeline.py +26 -53
sky/utils/ux_utils.py +84 -15
sky/utils/validator.py +11 -1
sky/utils/volume.py +165 -0
sky/utils/yaml_utils.py +111 -0
sky/volumes/__init__.py +13 -0
sky/volumes/client/__init__.py +0 -0
sky/volumes/client/sdk.py +150 -0
sky/volumes/server/__init__.py +0 -0
sky/volumes/server/core.py +270 -0
sky/volumes/server/server.py +124 -0
sky/volumes/volume.py +215 -0
sky/workspaces/__init__.py +0 -0
sky/workspaces/core.py +655 -0
sky/workspaces/server.py +101 -0
sky/workspaces/utils.py +56 -0
sky_templates/README.md +3 -0
sky_templates/__init__.py +3 -0
sky_templates/ray/__init__.py +0 -0
sky_templates/ray/start_cluster +183 -0
sky_templates/ray/stop_cluster +75 -0
skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
{skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
sky/benchmark/benchmark_state.py +0 -256
sky/benchmark/benchmark_utils.py +0 -641
sky/clouds/service_catalog/constants.py +0 -7
sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
sky/jobs/dashboard/dashboard.py +0 -223
sky/jobs/dashboard/static/favicon.ico +0 -0
sky/jobs/dashboard/templates/index.html +0 -831
sky/jobs/server/dashboard_utils.py +0 -69
sky/skylet/providers/scp/__init__.py +0 -2
sky/skylet/providers/scp/config.py +0 -149
sky/skylet/providers/scp/node_provider.py +0 -578
sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
sky/utils/db_utils.py +0 -100
sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
/sky/{clouds/service_catalog → catalog}/config.py +0 -0
/sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
/sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
/sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
/sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
/sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
/sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0

sky/jobs/utils.py CHANGED Viewed

@@ -4,60 +4,86 @@ NOTE: whenever an API change is made in this file, we need to bump the
 jobs.constants.MANAGED_JOBS_VERSION and handle the API change in the
 ManagedJobCodeGen.
 """
+import asyncio
 import collections
+from datetime import datetime
 import enum
 import os
 import pathlib
+import re
 import shlex
 import textwrap
 import time
 import traceback
 import typing
-from typing import Any, Dict, List, Optional, Set, Tuple, Union
+from typing import (Any, Deque, Dict, Iterable, List, Literal, Optional, Set,
+                    TextIO, Tuple, Union)
 import colorama
 import filelock
-from typing_extensions import Literal
 from sky import backends
 from sky import exceptions
 from sky import global_user_state
 from sky import sky_logging
+from sky import skypilot_config
 from sky.adaptors import common as adaptors_common
 from sky.backends import backend_utils
+from sky.backends import cloud_vm_ray_backend
 from sky.jobs import constants as managed_job_constants
 from sky.jobs import scheduler
 from sky.jobs import state as managed_job_state
+from sky.schemas.api import responses
 from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.skylet import log_lib
 from sky.usage import usage_lib
+from sky.utils import annotations
 from sky.utils import common_utils
+from sky.utils import context_utils
+from sky.utils import controller_utils
+from sky.utils import infra_utils
 from sky.utils import log_utils
 from sky.utils import message_utils
+from sky.utils import resources_utils
 from sky.utils import rich_utils
 from sky.utils import subprocess_utils
 from sky.utils import ux_utils
 if typing.TYPE_CHECKING:
+    from google.protobuf import descriptor
+    from google.protobuf import json_format
+    import grpc
     import psutil
     import sky
     from sky import dag as dag_lib
+    from sky.schemas.generated import jobsv1_pb2
+    from sky.schemas.generated import managed_jobsv1_pb2
 else:
+    json_format = adaptors_common.LazyImport('google.protobuf.json_format')
+    descriptor = adaptors_common.LazyImport('google.protobuf.descriptor')
     psutil = adaptors_common.LazyImport('psutil')
+    grpc = adaptors_common.LazyImport('grpc')
+    jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
+    managed_jobsv1_pb2 = adaptors_common.LazyImport(
+        'sky.schemas.generated.managed_jobsv1_pb2')
 logger = sky_logging.init_logger(__name__)
-SIGNAL_FILE_PREFIX = '/tmp/sky_jobs_controller_signal_{}'
 # Controller checks its job's status every this many seconds.
-JOB_STATUS_CHECK_GAP_SECONDS = 20
+# This is a tradeoff between the latency and the resource usage.
+JOB_STATUS_CHECK_GAP_SECONDS = 15
 # Controller checks if its job has started every this many seconds.
 JOB_STARTED_STATUS_CHECK_GAP_SECONDS = 5
 _LOG_STREAM_CHECK_CONTROLLER_GAP_SECONDS = 5
+_JOB_STATUS_FETCH_MAX_RETRIES = 3
+_JOB_K8S_TRANSIENT_NW_MSG = 'Unable to connect to the server: dial tcp'
+_JOB_STATUS_FETCH_TIMEOUT_SECONDS = 30
 _JOB_WAITING_STATUS_MESSAGE = ux_utils.spinner_message(
     'Waiting for task to start[/]'
     '{status_str}. It may take a few minutes.\n'
@@ -72,7 +98,35 @@ _JOB_CANCELLED_MESSAGE = (
 # blocking for a long time. This should be significantly longer than the
 # JOB_STATUS_CHECK_GAP_SECONDS to avoid timing out before the controller can
 # update the state.
-_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 40
+_FINAL_JOB_STATUS_WAIT_TIMEOUT_SECONDS = 120
+# After enabling consolidation mode, we need to restart the API server to get
+# the jobs refresh deamon and correct number of executors. We use this file to
+# indicate that the API server has been restarted after enabling consolidation
+# mode.
+_JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE = (
+    '~/.sky/.jobs_controller_consolidation_reloaded_signal')
+# The response fields for managed jobs that require cluster handle
+_CLUSTER_HANDLE_FIELDS = [
+    'cluster_resources',
+    'cluster_resources_full',
+    'cloud',
+    'region',
+    'zone',
+    'infra',
+    'accelerators',
+]
+# The response fields for managed jobs that are not stored in the database
+# These fields will be mapped to the DB fields in the `_update_fields`.
+_NON_DB_FIELDS = _CLUSTER_HANDLE_FIELDS + ['user_yaml', 'user_name', 'details']
+class ManagedJobQueueResultType(enum.Enum):
+    """The type of the managed job queue result."""
+    DICT = 'DICT'
+    LIST = 'LIST'
 class UserSignal(enum.Enum):
@@ -83,7 +137,10 @@ class UserSignal(enum.Enum):
 # ====== internal functions ======
-def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
+def terminate_cluster(
+    cluster_name: str,
+    max_retry: int = 6,
+) -> None:
     """Terminate the cluster."""
     from sky import core  # pylint: disable=import-outside-toplevel
     retry_cnt = 0
@@ -121,43 +178,313 @@ def terminate_cluster(cluster_name: str, max_retry: int = 6) -> None:
             time.sleep(backoff.current_backoff())
-def get_job_status(backend: 'backends.CloudVmRayBackend',
-                   cluster_name: str) -> Optional['job_lib.JobStatus']:
+def _validate_consolidation_mode_config(
+        current_is_consolidation_mode: bool) -> None:
+    """Validate the consolidation mode config."""
+    # Check whether the consolidation mode config is changed.
+    if current_is_consolidation_mode:
+        controller_cn = (
+            controller_utils.Controllers.JOBS_CONTROLLER.value.cluster_name)
+        if global_user_state.cluster_with_name_exists(controller_cn):
+            logger.warning(
+                f'{colorama.Fore.RED}Consolidation mode for jobs is enabled, '
+                f'but the controller cluster {controller_cn} is still running. '
+                'Please terminate the controller cluster first.'
+                f'{colorama.Style.RESET_ALL}')
+    else:
+        total_jobs = managed_job_state.get_managed_jobs_total()
+        if total_jobs > 0:
+            nonterminal_jobs = (
+                managed_job_state.get_nonterminal_job_ids_by_name(
+                    None, None, all_users=True))
+            if nonterminal_jobs:
+                logger.warning(
+                    f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
+                    f'but there are still {len(nonterminal_jobs)} managed jobs '
+                    'running. Please terminate those jobs first.'
+                    f'{colorama.Style.RESET_ALL}')
+            else:
+                logger.warning(
+                    f'{colorama.Fore.YELLOW}Consolidation mode is disabled, '
+                    f'but there are {total_jobs} jobs from previous '
+                    'consolidation mode. Reset the `jobs.controller.'
+                    'consolidation_mode` to `true` and run `sky jobs queue` '
+                    'to see those jobs. Switching to normal mode will '
+                    f'lose the job history.{colorama.Style.RESET_ALL}')
+# Whether to use consolidation mode or not. When this is enabled, the managed
+# jobs controller will not be running on a separate cluster, but locally on the
+# API Server. Under the hood, we submit the job monitoring logic as processes
+# directly in the API Server.
+# Use LRU Cache so that the check is only done once.
+@annotations.lru_cache(scope='request', maxsize=2)
+def is_consolidation_mode(on_api_restart: bool = False) -> bool:
+    if os.environ.get(constants.OVERRIDE_CONSOLIDATION_MODE) is not None:
+        return True
+    config_consolidation_mode = skypilot_config.get_nested(
+        ('jobs', 'controller', 'consolidation_mode'), default_value=False)
+    signal_file = pathlib.Path(
+        _JOBS_CONSOLIDATION_RELOADED_SIGNAL_FILE).expanduser()
+    if on_api_restart:
+        if config_consolidation_mode:
+            signal_file.touch()
+    else:
+        restart_signal_file_exists = signal_file.exists()
+        if not restart_signal_file_exists:
+            if config_consolidation_mode:
+                logger.warning(f'{colorama.Fore.YELLOW}Consolidation mode for '
+                               'managed jobs is enabled in the server config, '
+                               'but the API server has not been restarted yet. '
+                               'Please restart the API server to enable it.'
+                               f'{colorama.Style.RESET_ALL}')
+                return False
+        elif not config_consolidation_mode:
+            # Cleanup the signal file if the consolidation mode is disabled in
+            # the config. This allow the user to disable the consolidation mode
+            # without restarting the API server.
+            signal_file.unlink()
+    # We should only do this check on API server, as the controller will not
+    # have related config and will always seemingly disabled for consolidation
+    # mode. Check #6611 for more details.
+    if os.environ.get(constants.ENV_VAR_IS_SKYPILOT_SERVER) is not None:
+        _validate_consolidation_mode_config(config_consolidation_mode)
+    return config_consolidation_mode
+def ha_recovery_for_consolidation_mode() -> None:
+    """Recovery logic for consolidation mode.
+    This should only be called from the managed-job-status-refresh-daemon, due
+    so that we have correct ordering recovery -> controller start -> job status
+    updates. This also should ensure correct operation during a rolling update.
+    """
+    # No setup recovery is needed in consolidation mode, as the API server
+    # already has all runtime installed. Directly start jobs recovery here.
+    # Refers to sky/templates/kubernetes-ray.yml.j2 for more details.
+    scheduler.maybe_start_controllers()
+    with open(constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format('jobs_'),
+              'a',
+              encoding='utf-8') as f:
+        start = time.time()
+        f.write(f'Starting HA recovery at {datetime.now()}\n')
+        jobs, _ = managed_job_state.get_managed_jobs_with_filters(fields=[
+            'job_id', 'controller_pid', 'controller_pid_started_at',
+            'schedule_state', 'status'
+        ])
+        for job in jobs:
+            job_id = job['job_id']
+            controller_pid = job['controller_pid']
+            controller_pid_started_at = job.get('controller_pid_started_at')
+            # In consolidation mode, it is possible that only the API server
+            # process is restarted, and the controller process is not. In such
+            # case, we don't need to do anything and the controller process will
+            # just keep running. However, in most cases, the controller process
+            # will also be stopped - either by a pod restart in k8s API server,
+            # or by `sky api stop`, which will stop controllers.
+            # TODO(cooperc): Make sure we cannot have a controller process
+            # running across API server restarts for consistency.
+            if controller_pid is not None:
+                try:
+                    # Note: We provide the legacy job id to the
+                    # controller_process_alive just in case, but we shouldn't
+                    # have a running legacy job controller process at this point
+                    if controller_process_alive(
+                            managed_job_state.ControllerPidRecord(
+                                pid=controller_pid,
+                                started_at=controller_pid_started_at), job_id):
+                        message = (f'Controller pid {controller_pid} for '
+                                   f'job {job_id} is still running. '
+                                   'Skipping recovery.\n')
+                        logger.debug(message)
+                        f.write(message)
+                        continue
+                except Exception:  # pylint: disable=broad-except
+                    # _controller_process_alive may raise if psutil fails; we
+                    # should not crash the recovery logic because of this.
+                    message = ('Error checking controller pid '
+                               f'{controller_pid} for job {job_id}\n')
+                    logger.warning(message, exc_info=True)
+                    f.write(message)
+            # Controller process is not set or not alive.
+            if job['schedule_state'] not in [
+                    managed_job_state.ManagedJobScheduleState.DONE,
+                    managed_job_state.ManagedJobScheduleState.WAITING,
+                    # INACTIVE job may be mid-submission, don't set to WAITING.
+                    managed_job_state.ManagedJobScheduleState.INACTIVE,
+            ]:
+                managed_job_state.reset_job_for_recovery(job_id)
+                message = (f'Job {job_id} completed recovery at '
+                           f'{datetime.now()}\n')
+                logger.info(message)
+                f.write(message)
+        f.write(f'HA recovery completed at {datetime.now()}\n')
+        f.write(f'Total recovery time: {time.time() - start} seconds\n')
+async def get_job_status(
+        backend: 'backends.CloudVmRayBackend', cluster_name: str,
+        job_id: Optional[int]) -> Optional['job_lib.JobStatus']:
     """Check the status of the job running on a managed job cluster.
     It can be None, INIT, RUNNING, SUCCEEDED, FAILED, FAILED_DRIVER,
     FAILED_SETUP or CANCELLED.
     """
-    handle = global_user_state.get_handle_from_cluster_name(cluster_name)
+    # TODO(luca) make this async
+    handle = await context_utils.to_thread(
+        global_user_state.get_handle_from_cluster_name, cluster_name)
     if handle is None:
         # This can happen if the cluster was preempted and background status
         # refresh already noticed and cleaned it up.
         logger.info(f'Cluster {cluster_name} not found.')
         return None
     assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
-    status = None
+    job_ids = None if job_id is None else [job_id]
+    for i in range(_JOB_STATUS_FETCH_MAX_RETRIES):
+        try:
+            logger.info('=== Checking the job status... ===')
+            statuses = await asyncio.wait_for(
+                context_utils.to_thread(backend.get_job_status,
+                                        handle,
+                                        job_ids=job_ids,
+                                        stream_logs=False),
+                timeout=_JOB_STATUS_FETCH_TIMEOUT_SECONDS)
+            status = list(statuses.values())[0]
+            if status is None:
+                logger.info('No job found.')
+            else:
+                logger.info(f'Job status: {status}')
+            logger.info('=' * 34)
+            return status
+        except (exceptions.CommandError, grpc.RpcError, grpc.FutureTimeoutError,
+                ValueError, TypeError, asyncio.TimeoutError) as e:
+            # Note: Each of these exceptions has some additional conditions to
+            # limit how we handle it and whether or not we catch it.
+            # Retry on k8s transient network errors. This is useful when using
+            # coreweave which may have transient network issue sometimes.
+            is_transient_error = False
+            detailed_reason = None
+            if isinstance(e, exceptions.CommandError):
+                detailed_reason = e.detailed_reason
+                if (detailed_reason is not None and
+                        _JOB_K8S_TRANSIENT_NW_MSG in detailed_reason):
+                    is_transient_error = True
+            elif isinstance(e, grpc.RpcError):
+                detailed_reason = e.details()
+                if e.code() in [
+                        grpc.StatusCode.UNAVAILABLE,
+                        grpc.StatusCode.DEADLINE_EXCEEDED
+                ]:
+                    is_transient_error = True
+            elif isinstance(e, grpc.FutureTimeoutError):
+                detailed_reason = 'Timeout'
+            elif isinstance(e, asyncio.TimeoutError):
+                detailed_reason = ('Job status check timed out after '
+                                   f'{_JOB_STATUS_FETCH_TIMEOUT_SECONDS}s')
+            # TODO(cooperc): Gracefully handle these exceptions in the backend.
+            elif isinstance(e, ValueError):
+                # If the cluster yaml is deleted in the middle of getting the
+                # SSH credentials, we could see this. See
+                # sky/global_user_state.py get_cluster_yaml_dict.
+                if re.search(r'Cluster yaml .* not found', str(e)):
+                    detailed_reason = 'Cluster yaml was deleted'
+                else:
+                    raise
+            elif isinstance(e, TypeError):
+                # We will grab the SSH credentials from the cluster yaml, but if
+                # handle.cluster_yaml is None, we will just return an empty dict
+                # for the credentials. See
+                # backend_utils.ssh_credential_from_yaml. Then, the credentials
+                # are passed as kwargs to SSHCommandRunner.__init__ - see
+                # cloud_vm_ray_backend.get_command_runners. So we can hit this
+                # TypeError if the cluster yaml is removed from the handle right
+                # when we pull it before the cluster is fully deleted.
+                error_msg_to_check = (
+                    'SSHCommandRunner.__init__() missing 2 required positional '
+                    'arguments: \'ssh_user\' and \'ssh_private_key\'')
+                if str(e) == error_msg_to_check:
+                    detailed_reason = 'SSH credentials were already cleaned up'
+                else:
+                    raise
+            if is_transient_error:
+                logger.info('Failed to connect to the cluster. Retrying '
+                            f'({i + 1}/{_JOB_STATUS_FETCH_MAX_RETRIES})...')
+                logger.info('=' * 34)
+                await asyncio.sleep(1)
+            else:
+                logger.info(f'Failed to get job status: {detailed_reason}')
+                logger.info('=' * 34)
+                return None
+    return None
+def controller_process_alive(record: managed_job_state.ControllerPidRecord,
+                             legacy_job_id: Optional[int] = None,
+                             quiet: bool = True) -> bool:
+    """Check if the controller process is alive.
+    If legacy_job_id is provided, this will also return True for a legacy
+    single-job controller process with that job id, based on the cmdline. This
+    is how the old check worked before #7051.
+    """
     try:
-        logger.info('=== Checking the job status... ===')
-        statuses = backend.get_job_status(handle, stream_logs=False)
-        status = list(statuses.values())[0]
-        if status is None:
-            logger.info('No job found.')
+        process = psutil.Process(record.pid)
+        if record.started_at is not None:
+            if process.create_time() != record.started_at:
+                if not quiet:
+                    logger.debug(f'Controller process {record.pid} has started '
+                                 f'at {record.started_at} but process has '
+                                 f'started at {process.create_time()}')
+                return False
         else:
-            logger.info(f'Job status: {status}')
-    except exceptions.CommandError:
-        logger.info('Failed to connect to the cluster.')
-    logger.info('=' * 34)
-    return status
+            # If we can't check the create_time try to check the cmdline instead
+            cmd_str = ' '.join(process.cmdline())
+            # pylint: disable=line-too-long
+            # Pre-#7051 cmdline: /path/to/python -u -m sky.jobs.controller <dag.yaml_path> --job-id <job_id>
+            # Post-#7051 cmdline: /path/to/python -u -msky.jobs.controller
+            # pylint: enable=line-too-long
+            if ('-m sky.jobs.controller' not in cmd_str and
+                    '-msky.jobs.controller' not in cmd_str):
+                if not quiet:
+                    logger.debug(f'Process {record.pid} is not a controller '
+                                 'process - missing "-m sky.jobs.controller" '
+                                 f'from cmdline: {cmd_str}')
+                return False
+            if (legacy_job_id is not None and '--job-id' in cmd_str and
+                    f'--job-id {legacy_job_id}' not in cmd_str):
+                if not quiet:
+                    logger.debug(f'Controller process {record.pid} has the '
+                                 f'wrong --job-id (expected {legacy_job_id}) '
+                                 f'in cmdline: {cmd_str}')
+                return False
+            # On linux, psutil.Process(pid) will return a valid process object
+            # even if the pid is actually a thread ID within the process. This
+            # hugely inflates the number of valid-looking pids, increasing the
+            # chance that we will falsely believe a controller is alive. The pid
+            # file should never contain thread IDs, just process IDs. We can
+            # check this with psutil.pid_exists(pid), which is false for TIDs.
+            # See pid_exists in psutil/_pslinux.py
+            if not psutil.pid_exists(record.pid):
+                if not quiet:
+                    logger.debug(
+                        f'Controller process {record.pid} is not a valid '
+                        'process id.')
+                return False
+        return process.is_running()
-def _controller_process_alive(pid: int, job_id: int) -> bool:
-    """Check if the controller process is alive."""
-    try:
-        process = psutil.Process(pid)
-        # The last two args of the command line should be --job-id <id>
-        job_args = process.cmdline()[-2:]
-        return process.is_running() and job_args == ['--job-id', str(job_id)]
-    except psutil.NoSuchProcess:
+    except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess,
+            OSError) as e:
+        if not quiet:
+            logger.debug(f'Controller process {record.pid} is not running: {e}')
         return False
@@ -173,6 +500,17 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
     Note: we expect that job_id, if provided, refers to a nonterminal job or a
     job that has not completed its cleanup (schedule state not DONE).
     """
+    # This signal file suggests that the controller is recovering from a
+    # failure. See sky/templates/kubernetes-ray.yml.j2 for more details.
+    # When restarting the controller processes, we don't want this event to
+    # set the job status to FAILED_CONTROLLER.
+    # TODO(tian): Change this to restart the controller process. For now we
+    # disabled it when recovering because we want to avoid caveats of infinite
+    # restart of last controller process that fully occupied the controller VM.
+    if os.path.exists(
+            os.path.expanduser(
+                constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE)):
+        return
     def _cleanup_job_clusters(job_id: int) -> Optional[str]:
         """Clean up clusters for a job. Returns error message if any.
@@ -181,15 +519,22 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
         capture the error message, and log/return it.
         """
         error_msg = None
-        tasks = managed_job_state.get_managed_jobs(job_id)
+        tasks = managed_job_state.get_managed_job_tasks(job_id)
         for task in tasks:
-            task_name = task['job_name']
-            cluster_name = generate_managed_job_cluster_name(task_name, job_id)
+            pool = task.get('pool', None)
+            if pool is None:
+                task_name = task['job_name']
+                cluster_name = generate_managed_job_cluster_name(
+                    task_name, job_id)
+            else:
+                cluster_name, _ = (
+                    managed_job_state.get_pool_submit_info(job_id))
             handle = global_user_state.get_handle_from_cluster_name(
                 cluster_name)
             if handle is not None:
                 try:
-                    terminate_cluster(cluster_name)
+                    if pool is None:
+                        terminate_cluster(cluster_name)
                 except Exception as e:  # pylint: disable=broad-except
                     error_msg = (
                         f'Failed to terminate cluster {cluster_name}: '
@@ -197,43 +542,6 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
                     logger.exception(error_msg, exc_info=e)
         return error_msg
-    # For backwards compatible jobs
-    # TODO(cooperc): Remove before 0.11.0.
-    def _handle_legacy_job(job_id: int):
-        controller_status = job_lib.get_status(job_id)
-        if controller_status is None or controller_status.is_terminal():
-            logger.error(f'Controller process for legacy job {job_id} is '
-                         'in an unexpected state.')
-            cleanup_error = _cleanup_job_clusters(job_id)
-            if cleanup_error:
-                # Unconditionally set the job to failed_controller if the
-                # cleanup fails.
-                managed_job_state.set_failed(
-                    job_id,
-                    task_id=None,
-                    failure_type=managed_job_state.ManagedJobStatus.
-                    FAILED_CONTROLLER,
-                    failure_reason=
-                    'Legacy controller process has exited abnormally, and '
-                    f'cleanup failed: {cleanup_error}. For more details, run: '
-                    f'sky jobs logs --controller {job_id}',
-                    override_terminal=True)
-                return
-            # It's possible for the job to have transitioned to
-            # another terminal state while between when we checked its
-            # state and now. In that case, set_failed won't do
-            # anything, which is fine.
-            managed_job_state.set_failed(
-                job_id,
-                task_id=None,
-                failure_type=managed_job_state.ManagedJobStatus.
-                FAILED_CONTROLLER,
-                failure_reason=(
-                    'Legacy controller process has exited abnormally. For '
-                    f'more details, run: sky jobs logs --controller {job_id}'))
     # Get jobs that need checking (non-terminal or not DONE)
     job_ids = managed_job_state.get_jobs_to_check_status(job_id)
     if not job_ids:
@@ -242,29 +550,23 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
         return
     for job_id in job_ids:
-        tasks = managed_job_state.get_managed_jobs(job_id)
+        assert job_id is not None
+        tasks = managed_job_state.get_managed_job_tasks(job_id)
         # Note: controller_pid and schedule_state are in the job_info table
         # which is joined to the spot table, so all tasks with the same job_id
         # will have the same value for these columns. This is what lets us just
         # take tasks[0]['controller_pid'] and tasks[0]['schedule_state'].
         schedule_state = tasks[0]['schedule_state']
-        # Backwards compatibility: this job was submitted when ray was still
-        # used for managing the parallelism of job controllers, before #4485.
-        # TODO(cooperc): Remove before 0.11.0.
-        if (schedule_state is
-                managed_job_state.ManagedJobScheduleState.INVALID):
-            _handle_legacy_job(job_id)
-            continue
         # Handle jobs with schedule state (non-legacy jobs):
         pid = tasks[0]['controller_pid']
+        pid_started_at = tasks[0].get('controller_pid_started_at')
         if schedule_state == managed_job_state.ManagedJobScheduleState.DONE:
             # There are two cases where we could get a job that is DONE.
             # 1. At query time (get_jobs_to_check_status), the job was not yet
-            #    DONE, but since then (before get_managed_jobs is called) it has
-            #    hit a terminal status, marked itself done, and exited. This is
-            #    fine.
+            #    DONE, but since then (before get_managed_job_tasks is called)
+            #    it has hit a terminal status, marked itself done, and exited.
+            #    This is fine.
             # 2. The job is DONE, but in a non-terminal status. This is
             #    unexpected. For instance, the task status is RUNNING, but the
             #    job schedule_state is DONE.
@@ -311,7 +613,9 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
             failure_reason = f'No controller pid set for {schedule_state.value}'
         else:
             logger.debug(f'Checking controller pid {pid}')
-            if _controller_process_alive(pid, job_id):
+            if controller_process_alive(
+                    managed_job_state.ControllerPidRecord(
+                        pid=pid, started_at=pid_started_at), job_id):
                 # The controller is still running, so this job is fine.
                 continue
@@ -369,11 +673,34 @@ def update_managed_jobs_statuses(job_id: Optional[int] = None):
 def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
-                      get_end_time: bool) -> float:
+                      job_id: Optional[int], get_end_time: bool) -> float:
     """Get the submitted/ended time of the job."""
-    code = job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
-        job_id=None, get_ended_time=get_end_time)
     handle = global_user_state.get_handle_from_cluster_name(cluster_name)
+    assert handle is not None, (
+        f'handle for cluster {cluster_name!r} should not be None')
+    if handle.is_grpc_enabled_with_flag:
+        try:
+            if get_end_time:
+                end_ts_request = jobsv1_pb2.GetJobEndedTimestampRequest(
+                    job_id=job_id)
+                end_ts_response = backend_utils.invoke_skylet_with_retries(
+                    lambda: cloud_vm_ray_backend.SkyletClient(
+                        handle.get_grpc_channel()).get_job_ended_timestamp(
+                            end_ts_request))
+                return end_ts_response.timestamp
+            else:
+                submit_ts_request = jobsv1_pb2.GetJobSubmittedTimestampRequest(
+                    job_id=job_id)
+                submit_ts_response = backend_utils.invoke_skylet_with_retries(
+                    lambda: cloud_vm_ray_backend.SkyletClient(
+                        handle.get_grpc_channel()).get_job_submitted_timestamp(
+                            submit_ts_request))
+                return submit_ts_response.timestamp
+        except exceptions.SkyletMethodNotImplementedError:
+            pass
+    code = (job_lib.JobLibCodeGen.get_job_submitted_or_ended_timestamp_payload(
+        job_id=job_id, get_ended_time=get_end_time))
     returncode, stdout, stderr = backend.run_on_head(handle,
                                                      code,
                                                      stream_logs=False,
@@ -386,16 +713,24 @@ def get_job_timestamp(backend: 'backends.CloudVmRayBackend', cluster_name: str,
 def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
-                            cluster_name: str) -> float:
+                            cluster_name: str, job_id: Optional[int]) -> float:
     """Try to get the end time of the job.
     If the job is preempted or we can't connect to the instance for whatever
     reason, fall back to the current time.
     """
     try:
-        return get_job_timestamp(backend, cluster_name, get_end_time=True)
-    except exceptions.CommandError as e:
-        if e.returncode == 255:
+        return get_job_timestamp(backend,
+                                 cluster_name,
+                                 job_id=job_id,
+                                 get_end_time=True)
+    except (exceptions.CommandError, grpc.RpcError,
+            grpc.FutureTimeoutError) as e:
+        if isinstance(e, exceptions.CommandError) and e.returncode == 255 or \
+                (isinstance(e, grpc.RpcError) and e.code() in [
+                    grpc.StatusCode.UNAVAILABLE,
+                    grpc.StatusCode.DEADLINE_EXCEEDED,
+                ]) or isinstance(e, grpc.FutureTimeoutError):
             # Failed to connect - probably the instance was preempted since the
             # job completed. We shouldn't crash here, so just log and use the
             # current time.
@@ -407,7 +742,9 @@ def try_to_get_job_end_time(backend: 'backends.CloudVmRayBackend',
             raise
-def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
+def event_callback_func(
+        job_id: int, task_id: Optional[int],
+        task: Optional['sky.Task']) -> managed_job_state.AsyncCallbackType:
     """Run event callback for the task."""
     def callback_func(status: str):
@@ -415,8 +752,12 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
         if event_callback is None or task is None:
             return
         event_callback = event_callback.strip()
-        cluster_name = generate_managed_job_cluster_name(
-            task.name, job_id) if task.name else None
+        pool = managed_job_state.get_pool_from_job_id(job_id)
+        if pool is not None:
+            cluster_name, _ = (managed_job_state.get_pool_submit_info(job_id))
+        else:
+            cluster_name = generate_managed_job_cluster_name(
+                task.name, job_id) if task.name else None
         logger.info(f'=== START: event callback for {status!r} ===')
         log_path = os.path.join(constants.SKY_LOGS_DIRECTORY,
                                 'managed_job_event',
@@ -442,7 +783,10 @@ def event_callback_func(job_id: int, task_id: int, task: 'sky.Task'):
             f'Bash:{event_callback},log_path:{log_path},result:{result}')
         logger.info(f'=== END: event callback for {status!r} ===')
-    return callback_func
+    async def async_callback_func(status: str):
+        return await context_utils.to_thread(callback_func, status)
+    return async_callback_func
 # ======== user functions ========
@@ -461,20 +805,24 @@ def generate_managed_job_cluster_name(task_name: str, job_id: int) -> str:
 def cancel_jobs_by_id(job_ids: Optional[List[int]],
-                      all_users: bool = False) -> str:
+                      all_users: bool = False,
+                      current_workspace: Optional[str] = None,
+                      user_hash: Optional[str] = None) -> str:
     """Cancel jobs by id.
     If job_ids is None, cancel all jobs.
     """
     if job_ids is None:
         job_ids = managed_job_state.get_nonterminal_job_ids_by_name(
-            None, all_users)
+            None, user_hash, all_users)
     job_ids = list(set(job_ids))
     if not job_ids:
         return 'No job to cancel.'
-    job_id_str = ', '.join(map(str, job_ids))
-    logger.info(f'Cancelling jobs {job_id_str}.')
+    if current_workspace is None:
+        current_workspace = constants.SKYPILOT_DEFAULT_WORKSPACE
     cancelled_job_ids: List[int] = []
+    wrong_workspace_job_ids: List[int] = []
     for job_id in job_ids:
         # Check the status of the managed job status. If it is in
         # terminal state, we can safely skip it.
@@ -486,30 +834,70 @@ def cancel_jobs_by_id(job_ids: Optional[List[int]],
             logger.info(f'Job {job_id} is already in terminal state '
                         f'{job_status.value}. Skipped.')
             continue
+        elif job_status == managed_job_state.ManagedJobStatus.PENDING:
+            # the "if PENDING" is a short circuit, this will be atomic.
+            cancelled = managed_job_state.set_pending_cancelled(job_id)
+            if cancelled:
+                cancelled_job_ids.append(job_id)
+                continue
         update_managed_jobs_statuses(job_id)
-        # Send the signal to the jobs controller.
-        signal_file = pathlib.Path(SIGNAL_FILE_PREFIX.format(job_id))
-        # Filelock is needed to prevent race condition between signal
-        # check/removal and signal writing.
-        with filelock.FileLock(str(signal_file) + '.lock'):
-            with signal_file.open('w', encoding='utf-8') as f:
-                f.write(UserSignal.CANCEL.value)
-                f.flush()
+        job_workspace = managed_job_state.get_workspace(job_id)
+        if current_workspace is not None and job_workspace != current_workspace:
+            wrong_workspace_job_ids.append(job_id)
+            continue
+        if managed_job_state.is_legacy_controller_process(job_id):
+            # The job is running on a legacy single-job controller process.
+            # TODO(cooperc): Remove this handling for 0.13.0
+            # Send the signal to the jobs controller.
+            signal_file = (pathlib.Path(
+                managed_job_constants.SIGNAL_FILE_PREFIX.format(job_id)))
+            # Filelock is needed to prevent race condition between signal
+            # check/removal and signal writing.
+            with filelock.FileLock(str(signal_file) + '.lock'):
+                with signal_file.open('w', encoding='utf-8') as f:
+                    f.write(UserSignal.CANCEL.value)
+                    f.flush()
+        else:
+            # New controller process.
+            try:
+                signal_file = pathlib.Path(
+                    managed_job_constants.CONSOLIDATED_SIGNAL_PATH, f'{job_id}')
+                signal_file.touch()
+            except OSError as e:
+                logger.error(f'Failed to cancel job {job_id}: {e}')
+                # Don't add it to the to be cancelled job ids
+                continue
         cancelled_job_ids.append(job_id)
+    wrong_workspace_job_str = ''
+    if wrong_workspace_job_ids:
+        plural = 's' if len(wrong_workspace_job_ids) > 1 else ''
+        plural_verb = 'are' if len(wrong_workspace_job_ids) > 1 else 'is'
+        wrong_workspace_job_str = (
+            f' Job{plural} with ID{plural}'
+            f' {", ".join(map(str, wrong_workspace_job_ids))} '
+            f'{plural_verb} skipped as they are not in the active workspace '
+            f'{current_workspace!r}. Check the workspace of the job with: '
+            f'sky jobs queue')
     if not cancelled_job_ids:
-        return 'No job to cancel.'
+        return f'No job to cancel.{wrong_workspace_job_str}'
     identity_str = f'Job with ID {cancelled_job_ids[0]} is'
     if len(cancelled_job_ids) > 1:
         cancelled_job_ids_str = ', '.join(map(str, cancelled_job_ids))
         identity_str = f'Jobs with IDs {cancelled_job_ids_str} are'
-    return f'{identity_str} scheduled to be cancelled.'
+    msg = f'{identity_str} scheduled to be cancelled.{wrong_workspace_job_str}'
+    return msg
-def cancel_job_by_name(job_name: str) -> str:
+def cancel_job_by_name(job_name: str,
+                       current_workspace: Optional[str] = None) -> str:
     """Cancel a job by name."""
     job_ids = managed_job_state.get_nonterminal_job_ids_by_name(job_name)
     if not job_ids:
@@ -518,11 +906,30 @@ def cancel_job_by_name(job_name: str) -> str:
         return (f'{colorama.Fore.RED}Multiple running jobs found '
                 f'with name {job_name!r}.\n'
                 f'Job IDs: {job_ids}{colorama.Style.RESET_ALL}')
-    cancel_jobs_by_id(job_ids)
-    return f'Job {job_name!r} is scheduled to be cancelled.'
+    msg = cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
+    return f'{job_name!r} {msg}'
-def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
+def cancel_jobs_by_pool(pool_name: str,
+                        current_workspace: Optional[str] = None) -> str:
+    """Cancel all jobs in a pool."""
+    job_ids = managed_job_state.get_nonterminal_job_ids_by_pool(pool_name)
+    if not job_ids:
+        return f'No running job found in pool {pool_name!r}.'
+    return cancel_jobs_by_id(job_ids, current_workspace=current_workspace)
+def controller_log_file_for_job(job_id: int,
+                                create_if_not_exists: bool = False) -> str:
+    log_dir = os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR)
+    if create_if_not_exists:
+        os.makedirs(log_dir, exist_ok=True)
+    return os.path.join(log_dir, f'{job_id}.log')
+def stream_logs_by_id(job_id: int,
+                      follow: bool = True,
+                      tail: Optional[int] = None) -> Tuple[str, int]:
     """Stream logs by job id.
     Returns:
@@ -552,18 +959,60 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
             if managed_job_status.is_failed():
                 job_msg = ('\nFailure reason: '
                            f'{managed_job_state.get_failure_reason(job_id)}')
-            log_file = managed_job_state.get_local_log_file(job_id, None)
-            if log_file is not None:
-                with open(os.path.expanduser(log_file), 'r',
-                          encoding='utf-8') as f:
-                    # Stream the logs to the console without reading the whole
-                    # file into memory.
-                    start_streaming = False
-                    for line in f:
-                        if log_lib.LOG_FILE_START_STREAMING_AT in line:
+            log_file_ever_existed = False
+            task_info = managed_job_state.get_all_task_ids_names_statuses_logs(
+                job_id)
+            num_tasks = len(task_info)
+            for (task_id, task_name, task_status, log_file,
+                 logs_cleaned_at) in task_info:
+                if log_file:
+                    log_file_ever_existed = True
+                    if logs_cleaned_at is not None:
+                        ts_str = datetime.fromtimestamp(
+                            logs_cleaned_at).strftime('%Y-%m-%d %H:%M:%S')
+                        print(f'Task {task_name}({task_id}) log has been '
+                              f'cleaned at {ts_str}.')
+                        continue
+                    task_str = (f'Task {task_name}({task_id})'
+                                if task_name else f'Task {task_id}')
+                    if num_tasks > 1:
+                        print(f'=== {task_str} ===')
+                    with open(os.path.expanduser(log_file),
+                              'r',
+                              encoding='utf-8') as f:
+                        # Stream the logs to the console without reading the
+                        # whole file into memory.
+                        start_streaming = False
+                        read_from: Union[TextIO, Deque[str]] = f
+                        if tail is not None:
+                            assert tail > 0
+                            # Read only the last 'tail' lines using deque
+                            read_from = collections.deque(f, maxlen=tail)
+                            # We set start_streaming to True here in case
+                            # truncating the log file removes the line that
+                            # contains LOG_FILE_START_STREAMING_AT. This does
+                            # not cause issues for log files shorter than tail
+                            # because tail_logs in sky/skylet/log_lib.py also
+                            # handles LOG_FILE_START_STREAMING_AT.
                             start_streaming = True
-                        if start_streaming:
-                            print(line, end='', flush=True)
+                        for line in read_from:
+                            if log_lib.LOG_FILE_START_STREAMING_AT in line:
+                                start_streaming = True
+                            if start_streaming:
+                                print(line, end='', flush=True)
+                    if num_tasks > 1:
+                        # Add the "Task finished" message for terminal states
+                        if task_status.is_terminal():
+                            print(ux_utils.finishing_message(
+                                f'{task_str} finished '
+                                f'(status: {task_status.value}).'),
+                                  flush=True)
+            if log_file_ever_existed:
+                # Add the "Job finished" message for terminal states
+                if managed_job_status.is_terminal():
+                    print(ux_utils.finishing_message(
+                        f'Job finished (status: {managed_job_status.value}).'),
+                          flush=True)
                 return '', exceptions.JobExitCode.from_managed_job_status(
                     managed_job_status)
             return (f'{colorama.Fore.YELLOW}'
@@ -585,12 +1034,19 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
         while should_keep_logging(managed_job_status):
             handle = None
+            job_id_to_tail = None
             if task_id is not None:
-                task_name = managed_job_state.get_task_name(job_id, task_id)
-                cluster_name = generate_managed_job_cluster_name(
-                    task_name, job_id)
-                handle = global_user_state.get_handle_from_cluster_name(
-                    cluster_name)
+                pool = managed_job_state.get_pool_from_job_id(job_id)
+                if pool is not None:
+                    cluster_name, job_id_to_tail = (
+                        managed_job_state.get_pool_submit_info(job_id))
+                else:
+                    task_name = managed_job_state.get_task_name(job_id, task_id)
+                    cluster_name = generate_managed_job_cluster_name(
+                        task_name, job_id)
+                if cluster_name is not None:
+                    handle = global_user_state.get_handle_from_cluster_name(
+                        cluster_name)
             # Check the handle: The cluster can be preempted and removed from
             # the table before the managed job state is updated by the
@@ -620,10 +1076,12 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
                     managed_job_state.ManagedJobStatus.RUNNING)
             assert isinstance(handle, backends.CloudVmRayResourceHandle), handle
             status_display.stop()
+            tail_param = tail if tail is not None else 0
             returncode = backend.tail_logs(handle,
-                                           job_id=None,
+                                           job_id=job_id_to_tail,
                                            managed_job_id=job_id,
-                                           follow=follow)
+                                           follow=follow,
+                                           tail=tail_param)
             if returncode in [rc.value for rc in exceptions.JobExitCode]:
                 # If the log tailing exits with a known exit code we can safely
                 # break the loop because it indicates the tailing process
@@ -760,7 +1218,8 @@ def stream_logs_by_id(job_id: int, follow: bool = True) -> Tuple[str, int]:
 def stream_logs(job_id: Optional[int],
                 job_name: Optional[str],
                 controller: bool = False,
-                follow: bool = True) -> Tuple[str, int]:
+                follow: bool = True,
+                tail: Optional[int] = None) -> Tuple[str, int]:
     """Stream logs by job id or job name.
     Returns:
@@ -776,7 +1235,8 @@ def stream_logs(job_id: Optional[int],
     if controller:
         if job_id is None:
             assert job_name is not None
-            managed_jobs = managed_job_state.get_managed_jobs()
+            managed_jobs, _ = managed_job_state.get_managed_jobs_with_filters(
+                name_match=job_name, fields=['job_id', 'job_name', 'status'])
             # We manually filter the jobs by name, instead of using
             # get_nonterminal_job_ids_by_name, as with `controller=True`, we
             # should be able to show the logs for jobs in terminal states.
@@ -799,9 +1259,7 @@ def stream_logs(job_id: Optional[int],
             job_id = managed_job_ids.pop()
         assert job_id is not None, (job_id, job_name)
-        controller_log_path = os.path.join(
-            os.path.expanduser(managed_job_constants.JOBS_CONTROLLER_LOGS_DIR),
-            f'{job_id}.log')
+        controller_log_path = controller_log_file_for_job(job_id)
         job_status = None
         # Wait for the log file to be written
@@ -831,7 +1289,12 @@ def stream_logs(job_id: Optional[int],
         with open(controller_log_path, 'r', newline='', encoding='utf-8') as f:
             # Note: we do not need to care about start_stream_at here, since
             # that should be in the job log printed above.
-            for line in f:
+            read_from: Union[TextIO, Deque[str]] = f
+            if tail is not None:
+                assert tail > 0
+                # Read only the last 'tail' lines efficiently using deque
+                read_from = collections.deque(f, maxlen=tail)
+            for line in read_from:
                 print(line, end='')
             # Flush.
             print(end='', flush=True)
@@ -883,61 +1346,384 @@ def stream_logs(job_id: Optional[int],
                 f'Multiple running jobs found with name {job_name!r}.')
         job_id = job_ids[0]
-    return stream_logs_by_id(job_id, follow)
+    return stream_logs_by_id(job_id, follow, tail)
+def dump_managed_job_queue(
+    skip_finished: bool = False,
+    accessible_workspaces: Optional[List[str]] = None,
+    job_ids: Optional[List[int]] = None,
+    workspace_match: Optional[str] = None,
+    name_match: Optional[str] = None,
+    pool_match: Optional[str] = None,
+    page: Optional[int] = None,
+    limit: Optional[int] = None,
+    user_hashes: Optional[List[Optional[str]]] = None,
+    statuses: Optional[List[str]] = None,
+    fields: Optional[List[str]] = None,
+) -> str:
+    return message_utils.encode_payload(
+        get_managed_job_queue(skip_finished, accessible_workspaces, job_ids,
+                              workspace_match, name_match, pool_match, page,
+                              limit, user_hashes, statuses, fields))
-def dump_managed_job_queue() -> str:
-    jobs = managed_job_state.get_managed_jobs()
+def _update_fields(fields: List[str],) -> Tuple[List[str], bool]:
+    """Update the fields list to include the necessary fields.
+    Args:
+        fields: The fields to update.
+    It will:
+    - Add the necessary dependent fields to the list.
+    - Remove the fields that are not in the DB.
+    - Determine if cluster handle is required.
+    Returns:
+        A tuple containing the updated fields and a boolean indicating if
+        cluster handle is required.
+    """
+    cluster_handle_required = True
+    if _cluster_handle_not_required(fields):
+        cluster_handle_required = False
+    # Copy the list to avoid modifying the original list
+    new_fields = fields.copy()
+    # status and job_id are always included
+    if 'status' not in new_fields:
+        new_fields.append('status')
+    if 'job_id' not in new_fields:
+        new_fields.append('job_id')
+    # user_hash is required if user_name is present
+    if 'user_name' in new_fields and 'user_hash' not in new_fields:
+        new_fields.append('user_hash')
+    if 'job_duration' in new_fields:
+        if 'last_recovered_at' not in new_fields:
+            new_fields.append('last_recovered_at')
+        if 'end_at' not in new_fields:
+            new_fields.append('end_at')
+    if 'job_name' in new_fields and 'task_name' not in new_fields:
+        new_fields.append('task_name')
+    if 'details' in new_fields:
+        if 'schedule_state' not in new_fields:
+            new_fields.append('schedule_state')
+        if 'priority' not in new_fields:
+            new_fields.append('priority')
+        if 'failure_reason' not in new_fields:
+            new_fields.append('failure_reason')
+    if 'user_yaml' in new_fields:
+        if 'original_user_yaml_path' not in new_fields:
+            new_fields.append('original_user_yaml_path')
+        if 'original_user_yaml_content' not in new_fields:
+            new_fields.append('original_user_yaml_content')
+    if cluster_handle_required:
+        if 'task_name' not in new_fields:
+            new_fields.append('task_name')
+        if 'current_cluster_name' not in new_fields:
+            new_fields.append('current_cluster_name')
+    # Remove _NON_DB_FIELDS
+    # These fields have been mapped to the DB fields in the above code, so we
+    # don't need to include them in the updated fields.
+    for field in _NON_DB_FIELDS:
+        if field in new_fields:
+            new_fields.remove(field)
+    return new_fields, cluster_handle_required
+def _cluster_handle_not_required(fields: List[str]) -> bool:
+    """Determine if cluster handle is not required.
+    Args:
+        fields: The fields to check if they contain any of the cluster handle
+        fields.
+    Returns:
+        True if the fields do not contain any of the cluster handle fields,
+        False otherwise.
+    """
+    return not any(field in fields for field in _CLUSTER_HANDLE_FIELDS)
+def get_managed_job_queue(
+    skip_finished: bool = False,
+    accessible_workspaces: Optional[List[str]] = None,
+    job_ids: Optional[List[int]] = None,
+    workspace_match: Optional[str] = None,
+    name_match: Optional[str] = None,
+    pool_match: Optional[str] = None,
+    page: Optional[int] = None,
+    limit: Optional[int] = None,
+    user_hashes: Optional[List[Optional[str]]] = None,
+    statuses: Optional[List[str]] = None,
+    fields: Optional[List[str]] = None,
+) -> Dict[str, Any]:
+    """Get the managed job queue.
+    Args:
+        skip_finished: Whether to skip finished jobs.
+        accessible_workspaces: The accessible workspaces.
+        job_ids: The job ids.
+        workspace_match: The workspace name to match.
+        name_match: The job name to match.
+        pool_match: The pool name to match.
+        page: The page number.
+        limit: The limit number.
+        user_hashes: The user hashes.
+        statuses: The statuses.
+        fields: The fields to include in the response.
+    Returns:
+        A dictionary containing the managed job queue.
+    """
+    cluster_handle_required = True
+    updated_fields = None
+    # The caller only need to specify the fields in the
+    # `class ManagedJobRecord` in `response.py`, and the `_update_fields`
+    # function will add the necessary dependent fields to the list, for
+    # example, if the caller specifies `['user_name']`, the `_update_fields`
+    # function will add `['user_hash']` to the list.
+    if fields:
+        updated_fields, cluster_handle_required = _update_fields(fields)
+    total_no_filter = managed_job_state.get_managed_jobs_total()
+    status_counts = managed_job_state.get_status_count_with_filters(
+        fields=fields,
+        job_ids=job_ids,
+        accessible_workspaces=accessible_workspaces,
+        workspace_match=workspace_match,
+        name_match=name_match,
+        pool_match=pool_match,
+        user_hashes=user_hashes,
+        skip_finished=skip_finished,
+    )
+    jobs, total = managed_job_state.get_managed_jobs_with_filters(
+        fields=updated_fields,
+        job_ids=job_ids,
+        accessible_workspaces=accessible_workspaces,
+        workspace_match=workspace_match,
+        name_match=name_match,
+        pool_match=pool_match,
+        user_hashes=user_hashes,
+        statuses=statuses,
+        skip_finished=skip_finished,
+        page=page,
+        limit=limit,
+    )
+    if cluster_handle_required:
+        # Fetch the cluster name to handle map for managed clusters only.
+        cluster_name_to_handle = (
+            global_user_state.get_cluster_name_to_handle_map(is_managed=True))
+    highest_blocking_priority = constants.MIN_PRIORITY
+    if not fields or 'details' in fields:
+        # Figure out what the highest priority blocking job is. We need to know
+        # in order to determine if other jobs are blocked by a higher priority
+        # job, or just by the limited controller resources.
+        highest_blocking_priority = (
+            managed_job_state.get_managed_jobs_highest_priority())
     for job in jobs:
-        end_at = job['end_at']
-        if end_at is None:
-            end_at = time.time()
-        job_submitted_at = job['last_recovered_at'] - job['job_duration']
-        if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
-            # When job is recovering, the duration is exact job['job_duration']
-            job_duration = job['job_duration']
-        elif job_submitted_at > 0:
-            job_duration = end_at - job_submitted_at
-        else:
-            # When job_start_at <= 0, that means the last_recovered_at is not
-            # set yet, i.e. the job is not started.
-            job_duration = 0
-        job['job_duration'] = job_duration
+        if not fields or 'job_duration' in fields:
+            end_at = job['end_at']
+            if end_at is None:
+                end_at = time.time()
+            job_submitted_at = job['last_recovered_at'] - job['job_duration']
+            if job['status'] == managed_job_state.ManagedJobStatus.RECOVERING:
+                # When job is recovering, the duration is exact
+                # job['job_duration']
+                job_duration = job['job_duration']
+            elif job_submitted_at > 0:
+                job_duration = end_at - job_submitted_at
+            else:
+                # When job_start_at <= 0, that means the last_recovered_at
+                # is not set yet, i.e. the job is not started.
+                job_duration = 0
+            job['job_duration'] = job_duration
         job['status'] = job['status'].value
-        job['schedule_state'] = job['schedule_state'].value
-        cluster_name = generate_managed_job_cluster_name(
-            job['task_name'], job['job_id'])
-        handle = global_user_state.get_handle_from_cluster_name(cluster_name)
-        if handle is not None:
-            assert isinstance(handle, backends.CloudVmRayResourceHandle)
-            job['cluster_resources'] = (
-                f'{handle.launched_nodes}x {handle.launched_resources}')
-            job['region'] = handle.launched_resources.region
+        if not fields or 'schedule_state' in fields:
+            job['schedule_state'] = job['schedule_state'].value
         else:
-            # FIXME(zongheng): display the last cached values for these.
-            job['cluster_resources'] = '-'
-            job['region'] = '-'
+            job['schedule_state'] = None
-    return message_utils.encode_payload(jobs)
+        if cluster_handle_required:
+            cluster_name = job.get('current_cluster_name', None)
+            if cluster_name is None:
+                cluster_name = generate_managed_job_cluster_name(
+                    job['task_name'], job['job_id'])
+            handle = cluster_name_to_handle.get(
+                cluster_name, None) if cluster_name is not None else None
+            if isinstance(handle, backends.CloudVmRayResourceHandle):
+                resources_str_simple, resources_str_full = (
+                    resources_utils.get_readable_resources_repr(
+                        handle, simplified_only=False))
+                assert resources_str_full is not None
+                job['cluster_resources'] = resources_str_simple
+                job['cluster_resources_full'] = resources_str_full
+                job['cloud'] = str(handle.launched_resources.cloud)
+                job['region'] = handle.launched_resources.region
+                job['zone'] = handle.launched_resources.zone
+                job['infra'] = infra_utils.InfraInfo(
+                    str(handle.launched_resources.cloud),
+                    handle.launched_resources.region,
+                    handle.launched_resources.zone).formatted_str()
+                job['accelerators'] = handle.launched_resources.accelerators
+            else:
+                # FIXME(zongheng): display the last cached values for these.
+                job['cluster_resources'] = '-'
+                job['cluster_resources_full'] = '-'
+                job['cloud'] = '-'
+                job['region'] = '-'
+                job['zone'] = '-'
+                job['infra'] = '-'
+        if not fields or 'details' in fields:
+            # Add details about schedule state / backoff.
+            state_details = None
+            if job['schedule_state'] == 'ALIVE_BACKOFF':
+                state_details = 'In backoff, waiting for resources'
+            elif job['schedule_state'] in ('WAITING', 'ALIVE_WAITING'):
+                priority = job.get('priority')
+                if (priority is not None and
+                        priority < highest_blocking_priority):
+                    # Job is lower priority than some other blocking job.
+                    state_details = 'Waiting for higher priority jobs to launch'
+                else:
+                    state_details = 'Waiting for other jobs to launch'
+            if state_details and job['failure_reason']:
+                job['details'] = f'{state_details} - {job["failure_reason"]}'
+            elif state_details:
+                job['details'] = state_details
+            elif job['failure_reason']:
+                job['details'] = f'Failure: {job["failure_reason"]}'
+            else:
+                job['details'] = None
+    return {
+        'jobs': jobs,
+        'total': total,
+        'total_no_filter': total_no_filter,
+        'status_counts': status_counts
+    }
+def filter_jobs(
+    jobs: List[Dict[str, Any]],
+    workspace_match: Optional[str],
+    name_match: Optional[str],
+    pool_match: Optional[str],
+    page: Optional[int],
+    limit: Optional[int],
+    user_match: Optional[str] = None,
+    enable_user_match: bool = False,
+    statuses: Optional[List[str]] = None,
+) -> Tuple[List[Dict[str, Any]], int, Dict[str, int]]:
+    """Filter jobs based on the given criteria.
+    Args:
+        jobs: List of jobs to filter.
+        workspace_match: Workspace name to filter.
+        name_match: Job name to filter.
+        pool_match: Pool name to filter.
+        page: Page to filter.
+        limit: Limit to filter.
+        user_match: User name to filter.
+        enable_user_match: Whether to enable user match.
+        statuses: Statuses to filter.
+    Returns:
+        List of filtered jobs
+        Total number of jobs
+        Dictionary of status counts
+    """
+    # TODO(hailong): refactor the whole function including the
+    # `dump_managed_job_queue()` to use DB filtering.
+    def _pattern_matches(job: Dict[str, Any], key: str,
+                         pattern: Optional[str]) -> bool:
+        if pattern is None:
+            return True
+        if key not in job:
+            return False
+        value = job[key]
+        if not value:
+            return False
+        return pattern in str(value)
+    def _handle_page_and_limit(
+        result: List[Dict[str, Any]],
+        page: Optional[int],
+        limit: Optional[int],
+    ) -> List[Dict[str, Any]]:
+        if page is None and limit is None:
+            return result
+        assert page is not None and limit is not None, (page, limit)
+        # page starts from 1
+        start = (page - 1) * limit
+        end = min(start + limit, len(result))
+        return result[start:end]
+    status_counts: Dict[str, int] = collections.defaultdict(int)
+    result = []
+    checks = [
+        ('workspace', workspace_match),
+        ('job_name', name_match),
+        ('pool', pool_match),
+    ]
+    if enable_user_match:
+        checks.append(('user_name', user_match))
+    for job in jobs:
+        if not all(
+                _pattern_matches(job, key, pattern) for key, pattern in checks):
+            continue
+        status_counts[job['status'].value] += 1
+        if statuses:
+            if job['status'].value not in statuses:
+                continue
+        result.append(job)
+    total = len(result)
+    return _handle_page_and_limit(result, page, limit), total, status_counts
-def load_managed_job_queue(payload: str) -> List[Dict[str, Any]]:
+def load_managed_job_queue(
+    payload: str
+) -> Tuple[List[Dict[str, Any]], int, ManagedJobQueueResultType, int, Dict[
+        str, int]]:
     """Load job queue from json string."""
-    jobs = message_utils.decode_payload(payload)
+    result = message_utils.decode_payload(payload)
+    result_type = ManagedJobQueueResultType.DICT
+    status_counts: Dict[str, int] = {}
+    if isinstance(result, dict):
+        jobs: List[Dict[str, Any]] = result['jobs']
+        total: int = result['total']
+        status_counts = result.get('status_counts', {})
+        total_no_filter: int = result.get('total_no_filter', total)
+    else:
+        jobs = result
+        total = len(jobs)
+        total_no_filter = total
+        result_type = ManagedJobQueueResultType.LIST
+    all_users = global_user_state.get_all_users()
+    all_users_map = {user.id: user.name for user in all_users}
     for job in jobs:
         job['status'] = managed_job_state.ManagedJobStatus(job['status'])
         if 'user_hash' in job and job['user_hash'] is not None:
             # Skip jobs that do not have user_hash info.
             # TODO(cooperc): Remove check before 0.12.0.
-            job['user_name'] = global_user_state.get_user(job['user_hash']).name
-    return jobs
+            job['user_name'] = all_users_map.get(job['user_hash'])
+    return jobs, total, result_type, total_no_filter, status_counts
 def _get_job_status_from_tasks(
-    job_tasks: List[Dict[str, Any]]
+    job_tasks: Union[List[responses.ManagedJobRecord], List[Dict[str, Any]]]
 ) -> Tuple[managed_job_state.ManagedJobStatus, int]:
     """Get the current task status and the current task id for a job."""
     managed_task_status = managed_job_state.ManagedJobStatus.SUCCEEDED
@@ -949,7 +1735,7 @@ def _get_job_status_from_tasks(
         # Use the first non-succeeded status.
         if managed_task_status != managed_job_state.ManagedJobStatus.SUCCEEDED:
             # TODO(zhwu): we should not blindly use the first non-
-            # succeeded as the status could be changed to SUBMITTED
+            # succeeded as the status could be changed to PENDING
             # when going from one task to the next one, which can be
             # confusing.
             break
@@ -957,29 +1743,40 @@ def _get_job_status_from_tasks(
 @typing.overload
-def format_job_table(tasks: List[Dict[str, Any]],
-                     show_all: bool,
-                     show_user: bool,
-                     return_rows: Literal[False] = False,
-                     max_jobs: Optional[int] = None) -> str:
+def format_job_table(
+    tasks: List[Dict[str, Any]],
+    show_all: bool,
+    show_user: bool,
+    return_rows: Literal[False] = False,
+    pool_status: Optional[List[Dict[str, Any]]] = None,
+    max_jobs: Optional[int] = None,
+    job_status_counts: Optional[Dict[str, int]] = None,
+) -> str:
     ...
 @typing.overload
-def format_job_table(tasks: List[Dict[str, Any]],
-                     show_all: bool,
-                     show_user: bool,
-                     return_rows: Literal[True],
-                     max_jobs: Optional[int] = None) -> List[List[str]]:
+def format_job_table(
+    tasks: List[Dict[str, Any]],
+    show_all: bool,
+    show_user: bool,
+    return_rows: Literal[True],
+    pool_status: Optional[List[Dict[str, Any]]] = None,
+    max_jobs: Optional[int] = None,
+    job_status_counts: Optional[Dict[str, int]] = None,
+) -> List[List[str]]:
     ...
 def format_job_table(
-        tasks: List[Dict[str, Any]],
-        show_all: bool,
-        show_user: bool,
-        return_rows: bool = False,
-        max_jobs: Optional[int] = None) -> Union[str, List[List[str]]]:
+    tasks: List[Dict[str, Any]],
+    show_all: bool,
+    show_user: bool,
+    return_rows: bool = False,
+    pool_status: Optional[List[Dict[str, Any]]] = None,
+    max_jobs: Optional[int] = None,
+    job_status_counts: Optional[Dict[str, int]] = None,
+) -> Union[str, List[List[str]]]:
     """Returns managed jobs as a formatted string.
     Args:
@@ -988,13 +1785,15 @@ def format_job_table(
         max_jobs: The maximum number of jobs to show in the table.
         return_rows: If True, return the rows as a list of strings instead of
           all rows concatenated into a single string.
+        pool_status: List of pool status dictionaries with replica_info.
+        job_status_counts: The counts of each job status.
     Returns: A formatted string of managed jobs, if not `return_rows`; otherwise
       a list of "rows" (each of which is a list of str).
     """
     jobs = collections.defaultdict(list)
     # Check if the tasks have user information from kubernetes.
-    # This is only used for sky status --kubernetes.
+    # This is only used for sky status-kubernetes.
     tasks_have_k8s_user = any([task.get('user') for task in tasks])
     if max_jobs and tasks_have_k8s_user:
         raise ValueError('max_jobs is not supported when tasks have user info.')
@@ -1004,16 +1803,41 @@ def format_job_table(
             return (task['user'], task['job_id'])
         return task['job_id']
+    def _get_job_id_to_worker_map(
+            pool_status: Optional[List[Dict[str, Any]]]) -> Dict[int, int]:
+        """Create a mapping from job_id to worker replica_id.
+        Args:
+            pool_status: List of pool status dictionaries with replica_info.
+        Returns:
+            Dictionary mapping job_id to replica_id (worker ID).
+        """
+        job_to_worker: Dict[int, int] = {}
+        if pool_status is None:
+            return job_to_worker
+        for pool in pool_status:
+            replica_info = pool.get('replica_info', [])
+            for replica in replica_info:
+                used_by = replica.get('used_by')
+                if used_by is not None:
+                    job_to_worker[used_by] = replica.get('replica_id')
+        return job_to_worker
+    # Create mapping from job_id to worker replica_id
+    job_to_worker = _get_job_id_to_worker_map(pool_status)
     for task in tasks:
         # The tasks within the same job_id are already sorted
         # by the task_id.
         jobs[get_hash(task)].append(task)
-    status_counts: Dict[str, int] = collections.defaultdict(int)
+    workspaces = set()
     for job_tasks in jobs.values():
-        managed_job_status = _get_job_status_from_tasks(job_tasks)[0]
-        if not managed_job_status.is_terminal():
-            status_counts[managed_job_status.value] += 1
+        workspaces.add(job_tasks[0].get('workspace',
+                                        constants.SKYPILOT_DEFAULT_WORKSPACE))
+    show_workspace = len(workspaces) > 1 or show_all
     user_cols: List[str] = []
     if show_user:
@@ -1024,26 +1848,43 @@ def format_job_table(
     columns = [
         'ID',
         'TASK',
+        *(['WORKSPACE'] if show_workspace else []),
         'NAME',
         *user_cols,
-        'RESOURCES',
+        'REQUESTED',
         'SUBMITTED',
         'TOT. DURATION',
         'JOB DURATION',
         '#RECOVERIES',
         'STATUS',
+        'POOL',
     ]
     if show_all:
         # TODO: move SCHED. STATE to a separate flag (e.g. --debug)
-        columns += ['STARTED', 'CLUSTER', 'REGION', 'SCHED. STATE', 'DETAILS']
+        columns += [
+            'WORKER_CLUSTER',
+            'WORKER_JOB_ID',
+            'STARTED',
+            'INFRA',
+            'RESOURCES',
+            'SCHED. STATE',
+            'DETAILS',
+            'GIT_COMMIT',
+        ]
     if tasks_have_k8s_user:
         columns.insert(0, 'USER')
     job_table = log_utils.create_table(columns)
     status_counts: Dict[str, int] = collections.defaultdict(int)
-    for task in tasks:
-        if not task['status'].is_terminal():
-            status_counts[task['status'].value] += 1
+    if job_status_counts:
+        for status_value, count in job_status_counts.items():
+            status = managed_job_state.ManagedJobStatus(status_value)
+            if not status.is_terminal():
+                status_counts[status_value] = count
+    else:
+        for task in tasks:
+            if not task['status'].is_terminal():
+                status_counts[task['status'].value] += 1
     all_tasks = tasks
     if max_jobs is not None:
@@ -1054,7 +1895,10 @@ def format_job_table(
         # by the task_id.
         jobs[get_hash(task)].append(task)
-    def generate_details(failure_reason: Optional[str]) -> str:
+    def generate_details(details: Optional[str],
+                         failure_reason: Optional[str]) -> str:
+        if details is not None:
+            return details
         if failure_reason is not None:
             return f'Failure: {failure_reason}'
         return '-'
@@ -1083,6 +1927,8 @@ def format_job_table(
     for job_hash, job_tasks in jobs.items():
         if show_all:
             schedule_state = job_tasks[0]['schedule_state']
+        workspace = job_tasks[0].get('workspace',
+                                     constants.SKYPILOT_DEFAULT_WORKSPACE)
         if len(job_tasks) > 1:
             # Aggregate the tasks into a new row in the table.
@@ -1120,10 +1966,20 @@ def format_job_table(
             user_values = get_user_column_values(job_tasks[0])
+            pool = job_tasks[0].get('pool')
+            if pool is None:
+                pool = '-'
+            # Add worker information if job is assigned to a worker
             job_id = job_hash[1] if tasks_have_k8s_user else job_hash
+            # job_id is now always an integer, use it to look up worker
+            if job_id in job_to_worker and pool != '-':
+                pool = f'{pool} (worker={job_to_worker[job_id]})'
             job_values = [
                 job_id,
                 '',
+                *([''] if show_workspace else []),
                 job_name,
                 *user_values,
                 '-',
@@ -1132,15 +1988,20 @@ def format_job_table(
                 job_duration,
                 recovery_cnt,
                 status_str,
+                pool,
             ]
             if show_all:
+                details = job_tasks[current_task_id].get('details')
                 failure_reason = job_tasks[current_task_id]['failure_reason']
                 job_values.extend([
+                    '-',
+                    '-',
                     '-',
                     '-',
                     '-',
                     job_tasks[0]['schedule_state'],
-                    generate_details(failure_reason),
+                    generate_details(details, failure_reason),
+                    job_tasks[0].get('metadata', {}).get('git_commit', '-'),
                 ])
             if tasks_have_k8s_user:
                 job_values.insert(0, job_tasks[0].get('user', '-'))
@@ -1153,9 +2014,20 @@ def format_job_table(
                 0, task['job_duration'], absolute=True)
             submitted = log_utils.readable_time_duration(task['submitted_at'])
             user_values = get_user_column_values(task)
+            task_workspace = '-' if len(job_tasks) > 1 else workspace
+            pool = task.get('pool')
+            if pool is None:
+                pool = '-'
+            # Add worker information if task is assigned to a worker
+            task_job_id = task['job_id']
+            if task_job_id in job_to_worker and pool != '-':
+                pool = f'{pool} (worker={job_to_worker[task_job_id]})'
             values = [
                 task['job_id'] if len(job_tasks) == 1 else ' \u21B3',
                 task['task_id'] if len(job_tasks) > 1 else '-',
+                *([task_workspace] if show_workspace else []),
                 task['task_name'],
                 *user_values,
                 task['resources'],
@@ -1168,20 +2040,50 @@ def format_job_table(
                 job_duration,
                 task['recovery_count'],
                 task['status'].colored_str(),
+                pool,
             ]
             if show_all:
                 # schedule_state is only set at the job level, so if we have
                 # more than one task, only display on the aggregated row.
                 schedule_state = (task['schedule_state']
                                   if len(job_tasks) == 1 else '-')
+                infra_str = task.get('infra')
+                if infra_str is None:
+                    cloud = task.get('cloud')
+                    if cloud is None:
+                        # Backward compatibility for old jobs controller without
+                        # cloud info returned, we parse it from the cluster
+                        # resources
+                        # TODO(zhwu): remove this after 0.12.0
+                        cloud = task['cluster_resources'].split('(')[0].split(
+                            'x')[-1]
+                        task['cluster_resources'] = task[
+                            'cluster_resources'].replace(f'{cloud}(',
+                                                         '(').replace(
+                                                             'x ', 'x')
+                    region = task['region']
+                    zone = task.get('zone')
+                    if cloud == '-':
+                        cloud = None
+                    if region == '-':
+                        region = None
+                    if zone == '-':
+                        zone = None
+                    infra_str = infra_utils.InfraInfo(cloud, region,
+                                                      zone).formatted_str()
                 values.extend([
+                    task.get('current_cluster_name', '-'),
+                    task.get('job_id_on_pool_cluster', '-'),
                     # STARTED
                     log_utils.readable_time_duration(task['start_at']),
+                    infra_str,
                     task['cluster_resources'],
-                    task['region'],
                     schedule_state,
-                    generate_details(task['failure_reason']),
+                    generate_details(task.get('details'),
+                                     task['failure_reason']),
                 ])
+                values.append(task.get('metadata', {}).get('git_commit', '-'))
             if tasks_have_k8s_user:
                 values.insert(0, task.get('user', '-'))
             job_table.add_row(values)
@@ -1204,6 +2106,59 @@ def format_job_table(
     return output
+def decode_managed_job_protos(
+    job_protos: Iterable['managed_jobsv1_pb2.ManagedJobInfo']
+) -> List[Dict[str, Any]]:
+    """Decode job protos to dicts. Similar to load_managed_job_queue."""
+    user_hash_to_user = global_user_state.get_users(
+        set(job.user_hash for job in job_protos if job.user_hash))
+    jobs = []
+    for job_proto in job_protos:
+        job_dict = _job_proto_to_dict(job_proto)
+        user_hash = job_dict.get('user_hash', None)
+        if user_hash is not None:
+            # Skip jobs that do not have user_hash info.
+            # TODO(cooperc): Remove check before 0.12.0.
+            user = user_hash_to_user.get(user_hash, None)
+            job_dict['user_name'] = user.name if user is not None else None
+        jobs.append(job_dict)
+    return jobs
+def _job_proto_to_dict(
+        job_proto: 'managed_jobsv1_pb2.ManagedJobInfo') -> Dict[str, Any]:
+    job_dict = json_format.MessageToDict(
+        job_proto,
+        always_print_fields_with_no_presence=True,
+        # Our API returns fields in snake_case.
+        preserving_proto_field_name=True,
+        use_integers_for_enums=True)
+    for field in job_proto.DESCRIPTOR.fields:
+        # Ensure optional fields are present with None values for
+        # backwards compatibility with older clients.
+        if field.has_presence and field.name not in job_dict:
+            job_dict[field.name] = None
+        # json_format.MessageToDict is meant for encoding to JSON,
+        # and Protobuf encodes int64 as decimal strings in JSON,
+        # so we need to convert them back to ints.
+        # https://protobuf.dev/programming-guides/json/#field-representation
+        if (field.type == descriptor.FieldDescriptor.TYPE_INT64 and
+                job_dict.get(field.name) is not None):
+            job_dict[field.name] = int(job_dict[field.name])
+    job_dict['status'] = managed_job_state.ManagedJobStatus.from_protobuf(
+        job_dict['status'])
+    # For backwards compatibility, convert schedule_state to a string,
+    # as we don't have the logic to handle it in our request
+    # encoder/decoder, unlike status.
+    schedule_state_enum = (
+        managed_job_state.ManagedJobScheduleState.from_protobuf(
+            job_dict['schedule_state']))
+    job_dict['schedule_state'] = (schedule_state_enum.value
+                                  if schedule_state_enum is not None else None)
+    return job_dict
 class ManagedJobCodeGen:
     """Code generator for managed job utility functions.
@@ -1221,9 +2176,62 @@ class ManagedJobCodeGen:
         """)
     @classmethod
-    def get_job_table(cls) -> str:
-        code = textwrap.dedent("""\
-        job_table = utils.dump_managed_job_queue()
+    def get_job_table(
+        cls,
+        skip_finished: bool = False,
+        accessible_workspaces: Optional[List[str]] = None,
+        job_ids: Optional[List[int]] = None,
+        workspace_match: Optional[str] = None,
+        name_match: Optional[str] = None,
+        pool_match: Optional[str] = None,
+        page: Optional[int] = None,
+        limit: Optional[int] = None,
+        user_hashes: Optional[List[Optional[str]]] = None,
+        statuses: Optional[List[str]] = None,
+        fields: Optional[List[str]] = None,
+    ) -> str:
+        code = textwrap.dedent(f"""\
+        if managed_job_version < 9:
+            # For backward compatibility, since filtering is not supported
+            # before #6652.
+            # TODO(hailong): Remove compatibility before 0.12.0
+            job_table = utils.dump_managed_job_queue()
+        elif managed_job_version < 10:
+            job_table = utils.dump_managed_job_queue(
+                                skip_finished={skip_finished},
+                                accessible_workspaces={accessible_workspaces!r},
+                                job_ids={job_ids!r},
+                                workspace_match={workspace_match!r},
+                                name_match={name_match!r},
+                                pool_match={pool_match!r},
+                                page={page!r},
+                                limit={limit!r},
+                                user_hashes={user_hashes!r})
+        elif managed_job_version < 12:
+            job_table = utils.dump_managed_job_queue(
+                                skip_finished={skip_finished},
+                                accessible_workspaces={accessible_workspaces!r},
+                                job_ids={job_ids!r},
+                                workspace_match={workspace_match!r},
+                                name_match={name_match!r},
+                                pool_match={pool_match!r},
+                                page={page!r},
+                                limit={limit!r},
+                                user_hashes={user_hashes!r},
+                                statuses={statuses!r})
+        else:
+            job_table = utils.dump_managed_job_queue(
+                                skip_finished={skip_finished},
+                                accessible_workspaces={accessible_workspaces!r},
+                                job_ids={job_ids!r},
+                                workspace_match={workspace_match!r},
+                                name_match={name_match!r},
+                                pool_match={pool_match!r},
+                                page={page!r},
+                                limit={limit!r},
+                                user_hashes={user_hashes!r},
+                                statuses={statuses!r},
+                                fields={fields!r})
         print(job_table, flush=True)
         """)
         return cls._build(code)
@@ -1232,26 +2240,77 @@ class ManagedJobCodeGen:
     def cancel_jobs_by_id(cls,
                           job_ids: Optional[List[int]],
                           all_users: bool = False) -> str:
+        active_workspace = skypilot_config.get_active_workspace()
         code = textwrap.dedent(f"""\
         if managed_job_version < 2:
             # For backward compatibility, since all_users is not supported
-            # before #4787. Assume th
+            # before #4787.
             # TODO(cooperc): Remove compatibility before 0.12.0
             msg = utils.cancel_jobs_by_id({job_ids})
-        else:
+        elif managed_job_version < 4:
+            # For backward compatibility, since current_workspace is not
+            # supported before #5660. Don't check the workspace.
+            # TODO(zhwu): Remove compatibility before 0.12.0
             msg = utils.cancel_jobs_by_id({job_ids}, all_users={all_users})
+        else:
+            msg = utils.cancel_jobs_by_id({job_ids}, all_users={all_users},
+                            current_workspace={active_workspace!r})
         print(msg, end="", flush=True)
         """)
         return cls._build(code)
     @classmethod
     def cancel_job_by_name(cls, job_name: str) -> str:
+        active_workspace = skypilot_config.get_active_workspace()
         code = textwrap.dedent(f"""\
-        msg = utils.cancel_job_by_name({job_name!r})
+        if managed_job_version < 4:
+            # For backward compatibility, since current_workspace is not
+            # supported before #5660. Don't check the workspace.
+            # TODO(zhwu): Remove compatibility before 0.12.0
+            msg = utils.cancel_job_by_name({job_name!r})
+        else:
+            msg = utils.cancel_job_by_name({job_name!r}, {active_workspace!r})
         print(msg, end="", flush=True)
         """)
         return cls._build(code)
+    @classmethod
+    def cancel_jobs_by_pool(cls, pool_name: str) -> str:
+        active_workspace = skypilot_config.get_active_workspace()
+        code = textwrap.dedent(f"""\
+            msg = utils.cancel_jobs_by_pool({pool_name!r}, {active_workspace!r})
+            print(msg, end="", flush=True)
+        """)
+        return cls._build(code)
+    @classmethod
+    def get_version_and_job_table(cls) -> str:
+        """Generate code to get controller version and raw job table."""
+        code = textwrap.dedent("""\
+        from sky.skylet import constants as controller_constants
+        # Get controller version
+        controller_version = controller_constants.SKYLET_VERSION
+        print(f"controller_version:{controller_version}", flush=True)
+        # Get and print raw job table (load_managed_job_queue can parse this directly)
+        job_table = utils.dump_managed_job_queue()
+        print(job_table, flush=True)
+        """)
+        return cls._build(code)
+    @classmethod
+    def get_version(cls) -> str:
+        """Generate code to get controller version."""
+        code = textwrap.dedent("""\
+        from sky.skylet import constants as controller_constants
+        # Get controller version
+        controller_version = controller_constants.SKYLET_VERSION
+        print(f"controller_version:{controller_version}", flush=True)
+        """)
+        return cls._build(code)
     @classmethod
     def get_all_job_ids_by_name(cls, job_name: Optional[str]) -> str:
         code = textwrap.dedent(f"""\
@@ -1266,10 +2325,16 @@ class ManagedJobCodeGen:
                     job_name: Optional[str],
                     job_id: Optional[int],
                     follow: bool = True,
-                    controller: bool = False) -> str:
+                    controller: bool = False,
+                    tail: Optional[int] = None) -> str:
         code = textwrap.dedent(f"""\
-        result = utils.stream_logs(job_id={job_id!r}, job_name={job_name!r},
-                                follow={follow}, controller={controller})
+        if managed_job_version < 6:
+            # Versions before 5 did not support tail parameter
+            result = utils.stream_logs(job_id={job_id!r}, job_name={job_name!r},
+                                    follow={follow}, controller={controller})
+        else:
+            result = utils.stream_logs(job_id={job_id!r}, job_name={job_name!r},
+                                    follow={follow}, controller={controller}, tail={tail!r})
         if managed_job_version < 3:
             # Versions 2 and older did not return a retcode, so we just print
             # the result.
@@ -1283,18 +2348,44 @@ class ManagedJobCodeGen:
         return cls._build(code)
     @classmethod
-    def set_pending(cls, job_id: int, managed_job_dag: 'dag_lib.Dag') -> str:
+    def set_pending(cls,
+                    job_id: int,
+                    managed_job_dag: 'dag_lib.Dag',
+                    workspace: str,
+                    entrypoint: str,
+                    user_hash: Optional[str] = None) -> str:
         dag_name = managed_job_dag.name
+        pool = managed_job_dag.pool
         # Add the managed job to queue table.
         code = textwrap.dedent(f"""\
-            managed_job_state.set_job_info({job_id}, {dag_name!r})
+            set_job_info_kwargs = {{'workspace': {workspace!r}}}
+            if managed_job_version < 4:
+                set_job_info_kwargs = {{}}
+            if managed_job_version >= 5:
+                set_job_info_kwargs['entrypoint'] = {entrypoint!r}
+            if managed_job_version >= 8:
+                from sky.serve import serve_state
+                pool_hash = None
+                if {pool!r} != None:
+                    pool_hash = serve_state.get_service_hash({pool!r})
+                set_job_info_kwargs['pool'] = {pool!r}
+                set_job_info_kwargs['pool_hash'] = pool_hash
+            if managed_job_version >= 11:
+                set_job_info_kwargs['user_hash'] = {user_hash!r}
+            managed_job_state.set_job_info(
+                {job_id}, {dag_name!r}, **set_job_info_kwargs)
             """)
         for task_id, task in enumerate(managed_job_dag.tasks):
             resources_str = backend_utils.get_task_resources_str(
                 task, is_managed_job=True)
             code += textwrap.dedent(f"""\
-                managed_job_state.set_pending({job_id}, {task_id},
-                                  {task.name!r}, {resources_str!r})
+                if managed_job_version < 7:
+                    managed_job_state.set_pending({job_id}, {task_id},
+                                    {task.name!r}, {resources_str!r})
+                else:
+                    managed_job_state.set_pending({job_id}, {task_id},
+                                    {task.name!r}, {resources_str!r},
+                                    {task.metadata_json!r})
                 """)
         return cls._build(code)

skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250502py3-none-any.whl → 1.0.0.dev20251203py3-none-any.whl