PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250905py3-none-any.whl → 1.0.0.dev20251203py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (397) hide show

sky/__init__.py +10 -2
sky/adaptors/aws.py +81 -16
sky/adaptors/common.py +25 -2
sky/adaptors/coreweave.py +278 -0
sky/adaptors/do.py +8 -2
sky/adaptors/gcp.py +11 -0
sky/adaptors/ibm.py +5 -2
sky/adaptors/kubernetes.py +64 -0
sky/adaptors/nebius.py +3 -1
sky/adaptors/primeintellect.py +1 -0
sky/adaptors/seeweb.py +183 -0
sky/adaptors/shadeform.py +89 -0
sky/admin_policy.py +20 -0
sky/authentication.py +157 -263
sky/backends/__init__.py +3 -2
sky/backends/backend.py +11 -3
sky/backends/backend_utils.py +588 -184
sky/backends/cloud_vm_ray_backend.py +1088 -904
sky/backends/local_docker_backend.py +9 -5
sky/backends/task_codegen.py +633 -0
sky/backends/wheel_utils.py +18 -0
sky/catalog/__init__.py +8 -0
sky/catalog/aws_catalog.py +4 -0
sky/catalog/common.py +19 -1
sky/catalog/data_fetchers/fetch_aws.py +102 -80
sky/catalog/data_fetchers/fetch_gcp.py +30 -3
sky/catalog/data_fetchers/fetch_nebius.py +9 -6
sky/catalog/data_fetchers/fetch_runpod.py +698 -0
sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
sky/catalog/kubernetes_catalog.py +24 -28
sky/catalog/primeintellect_catalog.py +95 -0
sky/catalog/runpod_catalog.py +5 -1
sky/catalog/seeweb_catalog.py +184 -0
sky/catalog/shadeform_catalog.py +165 -0
sky/check.py +73 -43
sky/client/cli/command.py +675 -412
sky/client/cli/flags.py +4 -2
sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
sky/client/cli/utils.py +79 -0
sky/client/common.py +12 -2
sky/client/sdk.py +132 -63
sky/client/sdk_async.py +34 -33
sky/cloud_stores.py +82 -3
sky/clouds/__init__.py +6 -0
sky/clouds/aws.py +337 -129
sky/clouds/azure.py +24 -18
sky/clouds/cloud.py +40 -13
sky/clouds/cudo.py +16 -13
sky/clouds/do.py +9 -7
sky/clouds/fluidstack.py +12 -5
sky/clouds/gcp.py +14 -7
sky/clouds/hyperbolic.py +12 -5
sky/clouds/ibm.py +12 -5
sky/clouds/kubernetes.py +80 -45
sky/clouds/lambda_cloud.py +12 -5
sky/clouds/nebius.py +23 -9
sky/clouds/oci.py +19 -12
sky/clouds/paperspace.py +4 -1
sky/clouds/primeintellect.py +317 -0
sky/clouds/runpod.py +85 -24
sky/clouds/scp.py +12 -8
sky/clouds/seeweb.py +477 -0
sky/clouds/shadeform.py +400 -0
sky/clouds/ssh.py +4 -2
sky/clouds/utils/scp_utils.py +61 -50
sky/clouds/vast.py +33 -27
sky/clouds/vsphere.py +14 -16
sky/core.py +174 -165
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +92 -1
sky/data/mounting_utils.py +162 -29
sky/data/storage.py +200 -19
sky/data/storage_utils.py +10 -45
sky/exceptions.py +18 -7
sky/execution.py +74 -31
sky/global_user_state.py +605 -191
sky/jobs/__init__.py +2 -0
sky/jobs/client/sdk.py +101 -4
sky/jobs/client/sdk_async.py +31 -5
sky/jobs/constants.py +15 -8
sky/jobs/controller.py +726 -284
sky/jobs/file_content_utils.py +128 -0
sky/jobs/log_gc.py +193 -0
sky/jobs/recovery_strategy.py +250 -100
sky/jobs/scheduler.py +271 -173
sky/jobs/server/core.py +367 -114
sky/jobs/server/server.py +81 -35
sky/jobs/server/utils.py +89 -35
sky/jobs/state.py +1498 -620
sky/jobs/utils.py +771 -306
sky/logs/agent.py +40 -5
sky/logs/aws.py +9 -19
sky/metrics/utils.py +282 -39
sky/optimizer.py +1 -1
sky/provision/__init__.py +37 -1
sky/provision/aws/config.py +34 -13
sky/provision/aws/instance.py +5 -2
sky/provision/azure/instance.py +5 -3
sky/provision/common.py +2 -0
sky/provision/cudo/instance.py +4 -3
sky/provision/do/instance.py +4 -3
sky/provision/docker_utils.py +97 -26
sky/provision/fluidstack/instance.py +6 -5
sky/provision/gcp/config.py +6 -1
sky/provision/gcp/instance.py +4 -2
sky/provision/hyperbolic/instance.py +4 -2
sky/provision/instance_setup.py +66 -20
sky/provision/kubernetes/__init__.py +2 -0
sky/provision/kubernetes/config.py +7 -44
sky/provision/kubernetes/constants.py +0 -1
sky/provision/kubernetes/instance.py +609 -213
sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
sky/provision/kubernetes/network.py +12 -8
sky/provision/kubernetes/network_utils.py +8 -25
sky/provision/kubernetes/utils.py +382 -418
sky/provision/kubernetes/volume.py +150 -18
sky/provision/lambda_cloud/instance.py +16 -13
sky/provision/nebius/instance.py +6 -2
sky/provision/nebius/utils.py +103 -86
sky/provision/oci/instance.py +4 -2
sky/provision/paperspace/instance.py +4 -3
sky/provision/primeintellect/__init__.py +10 -0
sky/provision/primeintellect/config.py +11 -0
sky/provision/primeintellect/instance.py +454 -0
sky/provision/primeintellect/utils.py +398 -0
sky/provision/provisioner.py +30 -9
sky/provision/runpod/__init__.py +2 -0
sky/provision/runpod/instance.py +4 -3
sky/provision/runpod/volume.py +69 -13
sky/provision/scp/instance.py +307 -130
sky/provision/seeweb/__init__.py +11 -0
sky/provision/seeweb/config.py +13 -0
sky/provision/seeweb/instance.py +812 -0
sky/provision/shadeform/__init__.py +11 -0
sky/provision/shadeform/config.py +12 -0
sky/provision/shadeform/instance.py +351 -0
sky/provision/shadeform/shadeform_utils.py +83 -0
sky/provision/vast/instance.py +5 -3
sky/provision/volume.py +164 -0
sky/provision/vsphere/common/ssl_helper.py +1 -1
sky/provision/vsphere/common/vapiconnect.py +2 -1
sky/provision/vsphere/common/vim_utils.py +3 -2
sky/provision/vsphere/instance.py +8 -6
sky/provision/vsphere/vsphere_utils.py +8 -1
sky/resources.py +11 -3
sky/schemas/api/responses.py +107 -6
sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
sky/schemas/db/serve_state/002_yaml_content.py +34 -0
sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
sky/schemas/generated/jobsv1_pb2.py +86 -0
sky/schemas/generated/jobsv1_pb2.pyi +254 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
sky/schemas/generated/servev1_pb2.py +58 -0
sky/schemas/generated/servev1_pb2.pyi +115 -0
sky/schemas/generated/servev1_pb2_grpc.py +322 -0
sky/serve/autoscalers.py +2 -0
sky/serve/client/impl.py +55 -21
sky/serve/constants.py +4 -3
sky/serve/controller.py +17 -11
sky/serve/load_balancing_policies.py +1 -1
sky/serve/replica_managers.py +219 -142
sky/serve/serve_rpc_utils.py +179 -0
sky/serve/serve_state.py +63 -54
sky/serve/serve_utils.py +145 -109
sky/serve/server/core.py +46 -25
sky/serve/server/impl.py +311 -162
sky/serve/server/server.py +21 -19
sky/serve/service.py +84 -68
sky/serve/service_spec.py +45 -7
sky/server/auth/loopback.py +38 -0
sky/server/auth/oauth2_proxy.py +12 -7
sky/server/common.py +47 -24
sky/server/config.py +62 -28
sky/server/constants.py +9 -1
sky/server/daemons.py +109 -38
sky/server/metrics.py +76 -96
sky/server/middleware_utils.py +166 -0
sky/server/requests/executor.py +381 -145
sky/server/requests/payloads.py +71 -18
sky/server/requests/preconditions.py +15 -13
sky/server/requests/request_names.py +121 -0
sky/server/requests/requests.py +507 -157
sky/server/requests/serializers/decoders.py +48 -17
sky/server/requests/serializers/encoders.py +85 -20
sky/server/requests/threads.py +117 -0
sky/server/rest.py +116 -24
sky/server/server.py +420 -172
sky/server/stream_utils.py +219 -45
sky/server/uvicorn.py +30 -19
sky/setup_files/MANIFEST.in +6 -1
sky/setup_files/alembic.ini +8 -0
sky/setup_files/dependencies.py +62 -19
sky/setup_files/setup.py +44 -44
sky/sky_logging.py +13 -5
sky/skylet/attempt_skylet.py +106 -24
sky/skylet/configs.py +3 -1
sky/skylet/constants.py +111 -26
sky/skylet/events.py +64 -10
sky/skylet/job_lib.py +141 -104
sky/skylet/log_lib.py +233 -5
sky/skylet/log_lib.pyi +40 -2
sky/skylet/providers/ibm/node_provider.py +12 -8
sky/skylet/providers/ibm/vpc_provider.py +13 -12
sky/skylet/runtime_utils.py +21 -0
sky/skylet/services.py +524 -0
sky/skylet/skylet.py +22 -1
sky/skylet/subprocess_daemon.py +104 -29
sky/skypilot_config.py +99 -79
sky/ssh_node_pools/server.py +9 -8
sky/task.py +221 -104
sky/templates/aws-ray.yml.j2 +1 -0
sky/templates/azure-ray.yml.j2 +1 -0
sky/templates/cudo-ray.yml.j2 +1 -0
sky/templates/do-ray.yml.j2 +1 -0
sky/templates/fluidstack-ray.yml.j2 +1 -0
sky/templates/gcp-ray.yml.j2 +1 -0
sky/templates/hyperbolic-ray.yml.j2 +1 -0
sky/templates/ibm-ray.yml.j2 +2 -1
sky/templates/jobs-controller.yaml.j2 +3 -0
sky/templates/kubernetes-ray.yml.j2 +196 -55
sky/templates/lambda-ray.yml.j2 +1 -0
sky/templates/nebius-ray.yml.j2 +3 -0
sky/templates/oci-ray.yml.j2 +1 -0
sky/templates/paperspace-ray.yml.j2 +1 -0
sky/templates/primeintellect-ray.yml.j2 +72 -0
sky/templates/runpod-ray.yml.j2 +1 -0
sky/templates/scp-ray.yml.j2 +1 -0
sky/templates/seeweb-ray.yml.j2 +171 -0
sky/templates/shadeform-ray.yml.j2 +73 -0
sky/templates/vast-ray.yml.j2 +1 -0
sky/templates/vsphere-ray.yml.j2 +1 -0
sky/templates/websocket_proxy.py +188 -43
sky/usage/usage_lib.py +16 -4
sky/users/permission.py +60 -43
sky/utils/accelerator_registry.py +6 -3
sky/utils/admin_policy_utils.py +18 -5
sky/utils/annotations.py +22 -0
sky/utils/asyncio_utils.py +78 -0
sky/utils/atomic.py +1 -1
sky/utils/auth_utils.py +153 -0
sky/utils/cli_utils/status_utils.py +12 -7
sky/utils/cluster_utils.py +28 -6
sky/utils/command_runner.py +88 -27
sky/utils/command_runner.pyi +36 -3
sky/utils/common.py +3 -1
sky/utils/common_utils.py +37 -4
sky/utils/config_utils.py +1 -14
sky/utils/context.py +127 -40
sky/utils/context_utils.py +73 -18
sky/utils/controller_utils.py +229 -70
sky/utils/db/db_utils.py +95 -18
sky/utils/db/kv_cache.py +149 -0
sky/utils/db/migration_utils.py +24 -7
sky/utils/env_options.py +4 -0
sky/utils/git.py +559 -1
sky/utils/kubernetes/create_cluster.sh +15 -30
sky/utils/kubernetes/delete_cluster.sh +10 -7
sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
sky/utils/kubernetes/generate_kind_config.py +6 -66
sky/utils/kubernetes/gpu_labeler.py +13 -3
sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
sky/utils/kubernetes/rsync_helper.sh +11 -3
sky/utils/kubernetes_enums.py +7 -15
sky/utils/lock_events.py +4 -4
sky/utils/locks.py +128 -31
sky/utils/log_utils.py +0 -319
sky/utils/resource_checker.py +13 -10
sky/utils/resources_utils.py +53 -29
sky/utils/rich_utils.py +8 -4
sky/utils/schemas.py +107 -52
sky/utils/subprocess_utils.py +17 -4
sky/utils/thread_utils.py +91 -0
sky/utils/timeline.py +2 -1
sky/utils/ux_utils.py +35 -1
sky/utils/volume.py +88 -4
sky/utils/yaml_utils.py +9 -0
sky/volumes/client/sdk.py +48 -10
sky/volumes/server/core.py +59 -22
sky/volumes/server/server.py +46 -17
sky/volumes/volume.py +54 -42
sky/workspaces/core.py +57 -21
sky/workspaces/server.py +13 -12
sky_templates/README.md +3 -0
sky_templates/__init__.py +3 -0
sky_templates/ray/__init__.py +0 -0
sky_templates/ray/start_cluster +183 -0
sky_templates/ray/stop_cluster +75 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
sky/client/cli/git.py +0 -549
sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
/sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0

sky/backends/cloud_vm_ray_backend.py CHANGED Viewed

@@ -2,14 +2,15 @@
 import copy
 import dataclasses
 import enum
-import inspect
 import json
 import math
 import os
 import pathlib
+import random
 import re
 import shlex
 import signal
+import socket
 import subprocess
 import sys
 import tempfile
@@ -17,8 +18,8 @@ import textwrap
 import threading
 import time
 import typing
-from typing import (Any, Callable, Dict, Iterable, List, Optional, Set, Tuple,
-                    Union)
+from typing import (Any, Callable, Dict, Iterable, Iterator, List, Optional,
+                    Set, Tuple, Union)
 import colorama
 import psutil
@@ -39,6 +40,7 @@ from sky import skypilot_config
 from sky import task as task_lib
 from sky.adaptors import common as adaptors_common
 from sky.backends import backend_utils
+from sky.backends import task_codegen
 from sky.backends import wheel_utils
 from sky.clouds import cloud as sky_cloud
 from sky.clouds.utils import gcp_utils
@@ -48,14 +50,15 @@ from sky.provision import common as provision_common
 from sky.provision import instance_setup
 from sky.provision import metadata_utils
 from sky.provision import provisioner
+from sky.provision.kubernetes import config as config_lib
 from sky.provision.kubernetes import utils as kubernetes_utils
+from sky.serve import constants as serve_constants
 from sky.server.requests import requests as requests_lib
 from sky.skylet import autostop_lib
 from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.skylet import log_lib
 from sky.usage import usage_lib
-from sky.utils import accelerator_registry
 from sky.utils import annotations
 from sky.utils import cluster_utils
 from sky.utils import command_runner
@@ -85,13 +88,34 @@ if typing.TYPE_CHECKING:
     from sky import dag
     from sky.schemas.generated import autostopv1_pb2
     from sky.schemas.generated import autostopv1_pb2_grpc
+    from sky.schemas.generated import jobsv1_pb2
+    from sky.schemas.generated import jobsv1_pb2_grpc
+    from sky.schemas.generated import managed_jobsv1_pb2
+    from sky.schemas.generated import managed_jobsv1_pb2_grpc
+    from sky.schemas.generated import servev1_pb2
+    from sky.schemas.generated import servev1_pb2_grpc
 else:
     # To avoid requiring grpcio to be installed on the client side.
-    grpc = adaptors_common.LazyImport('grpc')
+    grpc = adaptors_common.LazyImport(
+        'grpc',
+        # https://github.com/grpc/grpc/issues/37642 to avoid spam in console
+        set_loggers=lambda: os.environ.update({'GRPC_VERBOSITY': 'NONE'})
+        if not env_options.Options.SHOW_DEBUG_INFO.get() else None)
     autostopv1_pb2 = adaptors_common.LazyImport(
         'sky.schemas.generated.autostopv1_pb2')
     autostopv1_pb2_grpc = adaptors_common.LazyImport(
         'sky.schemas.generated.autostopv1_pb2_grpc')
+    jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
+    jobsv1_pb2_grpc = adaptors_common.LazyImport(
+        'sky.schemas.generated.jobsv1_pb2_grpc')
+    servev1_pb2 = adaptors_common.LazyImport(
+        'sky.schemas.generated.servev1_pb2')
+    servev1_pb2_grpc = adaptors_common.LazyImport(
+        'sky.schemas.generated.servev1_pb2_grpc')
+    managed_jobsv1_pb2 = adaptors_common.LazyImport(
+        'sky.schemas.generated.managed_jobsv1_pb2')
+    managed_jobsv1_pb2_grpc = adaptors_common.LazyImport(
+        'sky.schemas.generated.managed_jobsv1_pb2_grpc')
 Path = str
@@ -113,6 +137,7 @@ _NODES_LAUNCHING_PROGRESS_TIMEOUT = {
     clouds.OCI: 300,
     clouds.Paperspace: 600,
     clouds.Kubernetes: 300,
+    clouds.Shadeform: 300,
     clouds.Vsphere: 240,
 }
@@ -179,6 +204,12 @@ _RAY_UP_WITH_MONKEY_PATCHED_HASH_LAUNCH_CONF_PATH = (
 # We use 100KB as a threshold to be safe for other arguments that
 # might be added during ssh.
 _MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
+_EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT = [
+    ('too long', 255),
+    ('request-uri too large', 1),
+    ('request header fields too large', 1),
+    ('400 bad request', 1),  # CloudFlare 400 error
+]
 _RESOURCES_UNAVAILABLE_LOG = (
     'Reasons for provision failures (for details, please check the log above):')
@@ -199,6 +230,61 @@ def _is_command_length_over_limit(command: str) -> bool:
     return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
+def _is_message_too_long(returncode: int,
+                         output: Optional[str] = None,
+                         file_path: Optional[str] = None) -> bool:
+    """Check if the message sent to the remote is too long.
+    We use inline script to run the setup or run command, i.e. the script will
+    be part of the message sent to the remote cluster. There is a chance that
+    the command is too long, when people has very long run or setup commands, or
+    there is a cloudflare proxy in front of the remote blocking the long
+    message. Several common causes are:
+    - SSH returning: `too long` in the error message.
+    - Cloudflare proxy returning: `414 Request-URI Too Large` or
+      `431 Request Header Fields Too Large` error.
+    We use a general length limit check before but it could be inaccurate on
+    some systems, e.g. cloudflare proxy, so this is necessary.
+    Args:
+        returncode: The return code of the setup command.
+        output: The output of the setup command.
+        file_path: The path to the setup log file.
+    """
+    assert (output is None) != (file_path is None), (
+        'Either output or file_path must be provided.', output, file_path)
+    to_check = []
+    for (match_str,
+         desired_rc) in _EXCEPTION_MSG_AND_RETURNCODE_FOR_DUMP_INLINE_SCRIPT:
+        if desired_rc == returncode:
+            to_check.append(match_str)
+    if not to_check:
+        return False
+    def _check_output_for_match_str(output: str) -> bool:
+        for match_str in to_check:
+            if match_str.lower() in output.lower():
+                return True
+        return False
+    if file_path is not None:
+        try:
+            with open(os.path.expanduser(file_path), 'r',
+                      encoding='utf-8') as f:
+                content = f.read()
+                return _check_output_for_match_str(content)
+        except Exception as e:  # pylint: disable=broad-except
+            # We don't crash the setup if we cannot read the log file.
+            # Instead, we should retry the setup with dumping the script
+            # to a file to be safe.
+            logger.debug(f'Failed to read setup log file {file_path}: {e}')
+            return True
+    else:
+        assert output is not None, (output, file_path)
+        return _check_output_for_match_str(output)
 def _get_cluster_config_template(cloud):
     cloud_to_template = {
         clouds.AWS: 'aws-ray.yml.j2',
@@ -210,15 +296,18 @@ def _get_cluster_config_template(cloud):
         clouds.SCP: 'scp-ray.yml.j2',
         clouds.OCI: 'oci-ray.yml.j2',
         clouds.Paperspace: 'paperspace-ray.yml.j2',
+        clouds.PrimeIntellect: 'primeintellect-ray.yml.j2',
         clouds.DO: 'do-ray.yml.j2',
         clouds.RunPod: 'runpod-ray.yml.j2',
         clouds.Kubernetes: 'kubernetes-ray.yml.j2',
         clouds.SSH: 'kubernetes-ray.yml.j2',
+        clouds.Shadeform: 'shadeform-ray.yml.j2',
         clouds.Vsphere: 'vsphere-ray.yml.j2',
         clouds.Vast: 'vast-ray.yml.j2',
         clouds.Fluidstack: 'fluidstack-ray.yml.j2',
         clouds.Nebius: 'nebius-ray.yml.j2',
-        clouds.Hyperbolic: 'hyperbolic-ray.yml.j2'
+        clouds.Hyperbolic: 'hyperbolic-ray.yml.j2',
+        clouds.Seeweb: 'seeweb-ray.yml.j2'
     }
     return cloud_to_template[type(cloud)]
@@ -248,511 +337,6 @@ def write_ray_up_script_with_patched_launch_hash_fn(
     return f.name
-class RayCodeGen:
-    """Code generator of a Ray program that executes a sky.Task.
-    Usage:
-      >> codegen = RayCodegen()
-      >> codegen.add_prologue()
-      >> codegen.add_ray_task(...)
-      >> codegen.add_ray_task(...)
-      >> codegen.add_epilogue()
-      >> code = codegen.build()
-    """
-    def __init__(self):
-        # Code generated so far, to be joined via '\n'.
-        self._code = []
-        # Guard method calling order.
-        self._has_prologue = False
-        self._has_epilogue = False
-        # For n nodes gang scheduling.
-        self._has_gang_scheduling = False
-        self._num_nodes = 0
-        self._has_register_run_fn = False
-        # job_id
-        # Job ID is used to identify the job (also this generated code).
-        # It is a int automatically generated by the DB on the cluster
-        # and monotonically increasing starting from 1.
-        # To generate the job ID, we use the following logic:
-        #   code = job_lib.JobLibCodeGen.add_job(username,
-        #                                              run_timestamp)
-        #   job_id = get_output(run_on_cluster(code))
-        self.job_id = None
-    def add_prologue(self, job_id: int) -> None:
-        assert not self._has_prologue, 'add_prologue() called twice?'
-        self._has_prologue = True
-        self.job_id = job_id
-        # Should use 'auto' or 'ray://<internal_head_ip>:10001' rather than
-        # 'ray://localhost:10001', or 'ray://127.0.0.1:10001', for public cloud.
-        # Otherwise, ray will fail to get the placement group because of a bug
-        # in ray job.
-        ray_address = 'auto'
-        self._code = [
-            textwrap.dedent(f"""\
-            import functools
-            import getpass
-            import hashlib
-            import io
-            import os
-            import pathlib
-            import selectors
-            import shlex
-            import subprocess
-            import sys
-            import tempfile
-            import textwrap
-            import time
-            from typing import Dict, List, Optional, Tuple, Union
-            # Set the environment variables to avoid deduplicating logs and
-            # scheduler events. This should be set in driver code, since we are
-            # not using `ray job submit` anymore, and the environment variables
-            # from the ray cluster is not inherited.
-            os.environ['RAY_DEDUP_LOGS'] = '0'
-            os.environ['RAY_SCHEDULER_EVENTS'] = '0'
-            import ray
-            import ray.util as ray_util
-            from sky.skylet import autostop_lib
-            from sky.skylet import constants
-            from sky.skylet import job_lib
-            from sky.utils import log_utils
-            from sky.utils import subprocess_utils
-            SKY_REMOTE_WORKDIR = {constants.SKY_REMOTE_WORKDIR!r}
-            kwargs = dict()
-            # Only set the `_temp_dir` to SkyPilot's ray cluster directory when
-            # the directory exists for backward compatibility for the VM
-            # launched before #1790.
-            if os.path.exists({constants.SKY_REMOTE_RAY_TEMPDIR!r}):
-                kwargs['_temp_dir'] = {constants.SKY_REMOTE_RAY_TEMPDIR!r}
-            ray.init(
-                address={ray_address!r},
-                namespace='__sky__{job_id}__',
-                log_to_driver=True,
-                **kwargs
-            )
-            def get_or_fail(futures, pg) -> List[int]:
-                \"\"\"Wait for tasks, if any fails, cancel all unready.\"\"\"
-                if not futures:
-                    return []
-                returncodes = [1] * len(futures)
-                # Wait for 1 task to be ready.
-                ready = []
-                # Keep invoking ray.wait if ready is empty. This is because
-                # ray.wait with timeout=None will only wait for 10**6 seconds,
-                # which will cause tasks running for more than 12 days to return
-                # before becoming ready.
-                # (Such tasks are common in serving jobs.)
-                # Reference: https://github.com/ray-project/ray/blob/ray-2.9.3/python/ray/_private/worker.py#L2845-L2846
-                while not ready:
-                    ready, unready = ray.wait(futures)
-                idx = futures.index(ready[0])
-                returncodes[idx] = ray.get(ready[0])
-                while unready:
-                    if returncodes[idx] != 0:
-                        for task in unready:
-                            # ray.cancel without force fails to kill tasks.
-                            # We use force=True to kill unready tasks.
-                            ray.cancel(task, force=True)
-                            # Use SIGKILL=128+9 to indicate the task is forcely
-                            # killed.
-                            idx = futures.index(task)
-                            returncodes[idx] = 137
-                        break
-                    ready, unready = ray.wait(unready)
-                    idx = futures.index(ready[0])
-                    returncodes[idx] = ray.get(ready[0])
-                # Remove the placement group after all tasks are done, so that
-                # the next job can be scheduled on the released resources
-                # immediately.
-                ray_util.remove_placement_group(pg)
-                sys.stdout.flush()
-                return returncodes
-            run_fn = None
-            futures = []
-            """),
-            # FIXME: This is a hack to make sure that the functions can be found
-            # by ray.remote. This should be removed once we have a better way to
-            # specify dependencies for ray.
-            inspect.getsource(log_lib._ProcessingArgs),  # pylint: disable=protected-access
-            inspect.getsource(log_lib._get_context),  # pylint: disable=protected-access
-            inspect.getsource(log_lib._handle_io_stream),  # pylint: disable=protected-access
-            inspect.getsource(log_lib.process_subprocess_stream),
-            inspect.getsource(log_lib.run_with_log),
-            inspect.getsource(log_lib.make_task_bash_script),
-            inspect.getsource(log_lib.add_ray_env_vars),
-            inspect.getsource(log_lib.run_bash_command_with_log),
-            'run_bash_command_with_log = ray.remote(run_bash_command_with_log)',
-        ]
-        # Currently, the codegen program is/can only be submitted to the head
-        # node, due to using job_lib for updating job statuses, and using
-        # autostop_lib here.
-        self._code.append(
-            # Use hasattr to handle backward compatibility.
-            # TODO(zongheng): remove in ~1-2 minor releases (currently 0.2.x).
-            textwrap.dedent("""\
-              if hasattr(autostop_lib, 'set_last_active_time_to_now'):
-                  autostop_lib.set_last_active_time_to_now()
-            """))
-        self._code += [
-            f'job_lib.set_status({job_id!r}, job_lib.JobStatus.PENDING)',
-        ]
-    def add_gang_scheduling_placement_group_and_setup(
-        self,
-        num_nodes: int,
-        resources_dict: Dict[str, float],
-        stable_cluster_internal_ips: List[str],
-        env_vars: Dict[str, str],
-        setup_cmd: Optional[str] = None,
-        setup_log_path: Optional[str] = None,
-    ) -> None:
-        """Create the gang scheduling placement group for a Task.
-        cluster_ips_sorted is used to ensure that the SKY_NODE_RANK environment
-        variable is assigned in a deterministic order whenever a new task is
-        added.
-        """
-        assert self._has_prologue, (
-            'Call add_prologue() before '
-            'add_gang_scheduling_placement_group_and_setup().')
-        self._has_gang_scheduling = True
-        self._num_nodes = num_nodes
-        bundles = [copy.copy(resources_dict) for _ in range(num_nodes)]
-        # Set CPU to avoid ray hanging the resources allocation
-        # for remote functions, since the task will request 1 CPU
-        # by default.
-        task_cpu_demand = resources_dict.pop('CPU')
-        if resources_dict:
-            assert len(resources_dict) == 1, (
-                'There can only be one type of accelerator per instance. '
-                f'Found: {resources_dict}.')
-            acc_name, acc_count = list(resources_dict.items())[0]
-            gpu_dict = {'GPU': acc_count}
-            # gpu_dict should be empty when the accelerator is not GPU.
-            # TODO(zongheng,zhanghao): an alternative is to start the remote
-            # cluster with custom resource 'GPU': <n> even if the accelerator(s)
-            # are not GPU. We opt for the current solution for now.
-            if accelerator_registry.is_schedulable_non_gpu_accelerator(
-                    acc_name):
-                gpu_dict = {}
-            for bundle in bundles:
-                bundle.update({
-                    # Set the GPU to avoid ray hanging the resources allocation
-                    **gpu_dict,
-                })
-        streaming_message = (
-            f'{ux_utils.INDENT_LAST_SYMBOL}Job started. Streaming logs... '
-            f'{colorama.Style.DIM}(Ctrl-C to exit log streaming; job will not '
-            f'be killed){colorama.Style.RESET_ALL}')
-        self._code += [
-            textwrap.dedent(f"""\
-                pg = ray_util.placement_group({json.dumps(bundles)}, 'STRICT_SPREAD')
-                plural = 's' if {num_nodes} > 1 else ''
-                node_str = f'{num_nodes} node{{plural}}'
-                message = ('{ux_utils.INDENT_SYMBOL}{colorama.Style.DIM}'
-                            'Waiting for task resources on '
-                           f'{{node_str}}.{colorama.Style.RESET_ALL}')
-                print(message, flush=True)
-                # FIXME: This will print the error message from autoscaler if
-                # it is waiting for other task to finish. We should hide the
-                # error message.
-                ray.get(pg.ready())
-                print({streaming_message!r}, flush=True)
-                """)
-        ]
-        job_id = self.job_id
-        if setup_cmd is not None:
-            setup_envs = env_vars.copy()
-            setup_envs[constants.SKYPILOT_NUM_NODES] = str(num_nodes)
-            self._code += [
-                textwrap.dedent(f"""\
-                setup_cmd = {setup_cmd!r}
-                _SETUP_CPUS = 0.0001
-                # The setup command will be run as a ray task with num_cpus=_SETUP_CPUS as the
-                # requirement; this means Ray will set CUDA_VISIBLE_DEVICES to an empty string.
-                # We unset it so that user setup command may properly use this env var.
-                setup_cmd = 'unset CUDA_VISIBLE_DEVICES; ' + setup_cmd
-                job_lib.set_status({job_id!r}, job_lib.JobStatus.SETTING_UP)
-                # The schedule_step should be called after the job status is set to non-PENDING,
-                # otherwise, the scheduler will think the current job is not submitted yet, and
-                # skip the scheduling step.
-                job_lib.scheduler.schedule_step()
-                total_num_nodes = len(ray.nodes())
-                setup_bundles = [{{"CPU": _SETUP_CPUS}} for _ in range(total_num_nodes)]
-                setup_pg = ray.util.placement_group(setup_bundles, strategy='STRICT_SPREAD')
-                setup_workers = [run_bash_command_with_log \\
-                    .options(
-                        name='setup',
-                        num_cpus=_SETUP_CPUS,
-                        scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
-                            placement_group=setup_pg,
-                            placement_group_bundle_index=i)
-                    ) \\
-                    .remote(
-                        setup_cmd,
-                        os.path.expanduser({setup_log_path!r}),
-                        env_vars={setup_envs!r},
-                        stream_logs=True,
-                        with_ray=True,
-                    ) for i in range(total_num_nodes)]
-                setup_returncodes = get_or_fail(setup_workers, setup_pg)
-                if sum(setup_returncodes) != 0:
-                    job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED_SETUP)
-                    # This waits for all streaming logs to finish.
-                    time.sleep(1)
-                    print('ERROR: {colorama.Fore.RED}Job {self.job_id}\\'s setup failed with '
-                        'return code list:{colorama.Style.RESET_ALL}',
-                        setup_returncodes,
-                        flush=True)
-                    # Need this to set the job status in ray job to be FAILED.
-                    sys.exit(1)
-                """)
-            ]
-        self._code.append(f'job_lib.set_job_started({self.job_id!r})')
-        if setup_cmd is None:
-            # Need to call schedule_step() to make sure the scheduler
-            # schedule the next pending job.
-            self._code.append('job_lib.scheduler.schedule_step()')
-        # Export IP and node rank to the environment variables.
-        self._code += [
-            textwrap.dedent(f"""\
-                @ray.remote
-                def check_ip():
-                    return ray.util.get_node_ip_address()
-                gang_scheduling_id_to_ip = ray.get([
-                    check_ip.options(
-                            num_cpus={task_cpu_demand},
-                            scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy(
-                                placement_group=pg,
-                                placement_group_bundle_index=i
-                            )).remote()
-                    for i in range(pg.bundle_count)
-                ])
-                cluster_ips_to_node_id = {{ip: i for i, ip in enumerate({stable_cluster_internal_ips!r})}}
-                job_ip_rank_list = sorted(gang_scheduling_id_to_ip, key=cluster_ips_to_node_id.get)
-                job_ip_rank_map = {{ip: i for i, ip in enumerate(job_ip_rank_list)}}
-                job_ip_list_str = '\\n'.join(job_ip_rank_list)
-                """),
-        ]
-    def register_run_fn(self, run_fn: str, run_fn_name: str) -> None:
-        """Register the run function to be run on the remote cluster.
-        Args:
-            run_fn: The run function to be run on the remote cluster.
-        """
-        assert self._has_gang_scheduling, (
-            'Call add_gang_scheduling_placement_group_and_setup() '
-            'before register_run_fn().')
-        assert not self._has_register_run_fn, (
-            'register_run_fn() called twice?')
-        self._has_register_run_fn = True
-        self._code += [
-            run_fn,
-            f'run_fn = {run_fn_name}',
-        ]
-    def add_ray_task(self,
-                     bash_script: Optional[str],
-                     task_name: Optional[str],
-                     ray_resources_dict: Dict[str, float],
-                     log_dir: str,
-                     env_vars: Optional[Dict[str, str]] = None,
-                     gang_scheduling_id: int = 0) -> None:
-        """Generates code for a ray remote task that runs a bash command."""
-        assert self._has_gang_scheduling, (
-            'Call add_gang_scheduling_placement_group_and_setup() before '
-            'add_ray_task().')
-        assert (not self._has_register_run_fn or
-                bash_script is None), ('bash_script should '
-                                       'be None when run_fn is registered.')
-        task_cpu_demand = ray_resources_dict.pop('CPU')
-        # Build remote_task.options(...)
-        #   resources=...
-        #   num_gpus=...
-        options = []
-        options.append(f'num_cpus={task_cpu_demand}')
-        num_gpus = 0.0
-        if ray_resources_dict:
-            assert len(ray_resources_dict) == 1, (
-                'There can only be one type of accelerator per instance. '
-                f'Found: {ray_resources_dict}.')
-            num_gpus = list(ray_resources_dict.values())[0]
-            options.append(f'resources={json.dumps(ray_resources_dict)}')
-            resources_key = list(ray_resources_dict.keys())[0]
-            if not accelerator_registry.is_schedulable_non_gpu_accelerator(
-                    resources_key):
-                # `num_gpus` should be empty when the accelerator is not GPU.
-                # FIXME: use a set of GPU types, instead of 'tpu' in the key.
-                # Passing this ensures that the Ray remote task gets
-                # CUDA_VISIBLE_DEVICES set correctly.  If not passed, that flag
-                # would be force-set to empty by Ray.
-                options.append(f'num_gpus={num_gpus}')
-        options.append(
-            'scheduling_strategy=ray.util.scheduling_strategies.PlacementGroupSchedulingStrategy('  # pylint: disable=line-too-long
-            'placement_group=pg, '
-            f'placement_group_bundle_index={gang_scheduling_id})')
-        sky_env_vars_dict_str = [
-            textwrap.dedent(f"""\
-            sky_env_vars_dict = {{}}
-            sky_env_vars_dict['{constants.SKYPILOT_NODE_IPS}'] = job_ip_list_str
-            sky_env_vars_dict['{constants.SKYPILOT_NUM_NODES}'] = len(job_ip_rank_list)
-            """)
-        ]
-        if env_vars is not None:
-            sky_env_vars_dict_str.extend(f'sky_env_vars_dict[{k!r}] = {v!r}'
-                                         for k, v in env_vars.items())
-        sky_env_vars_dict_str = '\n'.join(sky_env_vars_dict_str)
-        options_str = ', '.join(options)
-        logger.debug('Added Task with options: '
-                     f'{options_str}')
-        # Script to block completion of a job until all storage mounted with
-        # CACHED_MOUNT mode is uploaded to remote.
-        rclone_flush_script = textwrap.dedent(f"""\
-        # Only waits if cached mount is enabled (RCLONE_MOUNT_CACHED_LOG_DIR is not empty)
-        # findmnt alone is not enough, as some clouds (e.g. AWS on ARM64) uses
-        # rclone for normal mounts as well.
-        if [ $(findmnt -t fuse.rclone --noheading | wc -l) -gt 0 ] && \
-           [ -d {constants.RCLONE_MOUNT_CACHED_LOG_DIR} ] && \
-           [ "$(ls -A {constants.RCLONE_MOUNT_CACHED_LOG_DIR})" ]; then
-            flushed=0
-            # extra second on top of --vfs-cache-poll-interval to
-            # avoid race condition between rclone log line creation and this check.
-            sleep 1
-            while [ $flushed -eq 0 ]; do
-                # sleep for the same interval as --vfs-cache-poll-interval
-                sleep {constants.RCLONE_CACHE_REFRESH_INTERVAL}
-                flushed=1
-                for file in {constants.RCLONE_MOUNT_CACHED_LOG_DIR}/*; do
-                    exitcode=0
-                    tac $file | grep "vfs cache: cleaned:" -m 1 | grep "in use 0, to upload 0, uploading 0" -q || exitcode=$?
-                    if [ $exitcode -ne 0 ]; then
-                        echo "skypilot: cached mount is still uploading to remote"
-                        flushed=0
-                        break
-                    fi
-                done
-            done
-            echo "skypilot: cached mount uploaded complete"
-        fi""")
-        self._code += [
-            sky_env_vars_dict_str,
-            textwrap.dedent(f"""\
-        script = {bash_script!r}
-        rclone_flush_script = {rclone_flush_script!r}
-        if run_fn is not None:
-            script = run_fn({gang_scheduling_id}, gang_scheduling_id_to_ip)
-        if script is not None:
-            script += rclone_flush_script
-            sky_env_vars_dict['{constants.SKYPILOT_NUM_GPUS_PER_NODE}'] = {int(math.ceil(num_gpus))!r}
-            ip = gang_scheduling_id_to_ip[{gang_scheduling_id!r}]
-            rank = job_ip_rank_map[ip]
-            if len(cluster_ips_to_node_id) == 1: # Single-node task on single-node cluter
-                name_str = '{task_name},' if {task_name!r} != None else 'task,'
-                log_path = os.path.expanduser(os.path.join({log_dir!r}, 'run.log'))
-            else: # Single-node or multi-node task on multi-node cluster
-                idx_in_cluster = cluster_ips_to_node_id[ip]
-                if cluster_ips_to_node_id[ip] == 0:
-                    node_name = 'head'
-                else:
-                    node_name = f'worker{{idx_in_cluster}}'
-                name_str = f'{{node_name}}, rank={{rank}},'
-                log_path = os.path.expanduser(os.path.join({log_dir!r}, f'{{rank}}-{{node_name}}.log'))
-            sky_env_vars_dict['{constants.SKYPILOT_NODE_RANK}'] = rank
-            sky_env_vars_dict['SKYPILOT_INTERNAL_JOB_ID'] = {self.job_id}
-            futures.append(run_bash_command_with_log \\
-                    .options(name=name_str, {options_str}) \\
-                    .remote(
-                        script,
-                        log_path,
-                        env_vars=sky_env_vars_dict,
-                        stream_logs=True,
-                        with_ray=True,
-                    ))""")
-        ]
-    def add_epilogue(self) -> None:
-        """Generates code that waits for all tasks, then exits."""
-        assert self._has_prologue, 'Call add_prologue() before add_epilogue().'
-        assert not self._has_epilogue, 'add_epilogue() called twice?'
-        self._has_epilogue = True
-        self._code += [
-            textwrap.dedent(f"""\
-            returncodes = get_or_fail(futures, pg)
-            if sum(returncodes) != 0:
-                job_lib.set_status({self.job_id!r}, job_lib.JobStatus.FAILED)
-                # Schedule the next pending job immediately to make the job
-                # scheduling more efficient.
-                job_lib.scheduler.schedule_step()
-                # This waits for all streaming logs to finish.
-                time.sleep(0.5)
-                reason = ''
-                # 139 is the return code of SIGSEGV, i.e. Segmentation Fault.
-                if any(r == 139 for r in returncodes):
-                    reason = '(likely due to Segmentation Fault)'
-                if any(r == 137 for r in returncodes):
-                    # Find the first non-137 return code
-                    non_137 = next(r for r in returncodes if r != 137)
-                    reason = f'(A Worker failed with return code {{non_137}}, SkyPilot cleaned up the processes on other nodes with return code 137)'
-                print('ERROR: {colorama.Fore.RED}Job {self.job_id} failed with '
-                      'return code list:{colorama.Style.RESET_ALL}',
-                      returncodes,
-                      reason,
-                      flush=True)
-                # Need this to set the job status in ray job to be FAILED.
-                sys.exit(1)
-            else:
-                job_lib.set_status({self.job_id!r}, job_lib.JobStatus.SUCCEEDED)
-                # Schedule the next pending job immediately to make the job
-                # scheduling more efficient.
-                job_lib.scheduler.schedule_step()
-                # This waits for all streaming logs to finish.
-                time.sleep(0.5)
-            """)
-        ]
-    def build(self) -> str:
-        """Returns the entire generated program."""
-        assert self._has_epilogue, 'Call add_epilogue() before build().'
-        return '\n'.join(self._code)
 class GangSchedulingStatus(enum.Enum):
     """Enum for gang scheduling status."""
     CLUSTER_READY = 0
@@ -1340,6 +924,34 @@ class RetryingVmProvisioner(object):
                     zones = [clouds.Zone(name=to_provision.zone)]
                 yield zones
+    def _insufficient_resources_msg(
+        self,
+        to_provision: resources_lib.Resources,
+        requested_resources: Set[resources_lib.Resources],
+        insufficient_resources: Optional[List[str]],
+    ) -> str:
+        insufficent_resource_msg = ('' if insufficient_resources is None else
+                                    f' ({", ".join(insufficient_resources)})')
+        message = f'Failed to acquire resources{insufficent_resource_msg} '
+        if to_provision.zone is not None:
+            message += (f'in {to_provision.zone} for {requested_resources}. ')
+        elif to_provision.region is not None and to_provision.cloud is not None:
+            # For public clouds, provision.region is always set.
+            if clouds.SSH().is_same_cloud(to_provision.cloud):
+                message += (
+                    f'in SSH Node Pool ({to_provision.region.lstrip("ssh-")}) '
+                    f'for {requested_resources}. The SSH Node Pool may not '
+                    'have enough resources.')
+            elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
+                message += (f'in context {to_provision.region} for '
+                            f'{requested_resources}. ')
+            else:
+                message += (f'in all zones in {to_provision.region} for '
+                            f'{requested_resources}. ')
+        else:
+            message += (f'{to_provision.cloud} for {requested_resources}. ')
+        return message
     def _retry_zones(
         self,
         to_provision: resources_lib.Resources,
@@ -1418,6 +1030,7 @@ class RetryingVmProvisioner(object):
                 f'To request quotas, check the instruction: '
                 f'https://docs.skypilot.co/en/latest/cloud-setup/quota.html.')
+        insufficient_resources = None
         for zones in self._yield_zones(to_provision, num_nodes, cluster_name,
                                        prev_cluster_status,
                                        prev_cluster_ever_up):
@@ -1630,6 +1243,24 @@ class RetryingVmProvisioner(object):
                     # No teardown happens for this error.
                     with ux_utils.print_exception_no_traceback():
                         raise
+                except config_lib.KubernetesError as e:
+                    if e.insufficent_resources:
+                        insufficient_resources = e.insufficent_resources
+                    # NOTE: We try to cleanup the cluster even if the previous
+                    # cluster does not exist. Also we are fast at
+                    # cleaning up clusters now if there is no existing node.
+                    CloudVmRayBackend().post_teardown_cleanup(
+                        handle,
+                        terminate=not prev_cluster_ever_up,
+                        remove_from_db=False,
+                        failover=True,
+                    )
+                    # TODO(suquark): other clouds may have different zone
+                    #  blocking strategy. See '_update_blocklist_on_error'
+                    #  for details.
+                    FailoverCloudErrorHandlerV2.update_blocklist_on_error(
+                        self._blocked_resources, to_provision, region, zones, e)
+                    continue
                 except Exception as e:  # pylint: disable=broad-except
                     # NOTE: We try to cleanup the cluster even if the previous
                     # cluster does not exist. Also we are fast at
@@ -1760,26 +1391,9 @@ class RetryingVmProvisioner(object):
                                                  terminate=terminate_or_stop,
                                                  remove_from_db=False)
-        if to_provision.zone is not None:
-            message = (
-                f'Failed to acquire resources in {to_provision.zone} for '
-                f'{requested_resources}. ')
-        elif to_provision.region is not None:
-            # For public clouds, provision.region is always set.
-            if clouds.SSH().is_same_cloud(to_provision.cloud):
-                message = ('Failed to acquire resources in SSH Node Pool '
-                           f'({to_provision.region.lstrip("ssh-")}) for '
-                           f'{requested_resources}. The SSH Node Pool may not '
-                           'have enough resources.')
-            elif clouds.Kubernetes().is_same_cloud(to_provision.cloud):
-                message = ('Failed to acquire resources in context '
-                           f'{to_provision.region} for {requested_resources}. ')
-            else:
-                message = ('Failed to acquire resources in all zones in '
-                           f'{to_provision.region} for {requested_resources}. ')
-        else:
-            message = (f'Failed to acquire resources in {to_provision.cloud} '
-                       f'for {requested_resources}. ')
+        message = self._insufficient_resources_msg(to_provision,
+                                                   requested_resources,
+                                                   insufficient_resources)
         # Do not failover to other locations if the cluster was ever up, since
         # the user can have some data on the cluster.
         raise exceptions.ResourcesUnavailableError(
@@ -2175,8 +1789,6 @@ class RetryingVmProvisioner(object):
                 # terminated by _retry_zones().
                 assert (prev_cluster_status == status_lib.ClusterStatus.INIT
                        ), prev_cluster_status
-                assert global_user_state.get_handle_from_cluster_name(
-                    cluster_name) is None, cluster_name
                 logger.info(
                     ux_utils.retry_message(
                         f'Retrying provisioning with requested resources: '
@@ -2215,9 +1827,8 @@ class RetryingVmProvisioner(object):
                 for (resource, exception) in resource_exceptions.items():
                     table.add_row([
                         resource.infra.formatted_str(),
-                        resources_utils.format_resource(resource,
-                                                        simplify=True),
-                        exception
+                        resources_utils.format_resource(
+                            resource, simplified_only=True)[0], exception
                     ])
                 # Set the max width of REASON column to 80 to avoid the table
                 # being wrapped in a unreadable way.
@@ -2239,6 +1850,18 @@ class SSHTunnelInfo:
     pid: int
+def _is_tunnel_healthy(tunnel: SSHTunnelInfo) -> bool:
+    try:
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+            s.settimeout(0.5)
+            s.connect(('localhost', tunnel.port))
+        return True
+    except socket.error as e:
+        logger.warning(f'Failed to connect to tunnel on port {tunnel.port}: '
+                       f'{common_utils.format_exception(e)}')
+        return False
 class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
     """A pickle-able handle to a cluster created by CloudVmRayBackend.
@@ -2261,8 +1884,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
     - (optional) Skylet SSH tunnel info.
     """
     # Bump if any fields get added/removed/changed, and add backward
-    # compaitibility logic in __setstate__.
-    _VERSION = 11
+    # compatibility logic in __setstate__ and/or __getstate__.
+    _VERSION = 12
     def __init__(
             self,
@@ -2296,7 +1919,6 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
         self.launched_resources = launched_resources
         self.docker_user: Optional[str] = None
         self.is_grpc_enabled = True
-        self.skylet_ssh_tunnel: Optional[SSHTunnelInfo] = None
     def __repr__(self):
         return (f'ResourceHandle('
@@ -2313,12 +1935,14 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
                 f'{self.launched_resources}, '
                 f'\n\tdocker_user={self.docker_user},'
                 f'\n\tssh_user={self.ssh_user},'
-                f'\n\tis_grpc_enabled={self.is_grpc_enabled},'
-                f'\n\tskylet_ssh_tunnel={self.skylet_ssh_tunnel}')
+                f'\n\tis_grpc_enabled={self.is_grpc_enabled},')
     def get_cluster_name(self):
         return self.cluster_name
+    def get_cluster_name_on_cloud(self):
+        return self.cluster_name_on_cloud
     def _use_internal_ips(self):
         """Returns whether to use internal IPs for SSH connections."""
         # Directly load the `use_internal_ips` flag from the cluster yaml
@@ -2345,7 +1969,8 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
     def _update_cluster_info(self):
         # When a cluster is on a cloud that does not support the new
         # provisioner, we should skip updating cluster_info.
-        if (self.launched_resources.cloud.PROVISIONER_VERSION >=
+        if (self.launched_resources.cloud is not None and
+                self.launched_resources.cloud.PROVISIONER_VERSION >=
                 clouds.ProvisionerVersion.SKYPILOT):
             provider_name = str(self.launched_resources.cloud).lower()
             config = {}
@@ -2643,64 +2268,199 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
                                                     cluster_config_file)
         self.docker_user = docker_user
+    def _get_skylet_ssh_tunnel(self) -> Optional[SSHTunnelInfo]:
+        metadata = global_user_state.get_cluster_skylet_ssh_tunnel_metadata(
+            self.cluster_name)
+        if metadata is None:
+            return None
+        return SSHTunnelInfo(port=metadata[0], pid=metadata[1])
+    def _set_skylet_ssh_tunnel(self, tunnel: Optional[SSHTunnelInfo]) -> None:
+        global_user_state.set_cluster_skylet_ssh_tunnel_metadata(
+            self.cluster_name,
+            (tunnel.port, tunnel.pid) if tunnel is not None else None)
+    def close_skylet_ssh_tunnel(self) -> None:
+        """Terminate the SSH tunnel process and clear its metadata."""
+        tunnel = self._get_skylet_ssh_tunnel()
+        if tunnel is None:
+            return
+        logger.debug('Closing Skylet SSH tunnel for cluster %r on port %d',
+                     self.cluster_name, tunnel.port)
+        try:
+            self._terminate_ssh_tunnel_process(tunnel)
+        finally:
+            self._set_skylet_ssh_tunnel(None)
     def get_grpc_channel(self) -> 'grpc.Channel':
-        if self.skylet_ssh_tunnel is None:
-            self.open_and_update_skylet_tunnel()
-        assert self.skylet_ssh_tunnel is not None
-        return grpc.insecure_channel(f'localhost:{self.skylet_ssh_tunnel.port}')
+        grpc_options = [
+            # The task YAMLs can be large, so the default
+            # max_receive_message_length of 4MB might not be enough.
+            ('grpc.max_receive_message_length', -1),
+        ]
+        # It's fine to not grab the lock here, as we're only reading,
+        # and writes are very rare.
+        # It's acceptable to read while another process is opening a tunnel,
+        # because it will only happen on:
+        # 1. A new cluster who has no tunnel yet, or
+        # 2. A cluster with an unhealthy tunnel
+        # For (2), for processes that read the "stale" tunnel, it will fail
+        # and on the next retry, it will call get_grpc_channel again
+        # and get the new tunnel.
+        tunnel = self._get_skylet_ssh_tunnel()
+        if tunnel is not None:
+            if _is_tunnel_healthy(tunnel):
+                return grpc.insecure_channel(f'localhost:{tunnel.port}',
+                                             options=grpc_options)
+            logger.debug('Failed to connect to SSH tunnel for cluster '
+                         f'{self.cluster_name!r} on port {tunnel.port}')
+        lock_id = backend_utils.cluster_tunnel_lock_id(self.cluster_name)
+        remaining_timeout = backend_utils.CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS
+        start_time = time.perf_counter()
+        attempt = 1
+        def _get_remaining_timeout() -> float:
+            return max(0.0,
+                       remaining_timeout - (time.perf_counter() - start_time))
+        while remaining_timeout > 0:
+            logger.debug(
+                'Attempting to acquire exclusive lock for %s (attempt %d)',
+                lock_id, attempt)
+            exclusive_lock = locks.get_lock(lock_id, remaining_timeout)
+            try:
+                with exclusive_lock.acquire(blocking=False):
+                    wait_elapsed = time.perf_counter() - start_time
+                    logger.debug(f'Acquired exclusive lock for {lock_id} after '
+                                 f'{wait_elapsed:.2f}s')
+                    try:
+                        tunnel = self._open_and_update_skylet_tunnel()
+                        return grpc.insecure_channel(f'localhost:{tunnel.port}',
+                                                     options=grpc_options)
+                    except Exception as e:  # pylint: disable=broad-except
+                        # Failed to open tunnel, release the lock and retry.
+                        logger.warning(f'Failed to open tunnel for cluster '
+                                       f'{self.cluster_name!r}: '
+                                       f'{common_utils.format_exception(e)}')
+                        remaining_timeout = _get_remaining_timeout()
+                        attempt += 1
+                        continue
+            except locks.LockTimeout:
+                pass
-    def _cleanup_ssh_tunnel(self, tunnel_info: SSHTunnelInfo) -> None:
-        """Clean up an SSH tunnel by terminating the process."""
+            remaining_timeout = _get_remaining_timeout()
+            logger.debug(f'Could not acquire exclusive lock for {lock_id}, '
+                         f'waiting on shared lock (attempt {attempt})')
+            try:
+                # Use shared lock so that concurrent readers can
+                # proceed in parallel.
+                shared_lock = locks.get_lock(lock_id,
+                                             remaining_timeout,
+                                             shared_lock=True)
+                # Wait for the exclusive lock to be released.
+                shared_lock.acquire(blocking=True)
+                # We only need the lock for signalling that the new tunnel has
+                # been opened, not for checking the tunnel health.
+                # Same reasoning as why we don't need to grab the lock in
+                # the fast path at the start of this function.
+                shared_lock.release()
+                wait_elapsed = time.perf_counter() - start_time
+                logger.debug(f'Acquired shared lock for {lock_id} after '
+                             f'{wait_elapsed:.2f}s')
+            except locks.LockTimeout as e:
+                raise RuntimeError(
+                    f'Failed to get gRPC channel for cluster '
+                    f'{self.cluster_name!r} due to a timeout when waiting '
+                    'for the SSH tunnel to be opened. Please try again or '
+                    f'manually remove the lock at {lock_id}. '
+                    f'{common_utils.format_exception(e)}') from e
+            # Add small jitter before probing to smoothen the effects
+            # of many readers waking up simultaneously.
+            jitter = random.uniform(0.01, 0.05)
+            time.sleep(jitter)
+            # Re-read the tunnel metadata and verify it's healthy.
+            tunnel = self._get_skylet_ssh_tunnel()
+            if tunnel is not None:
+                if _is_tunnel_healthy(tunnel):
+                    return grpc.insecure_channel(f'localhost:{tunnel.port}',
+                                                 options=grpc_options)
+                logger.debug('Failed to connect to SSH tunnel for cluster '
+                             f'{self.cluster_name!r} on port {tunnel.port}')
+            # Tunnel is still unhealthy or missing, try again with updated
+            # timeout. This could happen in the case where the thread who
+            # held the exclusive lock to open the tunnel crashed.
+            remaining_timeout = _get_remaining_timeout()
+            attempt += 1
+        raise RuntimeError('Timeout waiting for gRPC channel for cluster '
+                           f'{self.cluster_name!r} to be ready.')
+    def _terminate_ssh_tunnel_process(self, tunnel_info: SSHTunnelInfo) -> None:
+        """Terminate the SSH tunnel process."""
         try:
             proc = psutil.Process(tunnel_info.pid)
             if proc.is_running() and proc.status() != psutil.STATUS_ZOMBIE:
                 logger.debug(
                     f'Terminating SSH tunnel process {tunnel_info.pid}')
-                proc.terminate()
-                try:
-                    proc.wait(timeout=3)
-                except psutil.TimeoutExpired:
-                    proc.kill()
-                    proc.wait(timeout=1)
+                subprocess_utils.kill_children_processes(proc.pid)
         except psutil.NoSuchProcess:
             pass
         except Exception as e:  # pylint: disable=broad-except
             logger.warning(
                 f'Failed to cleanup SSH tunnel process {tunnel_info.pid}: {e}')
-    def open_and_update_skylet_tunnel(self) -> None:
+    def _open_and_update_skylet_tunnel(self) -> SSHTunnelInfo:
         """Opens an SSH tunnel to the Skylet on the head node,
         updates the cluster handle, and persists it to the database."""
-        local_port = common_utils.find_free_port(10000)
-        runners = self.get_command_runners()
-        head_runner = runners[0]
-        if isinstance(head_runner, command_runner.SSHCommandRunner):
-            # Disabling ControlMaster makes things easier to reason about
-            # with respect to resource management/ownership,
-            # as killing the process will close the tunnel too.
-            head_runner.disable_control_master = True
-        cmd = head_runner.port_forward_command([(local_port,
-                                                 constants.SKYLET_GRPC_PORT)])
-        ssh_tunnel_proc = subprocess.Popen(cmd)
-        tunnel_info = SSHTunnelInfo(port=local_port, pid=ssh_tunnel_proc.pid)
+        max_attempts = 3
+        # There could be a race condition here, as multiple processes may
+        # attempt to open the same port at the same time.
+        for attempt in range(max_attempts):
+            runners = self.get_command_runners()
+            head_runner = runners[0]
+            local_port = random.randint(10000, 65535)
+            try:
+                ssh_tunnel_proc = backend_utils.open_ssh_tunnel(
+                    head_runner, (local_port, constants.SKYLET_GRPC_PORT))
+            except exceptions.CommandError as e:
+                # Don't retry if the error is due to timeout,
+                # connection refused, Kubernetes pods not found,
+                # or an in-progress termination.
+                if (e.detailed_reason is not None and
+                    (backend_utils.SSH_CONNECTION_ERROR_PATTERN.search(
+                        e.detailed_reason) or
+                     backend_utils.K8S_PODS_NOT_FOUND_PATTERN.search(
+                         e.detailed_reason) or attempt == max_attempts - 1)):
+                    raise e
+                logger.warning(
+                    f'Failed to open SSH tunnel on port {local_port} '
+                    f'({attempt + 1}/{max_attempts}). '
+                    f'{e.error_msg}\n{e.detailed_reason}')
+                continue
+            tunnel_info = SSHTunnelInfo(port=local_port,
+                                        pid=ssh_tunnel_proc.pid)
+            break
         try:
             grpc.channel_ready_future(
                 grpc.insecure_channel(f'localhost:{tunnel_info.port}')).result(
                     timeout=constants.SKYLET_GRPC_TIMEOUT_SECONDS)
             # Clean up existing tunnel before setting up the new one.
-            if self.skylet_ssh_tunnel is not None:
-                self._cleanup_ssh_tunnel(self.skylet_ssh_tunnel)
-            self.skylet_ssh_tunnel = tunnel_info
-            global_user_state.update_cluster_handle(self.cluster_name, self)
+            old_tunnel = self._get_skylet_ssh_tunnel()
+            if old_tunnel is not None:
+                self._terminate_ssh_tunnel_process(old_tunnel)
+            self._set_skylet_ssh_tunnel(tunnel_info)
+            return tunnel_info
         except grpc.FutureTimeoutError as e:
-            self._cleanup_ssh_tunnel(tunnel_info)
+            self._terminate_ssh_tunnel_process(tunnel_info)
             logger.warning(
                 f'Skylet gRPC channel for cluster {self.cluster_name} not '
                 f'ready after {constants.SKYLET_GRPC_TIMEOUT_SECONDS}s')
             raise e
         except Exception as e:
-            self._cleanup_ssh_tunnel(tunnel_info)
+            self._terminate_ssh_tunnel_process(tunnel_info)
             raise e
     @property
@@ -2713,6 +2473,12 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
     def cluster_yaml(self, value: Optional[str]):
         self._cluster_yaml = value
+    @property
+    def instance_ids(self):
+        if self.cached_cluster_info is not None:
+            return self.cached_cluster_info.instance_ids()
+        return None
     @property
     def ssh_user(self):
         if self.cached_cluster_info is not None:
@@ -2752,6 +2518,13 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
         """Returns whether this handle has gRPC enabled and gRPC flag is set."""
         return env_options.Options.ENABLE_GRPC.get() and self.is_grpc_enabled
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        # For backwards compatibility. Refer to
+        # https://github.com/skypilot-org/skypilot/pull/7133
+        state.setdefault('skylet_ssh_tunnel', None)
+        return state
     def __setstate__(self, state):
         self._version = self._VERSION
@@ -2809,6 +2582,10 @@ class CloudVmRayResourceHandle(backends.backend.ResourceHandle):
             state['is_grpc_enabled'] = False
             state['skylet_ssh_tunnel'] = None
+        if version >= 12:
+            # DEPRECATED in favor of skylet_ssh_tunnel_metadata column in the DB
+            state.pop('skylet_ssh_tunnel', None)
         self.__dict__.update(state)
         # Because the update_cluster_ips and update_ssh_ports
@@ -2886,21 +2663,180 @@ class SkyletClient:
     def __init__(self, channel: 'grpc.Channel'):
         self._autostop_stub = autostopv1_pb2_grpc.AutostopServiceStub(channel)
+        self._jobs_stub = jobsv1_pb2_grpc.JobsServiceStub(channel)
+        self._serve_stub = servev1_pb2_grpc.ServeServiceStub(channel)
+        self._managed_jobs_stub = (
+            managed_jobsv1_pb2_grpc.ManagedJobsServiceStub(channel))
     def set_autostop(
         self,
         request: 'autostopv1_pb2.SetAutostopRequest',
-        timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
     ) -> 'autostopv1_pb2.SetAutostopResponse':
         return self._autostop_stub.SetAutostop(request, timeout=timeout)
     def is_autostopping(
         self,
         request: 'autostopv1_pb2.IsAutostoppingRequest',
-        timeout: float = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
     ) -> 'autostopv1_pb2.IsAutostoppingResponse':
         return self._autostop_stub.IsAutostopping(request, timeout=timeout)
+    def add_job(
+        self,
+        request: 'jobsv1_pb2.AddJobRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'jobsv1_pb2.AddJobResponse':
+        return self._jobs_stub.AddJob(request, timeout=timeout)
+    def queue_job(
+        self,
+        request: 'jobsv1_pb2.QueueJobRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'jobsv1_pb2.QueueJobResponse':
+        return self._jobs_stub.QueueJob(request, timeout=timeout)
+    def update_status(
+        self,
+        request: 'jobsv1_pb2.UpdateStatusRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'jobsv1_pb2.UpdateStatusResponse':
+        return self._jobs_stub.UpdateStatus(request, timeout=timeout)
+    def get_job_queue(
+        self,
+        request: 'jobsv1_pb2.GetJobQueueRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'jobsv1_pb2.GetJobQueueResponse':
+        return self._jobs_stub.GetJobQueue(request, timeout=timeout)
+    def cancel_jobs(
+        self,
+        request: 'jobsv1_pb2.CancelJobsRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'jobsv1_pb2.CancelJobsResponse':
+        return self._jobs_stub.CancelJobs(request, timeout=timeout)
+    def fail_all_in_progress_jobs(
+        self,
+        request: 'jobsv1_pb2.FailAllInProgressJobsRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'jobsv1_pb2.FailAllInProgressJobsResponse':
+        return self._jobs_stub.FailAllInProgressJobs(request, timeout=timeout)
+    def get_job_status(
+        self,
+        request: 'jobsv1_pb2.GetJobStatusRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'jobsv1_pb2.GetJobStatusResponse':
+        return self._jobs_stub.GetJobStatus(request, timeout=timeout)
+    def get_job_submitted_timestamp(
+        self,
+        request: 'jobsv1_pb2.GetJobSubmittedTimestampRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'jobsv1_pb2.GetJobSubmittedTimestampResponse':
+        return self._jobs_stub.GetJobSubmittedTimestamp(request,
+                                                        timeout=timeout)
+    def get_job_ended_timestamp(
+        self,
+        request: 'jobsv1_pb2.GetJobEndedTimestampRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'jobsv1_pb2.GetJobEndedTimestampResponse':
+        return self._jobs_stub.GetJobEndedTimestamp(request, timeout=timeout)
+    def get_log_dirs_for_jobs(
+        self,
+        request: 'jobsv1_pb2.GetLogDirsForJobsRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'jobsv1_pb2.GetLogDirsForJobsResponse':
+        return self._jobs_stub.GetLogDirsForJobs(request, timeout=timeout)
+    def tail_logs(
+        self,
+        request: 'jobsv1_pb2.TailLogsRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> Iterator['jobsv1_pb2.TailLogsResponse']:
+        return self._jobs_stub.TailLogs(request, timeout=timeout)
+    def get_service_status(
+        self,
+        request: 'servev1_pb2.GetServiceStatusRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'servev1_pb2.GetServiceStatusResponse':
+        return self._serve_stub.GetServiceStatus(request, timeout=timeout)
+    def add_serve_version(
+        self,
+        request: 'servev1_pb2.AddVersionRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'servev1_pb2.AddVersionResponse':
+        return self._serve_stub.AddVersion(request, timeout=timeout)
+    def terminate_services(
+        self,
+        request: 'servev1_pb2.TerminateServicesRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'servev1_pb2.TerminateServicesResponse':
+        return self._serve_stub.TerminateServices(request, timeout=timeout)
+    def terminate_replica(
+        self,
+        request: 'servev1_pb2.TerminateReplicaRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'servev1_pb2.TerminateReplicaResponse':
+        return self._serve_stub.TerminateReplica(request, timeout=timeout)
+    def wait_service_registration(
+        self,
+        request: 'servev1_pb2.WaitServiceRegistrationRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'servev1_pb2.WaitServiceRegistrationResponse':
+        # set timeout to at least 10 seconds more than service register
+        # constant to make sure that timeouts will not occur.
+        if timeout is not None:
+            timeout = max(timeout,
+                          serve_constants.SERVICE_REGISTER_TIMEOUT_SECONDS + 10)
+        return self._serve_stub.WaitServiceRegistration(request,
+                                                        timeout=timeout)
+    def update_service(
+        self,
+        request: 'servev1_pb2.UpdateServiceRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'servev1_pb2.UpdateServiceResponse':
+        return self._serve_stub.UpdateService(request, timeout=timeout)
+    def get_managed_job_controller_version(
+        self,
+        request: 'managed_jobsv1_pb2.GetVersionRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'managed_jobsv1_pb2.GetVersionResponse':
+        return self._managed_jobs_stub.GetVersion(request, timeout=timeout)
+    def get_managed_job_table(
+        self,
+        request: 'managed_jobsv1_pb2.GetJobTableRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'managed_jobsv1_pb2.GetJobTableResponse':
+        return self._managed_jobs_stub.GetJobTable(request, timeout=timeout)
+    def get_all_managed_job_ids_by_name(
+        self,
+        request: 'managed_jobsv1_pb2.GetAllJobIdsByNameRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'managed_jobsv1_pb2.GetAllJobIdsByNameResponse':
+        return self._managed_jobs_stub.GetAllJobIdsByName(request,
+                                                          timeout=timeout)
+    def cancel_managed_jobs(
+        self,
+        request: 'managed_jobsv1_pb2.CancelJobsRequest',
+        timeout: Optional[float] = constants.SKYLET_GRPC_TIMEOUT_SECONDS
+    ) -> 'managed_jobsv1_pb2.CancelJobsResponse':
+        return self._managed_jobs_stub.CancelJobs(request, timeout=timeout)
 @registry.BACKEND_REGISTRY.type_register(name='cloudvmray')
 class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
@@ -2931,6 +2867,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         self._requested_features = set()
         self._dump_final_script = False
         self._is_managed = False
+        # Optional planner (via register_info): used under the per-cluster lock
+        # to produce a fresh concrete plan when neither a reusable snapshot nor
+        # a caller plan is available.
+        self._planner = None
         # Command for running the setup script. It is only set when the
         # setup needs to be run outside the self._setup() and as part of
@@ -2948,6 +2888,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                                               self._requested_features)
         self._dump_final_script = kwargs.pop('dump_final_script', False)
         self._is_managed = kwargs.pop('is_managed', False)
+        # Optional planner callback for a fresh plan under lock when no
+        # reusable snapshot/caller plan exists. Keeps optimizer in upper layer.
+        self._planner = kwargs.pop('planner', self._planner)
         assert not kwargs, f'Unexpected kwargs: {kwargs}'
     def check_resources_fit_cluster(
@@ -2974,9 +2917,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         # Usage Collection:
         usage_lib.messages.usage.update_cluster_resources(
             handle.launched_nodes, launched_resources)
-        record = global_user_state.get_cluster_from_name(cluster_name)
-        if record is not None:
-            usage_lib.messages.usage.update_cluster_status(record['status'])
+        status = global_user_state.get_status_from_cluster_name(cluster_name)
+        if status is not None:
+            usage_lib.messages.usage.update_cluster_status(status)
         assert launched_resources.region is not None, handle
@@ -3115,7 +3058,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                                                  colorama.Style.RESET_ALL +
                                                  colorama.Style.DIM +
                                                  'Check concurrent requests: ' +
-                                                 'sky api status '))
+                                                 'sky api status -v | grep '
+                                                 f'{cluster_name}'))
     def _locked_provision(
         self,
@@ -3172,8 +3116,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 try:
                     retry_provisioner = RetryingVmProvisioner(
                         self.log_dir,
-                        self._dag,
-                        self._optimize_target,
+                        self._dag,  # type: ignore[arg-type]
+                        self._optimize_target,  # type: ignore[arg-type]
                         self._requested_features,
                         local_wheel_path,
                         wheel_hash,
@@ -3204,9 +3148,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         gap_seconds = _RETRY_UNTIL_UP_INIT_GAP_SECONDS
                         retry_message = ux_utils.retry_message(
                             f'Retry after {gap_seconds:.0f}s ')
-                        hint_message = (f'\n{retry_message} '
-                                        f'{ux_utils.log_path_hint(log_path)}'
-                                        f'{colorama.Style.RESET_ALL}')
+                        hint_message = (
+                            f'\n{retry_message} '
+                            f'{ux_utils.provision_hint(cluster_name)}'
+                            f'{colorama.Style.RESET_ALL}')
                         # Add cluster event for retry.
                         global_user_state.add_cluster_event(
@@ -3235,7 +3180,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     logger.error(
                         ux_utils.error_message(
                             'Failed to provision resources. '
-                            f'{ux_utils.log_path_hint(log_path)}'))
+                            f'{ux_utils.provision_hint(cluster_name)}'))
                     error_message += (
                         '\nTo keep retrying until the cluster is up, use '
                         'the `--retry-until-up` flag.')
@@ -3244,8 +3189,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                             error_message + '\n' + str(e),
                             failover_history=e.failover_history) from None
             if dryrun:
-                record = global_user_state.get_cluster_from_name(cluster_name)
-                return record['handle'] if record is not None else None, False
+                handle = global_user_state.get_handle_from_cluster_name(
+                    cluster_name)
+                return handle if handle is not None else None, False
             if config_dict['provisioning_skipped']:
                 # Skip further provisioning.
@@ -3253,10 +3199,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 # ('handle', 'provision_record', 'resources_vars')
                 # We need to return the handle - but it should be the existing
                 # handle for the cluster.
-                record = global_user_state.get_cluster_from_name(cluster_name)
-                assert record is not None and record['handle'] is not None, (
-                    cluster_name, record)
-                return record['handle'], True
+                handle = global_user_state.get_handle_from_cluster_name(
+                    cluster_name)
+                assert handle is not None, (cluster_name, handle)
+                return handle, True
             if 'provision_record' in config_dict:
                 # New provisioner is used here.
@@ -3279,7 +3225,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     global_user_state.ClusterEventType.STATUS_CHANGE)
                 cluster_info = provisioner.post_provision_runtime_setup(
-                    repr(handle.launched_resources.cloud),
+                    handle.launched_resources,
                     resources_utils.ClusterName(handle.cluster_name,
                                                 handle.cluster_name_on_cloud),
                     handle.cluster_yaml,
@@ -3293,6 +3239,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 # manually or by the cloud provider.
                 # Optimize the case where the cluster's IPs can be retrieved
                 # from cluster_info.
+                handle.cached_cluster_info = cluster_info
                 handle.docker_user = cluster_info.docker_user
                 handle.update_cluster_ips(max_attempts=_FETCH_IP_MAX_ATTEMPTS,
                                           cluster_info=cluster_info)
@@ -3304,7 +3251,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 self._update_after_cluster_provisioned(
                     handle, to_provision_config.prev_handle, task,
-                    prev_cluster_status, lock_id, config_hash)
+                    prev_cluster_status, config_hash)
                 return handle, False
             cluster_config_file = config_dict['ray']
@@ -3376,7 +3323,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             self._update_after_cluster_provisioned(
                 handle, to_provision_config.prev_handle, task,
-                prev_cluster_status, lock_id, config_hash)
+                prev_cluster_status, config_hash)
             return handle, False
     def _open_ports(self, handle: CloudVmRayResourceHandle) -> None:
@@ -3394,7 +3341,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             prev_handle: Optional[CloudVmRayResourceHandle],
             task: task_lib.Task,
             prev_cluster_status: Optional[status_lib.ClusterStatus],
-            lock_id: str, config_hash: str) -> None:
+            config_hash: str) -> None:
         usage_lib.messages.usage.update_cluster_resources(
             handle.launched_nodes, handle.launched_resources)
         usage_lib.messages.usage.update_final_cluster_status(
@@ -3406,16 +3353,26 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             # update_status will query the ray job status for all INIT /
             # PENDING / RUNNING jobs for the real status, since we do not
             # know the actual previous status of the cluster.
-            cmd = job_lib.JobLibCodeGen.update_status()
             logger.debug('Update job queue on remote cluster.')
             with rich_utils.safe_status(
                     ux_utils.spinner_message('Preparing SkyPilot runtime')):
-                returncode, _, stderr = self.run_on_head(handle,
-                                                         cmd,
-                                                         require_outputs=True)
-            subprocess_utils.handle_returncode(returncode, cmd,
-                                               'Failed to update job status.',
-                                               stderr)
+                use_legacy = not handle.is_grpc_enabled_with_flag
+                if not use_legacy:
+                    try:
+                        request = jobsv1_pb2.UpdateStatusRequest()
+                        backend_utils.invoke_skylet_with_retries(
+                            lambda: SkyletClient(handle.get_grpc_channel()
+                                                ).update_status(request))
+                    except exceptions.SkyletMethodNotImplementedError:
+                        use_legacy = True
+                if use_legacy:
+                    cmd = job_lib.JobLibCodeGen.update_status()
+                    returncode, _, stderr = self.run_on_head(
+                        handle, cmd, require_outputs=True)
+                    subprocess_utils.handle_returncode(
+                        returncode, cmd, 'Failed to update job status.', stderr)
         if prev_cluster_status == status_lib.ClusterStatus.STOPPED:
             # Safely set all the previous jobs to FAILED since the cluster
             # is restarted
@@ -3423,14 +3380,25 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             # 1. A job finishes RUNNING, but right before it update itself
             # to SUCCEEDED, the cluster is STOPPED by `sky stop`.
             # 2. On next `sky start`, it gets reset to FAILED.
-            cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
-            returncode, stdout, stderr = self.run_on_head(handle,
-                                                          cmd,
-                                                          require_outputs=True)
-            subprocess_utils.handle_returncode(
-                returncode, cmd,
-                'Failed to set previously in-progress jobs to FAILED',
-                stdout + stderr)
+            use_legacy = not handle.is_grpc_enabled_with_flag
+            if not use_legacy:
+                try:
+                    fail_request = jobsv1_pb2.FailAllInProgressJobsRequest()
+                    backend_utils.invoke_skylet_with_retries(
+                        lambda: SkyletClient(handle.get_grpc_channel(
+                        )).fail_all_in_progress_jobs(fail_request))
+                except exceptions.SkyletMethodNotImplementedError:
+                    use_legacy = True
+            if use_legacy:
+                cmd = job_lib.JobLibCodeGen.fail_all_jobs_in_progress()
+                returncode, stdout, stderr = self.run_on_head(
+                    handle, cmd, require_outputs=True)
+                subprocess_utils.handle_returncode(
+                    returncode, cmd,
+                    'Failed to set previously in-progress jobs to FAILED',
+                    stdout + stderr)
         prev_ports = None
         if prev_handle is not None:
@@ -3485,8 +3453,6 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                 handle.cached_external_ssh_ports, handle.docker_user,
                 handle.ssh_user)
-            locks.get_lock(lock_id).force_unlock()
     def _sync_workdir(self, handle: CloudVmRayResourceHandle,
                       workdir: Union[Path, Dict[str, Any]],
                       envs_and_secrets: Dict[str, str]) -> None:
@@ -3618,6 +3584,16 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             self._set_storage_mounts_metadata(handle.cluster_name,
                                               storage_mounts)
+    def _get_num_gpus(self, task: task_lib.Task) -> int:
+        if task.resources is not None:
+            for resource in task.resources:
+                if (resource.accelerators is not None and
+                        isinstance(resource.accelerators, dict)):
+                    if len(resource.accelerators) > 0:
+                        return math.ceil(
+                            list(resource.accelerators.values())[0])
+        return 0
     def _setup(self, handle: CloudVmRayResourceHandle, task: task_lib.Task,
                detach_setup: bool) -> None:
         start = time.time()
@@ -3630,13 +3606,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         remote_setup_file_name = f'/tmp/sky_setup_{self.run_timestamp}'
         # Need this `-i` option to make sure `source ~/.bashrc` work
         setup_cmd = f'/bin/bash -i {remote_setup_file_name} 2>&1'
+        unset_ray_env_vars = ' && '.join(
+            [f'unset {var}' for var in task_codegen.UNSET_RAY_ENV_VARS])
+        setup_cmd = f'{unset_ray_env_vars}; {setup_cmd}'
         runners = handle.get_command_runners(avoid_ssh_control=True)
         def _setup_node(node_id: int) -> None:
-            setup_envs = task.envs_and_secrets
+            setup_envs = task_lib.get_plaintext_envs_and_secrets(
+                task.envs_and_secrets)
             setup_envs.update(self._skypilot_predefined_env_vars(handle))
             setup_envs['SKYPILOT_SETUP_NODE_IPS'] = '\n'.join(internal_ips)
             setup_envs['SKYPILOT_SETUP_NODE_RANK'] = str(node_id)
+            setup_envs[constants.SKYPILOT_SETUP_NUM_GPUS_PER_NODE] = (str(
+                self._get_num_gpus(task)))
             runner = runners[node_id]
             setup_script = log_lib.make_task_bash_script(setup,
                                                          env_vars=setup_envs)
@@ -3693,29 +3676,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             returncode = _run_setup(f'{create_script_code} && {setup_cmd}',)
-            def _load_setup_log_and_match(match_str: str) -> bool:
-                try:
-                    with open(os.path.expanduser(setup_log_path),
-                              'r',
-                              encoding='utf-8') as f:
-                        return match_str.lower() in f.read().lower()
-                except Exception as e:  # pylint: disable=broad-except
-                    # We don't crash the setup if we cannot read the log file.
-                    # Instead, we should retry the setup with dumping the script
-                    # to a file to be safe.
-                    logger.debug(
-                        f'Failed to read setup log file {setup_log_path}: {e}')
-                    return True
-            if ((returncode == 255 and _load_setup_log_and_match('too long')) or
-                (returncode == 1 and
-                 _load_setup_log_and_match('request-uri too large'))):
-                # If the setup script is too long, we retry it with dumping
-                # the script to a file and running it with SSH. We use a
-                # general length limit check before but it could be
-                # inaccurate on some systems.
-                # When there is a cloudflare proxy in front of the remote, it
-                # could cause `414 Request-URI Too Large` error.
+            if _is_message_too_long(returncode, file_path=setup_log_path):
+                # If the setup script is too long, we need to retry it
+                # with dumping the script to a file and running it the script
+                # on remote cluster instead.
                 logger.debug('Failed to run setup command inline due to '
                              'command length limit. Dumping setup script to '
                              'file and running it with SSH.')
@@ -3779,119 +3743,180 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         logger.info(
             ux_utils.finishing_message('Setup completed.', setup_log_path))
+    def _download_file(self, handle: CloudVmRayResourceHandle,
+                       local_file_path: str, remote_file_path: str) -> None:
+        """Syncs file from remote to local."""
+        runners = handle.get_command_runners()
+        head_runner = runners[0]
+        head_runner.rsync(
+            source=local_file_path,
+            target=remote_file_path,
+            up=False,
+            stream_logs=False,
+        )
     def _exec_code_on_head(
         self,
         handle: CloudVmRayResourceHandle,
         codegen: str,
         job_id: int,
-        detach_run: bool = False,
         managed_job_dag: Optional['dag.Dag'] = None,
+        managed_job_user_id: Optional[str] = None,
         remote_log_dir: Optional[str] = None,
     ) -> None:
         """Executes generated code on the head node."""
-        script_path = os.path.join(SKY_REMOTE_APP_DIR, f'sky_job_{job_id}')
+        use_legacy = not handle.is_grpc_enabled_with_flag
+        file_name = f'sky_job_{job_id}'
+        script_path = os.path.join(SKY_REMOTE_APP_DIR, file_name)
         if remote_log_dir is None:
             remote_log_dir = self.log_dir
         remote_log_path = os.path.join(remote_log_dir, 'run.log')
-        cd = f'cd {SKY_REMOTE_WORKDIR}'
+        def _dump_code_to_file(codegen: str,
+                               target_dir: str = SKY_REMOTE_APP_DIR) -> None:
+            runners = handle.get_command_runners()
+            head_runner = runners[0]
+            with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
+                fp.write(codegen)
+                fp.flush()
+                script_path = os.path.join(target_dir, file_name)
+                # We choose to sync code + exec, because the alternative of
+                # 'ray submit' may not work as it may use system python
+                # (python2) to execute the script. Happens for AWS.
+                head_runner.rsync(source=fp.name,
+                                  target=script_path,
+                                  up=True,
+                                  stream_logs=False)
+        cd = f'cd {SKY_REMOTE_WORKDIR}'
         mkdir_code = (f'{cd} && mkdir -p {remote_log_dir} && '
                       f'touch {remote_log_path}')
         encoded_script = shlex.quote(codegen)
         create_script_code = f'{{ echo {encoded_script} > {script_path}; }}'
         job_submit_cmd = (
-            # JOB_CMD_IDENTIFIER is used for identifying the process retrieved
-            # with pid is the same driver process.
+            # JOB_CMD_IDENTIFIER is used for identifying the process
+            # retrieved with pid is the same driver process.
             f'{job_lib.JOB_CMD_IDENTIFIER.format(job_id)} && '
             f'{cd} && {constants.SKY_PYTHON_CMD} -u {script_path}'
             # Do not use &>, which is not POSIX and may not work.
             # Note that the order of ">filename 2>&1" matters.
             f'> {remote_log_path} 2>&1')
         code = job_lib.JobLibCodeGen.queue_job(job_id, job_submit_cmd)
         job_submit_cmd = ' && '.join([mkdir_code, create_script_code, code])
-        def _dump_code_to_file(codegen: str,
-                               target_dir: str = SKY_REMOTE_APP_DIR) -> None:
-            runners = handle.get_command_runners()
-            head_runner = runners[0]
-            with tempfile.NamedTemporaryFile('w', prefix='sky_app_') as fp:
-                fp.write(codegen)
-                fp.flush()
-                script_path = os.path.join(target_dir, f'sky_job_{job_id}')
-                # We choose to sync code + exec, because the alternative of 'ray
-                # submit' may not work as it may use system python (python2) to
-                # execute the script. Happens for AWS.
-                head_runner.rsync(source=fp.name,
-                                  target=script_path,
-                                  up=True,
-                                  stream_logs=False)
         # Should also be ealier than _is_command_length_over_limit
         # Same reason as in _setup
         if self._dump_final_script:
             _dump_code_to_file(job_submit_cmd,
                                constants.PERSISTENT_RUN_SCRIPT_DIR)
-        if _is_command_length_over_limit(job_submit_cmd):
-            _dump_code_to_file(codegen)
-            job_submit_cmd = f'{mkdir_code} && {code}'
-        def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
-            if managed_job_dag is not None:
-                # Add the managed job to job queue database.
-                managed_job_codegen = managed_jobs.ManagedJobCodeGen()
-                managed_job_code = managed_job_codegen.set_pending(
-                    job_id,
-                    managed_job_dag,
-                    skypilot_config.get_active_workspace(
-                        force_user_workspace=True),
-                    entrypoint=common_utils.get_current_command())
-                # Set the managed job to PENDING state to make sure that this
-                # managed job appears in the `sky jobs queue`, even if it needs
-                # to wait to be submitted.
-                # We cannot set the managed job to PENDING state in the job
-                # template (jobs-controller.yaml.j2), as it may need to wait for
-                # the run commands to be scheduled on the job controller in
-                # high-load cases.
-                job_submit_cmd += ' && ' + managed_job_code
-            return job_submit_cmd
-        job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
+        if not use_legacy:
+            try:
+                managed_job_info: Optional[jobsv1_pb2.ManagedJobInfo] = None
+                if managed_job_dag is not None:
+                    workspace = skypilot_config.get_active_workspace(
+                        force_user_workspace=True)
+                    entrypoint = common_utils.get_current_command()
+                    managed_job_tasks: List[jobsv1_pb2.ManagedJobTask] = []
+                    for task_id, task in enumerate(managed_job_dag.tasks):
+                        resources_str = backend_utils.get_task_resources_str(
+                            task, is_managed_job=True)
+                        managed_job_tasks.append(
+                            jobsv1_pb2.ManagedJobTask(
+                                task_id=task_id,
+                                name=task.name,
+                                resources_str=resources_str,
+                                metadata_json=task.metadata_json))
+                    managed_job_info = jobsv1_pb2.ManagedJobInfo(
+                        name=managed_job_dag.name,
+                        pool=managed_job_dag.pool,
+                        workspace=workspace,
+                        entrypoint=entrypoint,
+                        tasks=managed_job_tasks,
+                        user_id=managed_job_user_id)
+                if _is_command_length_over_limit(codegen):
+                    _dump_code_to_file(codegen)
+                    queue_job_request = jobsv1_pb2.QueueJobRequest(
+                        job_id=job_id,
+                        # codegen not set - server assumes script uploaded
+                        remote_log_dir=remote_log_dir,
+                        managed_job=managed_job_info,
+                        script_path=script_path)
+                else:
+                    queue_job_request = jobsv1_pb2.QueueJobRequest(
+                        job_id=job_id,
+                        codegen=codegen,
+                        remote_log_dir=remote_log_dir,
+                        managed_job=managed_job_info,
+                        script_path=script_path)
+                backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
+                    handle.get_grpc_channel()).queue_job(queue_job_request))
+            except exceptions.SkyletMethodNotImplementedError:
+                use_legacy = True
+        if use_legacy:
+            if _is_command_length_over_limit(job_submit_cmd):
+                _dump_code_to_file(codegen)
+                job_submit_cmd = f'{mkdir_code} && {code}'
+            def _maybe_add_managed_job_code(job_submit_cmd: str) -> str:
+                if managed_job_dag is not None:
+                    # Add the managed job to job queue database.
+                    managed_job_codegen = managed_jobs.ManagedJobCodeGen()
+                    managed_job_code = managed_job_codegen.set_pending(
+                        job_id,
+                        managed_job_dag,
+                        skypilot_config.get_active_workspace(
+                            force_user_workspace=True),
+                        entrypoint=common_utils.get_current_command(),
+                        user_hash=managed_job_user_id)
+                    # Set the managed job to PENDING state to make sure that
+                    # this managed job appears in the `sky jobs queue`, even
+                    # if it needs to wait to be submitted.
+                    # We cannot set the managed job to PENDING state in the
+                    # job template (jobs-controller.yaml.j2), as it may need
+                    # to wait for the run commands to be scheduled on the job
+                    # controller in high-load cases.
+                    job_submit_cmd += ' && ' + managed_job_code
+                return job_submit_cmd
-        returncode, stdout, stderr = self.run_on_head(handle,
-                                                      job_submit_cmd,
-                                                      stream_logs=False,
-                                                      require_outputs=True)
-        # Happens when someone calls `sky exec` but remote is outdated for
-        # running a job. Necessitating calling `sky launch`.
-        backend_utils.check_stale_runtime_on_remote(returncode, stderr,
-                                                    handle.cluster_name)
-        output = stdout + stderr
-        if ((returncode == 255 and 'too long' in output.lower()) or
-            (returncode == 1 and 'request-uri too large' in output.lower())):
-            # If the generated script is too long, we retry it with dumping
-            # the script to a file and running it with SSH. We use a general
-            # length limit check before but it could be inaccurate on some
-            # systems.
-            # When there is a cloudflare proxy in front of the remote, it could
-            # cause `414 Request-URI Too Large` error.
-            logger.debug('Failed to submit job due to command length limit. '
-                         'Dumping job to file and running it with SSH. '
-                         f'Output: {output}')
-            _dump_code_to_file(codegen)
-            job_submit_cmd = f'{mkdir_code} && {code}'
             job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
             returncode, stdout, stderr = self.run_on_head(handle,
                                                           job_submit_cmd,
                                                           stream_logs=False,
                                                           require_outputs=True)
+            # Happens when someone calls `sky exec` but remote is outdated for
+            # running a job. Necessitating calling `sky launch`.
+            backend_utils.check_stale_runtime_on_remote(returncode, stderr,
+                                                        handle.cluster_name)
+            output = stdout + stderr
+            if _is_message_too_long(returncode, output=output):
+                # If the job submit script is too long, we need to retry it
+                # with dumping the script to a file and running it the script
+                # on remote cluster instead.
+                logger.debug(
+                    'Failed to submit job due to command length limit. '
+                    'Dumping job to file and running it with SSH. '
+                    f'Output: {output}')
+                _dump_code_to_file(codegen)
+                job_submit_cmd = f'{mkdir_code} && {code}'
+                job_submit_cmd = _maybe_add_managed_job_code(job_submit_cmd)
+                returncode, stdout, stderr = self.run_on_head(
+                    handle,
+                    job_submit_cmd,
+                    stream_logs=False,
+                    require_outputs=True)
-        subprocess_utils.handle_returncode(returncode,
-                                           job_submit_cmd,
-                                           f'Failed to submit job {job_id}.',
-                                           stderr=stdout + stderr)
+            subprocess_utils.handle_returncode(
+                returncode,
+                job_submit_cmd,
+                f'Failed to submit job {job_id}.',
+                stderr=stdout + stderr)
         controller = controller_utils.Controllers.from_name(handle.cluster_name)
         if controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER:
@@ -3900,61 +3925,74 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             logger.info(
                 ux_utils.starting_message(f'Job submitted, ID: {job_id}'))
         rich_utils.stop_safe_status()
-        if not detach_run:
-            if (handle.cluster_name == controller_utils.Controllers.
-                    JOBS_CONTROLLER.value.cluster_name):
-                self.tail_managed_job_logs(handle, job_id)
-            else:
-                # Sky logs. Not using subprocess.run since it will make the
-                # ssh keep connected after ctrl-c.
-                self.tail_logs(handle, job_id)
     def _add_job(self, handle: CloudVmRayResourceHandle,
                  job_name: Optional[str], resources_str: str,
                  metadata: str) -> Tuple[int, str]:
-        code = job_lib.JobLibCodeGen.add_job(
-            job_name=job_name,
-            username=common_utils.get_user_hash(),
-            run_timestamp=self.run_timestamp,
-            resources_str=resources_str,
-            metadata=metadata)
-        returncode, result_str, stderr = self.run_on_head(handle,
-                                                          code,
-                                                          stream_logs=False,
-                                                          require_outputs=True,
-                                                          separate_stderr=True)
-        # Happens when someone calls `sky exec` but remote is outdated for
-        # adding a job. Necessitating calling `sky launch`.
-        backend_utils.check_stale_runtime_on_remote(returncode, stderr,
-                                                    handle.cluster_name)
-        # TODO(zhwu): this sometimes will unexpectedly fail, we can add
-        # retry for this, after we figure out the reason.
-        subprocess_utils.handle_returncode(returncode, code,
-                                           'Failed to fetch job id.', stderr)
-        try:
-            job_id_match = _JOB_ID_PATTERN.search(result_str)
-            if job_id_match is not None:
-                job_id = int(job_id_match.group(1))
-            else:
-                # For backward compatibility.
-                job_id = int(result_str)
-            log_dir_match = _LOG_DIR_PATTERN.search(result_str)
-            if log_dir_match is not None:
-                log_dir = log_dir_match.group(1).strip()
-            else:
-                # For backward compatibility, use the same log dir as local.
-                log_dir = self.log_dir
-        except ValueError as e:
-            logger.error(stderr)
-            raise ValueError(f'Failed to parse job id: {result_str}; '
-                             f'Returncode: {returncode}') from e
+        use_legacy = not handle.is_grpc_enabled_with_flag
+        if not use_legacy:
+            try:
+                request = jobsv1_pb2.AddJobRequest(
+                    job_name=job_name,
+                    username=common_utils.get_user_hash(),
+                    run_timestamp=self.run_timestamp,
+                    resources_str=resources_str,
+                    metadata=metadata)
+                response = backend_utils.invoke_skylet_with_retries(
+                    lambda: SkyletClient(handle.get_grpc_channel()).add_job(
+                        request))
+                job_id = response.job_id
+                log_dir = response.log_dir
+                return job_id, log_dir
+            except exceptions.SkyletMethodNotImplementedError:
+                use_legacy = True
+        if use_legacy:
+            code = job_lib.JobLibCodeGen.add_job(
+                job_name=job_name,
+                username=common_utils.get_user_hash(),
+                run_timestamp=self.run_timestamp,
+                resources_str=resources_str,
+                metadata=metadata)
+            returncode, result_str, stderr = self.run_on_head(
+                handle,
+                code,
+                stream_logs=False,
+                require_outputs=True,
+                separate_stderr=True)
+            # Happens when someone calls `sky exec` but remote is outdated for
+            # adding a job. Necessitating calling `sky launch`.
+            backend_utils.check_stale_runtime_on_remote(returncode, stderr,
+                                                        handle.cluster_name)
+            # TODO(zhwu): this sometimes will unexpectedly fail, we can add
+            # retry for this, after we figure out the reason.
+            subprocess_utils.handle_returncode(returncode, code,
+                                               'Failed to fetch job id.',
+                                               stderr)
+            try:
+                job_id_match = _JOB_ID_PATTERN.search(result_str)
+                if job_id_match is not None:
+                    job_id = int(job_id_match.group(1))
+                else:
+                    # For backward compatibility.
+                    job_id = int(result_str)
+                log_dir_match = _LOG_DIR_PATTERN.search(result_str)
+                if log_dir_match is not None:
+                    log_dir = log_dir_match.group(1).strip()
+                else:
+                    # For backward compatibility, use the same log dir as local.
+                    log_dir = self.log_dir
+            except ValueError as e:
+                logger.error(stderr)
+                raise ValueError(f'Failed to parse job id: {result_str}; '
+                                 f'Returncode: {returncode}') from e
         return job_id, log_dir
     def _execute(
         self,
         handle: CloudVmRayResourceHandle,
         task: task_lib.Task,
-        detach_run: bool,
         dryrun: bool = False,
     ) -> Optional[int]:
         """Executes the task on the cluster.
@@ -4006,12 +4044,10 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         num_actual_nodes = task.num_nodes * handle.num_ips_per_node
         # Case: task_lib.Task(run, num_nodes=N) or TPU VM Pods
         if num_actual_nodes > 1:
-            self._execute_task_n_nodes(handle, task_copy, job_id, detach_run,
-                                       log_dir)
+            self._execute_task_n_nodes(handle, task_copy, job_id, log_dir)
         else:
             # Case: task_lib.Task(run, num_nodes=1)
-            self._execute_task_one_node(handle, task_copy, job_id, detach_run,
-                                        log_dir)
+            self._execute_task_one_node(handle, task_copy, job_id, log_dir)
         return job_id
@@ -4054,7 +4090,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         is_identity_mismatch_and_purge = False
         try:
             backend_utils.check_owner_identity(cluster_name)
-        except exceptions.ClusterOwnerIdentityMismatchError as e:
+        except (exceptions.ClusterOwnerIdentityMismatchError,
+                exceptions.CloudUserIdentityError) as e:
             if purge:
                 logger.error(e)
                 verbed = 'terminated' if terminate else 'stopped'
@@ -4068,15 +4105,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             else:
                 raise
         lock_id = backend_utils.cluster_status_lock_id(cluster_name)
-        lock = locks.get_lock(lock_id)
+        lock = locks.get_lock(lock_id, timeout=1)
         # Retry in case new cluster operation comes in and holds the lock
         # right after the lock is removed.
         n_attempts = 2
         while True:
             n_attempts -= 1
-            # In case other running cluster operations are still holding the
-            # lock.
-            lock.force_unlock()
             # We have to kill the cluster requests, because `down` and `stop`
             # should be higher priority than the cluster requests, and we should
             # release the lock from other requests.
@@ -4094,6 +4128,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     'Failed to kill other launch requests for the '
                     f'cluster {handle.cluster_name}: '
                     f'{common_utils.format_exception(e, use_bracket=True)}')
+            # In case other running cluster operations are still holding the
+            # lock.
+            lock.force_unlock()
             try:
                 with lock:
                     self.teardown_no_lock(
@@ -4126,6 +4163,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         job_ids: Optional[List[int]] = None,
         stream_logs: bool = True
     ) -> Dict[Optional[int], Optional[job_lib.JobStatus]]:
+        if handle.is_grpc_enabled_with_flag:
+            try:
+                request = jobsv1_pb2.GetJobStatusRequest(job_ids=job_ids)
+                response = backend_utils.invoke_skylet_with_retries(
+                    lambda: SkyletClient(handle.get_grpc_channel()
+                                        ).get_job_status(request))
+                statuses: Dict[Optional[int], Optional[job_lib.JobStatus]] = {
+                    job_id: job_lib.JobStatus.from_protobuf(proto_status)
+                    for job_id, proto_status in response.job_statuses.items()
+                }
+                return statuses
+            except exceptions.SkyletMethodNotImplementedError:
+                pass
         code = job_lib.JobLibCodeGen.get_job_status(job_ids)
         returncode, stdout, stderr = self.run_on_head(handle,
                                                       code,
@@ -4146,16 +4197,32 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         See `skylet.job_lib.cancel_jobs_encoded_results` for more details.
         """
-        code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all, user_hash)
-        returncode, stdout, _ = self.run_on_head(handle,
-                                                 code,
-                                                 stream_logs=False,
-                                                 require_outputs=True)
-        subprocess_utils.handle_returncode(
-            returncode, code,
-            f'Failed to cancel jobs on cluster {handle.cluster_name}.', stdout)
-        cancelled_ids = message_utils.decode_payload(stdout)
+        use_legacy = not handle.is_grpc_enabled_with_flag
+        if not use_legacy:
+            try:
+                request = jobsv1_pb2.CancelJobsRequest(job_ids=jobs,
+                                                       cancel_all=cancel_all,
+                                                       user_hash=user_hash)
+                response = backend_utils.invoke_skylet_with_retries(
+                    lambda: SkyletClient(handle.get_grpc_channel()).cancel_jobs(
+                        request))
+                cancelled_ids = response.cancelled_job_ids
+            except exceptions.SkyletMethodNotImplementedError:
+                use_legacy = True
+        if use_legacy:
+            code = job_lib.JobLibCodeGen.cancel_jobs(jobs, cancel_all,
+                                                     user_hash)
+            returncode, stdout, _ = self.run_on_head(handle,
+                                                     code,
+                                                     stream_logs=False,
+                                                     require_outputs=True)
+            subprocess_utils.handle_returncode(
+                returncode, code,
+                f'Failed to cancel jobs on cluster {handle.cluster_name}.',
+                stdout)
+            cancelled_ids = message_utils.decode_payload(stdout)
         if cancelled_ids:
             logger.info(
                 f'Cancelled job ID(s): {", ".join(map(str, cancelled_ids))}')
@@ -4172,20 +4239,48 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         Returns:
             A dictionary mapping job_id to log path.
         """
-        code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
-        returncode, job_to_dir, stderr = self.run_on_head(handle,
+        job_to_dir: Dict[str, str] = {}
+        use_legacy = not handle.is_grpc_enabled_with_flag
+        if not use_legacy:
+            try:
+                int_job_ids = []
+                if job_ids:
+                    for str_job_id in job_ids:
+                        if str_job_id.isdigit():
+                            int_job_ids.append(int(str_job_id))
+                request = jobsv1_pb2.GetLogDirsForJobsRequest(
+                    job_ids=int_job_ids)
+                response = backend_utils.invoke_skylet_with_retries(
+                    lambda: SkyletClient(handle.get_grpc_channel()
+                                        ).get_log_dirs_for_jobs(request))
+                job_log_dirs = response.job_log_dirs
+                if not job_log_dirs:
+                    logger.info(f'{colorama.Fore.YELLOW}'
+                                'No matching log directories found'
+                                f'{colorama.Style.RESET_ALL}')
+                    return {}
+                for job_id, log_dir in job_log_dirs.items():
+                    # Convert to string for backwards compatibility
+                    job_to_dir[str(job_id)] = log_dir
+            except exceptions.SkyletMethodNotImplementedError:
+                use_legacy = True
+        if use_legacy:
+            code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(job_ids)
+            returncode, stdout, stderr = self.run_on_head(handle,
                                                           code,
                                                           stream_logs=False,
                                                           require_outputs=True,
                                                           separate_stderr=True)
-        subprocess_utils.handle_returncode(returncode, code,
-                                           'Failed to sync logs.', stderr)
-        job_to_dir: Dict[str, str] = message_utils.decode_payload(job_to_dir)
-        if not job_to_dir:
-            logger.info(f'{colorama.Fore.YELLOW}'
-                        'No matching log directories found'
-                        f'{colorama.Style.RESET_ALL}')
-            return {}
+            subprocess_utils.handle_returncode(returncode, code,
+                                               'Failed to sync logs.', stderr)
+            job_to_dir = message_utils.decode_payload(stdout)
+            if not job_to_dir:
+                logger.info(f'{colorama.Fore.YELLOW}'
+                            'No matching log directories found'
+                            f'{colorama.Style.RESET_ALL}')
+                return {}
         job_ids = list(job_to_dir.keys())
         dirs = list(job_to_dir.values())
@@ -4195,9 +4290,23 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             (dir if constants.SKY_LOGS_DIRECTORY in dir else os.path.join(
                 constants.SKY_LOGS_DIRECTORY, dir)) for dir in dirs
         ]
-        local_log_dirs = [(dir.replace(constants.SKY_LOGS_DIRECTORY, local_dir)
-                           if constants.SKY_LOGS_DIRECTORY in dir else
-                           os.path.join(local_dir, dir)) for dir in dirs]
+        # Include cluster name in local log directory path to avoid conflicts
+        # when the same job_id exists on different clusters
+        cluster_name = handle.cluster_name
+        local_log_dirs = []
+        for remote_log_dir in dirs:
+            if constants.SKY_LOGS_DIRECTORY in remote_log_dir:
+                # Extract the job-specific directory name from the full path
+                # e.g., ~/sky_logs/1-job_name -> 1-job_name
+                job_dir = remote_log_dir.replace(constants.SKY_LOGS_DIRECTORY,
+                                                 '').lstrip('/')
+                local_log_dir = os.path.join(local_dir, cluster_name, job_dir)
+            else:
+                # remote_log_dir is already just the job directory name (e.g.,
+                # "1-job_name")
+                local_log_dir = os.path.join(local_dir, cluster_name,
+                                             remote_log_dir)
+            local_log_dirs.append(local_log_dir)
         runners = handle.get_command_runners()
@@ -4261,6 +4370,28 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             The exit code of the tail command. Returns code 100 if the job has
             failed. See exceptions.JobExitCode for possible return codes.
         """
+        if handle.is_grpc_enabled_with_flag:
+            last_exit_code = 0
+            try:
+                request = jobsv1_pb2.TailLogsRequest(
+                    job_id=job_id,
+                    managed_job_id=managed_job_id,
+                    follow=follow,
+                    tail=tail)
+                for resp in backend_utils.invoke_skylet_streaming_with_retries(
+                        lambda: SkyletClient(handle.get_grpc_channel()
+                                            ).tail_logs(request, timeout=None)):
+                    if resp.log_line:
+                        print(resp.log_line, end='', flush=True)
+                    last_exit_code = resp.exit_code
+                return last_exit_code
+            except exceptions.SkyletMethodNotImplementedError:
+                pass
+            except grpc.RpcError as e:
+                if e.code() == grpc.StatusCode.CANCELLED:
+                    return last_exit_code
+                raise e
         code = job_lib.JobLibCodeGen.tail_logs(job_id,
                                                managed_job_id=managed_job_id,
                                                follow=follow,
@@ -4298,6 +4429,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                               tail: Optional[int] = None) -> int:
         # if job_name is not None, job_id should be None
         assert job_name is None or job_id is None, (job_name, job_id)
+        # TODO(kevin): Migrate stream_logs to gRPC
         code = managed_jobs.ManagedJobCodeGen.stream_logs(
             job_name, job_id, follow, controller, tail)
@@ -4343,20 +4475,37 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         assert job_name is None or job_id is None, (job_name, job_id)
         if job_id is None:
-            # generate code to get the job_id
+            # get the job_id
             # if job_name is None, get all job_ids
             # TODO: Only get the latest job_id, since that's the only one we use
-            code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
-                job_name=job_name)
-            returncode, job_ids, stderr = self.run_on_head(handle,
-                                                           code,
-                                                           stream_logs=False,
-                                                           require_outputs=True,
-                                                           separate_stderr=True)
-            subprocess_utils.handle_returncode(returncode, code,
-                                               'Failed to sync down logs.',
-                                               stderr)
-            job_ids = message_utils.decode_payload(job_ids)
+            use_legacy = not handle.is_grpc_enabled_with_flag
+            logger.info(f'handle.is_grpc_enabled_with_flag: '
+                        f'{handle.is_grpc_enabled_with_flag}')
+            if not use_legacy:
+                try:
+                    request = managed_jobsv1_pb2.GetAllJobIdsByNameRequest(
+                        job_name=job_name)
+                    response = backend_utils.invoke_skylet_with_retries(
+                        lambda: SkyletClient(handle.get_grpc_channel(
+                        )).get_all_managed_job_ids_by_name(request))
+                    job_ids = list(response.job_ids)
+                except exceptions.SkyletMethodNotImplementedError:
+                    use_legacy = True
+            if use_legacy:
+                code = managed_jobs.ManagedJobCodeGen.get_all_job_ids_by_name(
+                    job_name=job_name)
+                returncode, job_ids_payload, stderr = self.run_on_head(
+                    handle,
+                    code,
+                    stream_logs=False,
+                    require_outputs=True,
+                    separate_stderr=True)
+                subprocess_utils.handle_returncode(returncode, code,
+                                                   'Failed to sync down logs.',
+                                                   stderr)
+                job_ids = message_utils.decode_payload(job_ids_payload)
             if not job_ids:
                 logger.info(f'{colorama.Fore.YELLOW}'
                             'No matching job found'
@@ -4384,18 +4533,39 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         else:
             # get the run_timestamp
             # the function takes in [job_id]
-            code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs([str(job_id)])
-            returncode, run_timestamps_payload, stderr = self.run_on_head(
-                handle,
-                code,
-                stream_logs=False,
-                require_outputs=True,
-                separate_stderr=True)
-            subprocess_utils.handle_returncode(returncode, code,
-                                               'Failed to sync logs.', stderr)
-            # returns with a dict of {job_id: run_timestamp}
-            run_timestamps = message_utils.decode_payload(
-                run_timestamps_payload)
+            use_legacy = not handle.is_grpc_enabled_with_flag
+            if not use_legacy:
+                try:
+                    log_dirs_request = jobsv1_pb2.GetLogDirsForJobsRequest(
+                        job_ids=[job_id])
+                    log_dirs_response = (
+                        backend_utils.invoke_skylet_with_retries(
+                            lambda: SkyletClient(handle.get_grpc_channel(
+                            )).get_log_dirs_for_jobs(log_dirs_request)))
+                    job_log_dirs = log_dirs_response.job_log_dirs
+                    # Convert back to the expected format
+                    # {job_id: run_timestamp}
+                    run_timestamps = {}
+                    for jid, log_dir in job_log_dirs.items():
+                        run_timestamps[int(jid)] = log_dir
+                except exceptions.SkyletMethodNotImplementedError:
+                    use_legacy = True
+            if use_legacy:
+                code = job_lib.JobLibCodeGen.get_log_dirs_for_jobs(
+                    [str(job_id)])
+                returncode, run_timestamps_payload, stderr = self.run_on_head(
+                    handle,
+                    code,
+                    stream_logs=False,
+                    require_outputs=True,
+                    separate_stderr=True)
+                subprocess_utils.handle_returncode(returncode, code,
+                                                   'Failed to sync logs.',
+                                                   stderr)
+                # returns with a dict of {job_id: run_timestamp}
+                run_timestamps = message_utils.decode_payload(
+                    run_timestamps_payload)
         if not run_timestamps:
             logger.info(f'{colorama.Fore.YELLOW}'
                         'No matching log directories found'
@@ -4462,11 +4632,12 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         exist_ok=True)
             log_file = os.path.join(local_log_dir, 'run.log')
-            code = managed_jobs.ManagedJobCodeGen.stream_logs(job_name=None,
-                                                              job_id=job_id,
-                                                              follow=False,
-                                                              controller=False)
+            # TODO(kevin): Migrate stream_logs to gRPC
+            code = managed_jobs.ManagedJobCodeGen.stream_logs(
+                job_name=None,
+                job_id=int(job_id),
+                follow=False,
+                controller=False)
             # With the stdin=subprocess.DEVNULL, the ctrl-c will not
             # kill the process, so we need to handle it manually here.
             if threading.current_thread() is threading.main_thread():
@@ -4507,6 +4678,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         Raises:
             RuntimeError: If the cluster fails to be terminated/stopped.
         """
+        try:
+            handle.close_skylet_ssh_tunnel()
+        except Exception as e:  # pylint: disable=broad-except
+            # Not critical to the cluster teardown, just log a warning.
+            logger.warning(
+                'Failed to close Skylet SSH tunnel for cluster '
+                f'{handle.cluster_name}: '
+                f'{common_utils.format_exception(e, use_bracket=True)}')
         exclude_request_to_kill = 'sky.down' if terminate else 'sky.stop'
         # We have to kill the cluster requests again within the lock, because
         # any pending requests on the same cluster should be cancelled after
@@ -4543,7 +4723,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         # observed in AWS. See also
                         # _LAUNCH_DOUBLE_CHECK_WINDOW in backend_utils.py.
                         force_refresh_statuses={status_lib.ClusterStatus.INIT},
-                        acquire_per_cluster_status_lock=False))
+                        cluster_lock_already_held=True,
+                        retry_if_missing=False))
                 cluster_status_fetched = True
             except exceptions.ClusterStatusFetchingError:
                 logger.warning(
@@ -4551,10 +4732,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     f'{handle.cluster_name!r}. Assuming the cluster is still '
                     'up.')
         if not cluster_status_fetched:
-            record = global_user_state.get_cluster_from_name(
+            status = global_user_state.get_status_from_cluster_name(
                 handle.cluster_name)
-            prev_cluster_status = record[
-                'status'] if record is not None else None
+            prev_cluster_status = status if status is not None else None
         if prev_cluster_status is None:
             # When the cluster is not in the cluster table, we guarantee that
             # all related resources / cache / config are cleaned up, i.e. it
@@ -4786,7 +4966,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                                                 config['provider'])
                     ports_cleaned_up = True
                 except exceptions.NotSupportedError:
-                    pass
+                    ports_cleaned_up = True
                 except exceptions.PortDoesNotExistError:
                     logger.debug('Ports do not exist. Skipping cleanup.')
                 except Exception as e:  # pylint: disable=broad-except
@@ -4811,7 +4991,7 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                         failover)
                     custom_multi_network_cleaned_up = True
                 except exceptions.NotSupportedError:
-                    pass
+                    custom_multi_network_cleaned_up = True
                 except Exception as e:  # pylint: disable=broad-except
                     if purge:
                         msg = common_utils.format_exception(e, use_bracket=True)
@@ -4913,7 +5093,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         cluster_yaml_path = handle.cluster_yaml
         handle.cluster_yaml = None
         global_user_state.update_cluster_handle(handle.cluster_name, handle)
-        global_user_state.remove_cluster_yaml(handle.cluster_name)
+        # Removing the cluster YAML can cause some unexpected stability issues.
+        # See #5011.
+        # global_user_state.remove_cluster_yaml(handle.cluster_name)
         common_utils.remove_file_if_exists(cluster_yaml_path)
     def set_autostop(self,
@@ -4974,9 +5156,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
                     autostopv1_pb2.AUTOSTOP_WAIT_FOR_UNSPECIFIED,
                     down=down,
                 )
-                backend_utils.invoke_skylet_with_retries(
-                    handle, lambda: SkyletClient(handle.get_grpc_channel()).
-                    set_autostop(request))
+                backend_utils.invoke_skylet_with_retries(lambda: SkyletClient(
+                    handle.get_grpc_channel()).set_autostop(request))
             else:
                 code = autostop_lib.AutostopCodeGen.set_autostop(
                     idle_minutes_to_autostop, self.NAME, wait_for, down)
@@ -5015,8 +5196,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             try:
                 request = autostopv1_pb2.IsAutostoppingRequest()
                 response = backend_utils.invoke_skylet_with_retries(
-                    handle, lambda: SkyletClient(handle.get_grpc_channel()).
-                    is_autostopping(request))
+                    lambda: SkyletClient(handle.get_grpc_channel()
+                                        ).is_autostopping(request))
                 return response.is_autostopping
             except Exception as e:  # pylint: disable=broad-except
                 # The cluster may have been terminated, causing the gRPC call
@@ -5128,7 +5309,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             exceptions.InvalidClusterNameError: If the cluster name is invalid.
             # TODO(zhwu): complete the list of exceptions.
         """
-        record = global_user_state.get_cluster_from_name(cluster_name)
+        record = global_user_state.get_cluster_from_name(
+            cluster_name, include_user_info=False, summary_response=True)
         if record is None:
             handle_before_refresh = None
             status_before_refresh = None
@@ -5148,7 +5330,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             record = backend_utils.refresh_cluster_record(
                 cluster_name,
                 force_refresh_statuses={status_lib.ClusterStatus.INIT},
-                acquire_per_cluster_status_lock=False,
+                cluster_lock_already_held=True,
+                include_user_info=False,
+                summary_response=True,
             )
             if record is not None:
                 prev_cluster_status = record['status']
@@ -5264,33 +5448,41 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         common_utils.check_cluster_name_is_valid(cluster_name)
         if to_provision is None:
-            # The cluster is recently terminated either by autostop or manually
-            # terminated on the cloud. We should use the previously terminated
-            # resources to provision the cluster.
-            #
-            # FIXME(zongheng): this assert can be hit by using two terminals.
-            # First, create a 'dbg' cluster. Then:
-            #   Terminal 1: sky down dbg -y
-            #   Terminal 2: sky launch -c dbg -- echo
-            # Run it in order. Terminal 2 will show this error after terminal 1
-            # succeeds in downing the cluster and releasing the lock.
-            assert isinstance(
-                handle_before_refresh, CloudVmRayResourceHandle), (
-                    f'Trying to launch cluster {cluster_name!r} recently '
-                    'terminated on the cloud, but the handle is not a '
-                    f'CloudVmRayResourceHandle ({handle_before_refresh}).')
-            status_before_refresh_str = None
-            if status_before_refresh is not None:
-                status_before_refresh_str = status_before_refresh.value
-            logger.info(
-                f'The cluster {cluster_name!r} (status: '
-                f'{status_before_refresh_str}) was not found on the cloud: it '
-                'may be autodowned, manually terminated, or its launch never '
-                'succeeded. Provisioning a new cluster by using the same '
-                'resources as its original launch.')
-            to_provision = handle_before_refresh.launched_resources
-            self.check_resources_fit_cluster(handle_before_refresh, task)
+            # Recently terminated after refresh. OPTIMIZE usually ran outside
+            # the lock, so that decision may be stale by now. Under the lock,
+            # ensure we always have a concrete plan via the following order:
+            #   1) Reuse last placement snapshot (if available);
+            #   2) Else, call injected planner for a fresh plan.
+            # If we still have a pre-refresh handle snapshot with a concrete
+            # placement, prefer reusing it.
+            if (isinstance(handle_before_refresh, CloudVmRayResourceHandle) and
+                    handle_before_refresh.launched_resources is not None):
+                to_provision = handle_before_refresh.launched_resources
+                # Ensure the requested task fits the previous placement.
+                self.check_resources_fit_cluster(handle_before_refresh, task)
+                # Mirror the original message for reuse path.
+                status_before_refresh_str = None
+                if status_before_refresh is not None:
+                    status_before_refresh_str = status_before_refresh.value
+                logger.info(
+                    f'The cluster {cluster_name!r} (status: '
+                    f'{status_before_refresh_str}) was not found on the cloud: '
+                    'it may be autodowned, manually terminated, or its launch '
+                    'never succeeded. Provisioning a new cluster by using the '
+                    'same resources as its original launch.')
+            elif self._planner is not None:
+                to_provision = self._planner(task)
+                logger.info(
+                    'Previous placement snapshot missing; computing a fresh '
+                    'plan for provisioning.')
+            else:
+                # Without a snapshot or planner, we cannot proceed safely.
+                # Surface a user-friendly error without a long traceback.
+                with ux_utils.print_exception_no_traceback():
+                    raise RuntimeError(
+                        'No concrete launch plan available after recent cloud '
+                        f'termination of cluster {cluster_name!r}. Ensure the '
+                        'OPTIMIZE stage runs or provide concrete resources.')
         return RetryingVmProvisioner.ToProvisionConfig(
             cluster_name,
@@ -5639,7 +5831,8 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
     def _get_task_env_vars(self, task: task_lib.Task, job_id: int,
                            handle: CloudVmRayResourceHandle) -> Dict[str, str]:
         """Returns the environment variables for the task."""
-        env_vars = task.envs_and_secrets
+        env_vars = task_lib.get_plaintext_envs_and_secrets(
+            task.envs_and_secrets)
         # If it is a managed job, the TASK_ID_ENV_VAR will have been already set
         # by the controller.
         if constants.TASK_ID_ENV_VAR not in env_vars:
@@ -5651,9 +5844,15 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         env_vars.update(self._skypilot_predefined_env_vars(handle))
         return env_vars
+    def _get_managed_job_user_id(self, task: task_lib.Task) -> Optional[str]:
+        """Returns the user id for the managed job."""
+        if task.managed_job_dag is not None:
+            return task.envs[constants.USER_ID_ENV_VAR]
+        return None
     def _execute_task_one_node(self, handle: CloudVmRayResourceHandle,
                                task: task_lib.Task, job_id: int,
-                               detach_run: bool, remote_log_dir: str) -> None:
+                               remote_log_dir: str) -> None:
         # Launch the command as a Ray task.
         log_dir = os.path.join(remote_log_dir, 'tasks')
@@ -5663,9 +5862,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         task_env_vars = self._get_task_env_vars(task, job_id, handle)
-        codegen = RayCodeGen()
+        codegen = task_codegen.RayCodeGen()
         codegen.add_prologue(job_id)
-        codegen.add_gang_scheduling_placement_group_and_setup(
+        codegen.add_setup(
             1,
             resources_dict,
             stable_cluster_internal_ips=internal_ips,
@@ -5674,31 +5873,27 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             setup_log_path=os.path.join(log_dir, 'setup.log'),
         )
-        if callable(task.run):
-            run_fn_code = textwrap.dedent(inspect.getsource(task.run))
-            run_fn_name = task.run.__name__
-            codegen.register_run_fn(run_fn_code, run_fn_name)
-        command_for_node = task.run if isinstance(task.run, str) else None
-        codegen.add_ray_task(
-            bash_script=command_for_node,
+        codegen.add_task(
+            1,
+            bash_script=task.run,
             env_vars=task_env_vars,
             task_name=task.name,
-            ray_resources_dict=backend_utils.get_task_demands_dict(task),
+            resources_dict=backend_utils.get_task_demands_dict(task),
             log_dir=log_dir)
         codegen.add_epilogue()
-        self._exec_code_on_head(handle,
-                                codegen.build(),
-                                job_id,
-                                detach_run=detach_run,
-                                managed_job_dag=task.managed_job_dag,
-                                remote_log_dir=remote_log_dir)
+        self._exec_code_on_head(
+            handle,
+            codegen.build(),
+            job_id,
+            managed_job_dag=task.managed_job_dag,
+            managed_job_user_id=self._get_managed_job_user_id(task),
+            remote_log_dir=remote_log_dir)
     def _execute_task_n_nodes(self, handle: CloudVmRayResourceHandle,
                               task: task_lib.Task, job_id: int,
-                              detach_run: bool, remote_log_dir: str) -> None:
+                              remote_log_dir: str) -> None:
         # Strategy:
         #   ray.init(...)
         #   for node:
@@ -5712,9 +5907,9 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
         num_actual_nodes = task.num_nodes * handle.num_ips_per_node
         task_env_vars = self._get_task_env_vars(task, job_id, handle)
-        codegen = RayCodeGen()
+        codegen = task_codegen.RayCodeGen()
         codegen.add_prologue(job_id)
-        codegen.add_gang_scheduling_placement_group_and_setup(
+        codegen.add_setup(
             num_actual_nodes,
             resources_dict,
             stable_cluster_internal_ips=internal_ips,
@@ -5723,31 +5918,20 @@ class CloudVmRayBackend(backends.Backend['CloudVmRayResourceHandle']):
             setup_log_path=os.path.join(log_dir, 'setup.log'),
         )
-        if callable(task.run):
-            run_fn_code = textwrap.dedent(inspect.getsource(task.run))
-            run_fn_name = task.run.__name__
-            codegen.register_run_fn(run_fn_code, run_fn_name)
-        # TODO(zhwu): The resources limitation for multi-node ray.tune and
-        # horovod should be considered.
-        for i in range(num_actual_nodes):
-            command_for_node = task.run if isinstance(task.run, str) else None
-            # Ray's per-node resources, to constrain scheduling each command to
-            # the corresponding node, represented by private IPs.
-            codegen.add_ray_task(
-                bash_script=command_for_node,
-                env_vars=task_env_vars,
-                task_name=task.name,
-                ray_resources_dict=backend_utils.get_task_demands_dict(task),
-                log_dir=log_dir,
-                gang_scheduling_id=i)
+        codegen.add_task(
+            num_actual_nodes,
+            bash_script=task.run,
+            env_vars=task_env_vars,
+            task_name=task.name,
+            resources_dict=backend_utils.get_task_demands_dict(task),
+            log_dir=log_dir)
         codegen.add_epilogue()
         # TODO(zhanghao): Add help info for downloading logs.
-        self._exec_code_on_head(handle,
-                                codegen.build(),
-                                job_id,
-                                detach_run=detach_run,
-                                managed_job_dag=task.managed_job_dag,
-                                remote_log_dir=remote_log_dir)
+        self._exec_code_on_head(
+            handle,
+            codegen.build(),
+            job_id,
+            managed_job_dag=task.managed_job_dag,
+            managed_job_user_id=self._get_managed_job_user_id(task),
+            remote_log_dir=remote_log_dir)

skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250905py3-none-any.whl → 1.0.0.dev20251203py3-none-any.whl