PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250502py3-none-any.whl → 1.0.0.dev20251203py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (546) hide show

sky/__init__.py +22 -6
sky/adaptors/aws.py +81 -16
sky/adaptors/common.py +25 -2
sky/adaptors/coreweave.py +278 -0
sky/adaptors/do.py +8 -2
sky/adaptors/gcp.py +11 -0
sky/adaptors/hyperbolic.py +8 -0
sky/adaptors/ibm.py +5 -2
sky/adaptors/kubernetes.py +149 -18
sky/adaptors/nebius.py +173 -30
sky/adaptors/primeintellect.py +1 -0
sky/adaptors/runpod.py +68 -0
sky/adaptors/seeweb.py +183 -0
sky/adaptors/shadeform.py +89 -0
sky/admin_policy.py +187 -4
sky/authentication.py +179 -225
sky/backends/__init__.py +4 -2
sky/backends/backend.py +22 -9
sky/backends/backend_utils.py +1323 -397
sky/backends/cloud_vm_ray_backend.py +1749 -1029
sky/backends/docker_utils.py +1 -1
sky/backends/local_docker_backend.py +11 -6
sky/backends/task_codegen.py +633 -0
sky/backends/wheel_utils.py +55 -9
sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
sky/{clouds/service_catalog → catalog}/common.py +90 -49
sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
sky/catalog/data_fetchers/fetch_nebius.py +338 -0
sky/catalog/data_fetchers/fetch_runpod.py +698 -0
sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
sky/catalog/hyperbolic_catalog.py +136 -0
sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
sky/catalog/primeintellect_catalog.py +95 -0
sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
sky/catalog/seeweb_catalog.py +184 -0
sky/catalog/shadeform_catalog.py +165 -0
sky/catalog/ssh_catalog.py +167 -0
sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
sky/check.py +533 -185
sky/cli.py +5 -5975
sky/client/{cli.py → cli/command.py} +2591 -1956
sky/client/cli/deprecation_utils.py +99 -0
sky/client/cli/flags.py +359 -0
sky/client/cli/table_utils.py +322 -0
sky/client/cli/utils.py +79 -0
sky/client/common.py +78 -32
sky/client/oauth.py +82 -0
sky/client/sdk.py +1219 -319
sky/client/sdk_async.py +827 -0
sky/client/service_account_auth.py +47 -0
sky/cloud_stores.py +82 -3
sky/clouds/__init__.py +13 -0
sky/clouds/aws.py +564 -164
sky/clouds/azure.py +105 -83
sky/clouds/cloud.py +140 -40
sky/clouds/cudo.py +68 -50
sky/clouds/do.py +66 -48
sky/clouds/fluidstack.py +63 -44
sky/clouds/gcp.py +339 -110
sky/clouds/hyperbolic.py +293 -0
sky/clouds/ibm.py +70 -49
sky/clouds/kubernetes.py +570 -162
sky/clouds/lambda_cloud.py +74 -54
sky/clouds/nebius.py +210 -81
sky/clouds/oci.py +88 -66
sky/clouds/paperspace.py +61 -44
sky/clouds/primeintellect.py +317 -0
sky/clouds/runpod.py +164 -74
sky/clouds/scp.py +89 -86
sky/clouds/seeweb.py +477 -0
sky/clouds/shadeform.py +400 -0
sky/clouds/ssh.py +263 -0
sky/clouds/utils/aws_utils.py +10 -4
sky/clouds/utils/gcp_utils.py +87 -11
sky/clouds/utils/oci_utils.py +38 -14
sky/clouds/utils/scp_utils.py +231 -167
sky/clouds/vast.py +99 -77
sky/clouds/vsphere.py +51 -40
sky/core.py +375 -173
sky/dag.py +15 -0
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -0
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -0
sky/dashboard/out/infra.html +1 -0
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -0
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -0
sky/dashboard/out/volumes.html +1 -0
sky/dashboard/out/workspace/new.html +1 -0
sky/dashboard/out/workspaces/[name].html +1 -0
sky/dashboard/out/workspaces.html +1 -0
sky/data/data_utils.py +137 -1
sky/data/mounting_utils.py +269 -84
sky/data/storage.py +1460 -1807
sky/data/storage_utils.py +43 -57
sky/exceptions.py +126 -2
sky/execution.py +216 -63
sky/global_user_state.py +2390 -586
sky/jobs/__init__.py +7 -0
sky/jobs/client/sdk.py +300 -58
sky/jobs/client/sdk_async.py +161 -0
sky/jobs/constants.py +15 -8
sky/jobs/controller.py +848 -275
sky/jobs/file_content_utils.py +128 -0
sky/jobs/log_gc.py +193 -0
sky/jobs/recovery_strategy.py +402 -152
sky/jobs/scheduler.py +314 -189
sky/jobs/server/core.py +836 -255
sky/jobs/server/server.py +156 -115
sky/jobs/server/utils.py +136 -0
sky/jobs/state.py +2109 -706
sky/jobs/utils.py +1306 -215
sky/logs/__init__.py +21 -0
sky/logs/agent.py +108 -0
sky/logs/aws.py +243 -0
sky/logs/gcp.py +91 -0
sky/metrics/__init__.py +0 -0
sky/metrics/utils.py +453 -0
sky/models.py +78 -1
sky/optimizer.py +164 -70
sky/provision/__init__.py +90 -4
sky/provision/aws/config.py +147 -26
sky/provision/aws/instance.py +136 -50
sky/provision/azure/instance.py +11 -6
sky/provision/common.py +13 -1
sky/provision/cudo/cudo_machine_type.py +1 -1
sky/provision/cudo/cudo_utils.py +14 -8
sky/provision/cudo/cudo_wrapper.py +72 -71
sky/provision/cudo/instance.py +10 -6
sky/provision/do/instance.py +10 -6
sky/provision/do/utils.py +4 -3
sky/provision/docker_utils.py +140 -33
sky/provision/fluidstack/instance.py +13 -8
sky/provision/gcp/__init__.py +1 -0
sky/provision/gcp/config.py +301 -19
sky/provision/gcp/constants.py +218 -0
sky/provision/gcp/instance.py +36 -8
sky/provision/gcp/instance_utils.py +18 -4
sky/provision/gcp/volume_utils.py +247 -0
sky/provision/hyperbolic/__init__.py +12 -0
sky/provision/hyperbolic/config.py +10 -0
sky/provision/hyperbolic/instance.py +437 -0
sky/provision/hyperbolic/utils.py +373 -0
sky/provision/instance_setup.py +101 -20
sky/provision/kubernetes/__init__.py +5 -0
sky/provision/kubernetes/config.py +9 -52
sky/provision/kubernetes/constants.py +17 -0
sky/provision/kubernetes/instance.py +919 -280
sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
sky/provision/kubernetes/network.py +27 -17
sky/provision/kubernetes/network_utils.py +44 -43
sky/provision/kubernetes/utils.py +1221 -534
sky/provision/kubernetes/volume.py +343 -0
sky/provision/lambda_cloud/instance.py +22 -16
sky/provision/nebius/constants.py +50 -0
sky/provision/nebius/instance.py +19 -6
sky/provision/nebius/utils.py +237 -137
sky/provision/oci/instance.py +10 -5
sky/provision/paperspace/instance.py +10 -7
sky/provision/paperspace/utils.py +1 -1
sky/provision/primeintellect/__init__.py +10 -0
sky/provision/primeintellect/config.py +11 -0
sky/provision/primeintellect/instance.py +454 -0
sky/provision/primeintellect/utils.py +398 -0
sky/provision/provisioner.py +117 -36
sky/provision/runpod/__init__.py +5 -0
sky/provision/runpod/instance.py +27 -6
sky/provision/runpod/utils.py +51 -18
sky/provision/runpod/volume.py +214 -0
sky/provision/scp/__init__.py +15 -0
sky/provision/scp/config.py +93 -0
sky/provision/scp/instance.py +707 -0
sky/provision/seeweb/__init__.py +11 -0
sky/provision/seeweb/config.py +13 -0
sky/provision/seeweb/instance.py +812 -0
sky/provision/shadeform/__init__.py +11 -0
sky/provision/shadeform/config.py +12 -0
sky/provision/shadeform/instance.py +351 -0
sky/provision/shadeform/shadeform_utils.py +83 -0
sky/provision/ssh/__init__.py +18 -0
sky/provision/vast/instance.py +13 -8
sky/provision/vast/utils.py +10 -7
sky/provision/volume.py +164 -0
sky/provision/vsphere/common/ssl_helper.py +1 -1
sky/provision/vsphere/common/vapiconnect.py +2 -1
sky/provision/vsphere/common/vim_utils.py +4 -4
sky/provision/vsphere/instance.py +15 -10
sky/provision/vsphere/vsphere_utils.py +17 -20
sky/py.typed +0 -0
sky/resources.py +845 -119
sky/schemas/__init__.py +0 -0
sky/schemas/api/__init__.py +0 -0
sky/schemas/api/responses.py +227 -0
sky/schemas/db/README +4 -0
sky/schemas/db/env.py +90 -0
sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
sky/schemas/db/global_user_state/004_is_managed.py +34 -0
sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
sky/schemas/db/global_user_state/006_provision_log.py +41 -0
sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
sky/schemas/db/script.py.mako +28 -0
sky/schemas/db/serve_state/001_initial_schema.py +67 -0
sky/schemas/db/serve_state/002_yaml_content.py +34 -0
sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
sky/schemas/generated/__init__.py +0 -0
sky/schemas/generated/autostopv1_pb2.py +36 -0
sky/schemas/generated/autostopv1_pb2.pyi +43 -0
sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
sky/schemas/generated/jobsv1_pb2.py +86 -0
sky/schemas/generated/jobsv1_pb2.pyi +254 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
sky/schemas/generated/servev1_pb2.py +58 -0
sky/schemas/generated/servev1_pb2.pyi +115 -0
sky/schemas/generated/servev1_pb2_grpc.py +322 -0
sky/serve/autoscalers.py +357 -5
sky/serve/client/impl.py +310 -0
sky/serve/client/sdk.py +47 -139
sky/serve/client/sdk_async.py +130 -0
sky/serve/constants.py +12 -9
sky/serve/controller.py +68 -17
sky/serve/load_balancer.py +106 -60
sky/serve/load_balancing_policies.py +116 -2
sky/serve/replica_managers.py +434 -249
sky/serve/serve_rpc_utils.py +179 -0
sky/serve/serve_state.py +569 -257
sky/serve/serve_utils.py +775 -265
sky/serve/server/core.py +66 -711
sky/serve/server/impl.py +1093 -0
sky/serve/server/server.py +21 -18
sky/serve/service.py +192 -89
sky/serve/service_spec.py +144 -20
sky/serve/spot_placer.py +3 -0
sky/server/auth/__init__.py +0 -0
sky/server/auth/authn.py +50 -0
sky/server/auth/loopback.py +38 -0
sky/server/auth/oauth2_proxy.py +202 -0
sky/server/common.py +478 -182
sky/server/config.py +85 -23
sky/server/constants.py +44 -6
sky/server/daemons.py +295 -0
sky/server/html/token_page.html +185 -0
sky/server/metrics.py +160 -0
sky/server/middleware_utils.py +166 -0
sky/server/requests/executor.py +558 -138
sky/server/requests/payloads.py +364 -24
sky/server/requests/preconditions.py +21 -17
sky/server/requests/process.py +112 -29
sky/server/requests/request_names.py +121 -0
sky/server/requests/requests.py +822 -226
sky/server/requests/serializers/decoders.py +82 -31
sky/server/requests/serializers/encoders.py +140 -22
sky/server/requests/threads.py +117 -0
sky/server/rest.py +455 -0
sky/server/server.py +1309 -285
sky/server/state.py +20 -0
sky/server/stream_utils.py +327 -61
sky/server/uvicorn.py +217 -3
sky/server/versions.py +270 -0
sky/setup_files/MANIFEST.in +11 -1
sky/setup_files/alembic.ini +160 -0
sky/setup_files/dependencies.py +139 -31
sky/setup_files/setup.py +44 -42
sky/sky_logging.py +114 -7
sky/skylet/attempt_skylet.py +106 -24
sky/skylet/autostop_lib.py +129 -8
sky/skylet/configs.py +29 -20
sky/skylet/constants.py +216 -25
sky/skylet/events.py +101 -21
sky/skylet/job_lib.py +345 -164
sky/skylet/log_lib.py +297 -18
sky/skylet/log_lib.pyi +44 -1
sky/skylet/providers/ibm/node_provider.py +12 -8
sky/skylet/providers/ibm/vpc_provider.py +13 -12
sky/skylet/ray_patches/__init__.py +17 -3
sky/skylet/ray_patches/autoscaler.py.diff +18 -0
sky/skylet/ray_patches/cli.py.diff +19 -0
sky/skylet/ray_patches/command_runner.py.diff +17 -0
sky/skylet/ray_patches/log_monitor.py.diff +20 -0
sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
sky/skylet/ray_patches/updater.py.diff +18 -0
sky/skylet/ray_patches/worker.py.diff +41 -0
sky/skylet/runtime_utils.py +21 -0
sky/skylet/services.py +568 -0
sky/skylet/skylet.py +72 -4
sky/skylet/subprocess_daemon.py +104 -29
sky/skypilot_config.py +506 -99
sky/ssh_node_pools/__init__.py +1 -0
sky/ssh_node_pools/core.py +135 -0
sky/ssh_node_pools/server.py +233 -0
sky/task.py +685 -163
sky/templates/aws-ray.yml.j2 +11 -3
sky/templates/azure-ray.yml.j2 +2 -1
sky/templates/cudo-ray.yml.j2 +1 -0
sky/templates/do-ray.yml.j2 +2 -1
sky/templates/fluidstack-ray.yml.j2 +1 -0
sky/templates/gcp-ray.yml.j2 +62 -1
sky/templates/hyperbolic-ray.yml.j2 +68 -0
sky/templates/ibm-ray.yml.j2 +2 -1
sky/templates/jobs-controller.yaml.j2 +27 -24
sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
sky/templates/kubernetes-ray.yml.j2 +611 -50
sky/templates/lambda-ray.yml.j2 +2 -1
sky/templates/nebius-ray.yml.j2 +34 -12
sky/templates/oci-ray.yml.j2 +1 -0
sky/templates/paperspace-ray.yml.j2 +2 -1
sky/templates/primeintellect-ray.yml.j2 +72 -0
sky/templates/runpod-ray.yml.j2 +10 -1
sky/templates/scp-ray.yml.j2 +4 -50
sky/templates/seeweb-ray.yml.j2 +171 -0
sky/templates/shadeform-ray.yml.j2 +73 -0
sky/templates/sky-serve-controller.yaml.j2 +22 -2
sky/templates/vast-ray.yml.j2 +1 -0
sky/templates/vsphere-ray.yml.j2 +1 -0
sky/templates/websocket_proxy.py +212 -37
sky/usage/usage_lib.py +31 -15
sky/users/__init__.py +0 -0
sky/users/model.conf +15 -0
sky/users/permission.py +397 -0
sky/users/rbac.py +121 -0
sky/users/server.py +720 -0
sky/users/token_service.py +218 -0
sky/utils/accelerator_registry.py +35 -5
sky/utils/admin_policy_utils.py +84 -38
sky/utils/annotations.py +38 -5
sky/utils/asyncio_utils.py +78 -0
sky/utils/atomic.py +1 -1
sky/utils/auth_utils.py +153 -0
sky/utils/benchmark_utils.py +60 -0
sky/utils/cli_utils/status_utils.py +159 -86
sky/utils/cluster_utils.py +31 -9
sky/utils/command_runner.py +354 -68
sky/utils/command_runner.pyi +93 -3
sky/utils/common.py +35 -8
sky/utils/common_utils.py +314 -91
sky/utils/config_utils.py +74 -5
sky/utils/context.py +403 -0
sky/utils/context_utils.py +242 -0
sky/utils/controller_utils.py +383 -89
sky/utils/dag_utils.py +31 -12
sky/utils/db/__init__.py +0 -0
sky/utils/db/db_utils.py +485 -0
sky/utils/db/kv_cache.py +149 -0
sky/utils/db/migration_utils.py +137 -0
sky/utils/directory_utils.py +12 -0
sky/utils/env_options.py +13 -0
sky/utils/git.py +567 -0
sky/utils/git_clone.sh +460 -0
sky/utils/infra_utils.py +195 -0
sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
sky/utils/kubernetes/config_map_utils.py +133 -0
sky/utils/kubernetes/create_cluster.sh +15 -29
sky/utils/kubernetes/delete_cluster.sh +10 -7
sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
sky/utils/kubernetes/generate_kind_config.py +6 -66
sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
sky/utils/kubernetes/gpu_labeler.py +18 -8
sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
sky/utils/kubernetes/rsync_helper.sh +11 -3
sky/utils/kubernetes/ssh-tunnel.sh +379 -0
sky/utils/kubernetes/ssh_utils.py +221 -0
sky/utils/kubernetes_enums.py +8 -15
sky/utils/lock_events.py +94 -0
sky/utils/locks.py +416 -0
sky/utils/log_utils.py +82 -107
sky/utils/perf_utils.py +22 -0
sky/utils/resource_checker.py +298 -0
sky/utils/resources_utils.py +249 -32
sky/utils/rich_utils.py +217 -39
sky/utils/schemas.py +955 -160
sky/utils/serialize_utils.py +16 -0
sky/utils/status_lib.py +10 -0
sky/utils/subprocess_utils.py +29 -15
sky/utils/tempstore.py +70 -0
sky/utils/thread_utils.py +91 -0
sky/utils/timeline.py +26 -53
sky/utils/ux_utils.py +84 -15
sky/utils/validator.py +11 -1
sky/utils/volume.py +165 -0
sky/utils/yaml_utils.py +111 -0
sky/volumes/__init__.py +13 -0
sky/volumes/client/__init__.py +0 -0
sky/volumes/client/sdk.py +150 -0
sky/volumes/server/__init__.py +0 -0
sky/volumes/server/core.py +270 -0
sky/volumes/server/server.py +124 -0
sky/volumes/volume.py +215 -0
sky/workspaces/__init__.py +0 -0
sky/workspaces/core.py +655 -0
sky/workspaces/server.py +101 -0
sky/workspaces/utils.py +56 -0
sky_templates/README.md +3 -0
sky_templates/__init__.py +3 -0
sky_templates/ray/__init__.py +0 -0
sky_templates/ray/start_cluster +183 -0
sky_templates/ray/stop_cluster +75 -0
skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
{skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
sky/benchmark/benchmark_state.py +0 -256
sky/benchmark/benchmark_utils.py +0 -641
sky/clouds/service_catalog/constants.py +0 -7
sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
sky/jobs/dashboard/dashboard.py +0 -223
sky/jobs/dashboard/static/favicon.ico +0 -0
sky/jobs/dashboard/templates/index.html +0 -831
sky/jobs/server/dashboard_utils.py +0 -69
sky/skylet/providers/scp/__init__.py +0 -2
sky/skylet/providers/scp/config.py +0 -149
sky/skylet/providers/scp/node_provider.py +0 -578
sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
sky/utils/db_utils.py +0 -100
sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
/sky/{clouds/service_catalog → catalog}/config.py +0 -0
/sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
/sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
/sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
/sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
/sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
/sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0

sky/jobs/recovery_strategy.py CHANGED Viewed

@@ -5,23 +5,31 @@ In the YAML file, the user can specify the strategy to use for managed jobs.
 resources:
     job_recovery: EAGER_NEXT_REGION
 """
-import time
+import asyncio
+import logging
+import os
 import traceback
 import typing
-from typing import Optional
+from typing import Optional, Set
-import sky
 from sky import backends
+from sky import dag as dag_lib
 from sky import exceptions
-from sky import execution
 from sky import global_user_state
 from sky import sky_logging
+from sky import skypilot_config
 from sky.backends import backend_utils
+from sky.client import sdk
 from sky.jobs import scheduler
+from sky.jobs import state
 from sky.jobs import utils as managed_job_utils
+from sky.serve import serve_utils
+from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.usage import usage_lib
 from sky.utils import common_utils
+from sky.utils import context_utils
+from sky.utils import env_options
 from sky.utils import registry
 from sky.utils import status_lib
 from sky.utils import ux_utils
@@ -39,7 +47,14 @@ MAX_JOB_CHECKING_RETRY = 10
 # Minutes to job cluster autodown. This should be significantly larger than
 # managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS, to avoid tearing down the
 # cluster before its status can be updated by the job controller.
-_AUTODOWN_MINUTES = 5
+_AUTODOWN_MINUTES = 10
+ENV_VARS_TO_CLEAR = [
+    skypilot_config.ENV_VAR_SKYPILOT_CONFIG,
+    constants.USER_ID_ENV_VAR,
+    constants.USER_ENV_VAR,
+    env_options.Options.SHOW_DEBUG_INFO.env_key,
+]
 class StrategyExecutor:
@@ -47,29 +62,65 @@ class StrategyExecutor:
     RETRY_INIT_GAP_SECONDS = 60
-    def __init__(self, cluster_name: str, backend: 'backends.Backend',
-                 task: 'task_lib.Task', max_restarts_on_errors: int,
-                 job_id: int) -> None:
+    def __init__(
+        self,
+        cluster_name: Optional[str],
+        backend: 'backends.Backend',
+        task: 'task_lib.Task',
+        max_restarts_on_errors: int,
+        job_id: int,
+        task_id: int,
+        pool: Optional[str],
+        starting: Set[int],
+        starting_lock: asyncio.Lock,
+        starting_signal: asyncio.Condition,
+    ) -> None:
         """Initialize the strategy executor.
         Args:
             cluster_name: The name of the cluster.
             backend: The backend to use. Only CloudVMRayBackend is supported.
             task: The task to execute.
+            max_restarts_on_errors: Maximum number of restarts on errors.
+            job_id: The ID of the job.
+            task_id: The ID of the task.
+            starting: Set of job IDs that are currently starting.
+            starting_lock: Lock to synchronize starting jobs.
+            starting_signal: Condition to signal when a job can start.
         """
         assert isinstance(backend, backends.CloudVmRayBackend), (
             'Only CloudVMRayBackend is supported.')
-        self.dag = sky.Dag()
+        self.dag = dag_lib.Dag()
         self.dag.add(task)
+        # For jobs submitted to a pool, the cluster name might change after each
+        # recovery. Initially this is set to an empty string to indicate that no
+        # cluster is assigned yet, and in `_launch`, it will be set to one of
+        # the cluster names in the pool.
         self.cluster_name = cluster_name
         self.backend = backend
         self.max_restarts_on_errors = max_restarts_on_errors
         self.job_id = job_id
+        self.task_id = task_id
+        self.pool = pool
         self.restart_cnt_on_failure = 0
+        self.job_id_on_pool_cluster: Optional[int] = None
+        self.starting = starting
+        self.starting_lock = starting_lock
+        self.starting_signal = starting_signal
     @classmethod
-    def make(cls, cluster_name: str, backend: 'backends.Backend',
-             task: 'task_lib.Task', job_id: int) -> 'StrategyExecutor':
+    def make(
+        cls,
+        cluster_name: Optional[str],
+        backend: 'backends.Backend',
+        task: 'task_lib.Task',
+        job_id: int,
+        task_id: int,
+        pool: Optional[str],
+        starting: Set[int],
+        starting_lock: asyncio.Lock,
+        starting_signal: asyncio.Condition,
+    ) -> 'StrategyExecutor':
         """Create a strategy from a task."""
         resource_list = list(task.resources)
@@ -86,8 +137,11 @@ class StrategyExecutor:
         # original task.resources
         task.set_resources(type(task.resources)(new_resources_list))
         if isinstance(job_recovery, dict):
-            job_recovery_name = job_recovery.pop(
+            name = job_recovery.pop(
                 'strategy', registry.JOBS_RECOVERY_STRATEGY_REGISTRY.default)
+            assert name is None or isinstance(name, str), (
+                name, 'The job recovery strategy name must be a string or None')
+            job_recovery_name: Optional[str] = name
             max_restarts_on_errors = job_recovery.pop('max_restarts_on_errors',
                                                       0)
         else:
@@ -97,9 +151,11 @@ class StrategyExecutor:
                                  from_str(job_recovery_name))
         assert job_recovery_strategy is not None, job_recovery_name
         return job_recovery_strategy(cluster_name, backend, task,
-                                     max_restarts_on_errors, job_id)
+                                     max_restarts_on_errors, job_id, task_id,
+                                     pool, starting, starting_lock,
+                                     starting_signal)
-    def launch(self) -> float:
+    async def launch(self) -> float:
         """Launch the cluster for the first time.
         It can fail if resource is not available. Need to check the cluster
@@ -111,11 +167,11 @@ class StrategyExecutor:
         Raises: Please refer to the docstring of self._launch().
         """
-        job_submit_at = self._launch(max_retry=None)
+        job_submit_at = await self._launch(max_retry=None)
         assert job_submit_at is not None
         return job_submit_at
-    def recover(self) -> float:
+    async def recover(self) -> float:
         """Relaunch the cluster after failure and wait until job starts.
         When recover() is called the cluster should be in STOPPED status (i.e.
@@ -125,12 +181,12 @@ class StrategyExecutor:
         """
         raise NotImplementedError
-    def _try_cancel_all_jobs(self):
-        from sky import core  # pylint: disable=import-outside-toplevel
-        handle = global_user_state.get_handle_from_cluster_name(
-            self.cluster_name)
-        if handle is None:
+    async def _try_cancel_jobs(self):
+        if self.cluster_name is None:
+            return
+        handle = await context_utils.to_thread(
+            global_user_state.get_handle_from_cluster_name, self.cluster_name)
+        if handle is None or self.pool is not None:
             return
         try:
             usage_lib.messages.usage.set_internal()
@@ -153,9 +209,26 @@ class StrategyExecutor:
             # should be functional with the `_try_cancel_if_cluster_is_init`
             # flag, i.e. it sends the cancel signal to the head node, which will
             # then kill the user process on remaining worker nodes.
-            core.cancel(cluster_name=self.cluster_name,
-                        all=True,
-                        _try_cancel_if_cluster_is_init=True)
+            # Only cancel the corresponding job for pool.
+            if self.pool is None:
+                request_id = await context_utils.to_thread(
+                    sdk.cancel,
+                    cluster_name=self.cluster_name,
+                    all=True,
+                    _try_cancel_if_cluster_is_init=True,
+                )
+            else:
+                request_id = await context_utils.to_thread(
+                    sdk.cancel,
+                    cluster_name=self.cluster_name,
+                    job_ids=[self.job_id_on_pool_cluster],
+                    _try_cancel_if_cluster_is_init=True,
+                )
+            logger.debug(f'sdk.cancel request ID: {request_id}')
+            await context_utils.to_thread(
+                sdk.get,
+                request_id,
+            )
         except Exception as e:  # pylint: disable=broad-except
             logger.info('Failed to cancel the job on the cluster. The cluster '
                         'might be already down or the head node is preempted.'
@@ -163,25 +236,26 @@ class StrategyExecutor:
                         f'{common_utils.format_exception(e)}\n'
                         'Terminating the cluster explicitly to ensure no '
                         'remaining job process interferes with recovery.')
-            managed_job_utils.terminate_cluster(self.cluster_name)
+            await context_utils.to_thread(self._cleanup_cluster)
-    def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
+    async def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
         """Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
         Returns:
             The timestamp of when the job is submitted, or None if failed to
             submit.
         """
+        assert self.cluster_name is not None
         status = None
         job_checking_retry_cnt = 0
         while job_checking_retry_cnt < MAX_JOB_CHECKING_RETRY:
             # Avoid the infinite loop, if any bug happens.
             job_checking_retry_cnt += 1
             try:
-                cluster_status, _ = (
-                    backend_utils.refresh_cluster_status_handle(
-                        self.cluster_name,
-                        force_refresh_statuses=set(status_lib.ClusterStatus)))
+                cluster_status, _ = (await context_utils.to_thread(
+                    backend_utils.refresh_cluster_status_handle,
+                    self.cluster_name,
+                    force_refresh_statuses=set(status_lib.ClusterStatus)))
             except Exception as e:  # pylint: disable=broad-except
                 # If any unexpected error happens, retry the job checking
                 # loop.
@@ -201,8 +275,10 @@ class StrategyExecutor:
                 break
             try:
-                status = managed_job_utils.get_job_status(
-                    self.backend, self.cluster_name)
+                status = await managed_job_utils.get_job_status(
+                    self.backend,
+                    self.cluster_name,
+                    job_id=self.job_id_on_pool_cluster)
             except Exception as e:  # pylint: disable=broad-except
                 # If any unexpected error happens, retry the job checking
                 # loop.
@@ -217,8 +293,12 @@ class StrategyExecutor:
             # Check the job status until it is not in initialized status
             if status is not None and status > job_lib.JobStatus.INIT:
                 try:
-                    job_submitted_at = managed_job_utils.get_job_timestamp(
-                        self.backend, self.cluster_name, get_end_time=False)
+                    job_submitted_at = await context_utils.to_thread(
+                        managed_job_utils.get_job_timestamp,
+                        self.backend,
+                        self.cluster_name,
+                        self.job_id_on_pool_cluster,
+                        get_end_time=False)
                     return job_submitted_at
                 except Exception as e:  # pylint: disable=broad-except
                     # If we failed to get the job timestamp, we will retry
@@ -227,12 +307,20 @@ class StrategyExecutor:
                                 'the job start timestamp. Retrying.')
                     continue
             # Wait for the job to be started
-            time.sleep(managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
+            await asyncio.sleep(
+                managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
         return None
-    def _launch(self,
-                max_retry: Optional[int] = 3,
-                raise_on_failure: bool = True) -> Optional[float]:
+    def _cleanup_cluster(self) -> None:
+        if self.cluster_name is None:
+            return
+        if self.pool is None:
+            managed_job_utils.terminate_cluster(self.cluster_name)
+    async def _launch(self,
+                      max_retry: Optional[int] = 3,
+                      raise_on_failure: bool = True,
+                      recovery: bool = False) -> Optional[float]:
         """Implementation of launch().
         The function will wait until the job starts running, but will leave the
@@ -272,98 +360,234 @@ class StrategyExecutor:
         backoff = common_utils.Backoff(self.RETRY_INIT_GAP_SECONDS)
         while True:
             retry_cnt += 1
-            with scheduler.scheduled_launch(self.job_id):
-                try:
-                    usage_lib.messages.usage.set_internal()
-                    # Detach setup, so that the setup failure can be detected
-                    # by the controller process (job_status -> FAILED_SETUP).
-                    execution.launch(
-                        self.dag,
-                        cluster_name=self.cluster_name,
-                        # We expect to tear down the cluster as soon as the job
-                        # is finished. However, in case the controller dies, set
-                        # autodown to try and avoid a resource leak.
-                        idle_minutes_to_autostop=_AUTODOWN_MINUTES,
-                        down=True,
-                        _is_launched_by_jobs_controller=True)
-                    logger.info('Managed job cluster launched.')
-                except (exceptions.InvalidClusterNameError,
-                        exceptions.NoCloudAccessError,
-                        exceptions.ResourcesMismatchError) as e:
-                    logger.error('Failure happened before provisioning. '
-                                 f'{common_utils.format_exception(e)}')
-                    if raise_on_failure:
-                        raise exceptions.ProvisionPrechecksError(reasons=[e])
-                    return None
-                except exceptions.ResourcesUnavailableError as e:
-                    # This is raised when the launch fails due to prechecks or
-                    # after failing over through all the candidates.
-                    # Please refer to the docstring of `sky.launch` for more
-                    # details of how the exception will be structured.
-                    if not any(
-                            isinstance(err,
-                                       exceptions.ResourcesUnavailableError)
-                            for err in e.failover_history):
-                        # _launch() (this function) should fail/exit directly,
-                        # if none of the failover reasons were because of
-                        # resource unavailability or no failover was attempted
-                        # (the optimizer cannot find feasible resources for
-                        # requested resources), i.e., e.failover_history is
-                        # empty. Failing directly avoids the infinite loop of
-                        # retrying the launch when, e.g., an invalid cluster
-                        # name is used and --retry-until-up is specified.
-                        reasons = (e.failover_history
-                                   if e.failover_history else [e])
-                        reasons_str = '; '.join(
-                            common_utils.format_exception(err)
-                            for err in reasons)
-                        logger.error(
-                            'Failure happened before provisioning. Failover '
-                            f'reasons: {reasons_str}')
+            try:
+                async with scheduler.scheduled_launch(
+                        self.job_id,
+                        self.starting,
+                        self.starting_lock,
+                        self.starting_signal,
+                ):
+                    # The job state may have been PENDING during backoff -
+                    # update to STARTING or RECOVERING.
+                    # On the first attempt (when retry_cnt is 1), we should
+                    # already be in STARTING or RECOVERING.
+                    if retry_cnt > 1:
+                        await state.set_restarting_async(
+                            self.job_id, self.task_id, recovery)
+                    try:
+                        usage_lib.messages.usage.set_internal()
+                        if self.pool is None:
+                            assert self.cluster_name is not None
+                            # sdk.launch will implicitly start the API server,
+                            # but then the API server will inherit the current
+                            # env vars/user, which we may not want.
+                            # Instead, clear env vars here and call api_start
+                            # explicitly.
+                            vars_to_restore = {}
+                            try:
+                                for env_var in ENV_VARS_TO_CLEAR:
+                                    vars_to_restore[env_var] = os.environ.pop(
+                                        env_var, None)
+                                    logger.debug('Cleared env var: '
+                                                 f'{env_var}')
+                                logger.debug('Env vars for api_start: '
+                                             f'{os.environ}')
+                                await context_utils.to_thread(sdk.api_start)
+                                logger.info('API server started.')
+                            finally:
+                                for env_var, value in vars_to_restore.items():
+                                    if value is not None:
+                                        logger.debug('Restored env var: '
+                                                     f'{env_var}: {value}')
+                                        os.environ[env_var] = value
+                            request_id = None
+                            try:
+                                request_id = await context_utils.to_thread(
+                                    sdk.launch,
+                                    self.dag,
+                                    cluster_name=self.cluster_name,
+                                    # We expect to tear down the cluster as soon
+                                    # as the job is finished. However, in case
+                                    # the controller dies, we may end up with a
+                                    # resource leak.
+                                    # Ideally, we should autodown to be safe,
+                                    # but it's fine to disable it for now, as
+                                    # Nebius doesn't support autodown yet.
+                                    # TODO(kevin): set down=True once Nebius
+                                    # supports autodown.
+                                    # idle_minutes_to_autostop=(
+                                    #     _AUTODOWN_MINUTES),
+                                    # down=True,
+                                    _is_launched_by_jobs_controller=True,
+                                )
+                                logger.debug('sdk.launch request ID: '
+                                             f'{request_id}')
+                                await context_utils.to_thread(
+                                    sdk.stream_and_get,
+                                    request_id,
+                                )
+                            except asyncio.CancelledError:
+                                if request_id:
+                                    req = await context_utils.to_thread(
+                                        sdk.api_cancel, request_id)
+                                    logger.debug('sdk.api_cancel request '
+                                                 f'ID: {req}')
+                                    try:
+                                        await context_utils.to_thread(
+                                            sdk.get, req)
+                                    except Exception as e:  # pylint: disable=broad-except
+                                        # we must still return a CancelledError
+                                        logger.error(
+                                            f'Failed to cancel the job: {e}')
+                                raise
+                            logger.info('Managed job cluster launched.')
+                        else:
+                            self.cluster_name = await (context_utils.to_thread(
+                                serve_utils.get_next_cluster_name, self.pool,
+                                self.job_id))
+                            if self.cluster_name is None:
+                                raise exceptions.NoClusterLaunchedError(
+                                    'No cluster name found in the pool.')
+                            request_id = None
+                            try:
+                                request_id = await context_utils.to_thread(
+                                    sdk.exec,
+                                    self.dag,
+                                    cluster_name=self.cluster_name,
+                                )
+                                logger.debug('sdk.exec request ID: '
+                                             f'{request_id}')
+                                job_id_on_pool_cluster, _ = (
+                                    await context_utils.to_thread(
+                                        sdk.get, request_id))
+                            except asyncio.CancelledError:
+                                if request_id:
+                                    req = await context_utils.to_thread(
+                                        sdk.api_cancel, request_id)
+                                    logger.debug('sdk.api_cancel request '
+                                                 f'ID: {req}')
+                                    try:
+                                        await context_utils.to_thread(
+                                            sdk.get, req)
+                                    except Exception as e:  # pylint: disable=broad-except
+                                        # we must still return a CancelledError
+                                        logger.error(
+                                            f'Failed to cancel the job: {e}')
+                                raise
+                            assert job_id_on_pool_cluster is not None, (
+                                self.cluster_name, self.job_id)
+                            self.job_id_on_pool_cluster = job_id_on_pool_cluster
+                            await state.set_job_id_on_pool_cluster_async(
+                                self.job_id, job_id_on_pool_cluster)
+                        logger.info('Managed job cluster launched.')
+                    except (exceptions.InvalidClusterNameError,
+                            exceptions.NoCloudAccessError,
+                            exceptions.ResourcesMismatchError,
+                            exceptions.StorageSpecError,
+                            exceptions.StorageError) as e:
+                        logger.error('Failure happened before provisioning. '
+                                     f'{common_utils.format_exception(e)}')
                         if raise_on_failure:
-                            raise exceptions.ProvisionPrechecksError(reasons)
-                        return None
-                    logger.info('Failed to launch a cluster with error: '
-                                f'{common_utils.format_exception(e)})')
-                except Exception as e:  # pylint: disable=broad-except
-                    # If the launch fails, it will be recovered by the following
-                    # code.
-                    logger.info('Failed to launch a cluster with error: '
-                                f'{common_utils.format_exception(e)})')
-                    with ux_utils.enable_traceback():
-                        logger.info(f'  Traceback: {traceback.format_exc()}')
-                else:  # No exception, the launch succeeds.
-                    # At this point, a sky.launch() has succeeded. Cluster may
-                    # be UP (no preemption since) or DOWN (newly preempted).
-                    job_submitted_at = self._wait_until_job_starts_on_cluster()
-                    if job_submitted_at is not None:
-                        return job_submitted_at
-                    # The job fails to start on the cluster, retry the launch.
-                    # TODO(zhwu): log the unexpected error to usage collection
-                    # for future debugging.
-                    logger.info(
-                        'Failed to successfully submit the job to the '
-                        'launched cluster, due to unexpected submission errors '
-                        'or the cluster being preempted during job submission.')
-                # If we get here, the launch did not succeed. Tear down the
-                # cluster and retry.
-                managed_job_utils.terminate_cluster(self.cluster_name)
-                if max_retry is not None and retry_cnt >= max_retry:
-                    # Retry forever if max_retry is None.
-                    if raise_on_failure:
-                        with ux_utils.print_exception_no_traceback():
-                            raise exceptions.ManagedJobReachedMaxRetriesError(
-                                'Resources unavailable: failed to launch '
-                                f'clusters after {max_retry} retries.')
-                    else:
+                            raise exceptions.ProvisionPrechecksError(
+                                reasons=[e])
                         return None
-            # Exit the scheduled_launch context so that the scheulde state is
-            # ALIVE during the backoff. This allows other jobs to launch.
-            gap_seconds = backoff.current_backoff()
-            logger.info('Retrying to launch the cluster in '
-                        f'{gap_seconds:.1f} seconds.')
-            time.sleep(gap_seconds)
+                    except exceptions.ResourcesUnavailableError as e:
+                        # This is raised when the launch fails due to prechecks
+                        # or after failing over through all the candidates.
+                        # Please refer to the docstring of `sky.launch` for more
+                        # details of how the exception will be structured.
+                        if not any(
+                                isinstance(err,
+                                           exceptions.ResourcesUnavailableError)
+                                for err in e.failover_history):
+                            # _launch() (this function) should fail/exit
+                            # directly, if none of the failover reasons were
+                            # because of resource unavailability or no failover
+                            # was attempted (the optimizer cannot find feasible
+                            # resources for requested resources), i.e.,
+                            # e.failover_history is empty. Failing directly
+                            # avoids the infinite loop of retrying the launch
+                            # when, e.g., an invalid cluster name is used and
+                            # --retry-until-up is specified.
+                            reasons = (e.failover_history
+                                       if e.failover_history else [e])
+                            reasons_str = '; '.join(
+                                common_utils.format_exception(err)
+                                for err in reasons)
+                            logger.error(
+                                'Failure happened before provisioning. '
+                                f'Failover reasons: {reasons_str}')
+                            if raise_on_failure:
+                                raise exceptions.ProvisionPrechecksError(
+                                    reasons)
+                            return None
+                        logger.info('Failed to launch a cluster with error: '
+                                    f'{common_utils.format_exception(e)})')
+                    except Exception as e:  # pylint: disable=broad-except
+                        # If the launch fails, it will be recovered by the
+                        # following code.
+                        logger.info('Failed to launch a cluster with error: '
+                                    f'{common_utils.format_exception(e)})')
+                        with ux_utils.enable_traceback():
+                            logger.info(
+                                f'  Traceback: {traceback.format_exc()}')
+                    else:  # No exception, the launch succeeds.
+                        # At this point, a sky.launch() has succeeded. Cluster
+                        # may be UP (no preemption since) or DOWN (newly
+                        # preempted).
+                        job_submitted_at = await (
+                            self._wait_until_job_starts_on_cluster())
+                        if job_submitted_at is not None:
+                            return job_submitted_at
+                        # The job fails to start on the cluster, retry the
+                        # launch.
+                        # TODO(zhwu): log the unexpected error to usage
+                        # collection for future debugging.
+                        logger.info(
+                            'Failed to successfully submit the job to the '
+                            'launched cluster, due to unexpected submission '
+                            'errors or the cluster being preempted during '
+                            'job submission.')
+                    # If we get here, the launch did not succeed. Tear down the
+                    # cluster and retry.
+                    await context_utils.to_thread(self._cleanup_cluster)
+                    if max_retry is not None and retry_cnt >= max_retry:
+                        # Retry forever if max_retry is None.
+                        if raise_on_failure:
+                            with ux_utils.print_exception_no_traceback():
+                                raise (
+                                    exceptions.ManagedJobReachedMaxRetriesError(
+                                        'Resources unavailable: failed to '
+                                        f'launch clusters after {max_retry} '
+                                        'retries.'))
+                        else:
+                            return None
+                    # Raise NoClusterLaunchedError to indicate that the job is
+                    # in retry backoff. This will trigger special handling in
+                    # scheduler.schedule_launched().
+                    # We will exit the scheduled_launch context so that the
+                    # schedule state is ALIVE_BACKOFF during the backoff. This
+                    # allows other jobs to launch.
+                    raise exceptions.NoClusterLaunchedError()
+            except exceptions.NoClusterLaunchedError:
+                # Update the status to PENDING during backoff.
+                await state.set_backoff_pending_async(self.job_id, self.task_id)
+                # Calculate the backoff time and sleep.
+                gap_seconds = (backoff.current_backoff()
+                               if self.pool is None else 1)
+                logger.info('Retrying to launch the cluster in '
+                            f'{gap_seconds:.1f} seconds.')
+                await asyncio.sleep(gap_seconds)
+                continue
+            else:
+                # The inner loop should either return or throw
+                # NoClusterLaunchedError.
+                assert False, 'Unreachable'
     def should_restart_on_failure(self) -> bool:
         """Increments counter & checks if job should be restarted on a failure.
@@ -384,24 +608,38 @@ class FailoverStrategyExecutor(StrategyExecutor):
     _MAX_RETRY_CNT = 240  # Retry for 4 hours.
-    def __init__(self, cluster_name: str, backend: 'backends.Backend',
-                 task: 'task_lib.Task', max_restarts_on_errors: int,
-                 job_id: int) -> None:
+    def __init__(
+        self,
+        cluster_name: Optional[str],
+        backend: 'backends.Backend',
+        task: 'task_lib.Task',
+        max_restarts_on_errors: int,
+        job_id: int,
+        task_id: int,
+        pool: Optional[str],
+        starting: Set[int],
+        starting_lock: asyncio.Lock,
+        starting_signal: asyncio.Condition,
+    ) -> None:
         super().__init__(cluster_name, backend, task, max_restarts_on_errors,
-                         job_id)
+                         job_id, task_id, pool, starting, starting_lock,
+                         starting_signal)
         # Note down the cloud/region of the launched cluster, so that we can
         # first retry in the same cloud/region. (Inside recover() we may not
         # rely on cluster handle, as it can be None if the cluster is
         # preempted.)
         self._launched_resources: Optional['resources.Resources'] = None
-    def _launch(self,
-                max_retry: Optional[int] = 3,
-                raise_on_failure: bool = True) -> Optional[float]:
-        job_submitted_at = super()._launch(max_retry, raise_on_failure)
-        if job_submitted_at is not None:
+    async def _launch(self,
+                      max_retry: Optional[int] = 3,
+                      raise_on_failure: bool = True,
+                      recovery: bool = False) -> Optional[float]:
+        job_submitted_at = await super()._launch(max_retry, raise_on_failure,
+                                                 recovery)
+        if job_submitted_at is not None and self.cluster_name is not None:
             # Only record the cloud/region if the launch is successful.
-            handle = global_user_state.get_handle_from_cluster_name(
+            handle = await context_utils.to_thread(
+                global_user_state.get_handle_from_cluster_name,
                 self.cluster_name)
             assert isinstance(handle, backends.CloudVmRayResourceHandle), (
                 'Cluster should be launched.', handle)
@@ -411,7 +649,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
             self._launched_resources = None
         return job_submitted_at
-    def recover(self) -> float:
+    async def recover(self) -> float:
         # 1. Cancel the jobs and launch the cluster with the STOPPED status,
         #    so that it will try on the current region first until timeout.
         # 2. Tear down the cluster, if the step 1 failed to launch the cluster.
@@ -419,7 +657,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
         #    original user specification.
         # Step 1
-        self._try_cancel_all_jobs()
+        await self._try_cancel_jobs()
         while True:
             # Add region constraint to the task, to retry on the same region
@@ -433,7 +671,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
                     cloud=launched_cloud, region=launched_region, zone=None)
                 task.set_resources({new_resources})
                 # Not using self.launch to avoid the retry until up logic.
-                job_submitted_at = self._launch(raise_on_failure=False)
+                job_submitted_at = await self._launch(raise_on_failure=False,
+                                                      recovery=True)
                 # Restore the original dag, i.e. reset the region constraint.
                 task.set_resources(original_resources)
                 if job_submitted_at is not None:
@@ -442,20 +681,21 @@ class FailoverStrategyExecutor(StrategyExecutor):
             # Step 2
             logger.debug('Terminating unhealthy cluster and reset cloud '
                          'region.')
-            managed_job_utils.terminate_cluster(self.cluster_name)
+            await context_utils.to_thread(self._cleanup_cluster)
             # Step 3
             logger.debug('Relaunch the cluster  without constraining to prior '
                          'cloud/region.')
             # Not using self.launch to avoid the retry until up logic.
-            job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
-                                            raise_on_failure=False)
+            job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
+                                                  raise_on_failure=False,
+                                                  recovery=True)
             if job_submitted_at is None:
                 # Failed to launch the cluster.
                 gap_seconds = self.RETRY_INIT_GAP_SECONDS
                 logger.info('Retrying to recover the cluster in '
                             f'{gap_seconds:.1f} seconds.')
-                time.sleep(gap_seconds)
+                await asyncio.sleep(gap_seconds)
                 continue
             return job_submitted_at
@@ -487,7 +727,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
                                                   -> R1Z1 (success)
     """
-    def recover(self) -> float:
+    async def recover(self) -> float:
         # 1. Terminate the current cluster
         # 2. Launch again by explicitly blocking the previously launched region
         # (this will failover through the entire search space except the
@@ -500,7 +740,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
         # Step 1
         logger.debug('Terminating unhealthy cluster and reset cloud region.')
-        managed_job_utils.terminate_cluster(self.cluster_name)
+        await context_utils.to_thread(self._cleanup_cluster)
         # Step 2
         logger.debug('Relaunch the cluster skipping the previously launched '
@@ -521,7 +761,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
                                              region=launched_region)
                 }
                 # Not using self.launch to avoid the retry until up logic.
-                job_submitted_at = self._launch(raise_on_failure=False)
+                job_submitted_at = await self._launch(raise_on_failure=False,
+                                                      recovery=True)
                 task.blocked_resources = None
                 if job_submitted_at is not None:
                     return job_submitted_at
@@ -531,14 +772,23 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
             logger.debug('Relaunch the cluster without constraining to prior '
                          'cloud/region.')
             # Not using self.launch to avoid the retry until up logic.
-            job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
-                                            raise_on_failure=False)
+            job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
+                                                  raise_on_failure=False,
+                                                  recovery=True)
             if job_submitted_at is None:
                 # Failed to launch the cluster.
                 gap_seconds = self.RETRY_INIT_GAP_SECONDS
                 logger.info('Retrying to recover the cluster in '
                             f'{gap_seconds:.1f} seconds.')
-                time.sleep(gap_seconds)
+                await asyncio.sleep(gap_seconds)
                 continue
             return job_submitted_at
+def _get_logger_file(file_logger: logging.Logger) -> Optional[str]:
+    """Gets the file path that the logger writes to."""
+    for handler in file_logger.handlers:
+        if isinstance(handler, logging.FileHandler):
+            return handler.baseFilename
+    return None

skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250502py3-none-any.whl → 1.0.0.dev20251203py3-none-any.whl