PyPI - skypilot-nightly - Versions diffs - 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl - Mend

skypilot-nightly 1.0.0.dev20250905py3-none-any.whl → 1.0.0.dev20251203py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (397) hide show

sky/__init__.py +10 -2
sky/adaptors/aws.py +81 -16
sky/adaptors/common.py +25 -2
sky/adaptors/coreweave.py +278 -0
sky/adaptors/do.py +8 -2
sky/adaptors/gcp.py +11 -0
sky/adaptors/ibm.py +5 -2
sky/adaptors/kubernetes.py +64 -0
sky/adaptors/nebius.py +3 -1
sky/adaptors/primeintellect.py +1 -0
sky/adaptors/seeweb.py +183 -0
sky/adaptors/shadeform.py +89 -0
sky/admin_policy.py +20 -0
sky/authentication.py +157 -263
sky/backends/__init__.py +3 -2
sky/backends/backend.py +11 -3
sky/backends/backend_utils.py +588 -184
sky/backends/cloud_vm_ray_backend.py +1088 -904
sky/backends/local_docker_backend.py +9 -5
sky/backends/task_codegen.py +633 -0
sky/backends/wheel_utils.py +18 -0
sky/catalog/__init__.py +8 -0
sky/catalog/aws_catalog.py +4 -0
sky/catalog/common.py +19 -1
sky/catalog/data_fetchers/fetch_aws.py +102 -80
sky/catalog/data_fetchers/fetch_gcp.py +30 -3
sky/catalog/data_fetchers/fetch_nebius.py +9 -6
sky/catalog/data_fetchers/fetch_runpod.py +698 -0
sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
sky/catalog/kubernetes_catalog.py +24 -28
sky/catalog/primeintellect_catalog.py +95 -0
sky/catalog/runpod_catalog.py +5 -1
sky/catalog/seeweb_catalog.py +184 -0
sky/catalog/shadeform_catalog.py +165 -0
sky/check.py +73 -43
sky/client/cli/command.py +675 -412
sky/client/cli/flags.py +4 -2
sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
sky/client/cli/utils.py +79 -0
sky/client/common.py +12 -2
sky/client/sdk.py +132 -63
sky/client/sdk_async.py +34 -33
sky/cloud_stores.py +82 -3
sky/clouds/__init__.py +6 -0
sky/clouds/aws.py +337 -129
sky/clouds/azure.py +24 -18
sky/clouds/cloud.py +40 -13
sky/clouds/cudo.py +16 -13
sky/clouds/do.py +9 -7
sky/clouds/fluidstack.py +12 -5
sky/clouds/gcp.py +14 -7
sky/clouds/hyperbolic.py +12 -5
sky/clouds/ibm.py +12 -5
sky/clouds/kubernetes.py +80 -45
sky/clouds/lambda_cloud.py +12 -5
sky/clouds/nebius.py +23 -9
sky/clouds/oci.py +19 -12
sky/clouds/paperspace.py +4 -1
sky/clouds/primeintellect.py +317 -0
sky/clouds/runpod.py +85 -24
sky/clouds/scp.py +12 -8
sky/clouds/seeweb.py +477 -0
sky/clouds/shadeform.py +400 -0
sky/clouds/ssh.py +4 -2
sky/clouds/utils/scp_utils.py +61 -50
sky/clouds/vast.py +33 -27
sky/clouds/vsphere.py +14 -16
sky/core.py +174 -165
sky/dashboard/out/404.html +1 -1
sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
sky/dashboard/out/clusters/[cluster].html +1 -1
sky/dashboard/out/clusters.html +1 -1
sky/dashboard/out/config.html +1 -1
sky/dashboard/out/index.html +1 -1
sky/dashboard/out/infra/[context].html +1 -1
sky/dashboard/out/infra.html +1 -1
sky/dashboard/out/jobs/[job].html +1 -1
sky/dashboard/out/jobs/pools/[pool].html +1 -1
sky/dashboard/out/jobs.html +1 -1
sky/dashboard/out/users.html +1 -1
sky/dashboard/out/volumes.html +1 -1
sky/dashboard/out/workspace/new.html +1 -1
sky/dashboard/out/workspaces/[name].html +1 -1
sky/dashboard/out/workspaces.html +1 -1
sky/data/data_utils.py +92 -1
sky/data/mounting_utils.py +162 -29
sky/data/storage.py +200 -19
sky/data/storage_utils.py +10 -45
sky/exceptions.py +18 -7
sky/execution.py +74 -31
sky/global_user_state.py +605 -191
sky/jobs/__init__.py +2 -0
sky/jobs/client/sdk.py +101 -4
sky/jobs/client/sdk_async.py +31 -5
sky/jobs/constants.py +15 -8
sky/jobs/controller.py +726 -284
sky/jobs/file_content_utils.py +128 -0
sky/jobs/log_gc.py +193 -0
sky/jobs/recovery_strategy.py +250 -100
sky/jobs/scheduler.py +271 -173
sky/jobs/server/core.py +367 -114
sky/jobs/server/server.py +81 -35
sky/jobs/server/utils.py +89 -35
sky/jobs/state.py +1498 -620
sky/jobs/utils.py +771 -306
sky/logs/agent.py +40 -5
sky/logs/aws.py +9 -19
sky/metrics/utils.py +282 -39
sky/optimizer.py +1 -1
sky/provision/__init__.py +37 -1
sky/provision/aws/config.py +34 -13
sky/provision/aws/instance.py +5 -2
sky/provision/azure/instance.py +5 -3
sky/provision/common.py +2 -0
sky/provision/cudo/instance.py +4 -3
sky/provision/do/instance.py +4 -3
sky/provision/docker_utils.py +97 -26
sky/provision/fluidstack/instance.py +6 -5
sky/provision/gcp/config.py +6 -1
sky/provision/gcp/instance.py +4 -2
sky/provision/hyperbolic/instance.py +4 -2
sky/provision/instance_setup.py +66 -20
sky/provision/kubernetes/__init__.py +2 -0
sky/provision/kubernetes/config.py +7 -44
sky/provision/kubernetes/constants.py +0 -1
sky/provision/kubernetes/instance.py +609 -213
sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
sky/provision/kubernetes/network.py +12 -8
sky/provision/kubernetes/network_utils.py +8 -25
sky/provision/kubernetes/utils.py +382 -418
sky/provision/kubernetes/volume.py +150 -18
sky/provision/lambda_cloud/instance.py +16 -13
sky/provision/nebius/instance.py +6 -2
sky/provision/nebius/utils.py +103 -86
sky/provision/oci/instance.py +4 -2
sky/provision/paperspace/instance.py +4 -3
sky/provision/primeintellect/__init__.py +10 -0
sky/provision/primeintellect/config.py +11 -0
sky/provision/primeintellect/instance.py +454 -0
sky/provision/primeintellect/utils.py +398 -0
sky/provision/provisioner.py +30 -9
sky/provision/runpod/__init__.py +2 -0
sky/provision/runpod/instance.py +4 -3
sky/provision/runpod/volume.py +69 -13
sky/provision/scp/instance.py +307 -130
sky/provision/seeweb/__init__.py +11 -0
sky/provision/seeweb/config.py +13 -0
sky/provision/seeweb/instance.py +812 -0
sky/provision/shadeform/__init__.py +11 -0
sky/provision/shadeform/config.py +12 -0
sky/provision/shadeform/instance.py +351 -0
sky/provision/shadeform/shadeform_utils.py +83 -0
sky/provision/vast/instance.py +5 -3
sky/provision/volume.py +164 -0
sky/provision/vsphere/common/ssl_helper.py +1 -1
sky/provision/vsphere/common/vapiconnect.py +2 -1
sky/provision/vsphere/common/vim_utils.py +3 -2
sky/provision/vsphere/instance.py +8 -6
sky/provision/vsphere/vsphere_utils.py +8 -1
sky/resources.py +11 -3
sky/schemas/api/responses.py +107 -6
sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
sky/schemas/db/serve_state/002_yaml_content.py +34 -0
sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
sky/schemas/generated/jobsv1_pb2.py +86 -0
sky/schemas/generated/jobsv1_pb2.pyi +254 -0
sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
sky/schemas/generated/servev1_pb2.py +58 -0
sky/schemas/generated/servev1_pb2.pyi +115 -0
sky/schemas/generated/servev1_pb2_grpc.py +322 -0
sky/serve/autoscalers.py +2 -0
sky/serve/client/impl.py +55 -21
sky/serve/constants.py +4 -3
sky/serve/controller.py +17 -11
sky/serve/load_balancing_policies.py +1 -1
sky/serve/replica_managers.py +219 -142
sky/serve/serve_rpc_utils.py +179 -0
sky/serve/serve_state.py +63 -54
sky/serve/serve_utils.py +145 -109
sky/serve/server/core.py +46 -25
sky/serve/server/impl.py +311 -162
sky/serve/server/server.py +21 -19
sky/serve/service.py +84 -68
sky/serve/service_spec.py +45 -7
sky/server/auth/loopback.py +38 -0
sky/server/auth/oauth2_proxy.py +12 -7
sky/server/common.py +47 -24
sky/server/config.py +62 -28
sky/server/constants.py +9 -1
sky/server/daemons.py +109 -38
sky/server/metrics.py +76 -96
sky/server/middleware_utils.py +166 -0
sky/server/requests/executor.py +381 -145
sky/server/requests/payloads.py +71 -18
sky/server/requests/preconditions.py +15 -13
sky/server/requests/request_names.py +121 -0
sky/server/requests/requests.py +507 -157
sky/server/requests/serializers/decoders.py +48 -17
sky/server/requests/serializers/encoders.py +85 -20
sky/server/requests/threads.py +117 -0
sky/server/rest.py +116 -24
sky/server/server.py +420 -172
sky/server/stream_utils.py +219 -45
sky/server/uvicorn.py +30 -19
sky/setup_files/MANIFEST.in +6 -1
sky/setup_files/alembic.ini +8 -0
sky/setup_files/dependencies.py +62 -19
sky/setup_files/setup.py +44 -44
sky/sky_logging.py +13 -5
sky/skylet/attempt_skylet.py +106 -24
sky/skylet/configs.py +3 -1
sky/skylet/constants.py +111 -26
sky/skylet/events.py +64 -10
sky/skylet/job_lib.py +141 -104
sky/skylet/log_lib.py +233 -5
sky/skylet/log_lib.pyi +40 -2
sky/skylet/providers/ibm/node_provider.py +12 -8
sky/skylet/providers/ibm/vpc_provider.py +13 -12
sky/skylet/runtime_utils.py +21 -0
sky/skylet/services.py +524 -0
sky/skylet/skylet.py +22 -1
sky/skylet/subprocess_daemon.py +104 -29
sky/skypilot_config.py +99 -79
sky/ssh_node_pools/server.py +9 -8
sky/task.py +221 -104
sky/templates/aws-ray.yml.j2 +1 -0
sky/templates/azure-ray.yml.j2 +1 -0
sky/templates/cudo-ray.yml.j2 +1 -0
sky/templates/do-ray.yml.j2 +1 -0
sky/templates/fluidstack-ray.yml.j2 +1 -0
sky/templates/gcp-ray.yml.j2 +1 -0
sky/templates/hyperbolic-ray.yml.j2 +1 -0
sky/templates/ibm-ray.yml.j2 +2 -1
sky/templates/jobs-controller.yaml.j2 +3 -0
sky/templates/kubernetes-ray.yml.j2 +196 -55
sky/templates/lambda-ray.yml.j2 +1 -0
sky/templates/nebius-ray.yml.j2 +3 -0
sky/templates/oci-ray.yml.j2 +1 -0
sky/templates/paperspace-ray.yml.j2 +1 -0
sky/templates/primeintellect-ray.yml.j2 +72 -0
sky/templates/runpod-ray.yml.j2 +1 -0
sky/templates/scp-ray.yml.j2 +1 -0
sky/templates/seeweb-ray.yml.j2 +171 -0
sky/templates/shadeform-ray.yml.j2 +73 -0
sky/templates/vast-ray.yml.j2 +1 -0
sky/templates/vsphere-ray.yml.j2 +1 -0
sky/templates/websocket_proxy.py +188 -43
sky/usage/usage_lib.py +16 -4
sky/users/permission.py +60 -43
sky/utils/accelerator_registry.py +6 -3
sky/utils/admin_policy_utils.py +18 -5
sky/utils/annotations.py +22 -0
sky/utils/asyncio_utils.py +78 -0
sky/utils/atomic.py +1 -1
sky/utils/auth_utils.py +153 -0
sky/utils/cli_utils/status_utils.py +12 -7
sky/utils/cluster_utils.py +28 -6
sky/utils/command_runner.py +88 -27
sky/utils/command_runner.pyi +36 -3
sky/utils/common.py +3 -1
sky/utils/common_utils.py +37 -4
sky/utils/config_utils.py +1 -14
sky/utils/context.py +127 -40
sky/utils/context_utils.py +73 -18
sky/utils/controller_utils.py +229 -70
sky/utils/db/db_utils.py +95 -18
sky/utils/db/kv_cache.py +149 -0
sky/utils/db/migration_utils.py +24 -7
sky/utils/env_options.py +4 -0
sky/utils/git.py +559 -1
sky/utils/kubernetes/create_cluster.sh +15 -30
sky/utils/kubernetes/delete_cluster.sh +10 -7
sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
sky/utils/kubernetes/generate_kind_config.py +6 -66
sky/utils/kubernetes/gpu_labeler.py +13 -3
sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
sky/utils/kubernetes/rsync_helper.sh +11 -3
sky/utils/kubernetes_enums.py +7 -15
sky/utils/lock_events.py +4 -4
sky/utils/locks.py +128 -31
sky/utils/log_utils.py +0 -319
sky/utils/resource_checker.py +13 -10
sky/utils/resources_utils.py +53 -29
sky/utils/rich_utils.py +8 -4
sky/utils/schemas.py +107 -52
sky/utils/subprocess_utils.py +17 -4
sky/utils/thread_utils.py +91 -0
sky/utils/timeline.py +2 -1
sky/utils/ux_utils.py +35 -1
sky/utils/volume.py +88 -4
sky/utils/yaml_utils.py +9 -0
sky/volumes/client/sdk.py +48 -10
sky/volumes/server/core.py +59 -22
sky/volumes/server/server.py +46 -17
sky/volumes/volume.py +54 -42
sky/workspaces/core.py +57 -21
sky/workspaces/server.py +13 -12
sky_templates/README.md +3 -0
sky_templates/__init__.py +3 -0
sky_templates/ray/__init__.py +0 -0
sky_templates/ray/start_cluster +183 -0
sky_templates/ray/stop_cluster +75 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
sky/client/cli/git.py +0 -549
sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
/sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
{skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0

sky/jobs/recovery_strategy.py CHANGED Viewed

@@ -5,25 +5,31 @@ In the YAML file, the user can specify the strategy to use for managed jobs.
 resources:
     job_recovery: EAGER_NEXT_REGION
 """
-import time
+import asyncio
+import logging
+import os
 import traceback
 import typing
-from typing import Optional
+from typing import Optional, Set
 from sky import backends
 from sky import dag as dag_lib
 from sky import exceptions
-from sky import execution
 from sky import global_user_state
 from sky import sky_logging
+from sky import skypilot_config
 from sky.backends import backend_utils
+from sky.client import sdk
 from sky.jobs import scheduler
 from sky.jobs import state
 from sky.jobs import utils as managed_job_utils
 from sky.serve import serve_utils
+from sky.skylet import constants
 from sky.skylet import job_lib
 from sky.usage import usage_lib
 from sky.utils import common_utils
+from sky.utils import context_utils
+from sky.utils import env_options
 from sky.utils import registry
 from sky.utils import status_lib
 from sky.utils import ux_utils
@@ -41,7 +47,14 @@ MAX_JOB_CHECKING_RETRY = 10
 # Minutes to job cluster autodown. This should be significantly larger than
 # managed_job_utils.JOB_STATUS_CHECK_GAP_SECONDS, to avoid tearing down the
 # cluster before its status can be updated by the job controller.
-_AUTODOWN_MINUTES = 5
+_AUTODOWN_MINUTES = 10
+ENV_VARS_TO_CLEAR = [
+    skypilot_config.ENV_VAR_SKYPILOT_CONFIG,
+    constants.USER_ID_ENV_VAR,
+    constants.USER_ENV_VAR,
+    env_options.Options.SHOW_DEBUG_INFO.env_key,
+]
 class StrategyExecutor:
@@ -49,15 +62,31 @@ class StrategyExecutor:
     RETRY_INIT_GAP_SECONDS = 60
-    def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
-                 task: 'task_lib.Task', max_restarts_on_errors: int,
-                 job_id: int, task_id: int, pool: Optional[str]) -> None:
+    def __init__(
+        self,
+        cluster_name: Optional[str],
+        backend: 'backends.Backend',
+        task: 'task_lib.Task',
+        max_restarts_on_errors: int,
+        job_id: int,
+        task_id: int,
+        pool: Optional[str],
+        starting: Set[int],
+        starting_lock: asyncio.Lock,
+        starting_signal: asyncio.Condition,
+    ) -> None:
         """Initialize the strategy executor.
         Args:
             cluster_name: The name of the cluster.
             backend: The backend to use. Only CloudVMRayBackend is supported.
             task: The task to execute.
+            max_restarts_on_errors: Maximum number of restarts on errors.
+            job_id: The ID of the job.
+            task_id: The ID of the task.
+            starting: Set of job IDs that are currently starting.
+            starting_lock: Lock to synchronize starting jobs.
+            starting_signal: Condition to signal when a job can start.
         """
         assert isinstance(backend, backends.CloudVmRayBackend), (
             'Only CloudVMRayBackend is supported.')
@@ -75,11 +104,23 @@ class StrategyExecutor:
         self.pool = pool
         self.restart_cnt_on_failure = 0
         self.job_id_on_pool_cluster: Optional[int] = None
+        self.starting = starting
+        self.starting_lock = starting_lock
+        self.starting_signal = starting_signal
     @classmethod
-    def make(cls, cluster_name: Optional[str], backend: 'backends.Backend',
-             task: 'task_lib.Task', job_id: int, task_id: int,
-             pool: Optional[str]) -> 'StrategyExecutor':
+    def make(
+        cls,
+        cluster_name: Optional[str],
+        backend: 'backends.Backend',
+        task: 'task_lib.Task',
+        job_id: int,
+        task_id: int,
+        pool: Optional[str],
+        starting: Set[int],
+        starting_lock: asyncio.Lock,
+        starting_signal: asyncio.Condition,
+    ) -> 'StrategyExecutor':
         """Create a strategy from a task."""
         resource_list = list(task.resources)
@@ -111,9 +152,10 @@ class StrategyExecutor:
         assert job_recovery_strategy is not None, job_recovery_name
         return job_recovery_strategy(cluster_name, backend, task,
                                      max_restarts_on_errors, job_id, task_id,
-                                     pool)
+                                     pool, starting, starting_lock,
+                                     starting_signal)
-    def launch(self) -> float:
+    async def launch(self) -> float:
         """Launch the cluster for the first time.
         It can fail if resource is not available. Need to check the cluster
@@ -125,11 +167,11 @@ class StrategyExecutor:
         Raises: Please refer to the docstring of self._launch().
         """
-        job_submit_at = self._launch(max_retry=None)
+        job_submit_at = await self._launch(max_retry=None)
         assert job_submit_at is not None
         return job_submit_at
-    def recover(self) -> float:
+    async def recover(self) -> float:
         """Relaunch the cluster after failure and wait until job starts.
         When recover() is called the cluster should be in STOPPED status (i.e.
@@ -139,13 +181,11 @@ class StrategyExecutor:
         """
         raise NotImplementedError
-    def _try_cancel_jobs(self):
-        from sky import core  # pylint: disable=import-outside-toplevel
+    async def _try_cancel_jobs(self):
         if self.cluster_name is None:
             return
-        handle = global_user_state.get_handle_from_cluster_name(
-            self.cluster_name)
+        handle = await context_utils.to_thread(
+            global_user_state.get_handle_from_cluster_name, self.cluster_name)
         if handle is None or self.pool is not None:
             return
         try:
@@ -169,14 +209,26 @@ class StrategyExecutor:
             # should be functional with the `_try_cancel_if_cluster_is_init`
             # flag, i.e. it sends the cancel signal to the head node, which will
             # then kill the user process on remaining worker nodes.
-            # Only cancel the corresponding job for worker pool.
+            # Only cancel the corresponding job for pool.
             if self.pool is None:
-                kwargs = dict(all=True)
+                request_id = await context_utils.to_thread(
+                    sdk.cancel,
+                    cluster_name=self.cluster_name,
+                    all=True,
+                    _try_cancel_if_cluster_is_init=True,
+                )
             else:
-                kwargs = dict(job_ids=[self.job_id_on_pool_cluster])
-            core.cancel(cluster_name=self.cluster_name,
-                        **kwargs,
-                        _try_cancel_if_cluster_is_init=True)
+                request_id = await context_utils.to_thread(
+                    sdk.cancel,
+                    cluster_name=self.cluster_name,
+                    job_ids=[self.job_id_on_pool_cluster],
+                    _try_cancel_if_cluster_is_init=True,
+                )
+            logger.debug(f'sdk.cancel request ID: {request_id}')
+            await context_utils.to_thread(
+                sdk.get,
+                request_id,
+            )
         except Exception as e:  # pylint: disable=broad-except
             logger.info('Failed to cancel the job on the cluster. The cluster '
                         'might be already down or the head node is preempted.'
@@ -184,9 +236,9 @@ class StrategyExecutor:
                         f'{common_utils.format_exception(e)}\n'
                         'Terminating the cluster explicitly to ensure no '
                         'remaining job process interferes with recovery.')
-            self._cleanup_cluster()
+            await context_utils.to_thread(self._cleanup_cluster)
-    def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
+    async def _wait_until_job_starts_on_cluster(self) -> Optional[float]:
         """Wait for MAX_JOB_CHECKING_RETRY times until job starts on the cluster
         Returns:
@@ -200,10 +252,10 @@ class StrategyExecutor:
             # Avoid the infinite loop, if any bug happens.
             job_checking_retry_cnt += 1
             try:
-                cluster_status, _ = (
-                    backend_utils.refresh_cluster_status_handle(
-                        self.cluster_name,
-                        force_refresh_statuses=set(status_lib.ClusterStatus)))
+                cluster_status, _ = (await context_utils.to_thread(
+                    backend_utils.refresh_cluster_status_handle,
+                    self.cluster_name,
+                    force_refresh_statuses=set(status_lib.ClusterStatus)))
             except Exception as e:  # pylint: disable=broad-except
                 # If any unexpected error happens, retry the job checking
                 # loop.
@@ -223,7 +275,7 @@ class StrategyExecutor:
                 break
             try:
-                status = managed_job_utils.get_job_status(
+                status = await managed_job_utils.get_job_status(
                     self.backend,
                     self.cluster_name,
                     job_id=self.job_id_on_pool_cluster)
@@ -241,7 +293,8 @@ class StrategyExecutor:
             # Check the job status until it is not in initialized status
             if status is not None and status > job_lib.JobStatus.INIT:
                 try:
-                    job_submitted_at = managed_job_utils.get_job_timestamp(
+                    job_submitted_at = await context_utils.to_thread(
+                        managed_job_utils.get_job_timestamp,
                         self.backend,
                         self.cluster_name,
                         self.job_id_on_pool_cluster,
@@ -254,7 +307,8 @@ class StrategyExecutor:
                                 'the job start timestamp. Retrying.')
                     continue
             # Wait for the job to be started
-            time.sleep(managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
+            await asyncio.sleep(
+                managed_job_utils.JOB_STARTED_STATUS_CHECK_GAP_SECONDS)
         return None
     def _cleanup_cluster(self) -> None:
@@ -263,10 +317,10 @@ class StrategyExecutor:
         if self.pool is None:
             managed_job_utils.terminate_cluster(self.cluster_name)
-    def _launch(self,
-                max_retry: Optional[int] = 3,
-                raise_on_failure: bool = True,
-                recovery: bool = False) -> Optional[float]:
+    async def _launch(self,
+                      max_retry: Optional[int] = 3,
+                      raise_on_failure: bool = True,
+                      recovery: bool = False) -> Optional[float]:
         """Implementation of launch().
         The function will wait until the job starts running, but will leave the
@@ -307,54 +361,132 @@ class StrategyExecutor:
         while True:
             retry_cnt += 1
             try:
-                with scheduler.scheduled_launch(self.job_id):
+                async with scheduler.scheduled_launch(
+                        self.job_id,
+                        self.starting,
+                        self.starting_lock,
+                        self.starting_signal,
+                ):
                     # The job state may have been PENDING during backoff -
                     # update to STARTING or RECOVERING.
                     # On the first attempt (when retry_cnt is 1), we should
                     # already be in STARTING or RECOVERING.
                     if retry_cnt > 1:
-                        state.set_restarting(self.job_id, self.task_id,
-                                             recovery)
+                        await state.set_restarting_async(
+                            self.job_id, self.task_id, recovery)
                     try:
                         usage_lib.messages.usage.set_internal()
                         if self.pool is None:
                             assert self.cluster_name is not None
-                            # Detach setup, so that the setup failure can be
-                            # detected by the controller process (job_status ->
-                            # FAILED_SETUP).
-                            execution.launch(
-                                self.dag,
-                                cluster_name=self.cluster_name,
-                                # We expect to tear down the cluster as soon as
-                                # the job is finished. However, in case the
-                                # controller dies, we may end up with a
-                                # resource leak.
-                                # Ideally, we should autodown to be safe,
-                                # but it's fine to disable it for now, as
-                                # Nebius doesn't support autodown yet.
-                                # TODO(kevin): set down=True once Nebius
-                                # supports autodown.
-                                # idle_minutes_to_autostop=_AUTODOWN_MINUTES,
-                                # down=True,
-                                _is_launched_by_jobs_controller=True)
+                            # sdk.launch will implicitly start the API server,
+                            # but then the API server will inherit the current
+                            # env vars/user, which we may not want.
+                            # Instead, clear env vars here and call api_start
+                            # explicitly.
+                            vars_to_restore = {}
+                            try:
+                                for env_var in ENV_VARS_TO_CLEAR:
+                                    vars_to_restore[env_var] = os.environ.pop(
+                                        env_var, None)
+                                    logger.debug('Cleared env var: '
+                                                 f'{env_var}')
+                                logger.debug('Env vars for api_start: '
+                                             f'{os.environ}')
+                                await context_utils.to_thread(sdk.api_start)
+                                logger.info('API server started.')
+                            finally:
+                                for env_var, value in vars_to_restore.items():
+                                    if value is not None:
+                                        logger.debug('Restored env var: '
+                                                     f'{env_var}: {value}')
+                                        os.environ[env_var] = value
+                            request_id = None
+                            try:
+                                request_id = await context_utils.to_thread(
+                                    sdk.launch,
+                                    self.dag,
+                                    cluster_name=self.cluster_name,
+                                    # We expect to tear down the cluster as soon
+                                    # as the job is finished. However, in case
+                                    # the controller dies, we may end up with a
+                                    # resource leak.
+                                    # Ideally, we should autodown to be safe,
+                                    # but it's fine to disable it for now, as
+                                    # Nebius doesn't support autodown yet.
+                                    # TODO(kevin): set down=True once Nebius
+                                    # supports autodown.
+                                    # idle_minutes_to_autostop=(
+                                    #     _AUTODOWN_MINUTES),
+                                    # down=True,
+                                    _is_launched_by_jobs_controller=True,
+                                )
+                                logger.debug('sdk.launch request ID: '
+                                             f'{request_id}')
+                                await context_utils.to_thread(
+                                    sdk.stream_and_get,
+                                    request_id,
+                                )
+                            except asyncio.CancelledError:
+                                if request_id:
+                                    req = await context_utils.to_thread(
+                                        sdk.api_cancel, request_id)
+                                    logger.debug('sdk.api_cancel request '
+                                                 f'ID: {req}')
+                                    try:
+                                        await context_utils.to_thread(
+                                            sdk.get, req)
+                                    except Exception as e:  # pylint: disable=broad-except
+                                        # we must still return a CancelledError
+                                        logger.error(
+                                            f'Failed to cancel the job: {e}')
+                                raise
+                            logger.info('Managed job cluster launched.')
                         else:
-                            self.cluster_name = (
-                                serve_utils.get_next_cluster_name(
-                                    self.pool, self.job_id))
+                            self.cluster_name = await (context_utils.to_thread(
+                                serve_utils.get_next_cluster_name, self.pool,
+                                self.job_id))
                             if self.cluster_name is None:
                                 raise exceptions.NoClusterLaunchedError(
                                     'No cluster name found in the pool.')
-                            job_id_on_pool_cluster, _ = execution.exec(
-                                self.dag, cluster_name=self.cluster_name)
+                            request_id = None
+                            try:
+                                request_id = await context_utils.to_thread(
+                                    sdk.exec,
+                                    self.dag,
+                                    cluster_name=self.cluster_name,
+                                )
+                                logger.debug('sdk.exec request ID: '
+                                             f'{request_id}')
+                                job_id_on_pool_cluster, _ = (
+                                    await context_utils.to_thread(
+                                        sdk.get, request_id))
+                            except asyncio.CancelledError:
+                                if request_id:
+                                    req = await context_utils.to_thread(
+                                        sdk.api_cancel, request_id)
+                                    logger.debug('sdk.api_cancel request '
+                                                 f'ID: {req}')
+                                    try:
+                                        await context_utils.to_thread(
+                                            sdk.get, req)
+                                    except Exception as e:  # pylint: disable=broad-except
+                                        # we must still return a CancelledError
+                                        logger.error(
+                                            f'Failed to cancel the job: {e}')
+                                raise
                             assert job_id_on_pool_cluster is not None, (
                                 self.cluster_name, self.job_id)
                             self.job_id_on_pool_cluster = job_id_on_pool_cluster
-                            state.set_job_id_on_pool_cluster(
+                            await state.set_job_id_on_pool_cluster_async(
                                 self.job_id, job_id_on_pool_cluster)
                         logger.info('Managed job cluster launched.')
                     except (exceptions.InvalidClusterNameError,
                             exceptions.NoCloudAccessError,
-                            exceptions.ResourcesMismatchError) as e:
+                            exceptions.ResourcesMismatchError,
+                            exceptions.StorageSpecError,
+                            exceptions.StorageError) as e:
                         logger.error('Failure happened before provisioning. '
                                      f'{common_utils.format_exception(e)}')
                         if raise_on_failure:
@@ -405,7 +537,7 @@ class StrategyExecutor:
                         # At this point, a sky.launch() has succeeded. Cluster
                         # may be UP (no preemption since) or DOWN (newly
                         # preempted).
-                        job_submitted_at = (
+                        job_submitted_at = await (
                             self._wait_until_job_starts_on_cluster())
                         if job_submitted_at is not None:
                             return job_submitted_at
@@ -421,7 +553,7 @@ class StrategyExecutor:
                     # If we get here, the launch did not succeed. Tear down the
                     # cluster and retry.
-                    self._cleanup_cluster()
+                    await context_utils.to_thread(self._cleanup_cluster)
                     if max_retry is not None and retry_cnt >= max_retry:
                         # Retry forever if max_retry is None.
                         if raise_on_failure:
@@ -444,15 +576,13 @@ class StrategyExecutor:
             except exceptions.NoClusterLaunchedError:
                 # Update the status to PENDING during backoff.
-                state.set_backoff_pending(self.job_id, self.task_id)
+                await state.set_backoff_pending_async(self.job_id, self.task_id)
                 # Calculate the backoff time and sleep.
-                # We retry immediately for worker pool, since no sky.launch()
-                # is called and the overhead is minimal.
                 gap_seconds = (backoff.current_backoff()
                                if self.pool is None else 1)
                 logger.info('Retrying to launch the cluster in '
                             f'{gap_seconds:.1f} seconds.')
-                time.sleep(gap_seconds)
+                await asyncio.sleep(gap_seconds)
                 continue
             else:
                 # The inner loop should either return or throw
@@ -478,26 +608,38 @@ class FailoverStrategyExecutor(StrategyExecutor):
     _MAX_RETRY_CNT = 240  # Retry for 4 hours.
-    def __init__(self, cluster_name: Optional[str], backend: 'backends.Backend',
-                 task: 'task_lib.Task', max_restarts_on_errors: int,
-                 job_id: int, task_id: int, pool: Optional[str]) -> None:
+    def __init__(
+        self,
+        cluster_name: Optional[str],
+        backend: 'backends.Backend',
+        task: 'task_lib.Task',
+        max_restarts_on_errors: int,
+        job_id: int,
+        task_id: int,
+        pool: Optional[str],
+        starting: Set[int],
+        starting_lock: asyncio.Lock,
+        starting_signal: asyncio.Condition,
+    ) -> None:
         super().__init__(cluster_name, backend, task, max_restarts_on_errors,
-                         job_id, task_id, pool)
+                         job_id, task_id, pool, starting, starting_lock,
+                         starting_signal)
         # Note down the cloud/region of the launched cluster, so that we can
         # first retry in the same cloud/region. (Inside recover() we may not
         # rely on cluster handle, as it can be None if the cluster is
         # preempted.)
         self._launched_resources: Optional['resources.Resources'] = None
-    def _launch(self,
-                max_retry: Optional[int] = 3,
-                raise_on_failure: bool = True,
-                recovery: bool = False) -> Optional[float]:
-        job_submitted_at = super()._launch(max_retry, raise_on_failure,
-                                           recovery)
+    async def _launch(self,
+                      max_retry: Optional[int] = 3,
+                      raise_on_failure: bool = True,
+                      recovery: bool = False) -> Optional[float]:
+        job_submitted_at = await super()._launch(max_retry, raise_on_failure,
+                                                 recovery)
         if job_submitted_at is not None and self.cluster_name is not None:
             # Only record the cloud/region if the launch is successful.
-            handle = global_user_state.get_handle_from_cluster_name(
+            handle = await context_utils.to_thread(
+                global_user_state.get_handle_from_cluster_name,
                 self.cluster_name)
             assert isinstance(handle, backends.CloudVmRayResourceHandle), (
                 'Cluster should be launched.', handle)
@@ -507,7 +649,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
             self._launched_resources = None
         return job_submitted_at
-    def recover(self) -> float:
+    async def recover(self) -> float:
         # 1. Cancel the jobs and launch the cluster with the STOPPED status,
         #    so that it will try on the current region first until timeout.
         # 2. Tear down the cluster, if the step 1 failed to launch the cluster.
@@ -515,7 +657,7 @@ class FailoverStrategyExecutor(StrategyExecutor):
         #    original user specification.
         # Step 1
-        self._try_cancel_jobs()
+        await self._try_cancel_jobs()
         while True:
             # Add region constraint to the task, to retry on the same region
@@ -529,8 +671,8 @@ class FailoverStrategyExecutor(StrategyExecutor):
                     cloud=launched_cloud, region=launched_region, zone=None)
                 task.set_resources({new_resources})
                 # Not using self.launch to avoid the retry until up logic.
-                job_submitted_at = self._launch(raise_on_failure=False,
-                                                recovery=True)
+                job_submitted_at = await self._launch(raise_on_failure=False,
+                                                      recovery=True)
                 # Restore the original dag, i.e. reset the region constraint.
                 task.set_resources(original_resources)
                 if job_submitted_at is not None:
@@ -539,21 +681,21 @@ class FailoverStrategyExecutor(StrategyExecutor):
             # Step 2
             logger.debug('Terminating unhealthy cluster and reset cloud '
                          'region.')
-            self._cleanup_cluster()
+            await context_utils.to_thread(self._cleanup_cluster)
             # Step 3
             logger.debug('Relaunch the cluster  without constraining to prior '
                          'cloud/region.')
             # Not using self.launch to avoid the retry until up logic.
-            job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
-                                            raise_on_failure=False,
-                                            recovery=True)
+            job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
+                                                  raise_on_failure=False,
+                                                  recovery=True)
             if job_submitted_at is None:
                 # Failed to launch the cluster.
                 gap_seconds = self.RETRY_INIT_GAP_SECONDS
                 logger.info('Retrying to recover the cluster in '
                             f'{gap_seconds:.1f} seconds.')
-                time.sleep(gap_seconds)
+                await asyncio.sleep(gap_seconds)
                 continue
             return job_submitted_at
@@ -585,7 +727,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
                                                   -> R1Z1 (success)
     """
-    def recover(self) -> float:
+    async def recover(self) -> float:
         # 1. Terminate the current cluster
         # 2. Launch again by explicitly blocking the previously launched region
         # (this will failover through the entire search space except the
@@ -598,7 +740,7 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
         # Step 1
         logger.debug('Terminating unhealthy cluster and reset cloud region.')
-        self._cleanup_cluster()
+        await context_utils.to_thread(self._cleanup_cluster)
         # Step 2
         logger.debug('Relaunch the cluster skipping the previously launched '
@@ -619,8 +761,8 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
                                              region=launched_region)
                 }
                 # Not using self.launch to avoid the retry until up logic.
-                job_submitted_at = self._launch(raise_on_failure=False,
-                                                recovery=True)
+                job_submitted_at = await self._launch(raise_on_failure=False,
+                                                      recovery=True)
                 task.blocked_resources = None
                 if job_submitted_at is not None:
                     return job_submitted_at
@@ -630,15 +772,23 @@ class EagerFailoverStrategyExecutor(FailoverStrategyExecutor):
             logger.debug('Relaunch the cluster without constraining to prior '
                          'cloud/region.')
             # Not using self.launch to avoid the retry until up logic.
-            job_submitted_at = self._launch(max_retry=self._MAX_RETRY_CNT,
-                                            raise_on_failure=False,
-                                            recovery=True)
+            job_submitted_at = await self._launch(max_retry=self._MAX_RETRY_CNT,
+                                                  raise_on_failure=False,
+                                                  recovery=True)
             if job_submitted_at is None:
                 # Failed to launch the cluster.
                 gap_seconds = self.RETRY_INIT_GAP_SECONDS
                 logger.info('Retrying to recover the cluster in '
                             f'{gap_seconds:.1f} seconds.')
-                time.sleep(gap_seconds)
+                await asyncio.sleep(gap_seconds)
                 continue
             return job_submitted_at
+def _get_logger_file(file_logger: logging.Logger) -> Optional[str]:
+    """Gets the file path that the logger writes to."""
+    for handler in file_logger.handlers:
+        if isinstance(handler, logging.FileHandler):
+            return handler.baseFilename
+    return None

skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

skypilot-nightly 1.0.0.dev20250905py3-none-any.whl → 1.0.0.dev20251203py3-none-any.whl