skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/execution.py
CHANGED
|
@@ -3,8 +3,9 @@
|
|
|
3
3
|
See `Stage` for a Task's life cycle.
|
|
4
4
|
"""
|
|
5
5
|
import enum
|
|
6
|
+
import logging
|
|
6
7
|
import typing
|
|
7
|
-
from typing import List, Optional, Tuple, Union
|
|
8
|
+
from typing import Callable, List, Optional, Tuple, Union
|
|
8
9
|
|
|
9
10
|
import colorama
|
|
10
11
|
|
|
@@ -14,7 +15,9 @@ from sky import clouds
|
|
|
14
15
|
from sky import global_user_state
|
|
15
16
|
from sky import optimizer
|
|
16
17
|
from sky import sky_logging
|
|
18
|
+
from sky import task as task_lib
|
|
17
19
|
from sky.backends import backend_utils
|
|
20
|
+
from sky.server.requests import request_names
|
|
18
21
|
from sky.skylet import autostop_lib
|
|
19
22
|
from sky.usage import usage_lib
|
|
20
23
|
from sky.utils import admin_policy_utils
|
|
@@ -30,6 +33,7 @@ from sky.utils import ux_utils
|
|
|
30
33
|
|
|
31
34
|
if typing.TYPE_CHECKING:
|
|
32
35
|
import sky
|
|
36
|
+
from sky import resources as resources_lib
|
|
33
37
|
|
|
34
38
|
logger = sky_logging.init_logger(__name__)
|
|
35
39
|
|
|
@@ -110,16 +114,18 @@ def _execute(
|
|
|
110
114
|
stages: Optional[List[Stage]] = None,
|
|
111
115
|
cluster_name: Optional[str] = None,
|
|
112
116
|
detach_setup: bool = False,
|
|
113
|
-
detach_run: bool = False,
|
|
114
117
|
idle_minutes_to_autostop: Optional[int] = None,
|
|
115
118
|
no_setup: bool = False,
|
|
116
119
|
clone_disk_from: Optional[str] = None,
|
|
117
120
|
skip_unnecessary_provisioning: bool = False,
|
|
121
|
+
*, #keyword only separator
|
|
118
122
|
# Internal only:
|
|
119
123
|
# pylint: disable=invalid-name
|
|
124
|
+
_request_name: request_names.AdminPolicyRequestName,
|
|
120
125
|
_quiet_optimizer: bool = False,
|
|
121
126
|
_is_launched_by_jobs_controller: bool = False,
|
|
122
127
|
_is_launched_by_sky_serve_controller: bool = False,
|
|
128
|
+
job_logger: logging.Logger = logger,
|
|
123
129
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
124
130
|
"""Execute an entrypoint.
|
|
125
131
|
|
|
@@ -154,8 +160,6 @@ def _execute(
|
|
|
154
160
|
job itself. You can safely ctrl-c to detach from logging, and it will
|
|
155
161
|
not interrupt the setup process. To see the logs again after detaching,
|
|
156
162
|
use `sky logs`. To cancel setup, cancel the job via `sky cancel`.
|
|
157
|
-
detach_run: If True, as soon as a job is submitted, return from this
|
|
158
|
-
function and do not stream execution logs.
|
|
159
163
|
idle_minutes_to_autostop: int; if provided, the cluster will be set to
|
|
160
164
|
autostop after this many minutes of idleness.
|
|
161
165
|
no_setup: bool; whether to skip setup commands or not when (re-)launching.
|
|
@@ -172,6 +176,13 @@ def _execute(
|
|
|
172
176
|
handle: Optional[backends.ResourceHandle]; the handle to the cluster. None
|
|
173
177
|
if dryrun.
|
|
174
178
|
"""
|
|
179
|
+
if _request_name == request_names.AdminPolicyRequestName.CLUSTER_LAUNCH:
|
|
180
|
+
if _is_launched_by_jobs_controller:
|
|
181
|
+
_request_name = (
|
|
182
|
+
request_names.AdminPolicyRequestName.JOBS_LAUNCH_CLUSTER)
|
|
183
|
+
elif _is_launched_by_sky_serve_controller:
|
|
184
|
+
_request_name = (
|
|
185
|
+
request_names.AdminPolicyRequestName.SERVE_LAUNCH_REPLICA)
|
|
175
186
|
dag = dag_utils.convert_entrypoint_to_dag(entrypoint)
|
|
176
187
|
for task in dag.tasks:
|
|
177
188
|
for resource in task.resources:
|
|
@@ -187,6 +198,7 @@ def _execute(
|
|
|
187
198
|
idle_minutes_to_autostop = resource.autostop_config.idle_minutes
|
|
188
199
|
with admin_policy_utils.apply_and_use_config_in_current_request(
|
|
189
200
|
dag,
|
|
201
|
+
request_name=_request_name,
|
|
190
202
|
request_options=admin_policy.RequestOptions(
|
|
191
203
|
cluster_name=cluster_name,
|
|
192
204
|
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
|
@@ -214,14 +226,14 @@ def _execute(
|
|
|
214
226
|
stages=stages,
|
|
215
227
|
cluster_name=cluster_name,
|
|
216
228
|
detach_setup=detach_setup,
|
|
217
|
-
detach_run=detach_run,
|
|
218
229
|
no_setup=no_setup,
|
|
219
230
|
clone_disk_from=clone_disk_from,
|
|
220
231
|
skip_unnecessary_provisioning=skip_unnecessary_provisioning,
|
|
221
232
|
_quiet_optimizer=_quiet_optimizer,
|
|
222
233
|
_is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
|
|
223
234
|
_is_launched_by_sky_serve_controller=
|
|
224
|
-
_is_launched_by_sky_serve_controller
|
|
235
|
+
_is_launched_by_sky_serve_controller,
|
|
236
|
+
job_logger=job_logger)
|
|
225
237
|
|
|
226
238
|
|
|
227
239
|
def _execute_dag(
|
|
@@ -235,7 +247,6 @@ def _execute_dag(
|
|
|
235
247
|
stages: Optional[List[Stage]],
|
|
236
248
|
cluster_name: Optional[str],
|
|
237
249
|
detach_setup: bool,
|
|
238
|
-
detach_run: bool,
|
|
239
250
|
no_setup: bool,
|
|
240
251
|
clone_disk_from: Optional[str],
|
|
241
252
|
skip_unnecessary_provisioning: bool,
|
|
@@ -243,6 +254,7 @@ def _execute_dag(
|
|
|
243
254
|
_quiet_optimizer: bool,
|
|
244
255
|
_is_launched_by_jobs_controller: bool,
|
|
245
256
|
_is_launched_by_sky_serve_controller: bool,
|
|
257
|
+
job_logger: logging.Logger = logger,
|
|
246
258
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
247
259
|
"""Execute a DAG.
|
|
248
260
|
|
|
@@ -253,7 +265,7 @@ def _execute_dag(
|
|
|
253
265
|
task = dag.tasks[0]
|
|
254
266
|
|
|
255
267
|
if any(r.job_recovery is not None for r in task.resources):
|
|
256
|
-
|
|
268
|
+
job_logger.warning(
|
|
257
269
|
f'{colorama.Style.DIM}The task has `job_recovery` specified, '
|
|
258
270
|
'but is launched as an unmanaged job. It will be ignored.'
|
|
259
271
|
'To enable job recovery, use managed jobs: sky jobs launch.'
|
|
@@ -261,8 +273,10 @@ def _execute_dag(
|
|
|
261
273
|
|
|
262
274
|
cluster_exists = False
|
|
263
275
|
if cluster_name is not None:
|
|
264
|
-
|
|
265
|
-
|
|
276
|
+
# We use launched_at to check if the cluster exists, because this
|
|
277
|
+
# db query is faster than get_cluster_from_name.
|
|
278
|
+
cluster_exists = global_user_state.cluster_with_name_exists(
|
|
279
|
+
cluster_name)
|
|
266
280
|
# TODO(woosuk): If the cluster exists, print a warning that
|
|
267
281
|
# `cpus` and `memory` are not used as a job scheduling constraint,
|
|
268
282
|
# unlike `gpus`.
|
|
@@ -334,10 +348,10 @@ def _execute_dag(
|
|
|
334
348
|
# itself have no task running and start the auto{stop,down}
|
|
335
349
|
# process, before the task is submitted in the EXEC stage.
|
|
336
350
|
verb = 'torn down' if down else 'stopped'
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
351
|
+
job_logger.info(f'{colorama.Style.DIM}The cluster will '
|
|
352
|
+
f'be {verb} after 1 minutes of idleness '
|
|
353
|
+
'(after all jobs finish).'
|
|
354
|
+
f'{colorama.Style.RESET_ALL}')
|
|
341
355
|
idle_minutes_to_autostop = 1
|
|
342
356
|
if Stage.DOWN in stages:
|
|
343
357
|
stages.remove(Stage.DOWN)
|
|
@@ -366,7 +380,7 @@ def _execute_dag(
|
|
|
366
380
|
yellow = colorama.Fore.YELLOW
|
|
367
381
|
bold = colorama.Style.BRIGHT
|
|
368
382
|
reset = colorama.Style.RESET_ALL
|
|
369
|
-
|
|
383
|
+
job_logger.info(
|
|
370
384
|
f'{yellow}Launching a spot job that does not '
|
|
371
385
|
f'automatically recover from preemptions. To '
|
|
372
386
|
'get automatic recovery, use managed job instead: '
|
|
@@ -385,7 +399,7 @@ def _execute_dag(
|
|
|
385
399
|
controller = controller_utils.Controllers.from_name(
|
|
386
400
|
cluster_name)
|
|
387
401
|
if controller is not None:
|
|
388
|
-
|
|
402
|
+
job_logger.info(
|
|
389
403
|
f'Choosing resources for {controller.value.name}...'
|
|
390
404
|
)
|
|
391
405
|
dag = optimizer.Optimizer.optimize(dag,
|
|
@@ -394,6 +408,26 @@ def _execute_dag(
|
|
|
394
408
|
task = dag.tasks[0] # Keep: dag may have been deep-copied.
|
|
395
409
|
assert task.best_resources is not None, task
|
|
396
410
|
|
|
411
|
+
# Note on race vs. lock: OPTIMIZE typically runs outside the per-cluster
|
|
412
|
+
# lock. After the backend acquires the lock and refreshes state, the
|
|
413
|
+
# original "do we need to optimize?" decision may be stale (e.g., the
|
|
414
|
+
# cluster just got terminated). To compensate without moving the optimizer
|
|
415
|
+
# into the backend, we inject a small planner the backend can call under
|
|
416
|
+
# the lock only when no reusable snapshot and no caller plan exist.
|
|
417
|
+
planner: Optional[Callable[['sky.Task'], 'resources_lib.Resources']] = None
|
|
418
|
+
if isinstance(backend,
|
|
419
|
+
backends.CloudVmRayBackend) and Stage.OPTIMIZE in stages:
|
|
420
|
+
|
|
421
|
+
def _planner(_t: 'sky.Task'):
|
|
422
|
+
new_dag = optimizer.Optimizer.optimize(dag,
|
|
423
|
+
minimize=optimize_target,
|
|
424
|
+
quiet=_quiet_optimizer)
|
|
425
|
+
new_task = new_dag.tasks[0]
|
|
426
|
+
assert new_task.best_resources is not None, new_task
|
|
427
|
+
return new_task.best_resources.assert_launchable()
|
|
428
|
+
|
|
429
|
+
planner = _planner
|
|
430
|
+
|
|
397
431
|
backend.register_info(
|
|
398
432
|
dag=dag,
|
|
399
433
|
optimize_target=optimize_target,
|
|
@@ -402,7 +436,8 @@ def _execute_dag(
|
|
|
402
436
|
# after K8S pod recovers from a crash.
|
|
403
437
|
# See `kubernetes-ray.yml.j2` for more details.
|
|
404
438
|
dump_final_script=is_controller_high_availability_supported,
|
|
405
|
-
is_managed=is_managed
|
|
439
|
+
is_managed=is_managed,
|
|
440
|
+
planner=planner)
|
|
406
441
|
|
|
407
442
|
if task.storage_mounts is not None:
|
|
408
443
|
# Optimizer should eventually choose where to store bucket
|
|
@@ -427,7 +462,7 @@ def _execute_dag(
|
|
|
427
462
|
if handle is None:
|
|
428
463
|
assert dryrun, ('If not dryrun, handle must be set or '
|
|
429
464
|
'Stage.PROVISION must be included in stages.')
|
|
430
|
-
|
|
465
|
+
job_logger.info('Dryrun finished.')
|
|
431
466
|
return None, None
|
|
432
467
|
|
|
433
468
|
do_workdir = (Stage.SYNC_WORKDIR in stages and not dryrun and
|
|
@@ -436,7 +471,7 @@ def _execute_dag(
|
|
|
436
471
|
(task.file_mounts is not None or
|
|
437
472
|
task.storage_mounts is not None))
|
|
438
473
|
if do_workdir or do_file_mounts:
|
|
439
|
-
|
|
474
|
+
job_logger.info(ux_utils.starting_message('Syncing files.'))
|
|
440
475
|
|
|
441
476
|
if do_workdir:
|
|
442
477
|
if cluster_name is not None:
|
|
@@ -444,7 +479,9 @@ def _execute_dag(
|
|
|
444
479
|
cluster_name, status_lib.ClusterStatus.INIT,
|
|
445
480
|
'Syncing files to cluster',
|
|
446
481
|
global_user_state.ClusterEventType.STATUS_CHANGE)
|
|
447
|
-
|
|
482
|
+
envs_and_secrets = task_lib.get_plaintext_envs_and_secrets(
|
|
483
|
+
task.envs_and_secrets)
|
|
484
|
+
backend.sync_workdir(handle, task.workdir, envs_and_secrets)
|
|
448
485
|
|
|
449
486
|
if do_file_mounts:
|
|
450
487
|
if cluster_name is not None:
|
|
@@ -456,11 +493,11 @@ def _execute_dag(
|
|
|
456
493
|
task.storage_mounts)
|
|
457
494
|
|
|
458
495
|
if no_setup:
|
|
459
|
-
|
|
496
|
+
job_logger.info('Setup commands skipped.')
|
|
460
497
|
elif Stage.SETUP in stages and not dryrun:
|
|
461
498
|
if skip_unnecessary_provisioning and provisioning_skipped:
|
|
462
|
-
|
|
463
|
-
|
|
499
|
+
job_logger.debug('Unnecessary provisioning was skipped, so '
|
|
500
|
+
'skipping setup as well.')
|
|
464
501
|
else:
|
|
465
502
|
if cluster_name is not None:
|
|
466
503
|
global_user_state.add_cluster_event(
|
|
@@ -479,10 +516,7 @@ def _execute_dag(
|
|
|
479
516
|
if Stage.EXEC in stages:
|
|
480
517
|
try:
|
|
481
518
|
global_user_state.update_last_use(handle.get_cluster_name())
|
|
482
|
-
job_id = backend.execute(handle,
|
|
483
|
-
task,
|
|
484
|
-
detach_run,
|
|
485
|
-
dryrun=dryrun)
|
|
519
|
+
job_id = backend.execute(handle, task, dryrun=dryrun)
|
|
486
520
|
finally:
|
|
487
521
|
# Enables post_execute() to be run after KeyboardInterrupt.
|
|
488
522
|
backend.post_execute(handle, down)
|
|
@@ -515,12 +549,16 @@ def launch(
|
|
|
515
549
|
no_setup: bool = False,
|
|
516
550
|
clone_disk_from: Optional[str] = None,
|
|
517
551
|
fast: bool = False,
|
|
552
|
+
*, #keyword only separator
|
|
518
553
|
# Internal only:
|
|
519
554
|
# pylint: disable=invalid-name
|
|
520
555
|
_quiet_optimizer: bool = False,
|
|
521
556
|
_is_launched_by_jobs_controller: bool = False,
|
|
522
557
|
_is_launched_by_sky_serve_controller: bool = False,
|
|
523
558
|
_disable_controller_check: bool = False,
|
|
559
|
+
_request_name: request_names.AdminPolicyRequestName = request_names.
|
|
560
|
+
AdminPolicyRequestName.CLUSTER_LAUNCH,
|
|
561
|
+
job_logger: logging.Logger = logger,
|
|
524
562
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
525
563
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
526
564
|
"""Launches a cluster or task.
|
|
@@ -666,7 +704,6 @@ def launch(
|
|
|
666
704
|
# see the setup logs when inspecting the launch process to know
|
|
667
705
|
# excatly what the job is waiting for.
|
|
668
706
|
detach_setup = controller_utils.Controllers.from_name(cluster_name) is None
|
|
669
|
-
|
|
670
707
|
return _execute(
|
|
671
708
|
entrypoint=entrypoint,
|
|
672
709
|
dryrun=dryrun,
|
|
@@ -679,7 +716,6 @@ def launch(
|
|
|
679
716
|
stages=stages,
|
|
680
717
|
cluster_name=cluster_name,
|
|
681
718
|
detach_setup=detach_setup,
|
|
682
|
-
detach_run=True,
|
|
683
719
|
idle_minutes_to_autostop=idle_minutes_to_autostop,
|
|
684
720
|
no_setup=no_setup,
|
|
685
721
|
clone_disk_from=clone_disk_from,
|
|
@@ -688,7 +724,12 @@ def launch(
|
|
|
688
724
|
_is_launched_by_jobs_controller=_is_launched_by_jobs_controller,
|
|
689
725
|
_is_launched_by_sky_serve_controller=
|
|
690
726
|
_is_launched_by_sky_serve_controller,
|
|
691
|
-
|
|
727
|
+
_request_name=_request_name,
|
|
728
|
+
job_logger=job_logger)
|
|
729
|
+
|
|
730
|
+
|
|
731
|
+
# needed for backward compatibility. Remove by v0.12.0
|
|
732
|
+
cluster_launch = launch
|
|
692
733
|
|
|
693
734
|
|
|
694
735
|
@usage_lib.entrypoint
|
|
@@ -699,6 +740,7 @@ def exec( # pylint: disable=redefined-builtin
|
|
|
699
740
|
down: bool = False,
|
|
700
741
|
stream_logs: bool = True,
|
|
701
742
|
backend: Optional[backends.Backend] = None,
|
|
743
|
+
job_logger: logging.Logger = logger,
|
|
702
744
|
) -> Tuple[Optional[int], Optional[backends.ResourceHandle]]:
|
|
703
745
|
# NOTE(dev): Keep the docstring consistent between the Python API and CLI.
|
|
704
746
|
"""Executes a task on an existing cluster.
|
|
@@ -773,5 +815,6 @@ def exec( # pylint: disable=redefined-builtin
|
|
|
773
815
|
Stage.EXEC,
|
|
774
816
|
],
|
|
775
817
|
cluster_name=cluster_name,
|
|
776
|
-
|
|
818
|
+
job_logger=job_logger,
|
|
819
|
+
_request_name=request_names.AdminPolicyRequestName.CLUSTER_EXEC,
|
|
777
820
|
)
|