skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
|
@@ -3,8 +3,9 @@ import copy
|
|
|
3
3
|
import datetime
|
|
4
4
|
import json
|
|
5
5
|
import re
|
|
6
|
+
import sys
|
|
6
7
|
import time
|
|
7
|
-
from typing import Any,
|
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
8
9
|
|
|
9
10
|
from sky import exceptions
|
|
10
11
|
from sky import global_user_state
|
|
@@ -16,13 +17,13 @@ from sky.provision import constants
|
|
|
16
17
|
from sky.provision import docker_utils
|
|
17
18
|
from sky.provision.kubernetes import config as config_lib
|
|
18
19
|
from sky.provision.kubernetes import constants as k8s_constants
|
|
19
|
-
from sky.provision.kubernetes import network_utils
|
|
20
20
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
21
21
|
from sky.provision.kubernetes import volume
|
|
22
22
|
from sky.utils import command_runner
|
|
23
23
|
from sky.utils import common_utils
|
|
24
24
|
from sky.utils import config_utils
|
|
25
25
|
from sky.utils import kubernetes_enums
|
|
26
|
+
from sky.utils import rich_utils
|
|
26
27
|
from sky.utils import status_lib
|
|
27
28
|
from sky.utils import subprocess_utils
|
|
28
29
|
from sky.utils import timeline
|
|
@@ -32,8 +33,18 @@ from sky.utils.db import db_utils
|
|
|
32
33
|
POLL_INTERVAL = 2
|
|
33
34
|
_TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
|
|
34
35
|
_MAX_RETRIES = 3
|
|
36
|
+
_MAX_MISSING_PODS_RETRIES = 5
|
|
37
|
+
_MAX_QUERY_INSTANCES_RETRIES = 5
|
|
38
|
+
_QUERY_INSTANCES_RETRY_INTERVAL = .5
|
|
35
39
|
_NUM_THREADS = subprocess_utils.get_parallel_threads('kubernetes')
|
|
36
40
|
|
|
41
|
+
COMMON_NON_PENDING_EVENT_REASONS = {
|
|
42
|
+
'Scheduled', 'Created', 'Started', 'Failed', 'Pulled'
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
# Pattern to extract SSH user from command output, handling MOTD contamination
|
|
46
|
+
_SSH_USER_PATTERN = re.compile(r'SKYPILOT_SSH_USER: ([^\s\n]+)')
|
|
47
|
+
|
|
37
48
|
logger = sky_logging.init_logger(__name__)
|
|
38
49
|
|
|
39
50
|
|
|
@@ -77,7 +88,7 @@ def is_high_availability_cluster_by_kubectl(
|
|
|
77
88
|
context).list_namespaced_deployment(
|
|
78
89
|
namespace,
|
|
79
90
|
label_selector=
|
|
80
|
-
f'{
|
|
91
|
+
f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
|
|
81
92
|
except kubernetes.api_exception():
|
|
82
93
|
return False
|
|
83
94
|
# It is a high availability cluster if there is at least one deployment
|
|
@@ -191,14 +202,20 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
|
191
202
|
break
|
|
192
203
|
if event_message is not None:
|
|
193
204
|
if pod_status == 'Pending':
|
|
194
|
-
|
|
205
|
+
out_of = {}
|
|
206
|
+
# key: resource name, value: (extra message, nice name)
|
|
195
207
|
if 'Insufficient cpu' in event_message:
|
|
196
|
-
|
|
197
|
-
|
|
208
|
+
out_of['CPU'] = (': Run \'kubectl get nodes -o '
|
|
209
|
+
'custom-columns=NAME:.metadata.name,'
|
|
210
|
+
'CPU:.status.allocatable.cpu\' to check '
|
|
211
|
+
'the available CPUs on the node.', 'CPUs')
|
|
198
212
|
if 'Insufficient memory' in event_message:
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
213
|
+
out_of['memory'] = (': Run \'kubectl get nodes -o '
|
|
214
|
+
'custom-columns=NAME:.metadata.name,'
|
|
215
|
+
'MEMORY:.status.allocatable.memory\' '
|
|
216
|
+
'to check the available memory on the '
|
|
217
|
+
'node.', 'Memory')
|
|
218
|
+
|
|
202
219
|
# TODO(aylei): after switching from smarter-device-manager to
|
|
203
220
|
# fusermount-server, we need a new way to check whether the
|
|
204
221
|
# fusermount-server daemonset is ready.
|
|
@@ -206,41 +223,77 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
|
206
223
|
key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
|
|
207
224
|
for key in lf.get_label_keys()
|
|
208
225
|
]
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
226
|
+
for label_key in gpu_lf_keys:
|
|
227
|
+
# TODO(romilb): We may have additional node
|
|
228
|
+
# affinity selectors in the future - in that
|
|
229
|
+
# case we will need to update this logic.
|
|
230
|
+
# TODO(Doyoung): Update the error message raised
|
|
231
|
+
# with the multi-host TPU support.
|
|
232
|
+
gpu_resource_key = kubernetes_utils.get_gpu_resource_key(
|
|
233
|
+
context) # pylint: disable=line-too-long
|
|
234
|
+
if ((f'Insufficient {gpu_resource_key}' in event_message) or
|
|
235
|
+
('didn\'t match Pod\'s node affinity/selector'
|
|
236
|
+
in event_message) and pod.spec.node_selector):
|
|
237
|
+
if 'gpu' in gpu_resource_key.lower():
|
|
238
|
+
info_msg = (
|
|
239
|
+
': Run \'sky show-gpus --infra kubernetes\' to '
|
|
240
|
+
'see the available GPUs.')
|
|
241
|
+
else:
|
|
242
|
+
info_msg = ': '
|
|
243
|
+
if (pod.spec.node_selector and
|
|
244
|
+
label_key in pod.spec.node_selector):
|
|
245
|
+
extra_msg = (
|
|
246
|
+
f'Verify if any node matching label '
|
|
247
|
+
f'{pod.spec.node_selector[label_key]} and '
|
|
248
|
+
f'sufficient resource {gpu_resource_key} '
|
|
249
|
+
f'is available in the cluster.')
|
|
250
|
+
extra_msg = info_msg + ' ' + extra_msg
|
|
251
|
+
else:
|
|
252
|
+
extra_msg = info_msg
|
|
253
|
+
if gpu_resource_key not in out_of or len(
|
|
254
|
+
out_of[gpu_resource_key][0]) < len(extra_msg):
|
|
255
|
+
out_of[f'{gpu_resource_key}'] = (extra_msg, 'GPUs')
|
|
256
|
+
|
|
257
|
+
if len(out_of) > 0:
|
|
258
|
+
# We are out of some resources. We should raise an error.
|
|
259
|
+
rsrc_err_msg = 'Insufficient resource capacity on the '
|
|
260
|
+
rsrc_err_msg += 'cluster:\n'
|
|
261
|
+
out_of_keys = list(out_of.keys())
|
|
262
|
+
for i in range(len(out_of_keys)):
|
|
263
|
+
rsrc = out_of_keys[i]
|
|
264
|
+
(extra_msg, nice_name) = out_of[rsrc]
|
|
265
|
+
extra_msg = extra_msg if extra_msg else ''
|
|
266
|
+
if i == len(out_of_keys) - 1:
|
|
267
|
+
indent = '└──'
|
|
268
|
+
else:
|
|
269
|
+
indent = '├──'
|
|
270
|
+
rsrc_err_msg += (f'{indent} Cluster does not have '
|
|
271
|
+
f'sufficient {nice_name} for your request'
|
|
272
|
+
f'{extra_msg}')
|
|
273
|
+
if i != len(out_of_keys) - 1:
|
|
274
|
+
rsrc_err_msg += '\n'
|
|
275
|
+
|
|
276
|
+
# Emit the error message without logging prefixes for better UX.
|
|
277
|
+
tmp_handler = sky_logging.EnvAwareHandler(sys.stdout)
|
|
278
|
+
tmp_handler.flush = sys.stdout.flush # type: ignore
|
|
279
|
+
tmp_handler.setFormatter(sky_logging.NO_PREFIX_FORMATTER)
|
|
280
|
+
tmp_handler.setLevel(sky_logging.ERROR)
|
|
281
|
+
prev_propagate = logger.propagate
|
|
282
|
+
try:
|
|
283
|
+
logger.addHandler(tmp_handler)
|
|
284
|
+
logger.propagate = False
|
|
285
|
+
logger.error(ux_utils.error_message(f'{rsrc_err_msg}'))
|
|
286
|
+
finally:
|
|
287
|
+
logger.removeHandler(tmp_handler)
|
|
288
|
+
logger.propagate = prev_propagate
|
|
289
|
+
nice_names = [out_of[rsrc][1] for rsrc in out_of_keys]
|
|
290
|
+
raise config_lib.KubernetesError(
|
|
291
|
+
f'{timeout_err_msg} '
|
|
292
|
+
f'Pod status: {pod_status} '
|
|
293
|
+
f'Details: \'{event_message}\' ',
|
|
294
|
+
insufficent_resources=nice_names,
|
|
295
|
+
)
|
|
296
|
+
|
|
244
297
|
raise config_lib.KubernetesError(f'{timeout_err_msg} '
|
|
245
298
|
f'Pod status: {pod_status} '
|
|
246
299
|
f'Details: \'{event_message}\' ')
|
|
@@ -256,8 +309,89 @@ def _raise_command_running_error(message: str, command: str, pod_name: str,
|
|
|
256
309
|
f'code {rc}: {command!r}\nOutput: {stdout}.')
|
|
257
310
|
|
|
258
311
|
|
|
312
|
+
def _detect_cluster_event_reason_occurred(namespace, context, search_start,
|
|
313
|
+
reason) -> bool:
|
|
314
|
+
|
|
315
|
+
def _convert_to_utc(timestamp):
|
|
316
|
+
if timestamp.tzinfo is None:
|
|
317
|
+
return timestamp.replace(tzinfo=datetime.timezone.utc)
|
|
318
|
+
return timestamp.astimezone(datetime.timezone.utc)
|
|
319
|
+
|
|
320
|
+
def _get_event_timestamp(event):
|
|
321
|
+
if event.last_timestamp:
|
|
322
|
+
return event.last_timestamp
|
|
323
|
+
elif event.metadata.creation_timestamp:
|
|
324
|
+
return event.metadata.creation_timestamp
|
|
325
|
+
return None
|
|
326
|
+
|
|
327
|
+
events = kubernetes.core_api(context).list_namespaced_event(
|
|
328
|
+
namespace=namespace, field_selector=f'reason={reason}')
|
|
329
|
+
for event in events.items:
|
|
330
|
+
ts = _get_event_timestamp(event)
|
|
331
|
+
if ts and _convert_to_utc(ts) > search_start:
|
|
332
|
+
return True
|
|
333
|
+
return False
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def _cluster_had_autoscale_event(namespace, context, search_start) -> bool:
|
|
337
|
+
"""Detects whether the cluster had a autoscaling event after a
|
|
338
|
+
specified datetime. This only works when using cluster-autoscaler.
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
namespace: kubernetes namespace
|
|
342
|
+
context: kubernetes context
|
|
343
|
+
search_start (datetime.datetime): filter for events that occurred
|
|
344
|
+
after search_start
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
A boolean whether the cluster has an autoscaling event or not.
|
|
348
|
+
"""
|
|
349
|
+
assert namespace is not None
|
|
350
|
+
|
|
351
|
+
try:
|
|
352
|
+
return _detect_cluster_event_reason_occurred(namespace, context,
|
|
353
|
+
search_start,
|
|
354
|
+
'TriggeredScaleUp')
|
|
355
|
+
except Exception as e: # pylint: disable=broad-except
|
|
356
|
+
logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
|
|
357
|
+
return False
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _cluster_maybe_autoscaling(namespace, context, search_start) -> bool:
|
|
361
|
+
"""Detects whether a kubernetes cluster may have an autoscaling event.
|
|
362
|
+
|
|
363
|
+
This is not a definitive detection. FailedScheduling, which is an
|
|
364
|
+
event that can occur when not enough resources are present in the cluster,
|
|
365
|
+
which is a trigger for cluster autoscaling. However, FailedScheduling may
|
|
366
|
+
have occurred due to other reasons (cluster itself is abnormal).
|
|
367
|
+
|
|
368
|
+
Hence, this should only be used for autoscalers that don't emit the
|
|
369
|
+
TriggeredScaleUp event, e.g.: Karpenter.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
namespace: kubernetes namespace
|
|
373
|
+
context: kubernetes context
|
|
374
|
+
search_start (datetime.datetime): filter for events that occurred
|
|
375
|
+
after search_start
|
|
376
|
+
|
|
377
|
+
Returns:
|
|
378
|
+
A boolean whether the cluster has an autoscaling event or not.
|
|
379
|
+
"""
|
|
380
|
+
assert namespace is not None
|
|
381
|
+
|
|
382
|
+
try:
|
|
383
|
+
return _detect_cluster_event_reason_occurred(namespace, context,
|
|
384
|
+
search_start,
|
|
385
|
+
'FailedScheduling')
|
|
386
|
+
except Exception as e: # pylint: disable=broad-except
|
|
387
|
+
logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
|
|
388
|
+
return False
|
|
389
|
+
|
|
390
|
+
|
|
259
391
|
@timeline.event
|
|
260
|
-
def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int
|
|
392
|
+
def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int,
|
|
393
|
+
cluster_name: str,
|
|
394
|
+
create_pods_start: datetime.datetime):
|
|
261
395
|
"""Wait for all pods to be scheduled.
|
|
262
396
|
|
|
263
397
|
Wait for all pods including jump pod to be scheduled, and if it
|
|
@@ -266,6 +400,9 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
266
400
|
allocated and we can exit.
|
|
267
401
|
|
|
268
402
|
If timeout is set to a negative value, this method will wait indefinitely.
|
|
403
|
+
|
|
404
|
+
Will update the spinner message to indicate autoscaling if autoscaling
|
|
405
|
+
is happening.
|
|
269
406
|
"""
|
|
270
407
|
# Create a set of pod names we're waiting for
|
|
271
408
|
if not new_nodes:
|
|
@@ -273,6 +410,18 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
273
410
|
expected_pod_names = {node.metadata.name for node in new_nodes}
|
|
274
411
|
start_time = time.time()
|
|
275
412
|
|
|
413
|
+
# Variables for autoscaler detection
|
|
414
|
+
autoscaler_type = skypilot_config.get_effective_region_config(
|
|
415
|
+
cloud='kubernetes',
|
|
416
|
+
region=context,
|
|
417
|
+
keys=('autoscaler',),
|
|
418
|
+
default_value=None)
|
|
419
|
+
autoscaler_is_set = autoscaler_type is not None
|
|
420
|
+
use_heuristic_detection = (autoscaler_is_set and
|
|
421
|
+
not kubernetes_enums.KubernetesAutoscalerType(
|
|
422
|
+
autoscaler_type).emits_autoscale_event())
|
|
423
|
+
is_autoscaling = False
|
|
424
|
+
|
|
276
425
|
def _evaluate_timeout() -> bool:
|
|
277
426
|
# If timeout is negative, retry indefinitely.
|
|
278
427
|
if timeout < 0:
|
|
@@ -282,12 +431,13 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
282
431
|
while _evaluate_timeout():
|
|
283
432
|
# Get all pods in a single API call using the cluster name label
|
|
284
433
|
# which all pods in new_nodes should share
|
|
285
|
-
|
|
286
|
-
|
|
434
|
+
cluster_name_on_cloud = new_nodes[0].metadata.labels[
|
|
435
|
+
constants.TAG_SKYPILOT_CLUSTER_NAME]
|
|
287
436
|
pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
288
437
|
namespace,
|
|
289
438
|
label_selector=
|
|
290
|
-
f'{
|
|
439
|
+
f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
|
|
440
|
+
).items
|
|
291
441
|
|
|
292
442
|
# Get the set of found pod names and check if we have all expected pods
|
|
293
443
|
found_pod_names = {pod.metadata.name for pod in pods}
|
|
@@ -311,6 +461,26 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
311
461
|
|
|
312
462
|
if all_scheduled:
|
|
313
463
|
return
|
|
464
|
+
|
|
465
|
+
# Check if cluster is autoscaling and update spinner message.
|
|
466
|
+
# Minor optimization to not query k8s api after autoscaling
|
|
467
|
+
# event was detected. This is useful because there isn't any
|
|
468
|
+
# autoscaling complete event.
|
|
469
|
+
if autoscaler_is_set and not is_autoscaling:
|
|
470
|
+
if use_heuristic_detection:
|
|
471
|
+
is_autoscaling = _cluster_maybe_autoscaling(
|
|
472
|
+
namespace, context, create_pods_start)
|
|
473
|
+
msg = 'Kubernetes cluster may be scaling up'
|
|
474
|
+
else:
|
|
475
|
+
is_autoscaling = _cluster_had_autoscale_event(
|
|
476
|
+
namespace, context, create_pods_start)
|
|
477
|
+
msg = 'Kubernetes cluster is autoscaling'
|
|
478
|
+
|
|
479
|
+
if is_autoscaling:
|
|
480
|
+
rich_utils.force_update_status(
|
|
481
|
+
ux_utils.spinner_message(f'Launching ({msg})',
|
|
482
|
+
cluster_name=cluster_name))
|
|
483
|
+
|
|
314
484
|
time.sleep(1)
|
|
315
485
|
|
|
316
486
|
# Handle pod scheduling errors
|
|
@@ -326,17 +496,17 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
326
496
|
|
|
327
497
|
|
|
328
498
|
@timeline.event
|
|
329
|
-
def _wait_for_pods_to_run(namespace, context,
|
|
499
|
+
def _wait_for_pods_to_run(namespace, context, cluster_name, new_pods):
|
|
330
500
|
"""Wait for pods and their containers to be ready.
|
|
331
501
|
|
|
332
502
|
Pods may be pulling images or may be in the process of container
|
|
333
503
|
creation.
|
|
334
504
|
"""
|
|
335
|
-
if not
|
|
505
|
+
if not new_pods:
|
|
336
506
|
return
|
|
337
507
|
|
|
338
508
|
# Create a set of pod names we're waiting for
|
|
339
|
-
expected_pod_names = {
|
|
509
|
+
expected_pod_names = {pod.metadata.name for pod in new_pods}
|
|
340
510
|
|
|
341
511
|
def _check_init_containers(pod):
|
|
342
512
|
# Check if any of the init containers failed
|
|
@@ -363,39 +533,40 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
|
|
|
363
533
|
'Failed to create init container for pod '
|
|
364
534
|
f'{pod.metadata.name}. Error details: {msg}.')
|
|
365
535
|
|
|
366
|
-
|
|
367
|
-
#
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
536
|
+
def _inspect_pod_status(pod):
|
|
537
|
+
# Check if pod is terminated/preempted/failed.
|
|
538
|
+
if (pod.metadata.deletion_timestamp is not None or
|
|
539
|
+
pod.status.phase == 'Failed'):
|
|
540
|
+
# Get the reason and write to cluster events before
|
|
541
|
+
# the pod gets completely deleted from the API.
|
|
542
|
+
termination_reason = _get_pod_termination_reason(pod, cluster_name)
|
|
543
|
+
logger.warning(
|
|
544
|
+
f'Pod {pod.metadata.name} terminated: {termination_reason}')
|
|
545
|
+
raise config_lib.KubernetesError(
|
|
546
|
+
f'Pod {pod.metadata.name} has terminated or failed '
|
|
547
|
+
f'unexpectedly. Run `sky logs --provision {cluster_name}` '
|
|
548
|
+
'for more details.')
|
|
549
|
+
|
|
550
|
+
container_statuses = pod.status.container_statuses
|
|
551
|
+
# Continue if pod and all the containers within the
|
|
552
|
+
# pod are successfully created and running.
|
|
553
|
+
if (pod.status.phase == 'Running' and container_statuses is not None and
|
|
554
|
+
all(container.state.running
|
|
555
|
+
for container in container_statuses)):
|
|
556
|
+
return True, None
|
|
557
|
+
|
|
558
|
+
reason: Optional[str] = None
|
|
559
|
+
if pod.status.phase == 'Pending':
|
|
560
|
+
pending_reason = _get_pod_pending_reason(context, namespace,
|
|
561
|
+
pod.metadata.name)
|
|
562
|
+
if pending_reason is not None:
|
|
563
|
+
reason, message = pending_reason
|
|
564
|
+
logger.debug(f'Pod {pod.metadata.name} is pending: '
|
|
565
|
+
f'{reason}: {message}')
|
|
566
|
+
|
|
567
|
+
# Iterate over each container in pod to check their status
|
|
568
|
+
if container_statuses is not None:
|
|
569
|
+
for container_status in container_statuses:
|
|
399
570
|
# If the container wasn't in 'ContainerCreating'
|
|
400
571
|
# state, then we know pod wasn't scheduled or
|
|
401
572
|
# had some other error, such as image pull error.
|
|
@@ -406,43 +577,86 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
|
|
|
406
577
|
if waiting.reason == 'PodInitializing':
|
|
407
578
|
_check_init_containers(pod)
|
|
408
579
|
elif waiting.reason != 'ContainerCreating':
|
|
409
|
-
msg = waiting.message if
|
|
410
|
-
waiting)
|
|
580
|
+
msg = waiting.message if (
|
|
581
|
+
waiting.message) else str(waiting)
|
|
411
582
|
raise config_lib.KubernetesError(
|
|
412
583
|
'Failed to create container while launching '
|
|
413
584
|
f'the node. Error details: {msg}.')
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
585
|
+
return False, reason
|
|
586
|
+
|
|
587
|
+
missing_pods_retry = 0
|
|
588
|
+
last_status_msg: Optional[str] = None
|
|
589
|
+
while True:
|
|
590
|
+
# Get all pods in a single API call
|
|
591
|
+
cluster_name_on_cloud = new_pods[0].metadata.labels[
|
|
592
|
+
constants.TAG_SKYPILOT_CLUSTER_NAME]
|
|
593
|
+
all_pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
594
|
+
namespace,
|
|
595
|
+
label_selector=
|
|
596
|
+
f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
|
|
597
|
+
).items
|
|
598
|
+
|
|
599
|
+
# Get the set of found pod names and check if we have all expected pods
|
|
600
|
+
found_pod_names = {pod.metadata.name for pod in all_pods}
|
|
601
|
+
missing_pod_names = expected_pod_names - found_pod_names
|
|
602
|
+
if missing_pod_names:
|
|
603
|
+
# In _wait_for_pods_to_schedule, we already wait for all pods to go
|
|
604
|
+
# from pending to scheduled. So if a pod is missing here, it means
|
|
605
|
+
# something unusual must have happened, and so should be treated as
|
|
606
|
+
# an exception.
|
|
607
|
+
# It is also only in _wait_for_pods_to_schedule that
|
|
608
|
+
# provision_timeout is used.
|
|
609
|
+
# TODO(kevin): Should we take provision_timeout into account here,
|
|
610
|
+
# instead of hardcoding the number of retries?
|
|
611
|
+
if missing_pods_retry >= _MAX_MISSING_PODS_RETRIES:
|
|
612
|
+
for pod_name in missing_pod_names:
|
|
613
|
+
reason = _get_pod_missing_reason(context, namespace,
|
|
614
|
+
cluster_name, pod_name)
|
|
615
|
+
logger.warning(f'Pod {pod_name} missing: {reason}')
|
|
616
|
+
raise config_lib.KubernetesError(
|
|
617
|
+
f'Failed to get all pods after {missing_pods_retry} '
|
|
618
|
+
f'retries. Some pods may have been terminated or failed '
|
|
619
|
+
f'unexpectedly. Run `sky logs --provision {cluster_name}` '
|
|
620
|
+
'for more details.')
|
|
621
|
+
logger.info('Retrying running pods check: '
|
|
622
|
+
f'Missing pods: {missing_pod_names}')
|
|
623
|
+
time.sleep(0.5)
|
|
624
|
+
missing_pods_retry += 1
|
|
625
|
+
continue
|
|
626
|
+
|
|
627
|
+
pods_to_check = [
|
|
628
|
+
pod for pod in all_pods if pod.metadata.name in expected_pod_names
|
|
629
|
+
]
|
|
630
|
+
pod_statuses = subprocess_utils.run_in_parallel(_inspect_pod_status,
|
|
631
|
+
pods_to_check,
|
|
632
|
+
_NUM_THREADS)
|
|
633
|
+
|
|
634
|
+
all_pods_running = True
|
|
635
|
+
pending_reasons_count: Dict[str, int] = {}
|
|
636
|
+
for is_running, pending_reason in pod_statuses:
|
|
637
|
+
if not is_running:
|
|
638
|
+
all_pods_running = False
|
|
639
|
+
if pending_reason is not None:
|
|
640
|
+
pending_reasons_count[pending_reason] = (
|
|
641
|
+
pending_reasons_count.get(pending_reason, 0) + 1)
|
|
417
642
|
|
|
418
643
|
if all_pods_running:
|
|
419
644
|
break
|
|
420
|
-
time.sleep(1)
|
|
421
645
|
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
for attempt in range(max_retries + 1):
|
|
437
|
-
try:
|
|
438
|
-
return func()
|
|
439
|
-
except config_lib.KubernetesError:
|
|
440
|
-
if attempt < max_retries:
|
|
441
|
-
logger.warning(f'Failed to {operation_name} - '
|
|
442
|
-
f'retrying in {retry_delay} seconds.')
|
|
443
|
-
time.sleep(retry_delay)
|
|
444
|
-
else:
|
|
445
|
-
raise
|
|
646
|
+
if pending_reasons_count:
|
|
647
|
+
msg = ', '.join([
|
|
648
|
+
f'{count} pod(s) pending due to {reason}'
|
|
649
|
+
for reason, count in sorted(pending_reasons_count.items())
|
|
650
|
+
])
|
|
651
|
+
status_text = f'Launching ({msg})'
|
|
652
|
+
else:
|
|
653
|
+
status_text = 'Launching'
|
|
654
|
+
new_status_msg = ux_utils.spinner_message(status_text,
|
|
655
|
+
cluster_name=cluster_name)
|
|
656
|
+
if new_status_msg != last_status_msg:
|
|
657
|
+
rich_utils.force_update_status(new_status_msg)
|
|
658
|
+
last_status_msg = new_status_msg
|
|
659
|
+
time.sleep(1)
|
|
446
660
|
|
|
447
661
|
|
|
448
662
|
@timeline.event
|
|
@@ -683,7 +897,7 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
|
|
|
683
897
|
def _wait_for_deployment_pod(context,
|
|
684
898
|
namespace,
|
|
685
899
|
deployment,
|
|
686
|
-
timeout=
|
|
900
|
+
timeout=300) -> List:
|
|
687
901
|
label_selector = ','.join([
|
|
688
902
|
f'{key}={value}'
|
|
689
903
|
for key, value in deployment.spec.selector.match_labels.items()
|
|
@@ -715,13 +929,14 @@ def _wait_for_deployment_pod(context,
|
|
|
715
929
|
|
|
716
930
|
|
|
717
931
|
@timeline.event
|
|
718
|
-
def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
932
|
+
def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
719
933
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
720
934
|
"""Create pods based on the config."""
|
|
721
935
|
provider_config = config.provider_config
|
|
722
936
|
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
723
937
|
context = kubernetes_utils.get_context_from_config(provider_config)
|
|
724
938
|
pod_spec = copy.deepcopy(config.node_config)
|
|
939
|
+
create_pods_start = datetime.datetime.now(datetime.timezone.utc)
|
|
725
940
|
|
|
726
941
|
to_create_deployment = 'deployment_spec' in pod_spec
|
|
727
942
|
if to_create_deployment:
|
|
@@ -738,7 +953,26 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
738
953
|
else:
|
|
739
954
|
pod_spec['metadata']['labels'] = tags
|
|
740
955
|
pod_spec['metadata']['labels'].update(
|
|
741
|
-
{
|
|
956
|
+
{constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
|
|
957
|
+
|
|
958
|
+
ephemeral_volumes = provider_config.get('ephemeral_volume_infos')
|
|
959
|
+
if ephemeral_volumes:
|
|
960
|
+
for ephemeral_volume in ephemeral_volumes:
|
|
961
|
+
# Update the volumes and volume mounts in the pod spec
|
|
962
|
+
if 'volumes' not in pod_spec['spec']:
|
|
963
|
+
pod_spec['spec']['volumes'] = []
|
|
964
|
+
pod_spec['spec']['volumes'].append({
|
|
965
|
+
'name': ephemeral_volume.name,
|
|
966
|
+
'persistentVolumeClaim': {
|
|
967
|
+
'claimName': ephemeral_volume.volume_name_on_cloud,
|
|
968
|
+
},
|
|
969
|
+
})
|
|
970
|
+
if 'volumeMounts' not in pod_spec['spec']['containers'][0]:
|
|
971
|
+
pod_spec['spec']['containers'][0]['volumeMounts'] = []
|
|
972
|
+
pod_spec['spec']['containers'][0]['volumeMounts'].append({
|
|
973
|
+
'name': ephemeral_volume.name,
|
|
974
|
+
'mountPath': ephemeral_volume.path,
|
|
975
|
+
})
|
|
742
976
|
|
|
743
977
|
terminating_pods = kubernetes_utils.filter_pods(namespace, context, tags,
|
|
744
978
|
['Terminating'])
|
|
@@ -770,8 +1004,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
770
1004
|
running_pods = kubernetes_utils.filter_pods(namespace, context, tags,
|
|
771
1005
|
['Pending', 'Running'])
|
|
772
1006
|
head_pod_name = _get_head_pod_name(running_pods)
|
|
1007
|
+
running_pod_statuses = [{
|
|
1008
|
+
pod.metadata.name: pod.status.phase
|
|
1009
|
+
} for pod in running_pods.values()]
|
|
773
1010
|
logger.debug(f'Found {len(running_pods)} existing pods: '
|
|
774
|
-
f'{
|
|
1011
|
+
f'{running_pod_statuses}')
|
|
775
1012
|
|
|
776
1013
|
to_start_count = config.count - len(running_pods)
|
|
777
1014
|
if to_start_count < 0:
|
|
@@ -787,7 +1024,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
787
1024
|
nvidia_runtime_exists = False
|
|
788
1025
|
try:
|
|
789
1026
|
nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
|
|
790
|
-
context)
|
|
1027
|
+
context=context)
|
|
791
1028
|
except kubernetes.kubernetes.client.ApiException as e:
|
|
792
1029
|
logger.warning('run_instances: Error occurred while checking for '
|
|
793
1030
|
f'nvidia RuntimeClass - '
|
|
@@ -817,12 +1054,19 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
817
1054
|
|
|
818
1055
|
def _create_resource_thread(i: int):
|
|
819
1056
|
pod_spec_copy = copy.deepcopy(pod_spec)
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
1057
|
+
# 0 is for head pod, while 1+ is for worker pods.
|
|
1058
|
+
if i == 0:
|
|
1059
|
+
if head_pod_name is None:
|
|
1060
|
+
# First pod should be head if no head exists
|
|
1061
|
+
pod_spec_copy['metadata']['labels'].update(
|
|
1062
|
+
constants.HEAD_NODE_TAGS)
|
|
1063
|
+
head_selector = _head_service_selector(cluster_name_on_cloud)
|
|
1064
|
+
pod_spec_copy['metadata']['labels'].update(head_selector)
|
|
1065
|
+
pod_spec_copy['metadata'][
|
|
1066
|
+
'name'] = f'{cluster_name_on_cloud}-head'
|
|
1067
|
+
else:
|
|
1068
|
+
# If head pod already exists, we skip creating it.
|
|
1069
|
+
return
|
|
826
1070
|
else:
|
|
827
1071
|
# Worker pods
|
|
828
1072
|
pod_spec_copy['metadata']['labels'].update(
|
|
@@ -868,7 +1112,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
868
1112
|
'podAffinityTerm': {
|
|
869
1113
|
'labelSelector': {
|
|
870
1114
|
'matchExpressions': [{
|
|
871
|
-
'key':
|
|
1115
|
+
'key': constants.TAG_SKYPILOT_CLUSTER_NAME,
|
|
872
1116
|
'operator': 'In',
|
|
873
1117
|
'values': [cluster_name_on_cloud]
|
|
874
1118
|
}]
|
|
@@ -963,9 +1207,16 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
963
1207
|
'and then up the cluster again.')
|
|
964
1208
|
raise exceptions.InconsistentHighAvailabilityError(message)
|
|
965
1209
|
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
1210
|
+
created_resources = []
|
|
1211
|
+
if to_start_count > 0:
|
|
1212
|
+
# Create pods in parallel.
|
|
1213
|
+
# Use `config.count` instead of `to_start_count` to keep the index of
|
|
1214
|
+
# the Pods consistent especially for the case where some Pods are down
|
|
1215
|
+
# due to node failure or manual termination, etc. and then launch
|
|
1216
|
+
# again to create the Pods back.
|
|
1217
|
+
# The existing Pods will be skipped in _create_resource_thread.
|
|
1218
|
+
created_resources = subprocess_utils.run_in_parallel(
|
|
1219
|
+
_create_resource_thread, list(range(config.count)), _NUM_THREADS)
|
|
969
1220
|
|
|
970
1221
|
if to_create_deployment:
|
|
971
1222
|
deployments = copy.deepcopy(created_resources)
|
|
@@ -978,20 +1229,22 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
978
1229
|
pods = created_resources
|
|
979
1230
|
|
|
980
1231
|
created_pods = {}
|
|
1232
|
+
valid_pods = []
|
|
981
1233
|
for pod in pods:
|
|
1234
|
+
# In case Pod is not created
|
|
1235
|
+
if pod is None:
|
|
1236
|
+
continue
|
|
1237
|
+
valid_pods.append(pod)
|
|
982
1238
|
created_pods[pod.metadata.name] = pod
|
|
983
1239
|
if head_pod_name is None and _is_head(pod):
|
|
984
1240
|
head_pod_name = pod.metadata.name
|
|
1241
|
+
pods = valid_pods
|
|
1242
|
+
|
|
1243
|
+
# The running_pods may include Pending Pods, so we add them to the pods
|
|
1244
|
+
# list to wait for scheduling and running
|
|
1245
|
+
if running_pods:
|
|
1246
|
+
pods = pods + list(running_pods.values())
|
|
985
1247
|
|
|
986
|
-
networking_mode = network_utils.get_networking_mode(
|
|
987
|
-
config.provider_config.get('networking_mode'), context)
|
|
988
|
-
if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
|
989
|
-
# Adding the jump pod to the new_nodes list as well so it can be
|
|
990
|
-
# checked if it's scheduled and running along with other pods.
|
|
991
|
-
ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
|
|
992
|
-
jump_pod = kubernetes.core_api(context).read_namespaced_pod(
|
|
993
|
-
ssh_jump_pod_name, namespace)
|
|
994
|
-
pods.append(jump_pod)
|
|
995
1248
|
provision_timeout = provider_config['timeout']
|
|
996
1249
|
|
|
997
1250
|
wait_str = ('indefinitely'
|
|
@@ -1001,12 +1254,21 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
1001
1254
|
|
|
1002
1255
|
# Wait until the pods are scheduled and surface cause for error
|
|
1003
1256
|
# if there is one
|
|
1004
|
-
_wait_for_pods_to_schedule(namespace, context, pods, provision_timeout
|
|
1257
|
+
_wait_for_pods_to_schedule(namespace, context, pods, provision_timeout,
|
|
1258
|
+
cluster_name, create_pods_start)
|
|
1259
|
+
# Reset spinner message here because it might have hinted autoscaling
|
|
1260
|
+
# while waiting for pods to schedule.
|
|
1261
|
+
rich_utils.force_update_status(
|
|
1262
|
+
ux_utils.spinner_message('Launching', cluster_name=cluster_name))
|
|
1005
1263
|
# Wait until the pods and their containers are up and running, and
|
|
1006
1264
|
# fail early if there is an error
|
|
1007
|
-
logger.debug(f'run_instances: waiting for pods to be running
|
|
1008
|
-
f'
|
|
1009
|
-
_wait_for_pods_to_run(namespace, context, pods)
|
|
1265
|
+
logger.debug(f'run_instances: waiting for pods to be running: '
|
|
1266
|
+
f'{[pod.metadata.name for pod in pods]}')
|
|
1267
|
+
_wait_for_pods_to_run(namespace, context, cluster_name, pods)
|
|
1268
|
+
# Reset spinner message here because it might have hinted the reason
|
|
1269
|
+
# pods were pending.
|
|
1270
|
+
rich_utils.force_update_status(
|
|
1271
|
+
ux_utils.spinner_message('Launching', cluster_name=cluster_name))
|
|
1010
1272
|
logger.debug(f'run_instances: all pods are scheduled and running: '
|
|
1011
1273
|
f'{[pod.metadata.name for pod in pods]}')
|
|
1012
1274
|
|
|
@@ -1022,11 +1284,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
1022
1284
|
)
|
|
1023
1285
|
|
|
1024
1286
|
|
|
1025
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
1287
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
1026
1288
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
1027
1289
|
"""Runs instances for the given cluster."""
|
|
1028
1290
|
try:
|
|
1029
|
-
return _create_pods(region, cluster_name_on_cloud, config)
|
|
1291
|
+
return _create_pods(region, cluster_name, cluster_name_on_cloud, config)
|
|
1030
1292
|
except (kubernetes.api_exception(), config_lib.KubernetesError) as e:
|
|
1031
1293
|
e_msg = common_utils.format_exception(e).replace('\n', ' ')
|
|
1032
1294
|
logger.warning('run_instances: Error occurred when creating pods: '
|
|
@@ -1150,18 +1412,6 @@ def terminate_instances(
|
|
|
1150
1412
|
ray_tag_filter(cluster_name_on_cloud),
|
|
1151
1413
|
None)
|
|
1152
1414
|
|
|
1153
|
-
# Clean up the SSH jump pod if in use
|
|
1154
|
-
networking_mode = network_utils.get_networking_mode(
|
|
1155
|
-
provider_config.get('networking_mode'), context)
|
|
1156
|
-
if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
|
1157
|
-
pod_name = list(pods.keys())[0]
|
|
1158
|
-
try:
|
|
1159
|
-
kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context,
|
|
1160
|
-
pod_name)
|
|
1161
|
-
except Exception as e: # pylint: disable=broad-except
|
|
1162
|
-
logger.warning('terminate_instances: Error occurred when analyzing '
|
|
1163
|
-
f'SSH Jump pod: {e}')
|
|
1164
|
-
|
|
1165
1415
|
if is_high_availability_cluster_by_kubectl(cluster_name_on_cloud, context,
|
|
1166
1416
|
namespace):
|
|
1167
1417
|
# For high availability controllers, terminate the deployment
|
|
@@ -1192,19 +1442,11 @@ def get_cluster_info(
|
|
|
1192
1442
|
|
|
1193
1443
|
running_pods = kubernetes_utils.filter_pods(
|
|
1194
1444
|
namespace, context, ray_tag_filter(cluster_name_on_cloud), ['Running'])
|
|
1445
|
+
logger.debug(f'Running pods: {list(running_pods.keys())}')
|
|
1195
1446
|
|
|
1196
1447
|
pods: Dict[str, List[common.InstanceInfo]] = {}
|
|
1197
1448
|
head_pod_name = None
|
|
1198
1449
|
|
|
1199
|
-
port_forward_mode = kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD
|
|
1200
|
-
network_mode_str = skypilot_config.get_effective_region_config(
|
|
1201
|
-
cloud='kubernetes',
|
|
1202
|
-
region=context,
|
|
1203
|
-
keys=('networking_mode',),
|
|
1204
|
-
default_value=port_forward_mode.value)
|
|
1205
|
-
network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
|
|
1206
|
-
network_mode_str)
|
|
1207
|
-
external_ip = kubernetes_utils.get_external_ip(network_mode, context)
|
|
1208
1450
|
port = 22
|
|
1209
1451
|
if not provider_config.get('use_internal_ips', False):
|
|
1210
1452
|
port = kubernetes_utils.get_head_ssh_port(cluster_name_on_cloud,
|
|
@@ -1218,10 +1460,12 @@ def get_cluster_info(
|
|
|
1218
1460
|
common.InstanceInfo(
|
|
1219
1461
|
instance_id=pod_name,
|
|
1220
1462
|
internal_ip=internal_ip,
|
|
1221
|
-
external_ip=
|
|
1222
|
-
external_ip),
|
|
1463
|
+
external_ip=None,
|
|
1223
1464
|
ssh_port=port,
|
|
1224
1465
|
tags=pod.metadata.labels,
|
|
1466
|
+
# TODO(hailong): `cluster.local` may need to be configurable
|
|
1467
|
+
# Service name is same as the pod name for now.
|
|
1468
|
+
internal_svc=f'{pod_name}.{namespace}.svc.cluster.local',
|
|
1225
1469
|
)
|
|
1226
1470
|
]
|
|
1227
1471
|
if _is_head(pod):
|
|
@@ -1230,10 +1474,16 @@ def get_cluster_info(
|
|
|
1230
1474
|
assert head_spec is not None, pod
|
|
1231
1475
|
cpu_request = head_spec.containers[0].resources.requests['cpu']
|
|
1232
1476
|
|
|
1233
|
-
|
|
1477
|
+
if cpu_request is None:
|
|
1478
|
+
raise RuntimeError(f'Pod {cluster_name_on_cloud}-head not found'
|
|
1479
|
+
' or not Running, check the Pod status')
|
|
1234
1480
|
|
|
1235
1481
|
ssh_user = 'sky'
|
|
1236
|
-
|
|
1482
|
+
# Use pattern matching to extract SSH user, handling MOTD contamination.
|
|
1483
|
+
# Some container images (like CUDA-Q) print MOTD when login shells start,
|
|
1484
|
+
# which can contaminate command output. We use a unique pattern to extract
|
|
1485
|
+
# the actual username reliably.
|
|
1486
|
+
get_k8s_ssh_user_cmd = 'echo "SKYPILOT_SSH_USER: $(whoami)"'
|
|
1237
1487
|
assert head_pod_name is not None
|
|
1238
1488
|
runner = command_runner.KubernetesCommandRunner(
|
|
1239
1489
|
((namespace, context), head_pod_name))
|
|
@@ -1243,10 +1493,24 @@ def get_cluster_info(
|
|
|
1243
1493
|
stream_logs=False)
|
|
1244
1494
|
_raise_command_running_error('get ssh user', get_k8s_ssh_user_cmd,
|
|
1245
1495
|
head_pod_name, rc, stdout + stderr)
|
|
1246
|
-
|
|
1496
|
+
|
|
1497
|
+
# Extract SSH user using pattern matching
|
|
1498
|
+
ssh_user_match = _SSH_USER_PATTERN.search(stdout)
|
|
1499
|
+
if ssh_user_match:
|
|
1500
|
+
ssh_user = ssh_user_match.group(1)
|
|
1501
|
+
else:
|
|
1502
|
+
raise ValueError('Failed to find SSH user identifier: '
|
|
1503
|
+
f'{stdout + stderr}')
|
|
1247
1504
|
logger.debug(
|
|
1248
1505
|
f'Using ssh user {ssh_user} for cluster {cluster_name_on_cloud}')
|
|
1249
1506
|
|
|
1507
|
+
# cpu_request may be a string like `100m`, need to parse and convert
|
|
1508
|
+
num_cpus = kubernetes_utils.parse_cpu_or_gpu_resource_to_float(cpu_request)
|
|
1509
|
+
# 'num-cpus' for ray must be an integer, but we should not set it to 0 if
|
|
1510
|
+
# cpus is <1.
|
|
1511
|
+
# Keep consistent with the logic in clouds/kubernetes.py
|
|
1512
|
+
str_cpus = str(max(int(num_cpus), 1))
|
|
1513
|
+
|
|
1250
1514
|
return common.ClusterInfo(
|
|
1251
1515
|
instances=pods,
|
|
1252
1516
|
head_instance_id=head_pod_name,
|
|
@@ -1256,16 +1520,52 @@ def get_cluster_info(
|
|
|
1256
1520
|
# problems for other pods.
|
|
1257
1521
|
custom_ray_options={
|
|
1258
1522
|
'object-store-memory': 500000000,
|
|
1259
|
-
'num-cpus':
|
|
1523
|
+
'num-cpus': str_cpus,
|
|
1260
1524
|
},
|
|
1261
1525
|
provider_name='kubernetes',
|
|
1262
1526
|
provider_config=provider_config)
|
|
1263
1527
|
|
|
1264
1528
|
|
|
1265
1529
|
def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
|
|
1266
|
-
"""Get pod termination reason and write to cluster events.
|
|
1267
|
-
|
|
1530
|
+
"""Get pod termination reason and write to cluster events.
|
|
1531
|
+
|
|
1532
|
+
Checks both pod conditions (for preemption/disruption) and
|
|
1533
|
+
container statuses (for exit codes/errors).
|
|
1534
|
+
"""
|
|
1268
1535
|
latest_timestamp = pod.status.start_time or datetime.datetime.min
|
|
1536
|
+
ready_state = 'Unknown'
|
|
1537
|
+
termination_reason = 'Terminated unexpectedly'
|
|
1538
|
+
container_reasons = []
|
|
1539
|
+
|
|
1540
|
+
# Check pod status conditions for high level overview.
|
|
1541
|
+
# No need to sort, as each condition.type will only appear once.
|
|
1542
|
+
for condition in pod.status.conditions:
|
|
1543
|
+
reason = condition.reason or 'Unknown reason'
|
|
1544
|
+
message = condition.message or ''
|
|
1545
|
+
|
|
1546
|
+
# Get last known readiness state.
|
|
1547
|
+
if condition.type == 'Ready':
|
|
1548
|
+
ready_state = f'{reason} ({message})' if message else reason
|
|
1549
|
+
# Kueue preemption, as defined in:
|
|
1550
|
+
# https://pkg.go.dev/sigs.k8s.io/kueue/pkg/controller/jobs/pod#pkg-constants
|
|
1551
|
+
elif condition.type == 'TerminationTarget':
|
|
1552
|
+
termination_reason = f'Preempted by Kueue: {reason}'
|
|
1553
|
+
if message:
|
|
1554
|
+
termination_reason += f' ({message})'
|
|
1555
|
+
# Generic disruption.
|
|
1556
|
+
elif condition.type == 'DisruptionTarget':
|
|
1557
|
+
termination_reason = f'Disrupted: {reason}'
|
|
1558
|
+
if message:
|
|
1559
|
+
termination_reason += f' ({message})'
|
|
1560
|
+
|
|
1561
|
+
if condition.last_transition_time is not None:
|
|
1562
|
+
latest_timestamp = max(latest_timestamp,
|
|
1563
|
+
condition.last_transition_time)
|
|
1564
|
+
|
|
1565
|
+
pod_reason = (f'{termination_reason}.\n'
|
|
1566
|
+
f'Last known state: {ready_state}.')
|
|
1567
|
+
|
|
1568
|
+
# Check container statuses for exit codes/errors
|
|
1269
1569
|
if pod.status and pod.status.container_statuses:
|
|
1270
1570
|
for container_status in pod.status.container_statuses:
|
|
1271
1571
|
terminated = container_status.state.terminated
|
|
@@ -1280,18 +1580,15 @@ def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
|
|
|
1280
1580
|
if reason is None:
|
|
1281
1581
|
# just in-case reason is None, have default for debugging
|
|
1282
1582
|
reason = f'exit({exit_code})'
|
|
1283
|
-
|
|
1284
|
-
|
|
1285
|
-
latest_timestamp = terminated.finished_at
|
|
1583
|
+
container_reasons.append(reason)
|
|
1584
|
+
latest_timestamp = max(latest_timestamp, terminated.finished_at)
|
|
1286
1585
|
|
|
1287
1586
|
# TODO (kyuds): later, if needed, query `last_state` too.
|
|
1288
1587
|
|
|
1289
|
-
if not reasons:
|
|
1290
|
-
return ''
|
|
1291
|
-
|
|
1292
1588
|
# Normally we will have a single container per pod for skypilot
|
|
1293
1589
|
# but doing this just in-case there are multiple containers.
|
|
1294
|
-
|
|
1590
|
+
if container_reasons:
|
|
1591
|
+
pod_reason += f'\nContainer errors: {" | ".join(container_reasons)}'
|
|
1295
1592
|
|
|
1296
1593
|
global_user_state.add_cluster_event(
|
|
1297
1594
|
cluster_name,
|
|
@@ -1303,21 +1600,56 @@ def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
|
|
|
1303
1600
|
return pod_reason
|
|
1304
1601
|
|
|
1305
1602
|
|
|
1306
|
-
def
|
|
1307
|
-
|
|
1308
|
-
"""Get events for
|
|
1309
|
-
logger.debug(f'Analyzing events for pod {pod_name}')
|
|
1603
|
+
def _get_pod_events(context: Optional[str], namespace: str,
|
|
1604
|
+
pod_name: str) -> List[Any]:
|
|
1605
|
+
"""Get the events for a pod, sorted by timestamp, most recent first."""
|
|
1310
1606
|
pod_field_selector = (
|
|
1311
1607
|
f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
|
|
1312
1608
|
pod_events = kubernetes.core_api(context).list_namespaced_event(
|
|
1313
1609
|
namespace,
|
|
1314
1610
|
field_selector=pod_field_selector,
|
|
1315
1611
|
_request_timeout=kubernetes.API_TIMEOUT).items
|
|
1316
|
-
|
|
1612
|
+
return sorted(
|
|
1317
1613
|
pod_events,
|
|
1318
1614
|
key=lambda event: event.metadata.creation_timestamp,
|
|
1319
1615
|
# latest event appears first
|
|
1320
1616
|
reverse=True)
|
|
1617
|
+
|
|
1618
|
+
|
|
1619
|
+
def _get_pod_pending_reason(context: Optional[str], namespace: str,
|
|
1620
|
+
pod_name: str) -> Optional[Tuple[str, str]]:
|
|
1621
|
+
"""Get the reason why a pod is pending from its events.
|
|
1622
|
+
|
|
1623
|
+
Returns a (reason, message) tuple about why the pod is pending (e.g.,
|
|
1624
|
+
("FailedMount", "hostPath type check failed")) or None if no reason found.
|
|
1625
|
+
"""
|
|
1626
|
+
try:
|
|
1627
|
+
pod_events = _get_pod_events(context, namespace, pod_name)
|
|
1628
|
+
except Exception as e: # pylint: disable=broad-except
|
|
1629
|
+
logger.debug(f'Failed to get events for pod {pod_name}: {e}')
|
|
1630
|
+
return None
|
|
1631
|
+
|
|
1632
|
+
if not pod_events:
|
|
1633
|
+
return None
|
|
1634
|
+
|
|
1635
|
+
for event in pod_events:
|
|
1636
|
+
# Omit common events that does not indicate a pending reason.
|
|
1637
|
+
# We could also filter by event type 'Warning' or 'Error',
|
|
1638
|
+
# but there might be useful 'Normal' events such as pulling
|
|
1639
|
+
# image that we want to surface to the user.
|
|
1640
|
+
if event.reason not in COMMON_NON_PENDING_EVENT_REASONS:
|
|
1641
|
+
reason = event.reason or 'Unknown'
|
|
1642
|
+
message = event.message or ''
|
|
1643
|
+
return reason, message
|
|
1644
|
+
|
|
1645
|
+
return None
|
|
1646
|
+
|
|
1647
|
+
|
|
1648
|
+
def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
1649
|
+
cluster_name: str, pod_name: str) -> Optional[str]:
|
|
1650
|
+
"""Get events for missing pod and write to cluster events."""
|
|
1651
|
+
logger.debug(f'Analyzing events for pod {pod_name}')
|
|
1652
|
+
pod_events = _get_pod_events(context, namespace, pod_name)
|
|
1321
1653
|
last_scheduled_node = None
|
|
1322
1654
|
insert_new_pod_event = True
|
|
1323
1655
|
new_event_inserted = False
|
|
@@ -1436,35 +1768,50 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
|
1436
1768
|
return failure_reason
|
|
1437
1769
|
|
|
1438
1770
|
|
|
1439
|
-
def
|
|
1440
|
-
|
|
1441
|
-
|
|
1442
|
-
|
|
1443
|
-
non_terminated_only: bool = True
|
|
1444
|
-
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
1445
|
-
# Mapping from pod phase to skypilot status. These are the only valid pod
|
|
1446
|
-
# phases.
|
|
1447
|
-
# https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
|
|
1448
|
-
status_map = {
|
|
1449
|
-
'Pending': status_lib.ClusterStatus.INIT,
|
|
1450
|
-
'Running': status_lib.ClusterStatus.UP,
|
|
1451
|
-
'Failed': status_lib.ClusterStatus.INIT,
|
|
1452
|
-
'Unknown': None,
|
|
1453
|
-
'Succeeded': None,
|
|
1454
|
-
}
|
|
1455
|
-
|
|
1456
|
-
assert provider_config is not None
|
|
1457
|
-
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
1458
|
-
context = kubernetes_utils.get_context_from_config(provider_config)
|
|
1459
|
-
is_ssh = context.startswith('ssh-') if context else False
|
|
1460
|
-
identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
|
|
1461
|
-
|
|
1462
|
-
# Get all the pods with the label skypilot-cluster: <cluster_name>
|
|
1771
|
+
def list_namespaced_pod(context: Optional[str], namespace: str,
|
|
1772
|
+
cluster_name_on_cloud: str, is_ssh: bool, identity: str,
|
|
1773
|
+
label_selector: str) -> List[Any]:
|
|
1774
|
+
# Get all the pods with the label skypilot-cluster-name: <cluster_name>
|
|
1463
1775
|
try:
|
|
1464
|
-
|
|
1776
|
+
# log the query parameters we pass to the k8s api
|
|
1777
|
+
logger.debug(f'Querying k8s api for pods:\n'
|
|
1778
|
+
f'context: {context}\n'
|
|
1779
|
+
f'namespace: {namespace}\n'
|
|
1780
|
+
f'label selector:`{label_selector}`.')
|
|
1781
|
+
|
|
1782
|
+
response = kubernetes.core_api(context).list_namespaced_pod(
|
|
1465
1783
|
namespace,
|
|
1466
|
-
label_selector=
|
|
1467
|
-
_request_timeout=kubernetes.API_TIMEOUT)
|
|
1784
|
+
label_selector=label_selector,
|
|
1785
|
+
_request_timeout=kubernetes.API_TIMEOUT)
|
|
1786
|
+
|
|
1787
|
+
# log PodList response info
|
|
1788
|
+
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
1789
|
+
logger.debug(f'k8s api response for `{label_selector}`:\n'
|
|
1790
|
+
f'apiVersion={response.api_version}, '
|
|
1791
|
+
f'kind={response.kind},\n'
|
|
1792
|
+
f'metadata={response.metadata}')
|
|
1793
|
+
|
|
1794
|
+
pods = response.items
|
|
1795
|
+
|
|
1796
|
+
# log detailed Pod info
|
|
1797
|
+
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
1798
|
+
logger.debug(f'k8s api response for `{label_selector}`: '
|
|
1799
|
+
f'len(pods)={len(pods)}')
|
|
1800
|
+
for pod in pods:
|
|
1801
|
+
logger.debug(f'k8s pod info for `{label_selector}`: '
|
|
1802
|
+
f'pod.apiVersion={pod.api_version}, '
|
|
1803
|
+
f'pod.kind={pod.kind}, \n'
|
|
1804
|
+
f'pod.name={pod.metadata.name}, '
|
|
1805
|
+
f'pod.namespace={pod.metadata.namespace}, \n'
|
|
1806
|
+
f'pod.labels={pod.metadata.labels}, \n'
|
|
1807
|
+
f'pod.annotations={pod.metadata.annotations}, \n'
|
|
1808
|
+
'pod.creationTimestamp='
|
|
1809
|
+
f'{pod.metadata.creation_timestamp}, '
|
|
1810
|
+
'pod.deletionTimestamp='
|
|
1811
|
+
f'{pod.metadata.deletion_timestamp}, \n'
|
|
1812
|
+
f'pod.status={pod.status}')
|
|
1813
|
+
return pods
|
|
1814
|
+
|
|
1468
1815
|
except kubernetes.max_retry_error():
|
|
1469
1816
|
with ux_utils.print_exception_no_traceback():
|
|
1470
1817
|
if is_ssh:
|
|
@@ -1488,14 +1835,63 @@ def query_instances(
|
|
|
1488
1835
|
f'Failed to query {identity} {cluster_name_on_cloud!r} '
|
|
1489
1836
|
f'status: {common_utils.format_exception(e)}')
|
|
1490
1837
|
|
|
1838
|
+
|
|
1839
|
+
def query_instances(
|
|
1840
|
+
cluster_name: str,
|
|
1841
|
+
cluster_name_on_cloud: str,
|
|
1842
|
+
provider_config: Optional[Dict[str, Any]] = None,
|
|
1843
|
+
non_terminated_only: bool = True,
|
|
1844
|
+
retry_if_missing: bool = False,
|
|
1845
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
1846
|
+
# Mapping from pod phase to skypilot status. These are the only valid pod
|
|
1847
|
+
# phases.
|
|
1848
|
+
# https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
|
|
1849
|
+
status_map = {
|
|
1850
|
+
'Pending': status_lib.ClusterStatus.INIT,
|
|
1851
|
+
'Running': status_lib.ClusterStatus.UP,
|
|
1852
|
+
'Failed': status_lib.ClusterStatus.INIT,
|
|
1853
|
+
'Unknown': None,
|
|
1854
|
+
'Succeeded': None,
|
|
1855
|
+
}
|
|
1856
|
+
|
|
1857
|
+
assert provider_config is not None
|
|
1858
|
+
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
1859
|
+
context = kubernetes_utils.get_context_from_config(provider_config)
|
|
1860
|
+
is_ssh = context.startswith('ssh-') if context else False
|
|
1861
|
+
identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
|
|
1862
|
+
label_selector = (f'{constants.TAG_SKYPILOT_CLUSTER_NAME}='
|
|
1863
|
+
f'{cluster_name_on_cloud}')
|
|
1864
|
+
|
|
1865
|
+
attempts = 0
|
|
1866
|
+
pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
|
|
1867
|
+
is_ssh, identity, label_selector)
|
|
1868
|
+
# When we see no pods returned from the k8s api, we assume the pods have
|
|
1869
|
+
# been terminated by the user directly and mark the cluster as terminated
|
|
1870
|
+
# in the global user state.
|
|
1871
|
+
# We add retry logic here as an attempt to mitigate a leak caused by the
|
|
1872
|
+
# kubernetes api returning no pods despite the pods actually existing.
|
|
1873
|
+
while (retry_if_missing and not pods and
|
|
1874
|
+
attempts < _MAX_QUERY_INSTANCES_RETRIES):
|
|
1875
|
+
logger.debug(f'Retrying to query k8s api for {cluster_name_on_cloud} '
|
|
1876
|
+
f'{attempts}/{_MAX_QUERY_INSTANCES_RETRIES} times.'
|
|
1877
|
+
f'after {_QUERY_INSTANCES_RETRY_INTERVAL} seconds.')
|
|
1878
|
+
time.sleep(_QUERY_INSTANCES_RETRY_INTERVAL)
|
|
1879
|
+
attempts += 1
|
|
1880
|
+
pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
|
|
1881
|
+
is_ssh, identity, label_selector)
|
|
1882
|
+
if len(pods) > 0:
|
|
1883
|
+
logger.info(f'Found {len(pods)} pods for {label_selector} after'
|
|
1884
|
+
f'{attempts} retries.')
|
|
1885
|
+
|
|
1491
1886
|
# Check if the pods are running or pending
|
|
1492
1887
|
cluster_status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
1493
1888
|
Optional[str]]] = {}
|
|
1494
1889
|
for pod in pods:
|
|
1495
1890
|
phase = pod.status.phase
|
|
1891
|
+
is_terminating = pod.metadata.deletion_timestamp is not None
|
|
1496
1892
|
pod_status = status_map[phase]
|
|
1497
1893
|
reason = None
|
|
1498
|
-
if phase in ('Failed', 'Unknown'):
|
|
1894
|
+
if phase in ('Failed', 'Unknown') or is_terminating:
|
|
1499
1895
|
reason = _get_pod_termination_reason(pod, cluster_name)
|
|
1500
1896
|
logger.debug(f'Pod Status ({phase}) Reason(s): {reason}')
|
|
1501
1897
|
if non_terminated_only and pod_status is None:
|