skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Kubernetes utilities for SkyPilot."""
|
|
2
|
+
import collections
|
|
2
3
|
import copy
|
|
3
4
|
import dataclasses
|
|
4
5
|
import datetime
|
|
@@ -13,8 +14,10 @@ import shutil
|
|
|
13
14
|
import subprocess
|
|
14
15
|
import time
|
|
15
16
|
import typing
|
|
16
|
-
from typing import Any, Callable, Dict, List, Optional, Set, Tuple,
|
|
17
|
-
|
|
17
|
+
from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple,
|
|
18
|
+
Union)
|
|
19
|
+
|
|
20
|
+
import ijson
|
|
18
21
|
|
|
19
22
|
from sky import clouds
|
|
20
23
|
from sky import exceptions
|
|
@@ -32,7 +35,6 @@ from sky.skylet import constants
|
|
|
32
35
|
from sky.utils import annotations
|
|
33
36
|
from sky.utils import common_utils
|
|
34
37
|
from sky.utils import config_utils
|
|
35
|
-
from sky.utils import directory_utils
|
|
36
38
|
from sky.utils import env_options
|
|
37
39
|
from sky.utils import kubernetes_enums
|
|
38
40
|
from sky.utils import schemas
|
|
@@ -61,6 +63,8 @@ HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
|
|
|
61
63
|
# and store all data that needs to be persisted in future.
|
|
62
64
|
HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
|
|
63
65
|
|
|
66
|
+
IJSON_BUFFER_SIZE = 64 * 1024 # 64KB, default from ijson
|
|
67
|
+
|
|
64
68
|
|
|
65
69
|
class KubernetesHighPerformanceNetworkType(enum.Enum):
|
|
66
70
|
"""Enum for different Kubernetes cluster types with high performance
|
|
@@ -106,8 +110,9 @@ class KubernetesHighPerformanceNetworkType(enum.Enum):
|
|
|
106
110
|
return {
|
|
107
111
|
'NCCL_SOCKET_IFNAME': 'eth0',
|
|
108
112
|
'NCCL_IB_HCA': 'ibp',
|
|
109
|
-
'
|
|
110
|
-
|
|
113
|
+
# Restrict UCX to TCP to avoid unneccsary errors. NCCL doesn't use UCX
|
|
114
|
+
'UCX_TLS': 'tcp',
|
|
115
|
+
'UCX_NET_DEVICES': 'eth0',
|
|
111
116
|
}
|
|
112
117
|
else:
|
|
113
118
|
# GCP clusters and generic clusters - environment variables are
|
|
@@ -235,6 +240,40 @@ def normalize_tpu_accelerator_name(accelerator: str) -> Tuple[str, int]:
|
|
|
235
240
|
return accelerator, 1
|
|
236
241
|
|
|
237
242
|
|
|
243
|
+
def _is_cloudflare_403_error(exception: Exception) -> bool:
|
|
244
|
+
"""Check if an exception is a transient CloudFlare 403 error.
|
|
245
|
+
|
|
246
|
+
CloudFlare proxy 403 errors with CF-specific headers are transient and
|
|
247
|
+
should be retried, unlike real RBAC 403 errors.
|
|
248
|
+
|
|
249
|
+
Args:
|
|
250
|
+
exception: The exception to check
|
|
251
|
+
|
|
252
|
+
Returns:
|
|
253
|
+
True if this is a CloudFlare 403 error that should be retried
|
|
254
|
+
"""
|
|
255
|
+
if not isinstance(exception, kubernetes.api_exception()):
|
|
256
|
+
return False
|
|
257
|
+
|
|
258
|
+
# Only check for 403 errors
|
|
259
|
+
if exception.status != 403:
|
|
260
|
+
return False
|
|
261
|
+
|
|
262
|
+
# Check for CloudFlare-specific headers
|
|
263
|
+
headers = exception.headers if hasattr(exception, 'headers') else {}
|
|
264
|
+
if not headers:
|
|
265
|
+
return False
|
|
266
|
+
|
|
267
|
+
# CloudFlare errors have CF-RAY header and/or Server: cloudflare
|
|
268
|
+
for k, v in headers.items():
|
|
269
|
+
if 'cf-ray' in k.lower():
|
|
270
|
+
return True
|
|
271
|
+
if 'server' in k.lower() and 'cloudflare' in str(v).lower():
|
|
272
|
+
return True
|
|
273
|
+
|
|
274
|
+
return False
|
|
275
|
+
|
|
276
|
+
|
|
238
277
|
def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
|
|
239
278
|
retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
|
|
240
279
|
resource_type: Optional[str] = None):
|
|
@@ -269,19 +308,25 @@ def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
|
|
|
269
308
|
kubernetes.api_exception(),
|
|
270
309
|
kubernetes.config_exception()) as e:
|
|
271
310
|
last_exception = e
|
|
311
|
+
|
|
312
|
+
# Check if this is a CloudFlare transient 403 error
|
|
313
|
+
is_cloudflare_403 = _is_cloudflare_403_error(e)
|
|
314
|
+
|
|
272
315
|
# Don't retry on permanent errors like 401 (Unauthorized)
|
|
273
|
-
# or 403 (Forbidden)
|
|
316
|
+
# or 403 (Forbidden), unless it's a CloudFlare transient 403
|
|
274
317
|
if (isinstance(e, kubernetes.api_exception()) and
|
|
275
|
-
e.status in (401, 403)):
|
|
318
|
+
e.status in (401, 403) and not is_cloudflare_403):
|
|
276
319
|
# Raise KubeAPIUnreachableError exception so that the
|
|
277
320
|
# optimizer/provisioner can failover to other clouds.
|
|
278
321
|
raise exceptions.KubeAPIUnreachableError(
|
|
279
322
|
f'Kubernetes API error: {str(e)}') from e
|
|
280
323
|
if attempt < max_retries - 1:
|
|
281
324
|
sleep_time = backoff.current_backoff()
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
325
|
+
error_type = 'CloudFlare 403' if is_cloudflare_403 else 'error'
|
|
326
|
+
logger.debug(
|
|
327
|
+
f'Kubernetes API call {func.__name__} '
|
|
328
|
+
f'failed with {error_type} {str(e)}. Retrying in '
|
|
329
|
+
f'{sleep_time:.1f}s...')
|
|
285
330
|
time.sleep(sleep_time)
|
|
286
331
|
continue
|
|
287
332
|
|
|
@@ -451,6 +496,9 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
|
|
|
451
496
|
|
|
452
497
|
LABEL_KEY = 'gpu.nvidia.com/class'
|
|
453
498
|
|
|
499
|
+
# TODO (kyuds): fill in more label values for different accelerators.
|
|
500
|
+
ACC_VALUE_MAPPINGS = {'H100_NVLINK_80GB': 'H100'}
|
|
501
|
+
|
|
454
502
|
@classmethod
|
|
455
503
|
def get_label_key(cls, accelerator: Optional[str] = None) -> str:
|
|
456
504
|
return cls.LABEL_KEY
|
|
@@ -469,7 +517,8 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
|
|
|
469
517
|
|
|
470
518
|
@classmethod
|
|
471
519
|
def get_accelerator_from_label_value(cls, value: str) -> str:
|
|
472
|
-
return value
|
|
520
|
+
# return original label value if not found in mappings.
|
|
521
|
+
return cls.ACC_VALUE_MAPPINGS.get(value, value)
|
|
473
522
|
|
|
474
523
|
|
|
475
524
|
class GKELabelFormatter(GPULabelFormatter):
|
|
@@ -689,6 +738,7 @@ def detect_gpu_label_formatter(
|
|
|
689
738
|
for label, value in node.metadata.labels.items():
|
|
690
739
|
node_labels[node.metadata.name].append((label, value))
|
|
691
740
|
|
|
741
|
+
invalid_label_values: List[Tuple[str, str, str, str]] = []
|
|
692
742
|
# Check if the node labels contain any of the GPU label prefixes
|
|
693
743
|
for lf in LABEL_FORMATTER_REGISTRY:
|
|
694
744
|
skip = False
|
|
@@ -702,11 +752,8 @@ def detect_gpu_label_formatter(
|
|
|
702
752
|
if valid:
|
|
703
753
|
return lf(), node_labels
|
|
704
754
|
else:
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
f'but has invalid value {value}. '
|
|
708
|
-
f'Reason: {reason}. '
|
|
709
|
-
'Skipping...')
|
|
755
|
+
invalid_label_values.append(
|
|
756
|
+
(label, lf.__name__, value, reason))
|
|
710
757
|
skip = True
|
|
711
758
|
break
|
|
712
759
|
if skip:
|
|
@@ -714,6 +761,13 @@ def detect_gpu_label_formatter(
|
|
|
714
761
|
if skip:
|
|
715
762
|
continue
|
|
716
763
|
|
|
764
|
+
for label, lf_name, value, reason in invalid_label_values:
|
|
765
|
+
logger.warning(f'GPU label {label} matched for label '
|
|
766
|
+
f'formatter {lf_name}, '
|
|
767
|
+
f'but has invalid value {value}. '
|
|
768
|
+
f'Reason: {reason}. '
|
|
769
|
+
'Skipping...')
|
|
770
|
+
|
|
717
771
|
return None, node_labels
|
|
718
772
|
|
|
719
773
|
|
|
@@ -1012,15 +1066,16 @@ class GKEAutoscaler(Autoscaler):
|
|
|
1012
1066
|
to fit the instance type.
|
|
1013
1067
|
"""
|
|
1014
1068
|
for accelerator in node_pool_accelerators:
|
|
1069
|
+
raw_value = accelerator['acceleratorType']
|
|
1015
1070
|
node_accelerator_type = (
|
|
1016
|
-
GKELabelFormatter.get_accelerator_from_label_value(
|
|
1017
|
-
accelerator['acceleratorType']))
|
|
1071
|
+
GKELabelFormatter.get_accelerator_from_label_value(raw_value))
|
|
1018
1072
|
# handle heterogenous nodes.
|
|
1019
1073
|
if not node_accelerator_type:
|
|
1020
1074
|
continue
|
|
1021
1075
|
node_accelerator_count = accelerator['acceleratorCount']
|
|
1022
|
-
|
|
1023
|
-
|
|
1076
|
+
viable_names = [node_accelerator_type.lower(), raw_value.lower()]
|
|
1077
|
+
if (requested_gpu_type.lower() in viable_names and
|
|
1078
|
+
int(node_accelerator_count) >= requested_gpu_count):
|
|
1024
1079
|
return True
|
|
1025
1080
|
return False
|
|
1026
1081
|
|
|
@@ -1137,9 +1192,76 @@ def detect_accelerator_resource(
|
|
|
1137
1192
|
return has_accelerator, cluster_resources
|
|
1138
1193
|
|
|
1139
1194
|
|
|
1195
|
+
@dataclasses.dataclass
|
|
1196
|
+
class V1ObjectMeta:
|
|
1197
|
+
name: str
|
|
1198
|
+
labels: Dict[str, str]
|
|
1199
|
+
namespace: str = '' # Used for pods, not nodes
|
|
1200
|
+
|
|
1201
|
+
|
|
1202
|
+
@dataclasses.dataclass
|
|
1203
|
+
class V1NodeAddress:
|
|
1204
|
+
type: str
|
|
1205
|
+
address: str
|
|
1206
|
+
|
|
1207
|
+
|
|
1208
|
+
@dataclasses.dataclass
|
|
1209
|
+
class V1NodeCondition:
|
|
1210
|
+
"""Represents a Kubernetes node condition."""
|
|
1211
|
+
type: str
|
|
1212
|
+
status: str
|
|
1213
|
+
|
|
1214
|
+
|
|
1215
|
+
@dataclasses.dataclass
|
|
1216
|
+
class V1NodeStatus:
|
|
1217
|
+
allocatable: Dict[str, str]
|
|
1218
|
+
capacity: Dict[str, str]
|
|
1219
|
+
addresses: List[V1NodeAddress]
|
|
1220
|
+
conditions: List[V1NodeCondition]
|
|
1221
|
+
|
|
1222
|
+
|
|
1223
|
+
@dataclasses.dataclass
|
|
1224
|
+
class V1Node:
|
|
1225
|
+
"""Represents a Kubernetes node."""
|
|
1226
|
+
metadata: V1ObjectMeta
|
|
1227
|
+
status: V1NodeStatus
|
|
1228
|
+
|
|
1229
|
+
@classmethod
|
|
1230
|
+
def from_dict(cls, data: dict) -> 'V1Node':
|
|
1231
|
+
"""Create V1Node from a dictionary."""
|
|
1232
|
+
return cls(metadata=V1ObjectMeta(
|
|
1233
|
+
name=data['metadata']['name'],
|
|
1234
|
+
labels=data['metadata'].get('labels', {}),
|
|
1235
|
+
),
|
|
1236
|
+
status=V1NodeStatus(
|
|
1237
|
+
allocatable=data['status']['allocatable'],
|
|
1238
|
+
capacity=data['status']['capacity'],
|
|
1239
|
+
addresses=[
|
|
1240
|
+
V1NodeAddress(type=addr['type'],
|
|
1241
|
+
address=addr['address'])
|
|
1242
|
+
for addr in data['status'].get('addresses', [])
|
|
1243
|
+
],
|
|
1244
|
+
conditions=[
|
|
1245
|
+
V1NodeCondition(type=cond['type'],
|
|
1246
|
+
status=cond['status'])
|
|
1247
|
+
for cond in data['status'].get('conditions', [])
|
|
1248
|
+
]))
|
|
1249
|
+
|
|
1250
|
+
def is_ready(self) -> bool:
|
|
1251
|
+
"""Check if the node is ready based on its conditions.
|
|
1252
|
+
|
|
1253
|
+
A node is considered ready if it has a 'Ready' condition with
|
|
1254
|
+
status 'True'.
|
|
1255
|
+
"""
|
|
1256
|
+
for condition in self.status.conditions:
|
|
1257
|
+
if condition.type == 'Ready':
|
|
1258
|
+
return condition.status == 'True'
|
|
1259
|
+
return False
|
|
1260
|
+
|
|
1261
|
+
|
|
1140
1262
|
@annotations.lru_cache(scope='request', maxsize=10)
|
|
1141
1263
|
@_retry_on_error(resource_type='node')
|
|
1142
|
-
def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[
|
|
1264
|
+
def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[V1Node]:
|
|
1143
1265
|
"""Gets the kubernetes nodes in the context.
|
|
1144
1266
|
|
|
1145
1267
|
If context is None, gets the nodes in the current context.
|
|
@@ -1147,25 +1269,113 @@ def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[Any]:
|
|
|
1147
1269
|
if context is None:
|
|
1148
1270
|
context = get_current_kube_config_context_name()
|
|
1149
1271
|
|
|
1150
|
-
|
|
1151
|
-
|
|
1272
|
+
# Return raw urllib3.HTTPResponse object so that we can parse the json
|
|
1273
|
+
# more efficiently.
|
|
1274
|
+
response = kubernetes.core_api(context).list_node(
|
|
1275
|
+
_request_timeout=kubernetes.API_TIMEOUT, _preload_content=False)
|
|
1276
|
+
try:
|
|
1277
|
+
nodes = [
|
|
1278
|
+
V1Node.from_dict(item_dict) for item_dict in ijson.items(
|
|
1279
|
+
response, 'items.item', buf_size=IJSON_BUFFER_SIZE)
|
|
1280
|
+
]
|
|
1281
|
+
finally:
|
|
1282
|
+
response.release_conn()
|
|
1283
|
+
|
|
1152
1284
|
return nodes
|
|
1153
1285
|
|
|
1154
1286
|
|
|
1155
|
-
@
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1287
|
+
@dataclasses.dataclass
|
|
1288
|
+
class V1PodStatus:
|
|
1289
|
+
phase: str
|
|
1290
|
+
|
|
1291
|
+
|
|
1292
|
+
@dataclasses.dataclass
|
|
1293
|
+
class V1ResourceRequirements:
|
|
1294
|
+
requests: Optional[Dict[str, str]]
|
|
1295
|
+
|
|
1296
|
+
|
|
1297
|
+
@dataclasses.dataclass
|
|
1298
|
+
class V1Container:
|
|
1299
|
+
resources: V1ResourceRequirements
|
|
1300
|
+
|
|
1301
|
+
|
|
1302
|
+
@dataclasses.dataclass
|
|
1303
|
+
class V1PodSpec:
|
|
1304
|
+
containers: List[V1Container]
|
|
1305
|
+
node_name: Optional[str]
|
|
1306
|
+
|
|
1307
|
+
|
|
1308
|
+
@dataclasses.dataclass
|
|
1309
|
+
class V1Pod:
|
|
1310
|
+
metadata: V1ObjectMeta
|
|
1311
|
+
status: V1PodStatus
|
|
1312
|
+
spec: V1PodSpec
|
|
1313
|
+
|
|
1314
|
+
@classmethod
|
|
1315
|
+
def from_dict(cls, data: dict) -> 'V1Pod':
|
|
1316
|
+
"""Create V1Pod from a dictionary."""
|
|
1317
|
+
return cls(metadata=V1ObjectMeta(
|
|
1318
|
+
name=data['metadata']['name'],
|
|
1319
|
+
labels=data['metadata'].get('labels', {}),
|
|
1320
|
+
namespace=data['metadata'].get('namespace'),
|
|
1321
|
+
),
|
|
1322
|
+
status=V1PodStatus(phase=data['status'].get('phase'),),
|
|
1323
|
+
spec=V1PodSpec(
|
|
1324
|
+
node_name=data['spec'].get('nodeName'),
|
|
1325
|
+
containers=[
|
|
1326
|
+
V1Container(resources=V1ResourceRequirements(
|
|
1327
|
+
requests=container.get('resources', {}).get(
|
|
1328
|
+
'requests') or None))
|
|
1329
|
+
for container in data['spec'].get('containers', [])
|
|
1330
|
+
]))
|
|
1331
|
+
|
|
1160
1332
|
|
|
1161
|
-
|
|
1333
|
+
@_retry_on_error(resource_type='pod')
|
|
1334
|
+
def get_allocated_gpu_qty_by_node(
|
|
1335
|
+
*,
|
|
1336
|
+
context: Optional[str] = None,
|
|
1337
|
+
) -> Dict[str, int]:
|
|
1338
|
+
"""Gets allocated GPU quantity by each node by fetching pods in
|
|
1339
|
+
all namespaces in kubernetes cluster indicated by context.
|
|
1162
1340
|
"""
|
|
1163
1341
|
if context is None:
|
|
1164
1342
|
context = get_current_kube_config_context_name()
|
|
1343
|
+
non_included_pod_statuses = POD_STATUSES.copy()
|
|
1344
|
+
status_filters = ['Running', 'Pending']
|
|
1345
|
+
if status_filters is not None:
|
|
1346
|
+
non_included_pod_statuses -= set(status_filters)
|
|
1347
|
+
field_selector = ','.join(
|
|
1348
|
+
[f'status.phase!={status}' for status in non_included_pod_statuses])
|
|
1165
1349
|
|
|
1166
|
-
|
|
1167
|
-
|
|
1168
|
-
|
|
1350
|
+
# Return raw urllib3.HTTPResponse object so that we can parse the json
|
|
1351
|
+
# more efficiently.
|
|
1352
|
+
response = kubernetes.core_api(context).list_pod_for_all_namespaces(
|
|
1353
|
+
_request_timeout=kubernetes.API_TIMEOUT,
|
|
1354
|
+
_preload_content=False,
|
|
1355
|
+
field_selector=field_selector)
|
|
1356
|
+
try:
|
|
1357
|
+
allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
|
|
1358
|
+
for item_dict in ijson.items(response,
|
|
1359
|
+
'items.item',
|
|
1360
|
+
buf_size=IJSON_BUFFER_SIZE):
|
|
1361
|
+
pod = V1Pod.from_dict(item_dict)
|
|
1362
|
+
if should_exclude_pod_from_gpu_allocation(pod):
|
|
1363
|
+
logger.debug(
|
|
1364
|
+
f'Excluding pod {pod.metadata.name} from GPU count '
|
|
1365
|
+
f'calculations on node {pod.spec.node_name}')
|
|
1366
|
+
continue
|
|
1367
|
+
# Iterate over all the containers in the pod and sum the
|
|
1368
|
+
# GPU requests
|
|
1369
|
+
pod_allocated_qty = 0
|
|
1370
|
+
for container in pod.spec.containers:
|
|
1371
|
+
if container.resources.requests:
|
|
1372
|
+
pod_allocated_qty += get_node_accelerator_count(
|
|
1373
|
+
context, container.resources.requests)
|
|
1374
|
+
if pod_allocated_qty > 0 and pod.spec.node_name:
|
|
1375
|
+
allocated_qty_by_node[pod.spec.node_name] += pod_allocated_qty
|
|
1376
|
+
return allocated_qty_by_node
|
|
1377
|
+
finally:
|
|
1378
|
+
response.release_conn()
|
|
1169
1379
|
|
|
1170
1380
|
|
|
1171
1381
|
def check_instance_fits(context: Optional[str],
|
|
@@ -1266,11 +1476,12 @@ def check_instance_fits(context: Optional[str],
|
|
|
1266
1476
|
return False, str(e)
|
|
1267
1477
|
# Get the set of nodes that have the GPU type
|
|
1268
1478
|
gpu_nodes = [
|
|
1269
|
-
node for node in nodes
|
|
1479
|
+
node for node in nodes
|
|
1480
|
+
if node.is_ready() and gpu_label_key in node.metadata.labels and
|
|
1270
1481
|
node.metadata.labels[gpu_label_key] in gpu_label_values
|
|
1271
1482
|
]
|
|
1272
1483
|
if not gpu_nodes:
|
|
1273
|
-
return False, f'No GPU nodes found with {acc_type} on the cluster'
|
|
1484
|
+
return False, f'No ready GPU nodes found with {acc_type} on the cluster'
|
|
1274
1485
|
if is_tpu_on_gke(acc_type):
|
|
1275
1486
|
# If requested accelerator is a TPU type, check if the cluster
|
|
1276
1487
|
# has sufficient TPU resource to meet the requirement.
|
|
@@ -1294,7 +1505,9 @@ def check_instance_fits(context: Optional[str],
|
|
|
1294
1505
|
f'enough CPU (> {k8s_instance_type.cpus} CPUs) and/or '
|
|
1295
1506
|
f'memory (> {k8s_instance_type.memory} G). ')
|
|
1296
1507
|
else:
|
|
1297
|
-
candidate_nodes = nodes
|
|
1508
|
+
candidate_nodes = [node for node in nodes if node.is_ready()]
|
|
1509
|
+
if not candidate_nodes:
|
|
1510
|
+
return False, 'No ready nodes found in the cluster.'
|
|
1298
1511
|
not_fit_reason_prefix = (f'No nodes found with enough '
|
|
1299
1512
|
f'CPU (> {k8s_instance_type.cpus} CPUs) '
|
|
1300
1513
|
'and/or memory '
|
|
@@ -1448,9 +1661,13 @@ def get_accelerator_label_key_values(
|
|
|
1448
1661
|
if is_multi_host_tpu(node_metadata_labels):
|
|
1449
1662
|
continue
|
|
1450
1663
|
for label, value in label_list:
|
|
1451
|
-
if
|
|
1452
|
-
|
|
1453
|
-
|
|
1664
|
+
if label_formatter.match_label_key(label):
|
|
1665
|
+
# match either canonicalized name or raw name
|
|
1666
|
+
accelerator = (label_formatter.
|
|
1667
|
+
get_accelerator_from_label_value(value))
|
|
1668
|
+
viable = [value.lower(), accelerator.lower()]
|
|
1669
|
+
if acc_type.lower() not in viable:
|
|
1670
|
+
continue
|
|
1454
1671
|
if is_tpu_on_gke(acc_type):
|
|
1455
1672
|
assert isinstance(label_formatter,
|
|
1456
1673
|
GKELabelFormatter)
|
|
@@ -1550,23 +1767,6 @@ def get_port(svc_name: str, namespace: str, context: Optional[str]) -> int:
|
|
|
1550
1767
|
return head_service.spec.ports[0].node_port
|
|
1551
1768
|
|
|
1552
1769
|
|
|
1553
|
-
def get_external_ip(network_mode: Optional[
|
|
1554
|
-
kubernetes_enums.KubernetesNetworkingMode], context: Optional[str]) -> str:
|
|
1555
|
-
if network_mode == kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD:
|
|
1556
|
-
return '127.0.0.1'
|
|
1557
|
-
# Return the IP address of the first node with an external IP
|
|
1558
|
-
nodes = kubernetes.core_api(context).list_node().items
|
|
1559
|
-
for node in nodes:
|
|
1560
|
-
if node.status.addresses:
|
|
1561
|
-
for address in node.status.addresses:
|
|
1562
|
-
if address.type == 'ExternalIP':
|
|
1563
|
-
return address.address
|
|
1564
|
-
# If no external IP is found, use the API server IP
|
|
1565
|
-
api_host = kubernetes.core_api(context).api_client.configuration.host
|
|
1566
|
-
parsed_url = urlparse(api_host)
|
|
1567
|
-
return parsed_url.hostname
|
|
1568
|
-
|
|
1569
|
-
|
|
1570
1770
|
def check_credentials(context: Optional[str],
|
|
1571
1771
|
timeout: int = kubernetes.API_TIMEOUT,
|
|
1572
1772
|
run_optional_checks: bool = False) -> \
|
|
@@ -1585,7 +1785,10 @@ def check_credentials(context: Optional[str],
|
|
|
1585
1785
|
try:
|
|
1586
1786
|
namespace = get_kube_config_context_namespace(context)
|
|
1587
1787
|
kubernetes.core_api(context).list_namespaced_pod(
|
|
1588
|
-
namespace, _request_timeout=timeout)
|
|
1788
|
+
namespace, limit=1, _request_timeout=timeout)
|
|
1789
|
+
# This call is "free" because this function is a cached call,
|
|
1790
|
+
# and it will not be called again in this function.
|
|
1791
|
+
get_kubernetes_nodes(context=context)
|
|
1589
1792
|
except ImportError:
|
|
1590
1793
|
# TODO(romilb): Update these error strs to also include link to docs
|
|
1591
1794
|
# when docs are ready.
|
|
@@ -1710,11 +1913,17 @@ class PodValidator:
|
|
|
1710
1913
|
|
|
1711
1914
|
if isinstance(klass, str):
|
|
1712
1915
|
if klass.startswith('list['):
|
|
1713
|
-
|
|
1916
|
+
match = re.match(r'list\[(.*)\]', klass)
|
|
1917
|
+
if match is None:
|
|
1918
|
+
raise ValueError(f'Invalid list type format: {klass}')
|
|
1919
|
+
sub_kls = match.group(1)
|
|
1714
1920
|
return [cls.__validate(sub_data, sub_kls) for sub_data in data]
|
|
1715
1921
|
|
|
1716
1922
|
if klass.startswith('dict('):
|
|
1717
|
-
|
|
1923
|
+
match = re.match(r'dict\(([^,]*), (.*)\)', klass)
|
|
1924
|
+
if match is None:
|
|
1925
|
+
raise ValueError(f'Invalid dict type format: {klass}')
|
|
1926
|
+
sub_kls = match.group(2)
|
|
1718
1927
|
return {k: cls.__validate(v, sub_kls) for k, v in data.items()}
|
|
1719
1928
|
|
|
1720
1929
|
# convert str to class
|
|
@@ -2073,6 +2282,15 @@ def get_kube_config_context_namespace(
|
|
|
2073
2282
|
return DEFAULT_NAMESPACE
|
|
2074
2283
|
|
|
2075
2284
|
|
|
2285
|
+
def parse_cpu_or_gpu_resource_to_float(resource_str: str) -> float:
|
|
2286
|
+
if not resource_str:
|
|
2287
|
+
return 0.0
|
|
2288
|
+
if resource_str[-1] == 'm':
|
|
2289
|
+
return float(resource_str[:-1]) / 1000
|
|
2290
|
+
else:
|
|
2291
|
+
return float(resource_str)
|
|
2292
|
+
|
|
2293
|
+
|
|
2076
2294
|
def parse_cpu_or_gpu_resource(resource_qty_str: str) -> Union[int, float]:
|
|
2077
2295
|
resource_str = str(resource_qty_str)
|
|
2078
2296
|
if resource_str[-1] == 'm':
|
|
@@ -2150,16 +2368,9 @@ class KubernetesInstanceType:
|
|
|
2150
2368
|
@staticmethod
|
|
2151
2369
|
def is_valid_instance_type(name: str) -> bool:
|
|
2152
2370
|
"""Returns whether the given name is a valid instance type."""
|
|
2153
|
-
# Before https://github.com/skypilot-org/skypilot/pull/4756,
|
|
2154
|
-
# the accelerators are appended with format "--{a}{type}",
|
|
2155
|
-
# e.g. "4CPU--16GB--1V100".
|
|
2156
|
-
# Check both patterns to keep backward compatibility.
|
|
2157
|
-
# TODO(romilb): Backward compatibility, remove after 0.11.0.
|
|
2158
|
-
prev_pattern = re.compile(
|
|
2159
|
-
r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--\d+\S+)?$')
|
|
2160
2371
|
pattern = re.compile(
|
|
2161
2372
|
r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--[\w\d-]+:\d+)?$')
|
|
2162
|
-
return bool(pattern.match(name))
|
|
2373
|
+
return bool(pattern.match(name))
|
|
2163
2374
|
|
|
2164
2375
|
@classmethod
|
|
2165
2376
|
def _parse_instance_type(
|
|
@@ -2176,11 +2387,6 @@ class KubernetesInstanceType:
|
|
|
2176
2387
|
r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_type>[\w\d-]+):(?P<accelerator_count>\d+))?$' # pylint: disable=line-too-long
|
|
2177
2388
|
)
|
|
2178
2389
|
match = pattern.match(name)
|
|
2179
|
-
# TODO(romilb): Backward compatibility, remove after 0.11.0.
|
|
2180
|
-
prev_pattern = re.compile(
|
|
2181
|
-
r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_count>\d+)(?P<accelerator_type>\S+))?$' # pylint: disable=line-too-long
|
|
2182
|
-
)
|
|
2183
|
-
prev_match = prev_pattern.match(name)
|
|
2184
2390
|
if match:
|
|
2185
2391
|
cpus = float(match.group('cpus'))
|
|
2186
2392
|
memory = float(match.group('memory'))
|
|
@@ -2193,19 +2399,6 @@ class KubernetesInstanceType:
|
|
|
2193
2399
|
accelerator_count = None
|
|
2194
2400
|
accelerator_type = None
|
|
2195
2401
|
return cpus, memory, accelerator_count, accelerator_type
|
|
2196
|
-
# TODO(romilb): Backward compatibility, remove after 0.11.0.
|
|
2197
|
-
elif prev_match:
|
|
2198
|
-
cpus = float(prev_match.group('cpus'))
|
|
2199
|
-
memory = float(prev_match.group('memory'))
|
|
2200
|
-
accelerator_count = prev_match.group('accelerator_count')
|
|
2201
|
-
accelerator_type = prev_match.group('accelerator_type')
|
|
2202
|
-
if accelerator_count:
|
|
2203
|
-
accelerator_count = int(accelerator_count)
|
|
2204
|
-
accelerator_type = str(accelerator_type)
|
|
2205
|
-
else:
|
|
2206
|
-
accelerator_count = None
|
|
2207
|
-
accelerator_type = None
|
|
2208
|
-
return cpus, memory, accelerator_count, accelerator_type
|
|
2209
2402
|
else:
|
|
2210
2403
|
raise ValueError(f'Invalid instance name: {name}')
|
|
2211
2404
|
|
|
@@ -2278,16 +2471,14 @@ def construct_ssh_jump_command(
|
|
|
2278
2471
|
|
|
2279
2472
|
|
|
2280
2473
|
def get_ssh_proxy_command(
|
|
2281
|
-
|
|
2282
|
-
network_mode: kubernetes_enums.KubernetesNetworkingMode,
|
|
2474
|
+
pod_name: str,
|
|
2283
2475
|
private_key_path: str,
|
|
2284
2476
|
context: Optional[str],
|
|
2285
2477
|
namespace: str,
|
|
2286
2478
|
) -> str:
|
|
2287
2479
|
"""Generates the SSH proxy command to connect to the pod.
|
|
2288
2480
|
|
|
2289
|
-
Uses a
|
|
2290
|
-
if the network mode is PORTFORWARD.
|
|
2481
|
+
Uses a direct port-forwarding.
|
|
2291
2482
|
|
|
2292
2483
|
By default, establishing an SSH connection creates a communication
|
|
2293
2484
|
channel to a remote node by setting up a TCP connection. When a
|
|
@@ -2298,17 +2489,8 @@ def get_ssh_proxy_command(
|
|
|
2298
2489
|
Pods within a Kubernetes cluster have internal IP addresses that are
|
|
2299
2490
|
typically not accessible from outside the cluster. Since the default TCP
|
|
2300
2491
|
connection of SSH won't allow access to these pods, we employ a
|
|
2301
|
-
ProxyCommand to establish the required communication channel.
|
|
2302
|
-
in two different networking options: NodePort/port-forward.
|
|
2303
|
-
|
|
2304
|
-
With the NodePort networking mode, a NodePort service is launched. This
|
|
2305
|
-
service opens an external port on the node which redirects to the desired
|
|
2306
|
-
port to a SSH jump pod. When establishing an SSH session in this mode, the
|
|
2307
|
-
ProxyCommand makes use of this external port to create a communication
|
|
2308
|
-
channel directly to port 22, which is the default port ssh server listens
|
|
2309
|
-
on, of the jump pod.
|
|
2492
|
+
ProxyCommand to establish the required communication channel.
|
|
2310
2493
|
|
|
2311
|
-
With Port-forward mode, instead of directly exposing an external port,
|
|
2312
2494
|
'kubectl port-forward' sets up a tunnel between a local port
|
|
2313
2495
|
(127.0.0.1:23100) and port 22 of the provisioned pod. Then we establish TCP
|
|
2314
2496
|
connection to the local end of this tunnel, 127.0.0.1:23100, using 'socat'.
|
|
@@ -2319,38 +2501,26 @@ def get_ssh_proxy_command(
|
|
|
2319
2501
|
the local machine.
|
|
2320
2502
|
|
|
2321
2503
|
Args:
|
|
2322
|
-
|
|
2323
|
-
target for SSH.
|
|
2324
|
-
service. If network_mode is PORTFORWARD, this is the pod name.
|
|
2325
|
-
network_mode: KubernetesNetworkingMode; networking mode for ssh
|
|
2326
|
-
session. It is either 'NODEPORT' or 'PORTFORWARD'
|
|
2504
|
+
pod_name: str; The Kubernetes pod name that will be used as the
|
|
2505
|
+
target for SSH.
|
|
2327
2506
|
private_key_path: str; Path to the private key to use for SSH.
|
|
2328
2507
|
This key must be authorized to access the SSH jump pod.
|
|
2329
|
-
Required for NODEPORT networking mode.
|
|
2330
2508
|
namespace: Kubernetes namespace to use.
|
|
2331
|
-
Required for NODEPORT networking mode.
|
|
2332
2509
|
"""
|
|
2333
|
-
|
|
2334
|
-
ssh_jump_ip = get_external_ip(network_mode, context)
|
|
2510
|
+
ssh_jump_ip = '127.0.0.1' # Local end of the port-forward tunnel
|
|
2335
2511
|
assert private_key_path is not None, 'Private key path must be provided'
|
|
2336
|
-
|
|
2337
|
-
|
|
2338
|
-
|
|
2339
|
-
|
|
2340
|
-
|
|
2341
|
-
|
|
2342
|
-
|
|
2343
|
-
|
|
2344
|
-
|
|
2345
|
-
|
|
2346
|
-
|
|
2347
|
-
|
|
2348
|
-
proxy_cmd_target_pod=k8s_ssh_target,
|
|
2349
|
-
# We embed both the current context and namespace to the SSH proxy
|
|
2350
|
-
# command to make sure SSH still works when the current
|
|
2351
|
-
# context/namespace is changed by the user.
|
|
2352
|
-
current_kube_context=context,
|
|
2353
|
-
current_kube_namespace=namespace)
|
|
2512
|
+
ssh_jump_proxy_command_path = create_proxy_command_script()
|
|
2513
|
+
ssh_jump_proxy_command = construct_ssh_jump_command(
|
|
2514
|
+
private_key_path,
|
|
2515
|
+
ssh_jump_ip,
|
|
2516
|
+
ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
|
|
2517
|
+
proxy_cmd_path=ssh_jump_proxy_command_path,
|
|
2518
|
+
proxy_cmd_target_pod=pod_name,
|
|
2519
|
+
# We embed both the current context and namespace to the SSH proxy
|
|
2520
|
+
# command to make sure SSH still works when the current
|
|
2521
|
+
# context/namespace is changed by the user.
|
|
2522
|
+
current_kube_context=context,
|
|
2523
|
+
current_kube_namespace=namespace)
|
|
2354
2524
|
return ssh_jump_proxy_command
|
|
2355
2525
|
|
|
2356
2526
|
|
|
@@ -2382,240 +2552,6 @@ def create_proxy_command_script() -> str:
|
|
|
2382
2552
|
return PORT_FORWARD_PROXY_CMD_PATH
|
|
2383
2553
|
|
|
2384
2554
|
|
|
2385
|
-
def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
|
|
2386
|
-
context: Optional[str],
|
|
2387
|
-
service_type: kubernetes_enums.KubernetesServiceType):
|
|
2388
|
-
"""Sets up Kubernetes service resource to access for SSH jump pod.
|
|
2389
|
-
|
|
2390
|
-
This method acts as a necessary complement to be run along with
|
|
2391
|
-
setup_ssh_jump_pod(...) method. This service ensures the pod is accessible.
|
|
2392
|
-
|
|
2393
|
-
Args:
|
|
2394
|
-
ssh_jump_name: Name to use for the SSH jump service
|
|
2395
|
-
namespace: Namespace to create the SSH jump service in
|
|
2396
|
-
service_type: Networking configuration on either to use NodePort
|
|
2397
|
-
or ClusterIP service to ssh in
|
|
2398
|
-
"""
|
|
2399
|
-
# Fill in template - ssh_key_secret and ssh_jump_image are not required for
|
|
2400
|
-
# the service spec, so we pass in empty strs.
|
|
2401
|
-
content = fill_ssh_jump_template('', '', ssh_jump_name, service_type.value)
|
|
2402
|
-
|
|
2403
|
-
# Add custom metadata from config
|
|
2404
|
-
merge_custom_metadata(content['service_spec']['metadata'], context)
|
|
2405
|
-
|
|
2406
|
-
# Create service
|
|
2407
|
-
try:
|
|
2408
|
-
kubernetes.core_api(context).create_namespaced_service(
|
|
2409
|
-
namespace, content['service_spec'])
|
|
2410
|
-
except kubernetes.api_exception() as e:
|
|
2411
|
-
# SSH Jump Pod service already exists.
|
|
2412
|
-
if e.status == 409:
|
|
2413
|
-
ssh_jump_service = kubernetes.core_api(
|
|
2414
|
-
context).read_namespaced_service(name=ssh_jump_name,
|
|
2415
|
-
namespace=namespace)
|
|
2416
|
-
curr_svc_type = ssh_jump_service.spec.type
|
|
2417
|
-
if service_type.value == curr_svc_type:
|
|
2418
|
-
# If the currently existing SSH Jump service's type is identical
|
|
2419
|
-
# to user's configuration for networking mode
|
|
2420
|
-
logger.debug(
|
|
2421
|
-
f'SSH Jump Service {ssh_jump_name} already exists in the '
|
|
2422
|
-
'cluster, using it.')
|
|
2423
|
-
else:
|
|
2424
|
-
# If a different type of service type for SSH Jump pod compared
|
|
2425
|
-
# to user's configuration for networking mode exists, we remove
|
|
2426
|
-
# existing servie to create a new one following user's config
|
|
2427
|
-
kubernetes.core_api(context).delete_namespaced_service(
|
|
2428
|
-
name=ssh_jump_name, namespace=namespace)
|
|
2429
|
-
kubernetes.core_api(context).create_namespaced_service(
|
|
2430
|
-
namespace, content['service_spec'])
|
|
2431
|
-
port_forward_mode = (
|
|
2432
|
-
kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value)
|
|
2433
|
-
nodeport_mode = (
|
|
2434
|
-
kubernetes_enums.KubernetesNetworkingMode.NODEPORT.value)
|
|
2435
|
-
clusterip_svc = (
|
|
2436
|
-
kubernetes_enums.KubernetesServiceType.CLUSTERIP.value)
|
|
2437
|
-
nodeport_svc = (
|
|
2438
|
-
kubernetes_enums.KubernetesServiceType.NODEPORT.value)
|
|
2439
|
-
curr_network_mode = port_forward_mode \
|
|
2440
|
-
if curr_svc_type == clusterip_svc else nodeport_mode
|
|
2441
|
-
new_network_mode = nodeport_mode \
|
|
2442
|
-
if curr_svc_type == clusterip_svc else port_forward_mode
|
|
2443
|
-
new_svc_type = nodeport_svc \
|
|
2444
|
-
if curr_svc_type == clusterip_svc else clusterip_svc
|
|
2445
|
-
logger.info(
|
|
2446
|
-
f'Switching the networking mode from '
|
|
2447
|
-
f'\'{curr_network_mode}\' to \'{new_network_mode}\' '
|
|
2448
|
-
f'following networking configuration. Deleting existing '
|
|
2449
|
-
f'\'{curr_svc_type}\' service and recreating as '
|
|
2450
|
-
f'\'{new_svc_type}\' service.')
|
|
2451
|
-
else:
|
|
2452
|
-
raise
|
|
2453
|
-
else:
|
|
2454
|
-
logger.info(f'Created SSH Jump Service {ssh_jump_name}.')
|
|
2455
|
-
|
|
2456
|
-
|
|
2457
|
-
def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
|
|
2458
|
-
ssh_key_secret: str, namespace: str,
|
|
2459
|
-
context: Optional[str]):
|
|
2460
|
-
"""Sets up Kubernetes RBAC and pod for SSH jump host.
|
|
2461
|
-
|
|
2462
|
-
Our Kubernetes implementation uses a SSH jump pod to reach SkyPilot clusters
|
|
2463
|
-
running inside a cluster. This function sets up the resources needed for
|
|
2464
|
-
the SSH jump pod. This includes a service account which grants the jump pod
|
|
2465
|
-
permission to watch for other SkyPilot pods and terminate itself if there
|
|
2466
|
-
are no SkyPilot pods running.
|
|
2467
|
-
|
|
2468
|
-
setup_ssh_jump_service must also be run to ensure that the SSH jump pod is
|
|
2469
|
-
reachable.
|
|
2470
|
-
|
|
2471
|
-
Args:
|
|
2472
|
-
ssh_jump_image: Container image to use for the SSH jump pod
|
|
2473
|
-
ssh_jump_name: Name to use for the SSH jump pod
|
|
2474
|
-
ssh_key_secret: Secret name for the SSH key stored in the cluster
|
|
2475
|
-
namespace: Namespace to create the SSH jump pod in
|
|
2476
|
-
"""
|
|
2477
|
-
# Fill in template - service is created separately so service_type is not
|
|
2478
|
-
# required, so we pass in empty str.
|
|
2479
|
-
content = fill_ssh_jump_template(ssh_key_secret, ssh_jump_image,
|
|
2480
|
-
ssh_jump_name, '')
|
|
2481
|
-
|
|
2482
|
-
# Add custom metadata to all objects
|
|
2483
|
-
for object_type in content.keys():
|
|
2484
|
-
merge_custom_metadata(content[object_type]['metadata'], context)
|
|
2485
|
-
|
|
2486
|
-
# ServiceAccount
|
|
2487
|
-
try:
|
|
2488
|
-
kubernetes.core_api(context).create_namespaced_service_account(
|
|
2489
|
-
namespace, content['service_account'])
|
|
2490
|
-
except kubernetes.api_exception() as e:
|
|
2491
|
-
if e.status == 409:
|
|
2492
|
-
logger.info(
|
|
2493
|
-
'SSH Jump ServiceAccount already exists in the cluster, using '
|
|
2494
|
-
'it.')
|
|
2495
|
-
else:
|
|
2496
|
-
raise
|
|
2497
|
-
else:
|
|
2498
|
-
logger.info('Created SSH Jump ServiceAccount.')
|
|
2499
|
-
# Role
|
|
2500
|
-
try:
|
|
2501
|
-
kubernetes.auth_api(context).create_namespaced_role(
|
|
2502
|
-
namespace, content['role'])
|
|
2503
|
-
except kubernetes.api_exception() as e:
|
|
2504
|
-
if e.status == 409:
|
|
2505
|
-
logger.info(
|
|
2506
|
-
'SSH Jump Role already exists in the cluster, using it.')
|
|
2507
|
-
else:
|
|
2508
|
-
raise
|
|
2509
|
-
else:
|
|
2510
|
-
logger.info('Created SSH Jump Role.')
|
|
2511
|
-
# RoleBinding
|
|
2512
|
-
try:
|
|
2513
|
-
kubernetes.auth_api(context).create_namespaced_role_binding(
|
|
2514
|
-
namespace, content['role_binding'])
|
|
2515
|
-
except kubernetes.api_exception() as e:
|
|
2516
|
-
if e.status == 409:
|
|
2517
|
-
logger.info(
|
|
2518
|
-
'SSH Jump RoleBinding already exists in the cluster, using '
|
|
2519
|
-
'it.')
|
|
2520
|
-
else:
|
|
2521
|
-
raise
|
|
2522
|
-
else:
|
|
2523
|
-
logger.info('Created SSH Jump RoleBinding.')
|
|
2524
|
-
# Pod
|
|
2525
|
-
try:
|
|
2526
|
-
kubernetes.core_api(context).create_namespaced_pod(
|
|
2527
|
-
namespace, content['pod_spec'])
|
|
2528
|
-
except kubernetes.api_exception() as e:
|
|
2529
|
-
if e.status == 409:
|
|
2530
|
-
logger.info(
|
|
2531
|
-
f'SSH Jump Host {ssh_jump_name} already exists in the cluster, '
|
|
2532
|
-
'using it.')
|
|
2533
|
-
else:
|
|
2534
|
-
raise
|
|
2535
|
-
else:
|
|
2536
|
-
logger.info(f'Created SSH Jump Host {ssh_jump_name}.')
|
|
2537
|
-
|
|
2538
|
-
|
|
2539
|
-
def clean_zombie_ssh_jump_pod(namespace: str, context: Optional[str],
|
|
2540
|
-
node_id: str):
|
|
2541
|
-
"""Analyzes SSH jump pod and removes if it is in a bad state
|
|
2542
|
-
|
|
2543
|
-
Prevents the existence of a dangling SSH jump pod. This could happen
|
|
2544
|
-
in case the pod main container did not start properly (or failed). In that
|
|
2545
|
-
case, jump pod lifecycle manager will not function properly to
|
|
2546
|
-
remove the pod and service automatically, and must be done manually.
|
|
2547
|
-
|
|
2548
|
-
Args:
|
|
2549
|
-
namespace: Namespace to remove the SSH jump pod and service from
|
|
2550
|
-
node_id: Name of head pod
|
|
2551
|
-
"""
|
|
2552
|
-
|
|
2553
|
-
def find(l, predicate):
|
|
2554
|
-
"""Utility function to find element in given list"""
|
|
2555
|
-
results = [x for x in l if predicate(x)]
|
|
2556
|
-
return results[0] if results else None
|
|
2557
|
-
|
|
2558
|
-
# Get the SSH jump pod name from the head pod
|
|
2559
|
-
try:
|
|
2560
|
-
pod = kubernetes.core_api(context).read_namespaced_pod(
|
|
2561
|
-
node_id, namespace)
|
|
2562
|
-
except kubernetes.api_exception() as e:
|
|
2563
|
-
if e.status == 404:
|
|
2564
|
-
logger.warning(f'Failed to get pod {node_id},'
|
|
2565
|
-
' but the pod was not found (404).')
|
|
2566
|
-
raise
|
|
2567
|
-
else:
|
|
2568
|
-
ssh_jump_name = pod.metadata.labels.get('skypilot-ssh-jump')
|
|
2569
|
-
try:
|
|
2570
|
-
ssh_jump_pod = kubernetes.core_api(context).read_namespaced_pod(
|
|
2571
|
-
ssh_jump_name, namespace)
|
|
2572
|
-
cont_ready_cond = find(ssh_jump_pod.status.conditions,
|
|
2573
|
-
lambda c: c.type == 'ContainersReady')
|
|
2574
|
-
if (cont_ready_cond and cont_ready_cond.status
|
|
2575
|
-
== 'False') or ssh_jump_pod.status.phase == 'Pending':
|
|
2576
|
-
# Either the main container is not ready or the pod failed
|
|
2577
|
-
# to schedule. To be on the safe side and prevent a dangling
|
|
2578
|
-
# ssh jump pod, lets remove it and the service. Otherwise, main
|
|
2579
|
-
# container is ready and its lifecycle management script takes
|
|
2580
|
-
# care of the cleaning.
|
|
2581
|
-
kubernetes.core_api(context).delete_namespaced_pod(
|
|
2582
|
-
ssh_jump_name, namespace)
|
|
2583
|
-
kubernetes.core_api(context).delete_namespaced_service(
|
|
2584
|
-
ssh_jump_name, namespace)
|
|
2585
|
-
except kubernetes.api_exception() as e:
|
|
2586
|
-
# We keep the warning in debug to avoid polluting the `sky launch`
|
|
2587
|
-
# output.
|
|
2588
|
-
logger.debug(f'Tried to check ssh jump pod {ssh_jump_name},'
|
|
2589
|
-
f' but got error {e}\n. Consider running `kubectl '
|
|
2590
|
-
f'delete pod {ssh_jump_name} -n {namespace}` to manually '
|
|
2591
|
-
'remove the pod if it has crashed.')
|
|
2592
|
-
# We encountered an issue while checking ssh jump pod. To be on
|
|
2593
|
-
# the safe side, lets remove its service so the port is freed
|
|
2594
|
-
try:
|
|
2595
|
-
kubernetes.core_api(context).delete_namespaced_service(
|
|
2596
|
-
ssh_jump_name, namespace)
|
|
2597
|
-
except kubernetes.api_exception():
|
|
2598
|
-
pass
|
|
2599
|
-
|
|
2600
|
-
|
|
2601
|
-
def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
|
|
2602
|
-
ssh_jump_name: str, service_type: str) -> Dict:
|
|
2603
|
-
template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
|
|
2604
|
-
'kubernetes-ssh-jump.yml.j2')
|
|
2605
|
-
if not os.path.exists(template_path):
|
|
2606
|
-
raise FileNotFoundError(
|
|
2607
|
-
'Template "kubernetes-ssh-jump.j2" does not exist.')
|
|
2608
|
-
with open(template_path, 'r', encoding='utf-8') as fin:
|
|
2609
|
-
template = fin.read()
|
|
2610
|
-
j2_template = jinja2.Template(template)
|
|
2611
|
-
cont = j2_template.render(name=ssh_jump_name,
|
|
2612
|
-
image=ssh_jump_image,
|
|
2613
|
-
secret=ssh_key_secret,
|
|
2614
|
-
service_type=service_type)
|
|
2615
|
-
content = yaml_utils.safe_load(cont)
|
|
2616
|
-
return content
|
|
2617
|
-
|
|
2618
|
-
|
|
2619
2555
|
def check_port_forward_mode_dependencies(
|
|
2620
2556
|
raise_error: bool = True) -> Optional[List[str]]:
|
|
2621
2557
|
"""Checks if 'socat' and 'nc' are installed
|
|
@@ -2762,26 +2698,22 @@ def combine_pod_config_fields(
|
|
|
2762
2698
|
merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
|
|
2763
2699
|
# We don't use override_configs in `get_effective_region_config`, as merging
|
|
2764
2700
|
# the pod config requires special handling.
|
|
2765
|
-
if isinstance(cloud, clouds.SSH)
|
|
2766
|
-
|
|
2767
|
-
|
|
2768
|
-
|
|
2769
|
-
|
|
2770
|
-
|
|
2771
|
-
|
|
2772
|
-
|
|
2773
|
-
|
|
2774
|
-
|
|
2775
|
-
|
|
2776
|
-
|
|
2777
|
-
|
|
2778
|
-
|
|
2779
|
-
|
|
2780
|
-
|
|
2781
|
-
cloud='kubernetes',
|
|
2782
|
-
region=context,
|
|
2783
|
-
keys=('pod_config',),
|
|
2784
|
-
default_value={})
|
|
2701
|
+
cloud_str = 'ssh' if isinstance(cloud, clouds.SSH) else 'kubernetes'
|
|
2702
|
+
context_str = context
|
|
2703
|
+
if isinstance(cloud, clouds.SSH) and context is not None:
|
|
2704
|
+
assert context.startswith('ssh-'), 'SSH context must start with "ssh-"'
|
|
2705
|
+
context_str = context[len('ssh-'):]
|
|
2706
|
+
kubernetes_config = skypilot_config.get_effective_region_config(
|
|
2707
|
+
cloud=cloud_str,
|
|
2708
|
+
region=context_str,
|
|
2709
|
+
keys=('pod_config',),
|
|
2710
|
+
default_value={})
|
|
2711
|
+
override_pod_config = config_utils.get_cloud_config_value_from_dict(
|
|
2712
|
+
dict_config=cluster_config_overrides,
|
|
2713
|
+
cloud=cloud_str,
|
|
2714
|
+
region=context_str,
|
|
2715
|
+
keys=('pod_config',),
|
|
2716
|
+
default_value={})
|
|
2785
2717
|
config_utils.merge_k8s_configs(kubernetes_config, override_pod_config)
|
|
2786
2718
|
|
|
2787
2719
|
# Merge the kubernetes config into the YAML for both head and worker nodes.
|
|
@@ -2800,9 +2732,11 @@ def combine_metadata_fields(cluster_yaml_obj: Dict[str, Any],
|
|
|
2800
2732
|
Obeys the same add or update semantics as combine_pod_config_fields().
|
|
2801
2733
|
"""
|
|
2802
2734
|
merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
|
|
2735
|
+
context, cloud_str = get_cleaned_context_and_cloud_str(context)
|
|
2736
|
+
|
|
2803
2737
|
# Get custom_metadata from global config
|
|
2804
2738
|
custom_metadata = skypilot_config.get_effective_region_config(
|
|
2805
|
-
cloud=
|
|
2739
|
+
cloud=cloud_str,
|
|
2806
2740
|
region=context,
|
|
2807
2741
|
keys=('custom_metadata',),
|
|
2808
2742
|
default_value={})
|
|
@@ -2810,7 +2744,7 @@ def combine_metadata_fields(cluster_yaml_obj: Dict[str, Any],
|
|
|
2810
2744
|
# Get custom_metadata from task-level config overrides
|
|
2811
2745
|
override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
|
|
2812
2746
|
dict_config=cluster_config_overrides,
|
|
2813
|
-
cloud=
|
|
2747
|
+
cloud=cloud_str,
|
|
2814
2748
|
region=context,
|
|
2815
2749
|
keys=('custom_metadata',),
|
|
2816
2750
|
default_value={})
|
|
@@ -2867,9 +2801,11 @@ def merge_custom_metadata(
|
|
|
2867
2801
|
|
|
2868
2802
|
Merge is done in-place, so return is not required
|
|
2869
2803
|
"""
|
|
2804
|
+
context, cloud_str = get_cleaned_context_and_cloud_str(context)
|
|
2805
|
+
|
|
2870
2806
|
# Get custom_metadata from global config
|
|
2871
2807
|
custom_metadata = skypilot_config.get_effective_region_config(
|
|
2872
|
-
cloud=
|
|
2808
|
+
cloud=cloud_str,
|
|
2873
2809
|
region=context,
|
|
2874
2810
|
keys=('custom_metadata',),
|
|
2875
2811
|
default_value={})
|
|
@@ -2878,7 +2814,7 @@ def merge_custom_metadata(
|
|
|
2878
2814
|
if cluster_config_overrides is not None:
|
|
2879
2815
|
override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
|
|
2880
2816
|
dict_config=cluster_config_overrides,
|
|
2881
|
-
cloud=
|
|
2817
|
+
cloud=cloud_str,
|
|
2882
2818
|
region=context,
|
|
2883
2819
|
keys=('custom_metadata',),
|
|
2884
2820
|
default_value={})
|
|
@@ -2889,7 +2825,8 @@ def merge_custom_metadata(
|
|
|
2889
2825
|
config_utils.merge_k8s_configs(original_metadata, custom_metadata)
|
|
2890
2826
|
|
|
2891
2827
|
|
|
2892
|
-
|
|
2828
|
+
@_retry_on_error(resource_type='runtimeclass')
|
|
2829
|
+
def check_nvidia_runtime_class(*, context: Optional[str] = None) -> bool:
|
|
2893
2830
|
"""Checks if the 'nvidia' RuntimeClass exists in the cluster"""
|
|
2894
2831
|
# Fetch the list of available RuntimeClasses
|
|
2895
2832
|
runtime_classes = kubernetes.node_api(context).list_runtime_class()
|
|
@@ -3108,14 +3045,6 @@ def get_kubernetes_node_info(
|
|
|
3108
3045
|
information.
|
|
3109
3046
|
"""
|
|
3110
3047
|
nodes = get_kubernetes_nodes(context=context)
|
|
3111
|
-
# Get the pods to get the real-time resource usage
|
|
3112
|
-
try:
|
|
3113
|
-
pods = get_all_pods_in_kubernetes_cluster(context=context)
|
|
3114
|
-
except kubernetes.api_exception() as e:
|
|
3115
|
-
if e.status == 403:
|
|
3116
|
-
pods = None
|
|
3117
|
-
else:
|
|
3118
|
-
raise
|
|
3119
3048
|
|
|
3120
3049
|
lf, _ = detect_gpu_label_formatter(context)
|
|
3121
3050
|
if not lf:
|
|
@@ -3123,6 +3052,29 @@ def get_kubernetes_node_info(
|
|
|
3123
3052
|
else:
|
|
3124
3053
|
label_keys = lf.get_label_keys()
|
|
3125
3054
|
|
|
3055
|
+
# Check if all nodes have no accelerators to avoid fetching pods
|
|
3056
|
+
has_accelerator_nodes = False
|
|
3057
|
+
for node in nodes:
|
|
3058
|
+
accelerator_count = get_node_accelerator_count(context,
|
|
3059
|
+
node.status.allocatable)
|
|
3060
|
+
if accelerator_count > 0:
|
|
3061
|
+
has_accelerator_nodes = True
|
|
3062
|
+
break
|
|
3063
|
+
|
|
3064
|
+
# Get the allocated GPU quantity by each node
|
|
3065
|
+
allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
|
|
3066
|
+
error_on_get_allocated_gpu_qty_by_node = False
|
|
3067
|
+
if has_accelerator_nodes:
|
|
3068
|
+
try:
|
|
3069
|
+
allocated_qty_by_node = get_allocated_gpu_qty_by_node(
|
|
3070
|
+
context=context)
|
|
3071
|
+
except kubernetes.api_exception() as e:
|
|
3072
|
+
if e.status == 403:
|
|
3073
|
+
error_on_get_allocated_gpu_qty_by_node = True
|
|
3074
|
+
pass
|
|
3075
|
+
else:
|
|
3076
|
+
raise
|
|
3077
|
+
|
|
3126
3078
|
node_info_dict: Dict[str, models.KubernetesNodeInfo] = {}
|
|
3127
3079
|
has_multi_host_tpu = False
|
|
3128
3080
|
|
|
@@ -3152,32 +3104,28 @@ def get_kubernetes_node_info(
|
|
|
3152
3104
|
node_ip = address.address
|
|
3153
3105
|
break
|
|
3154
3106
|
|
|
3155
|
-
allocated_qty = 0
|
|
3156
3107
|
accelerator_count = get_node_accelerator_count(context,
|
|
3157
3108
|
node.status.allocatable)
|
|
3109
|
+
# Check if node is ready
|
|
3110
|
+
node_is_ready = node.is_ready()
|
|
3111
|
+
|
|
3112
|
+
if accelerator_count == 0:
|
|
3113
|
+
node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
|
|
3114
|
+
name=node.metadata.name,
|
|
3115
|
+
accelerator_type=accelerator_name,
|
|
3116
|
+
total={'accelerator_count': 0},
|
|
3117
|
+
free={'accelerators_available': 0},
|
|
3118
|
+
ip_address=node_ip,
|
|
3119
|
+
is_ready=node_is_ready)
|
|
3120
|
+
continue
|
|
3158
3121
|
|
|
3159
|
-
if
|
|
3122
|
+
if not node_is_ready:
|
|
3123
|
+
# If node is not ready, report 0 available GPUs
|
|
3124
|
+
accelerators_available = 0
|
|
3125
|
+
elif not has_accelerator_nodes or error_on_get_allocated_gpu_qty_by_node:
|
|
3160
3126
|
accelerators_available = -1
|
|
3161
|
-
|
|
3162
3127
|
else:
|
|
3163
|
-
|
|
3164
|
-
# Get all the pods running on the node
|
|
3165
|
-
if (pod.spec.node_name == node.metadata.name and
|
|
3166
|
-
pod.status.phase in ['Running', 'Pending']):
|
|
3167
|
-
# Skip pods that should not count against GPU count
|
|
3168
|
-
if should_exclude_pod_from_gpu_allocation(pod):
|
|
3169
|
-
logger.debug(
|
|
3170
|
-
f'Excluding low priority pod '
|
|
3171
|
-
f'{pod.metadata.name} from GPU allocation '
|
|
3172
|
-
f'calculations on node {node.metadata.name}')
|
|
3173
|
-
continue
|
|
3174
|
-
# Iterate over all the containers in the pod and sum the
|
|
3175
|
-
# GPU requests
|
|
3176
|
-
for container in pod.spec.containers:
|
|
3177
|
-
if container.resources.requests:
|
|
3178
|
-
allocated_qty += get_node_accelerator_count(
|
|
3179
|
-
context, container.resources.requests)
|
|
3180
|
-
|
|
3128
|
+
allocated_qty = allocated_qty_by_node[node.metadata.name]
|
|
3181
3129
|
accelerators_available = accelerator_count - allocated_qty
|
|
3182
3130
|
|
|
3183
3131
|
# Exclude multi-host TPUs from being processed.
|
|
@@ -3192,7 +3140,8 @@ def get_kubernetes_node_info(
|
|
|
3192
3140
|
accelerator_type=accelerator_name,
|
|
3193
3141
|
total={'accelerator_count': int(accelerator_count)},
|
|
3194
3142
|
free={'accelerators_available': int(accelerators_available)},
|
|
3195
|
-
ip_address=node_ip
|
|
3143
|
+
ip_address=node_ip,
|
|
3144
|
+
is_ready=node_is_ready)
|
|
3196
3145
|
hint = ''
|
|
3197
3146
|
if has_multi_host_tpu:
|
|
3198
3147
|
hint = ('(Note: Multi-host TPUs are detected and excluded from the '
|
|
@@ -3224,7 +3173,11 @@ def filter_pods(namespace: str,
|
|
|
3224
3173
|
context: Optional[str],
|
|
3225
3174
|
tag_filters: Dict[str, str],
|
|
3226
3175
|
status_filters: Optional[List[str]] = None) -> Dict[str, Any]:
|
|
3227
|
-
"""Filters pods by tags and status.
|
|
3176
|
+
"""Filters pods by tags and status.
|
|
3177
|
+
|
|
3178
|
+
Returned dict is sorted by name, with workers sorted by their numeric suffix.
|
|
3179
|
+
This ensures consistent ordering for SSH configuration and other operations.
|
|
3180
|
+
"""
|
|
3228
3181
|
non_included_pod_statuses = POD_STATUSES.copy()
|
|
3229
3182
|
|
|
3230
3183
|
field_selector = ''
|
|
@@ -3242,7 +3195,32 @@ def filter_pods(namespace: str,
|
|
|
3242
3195
|
pods = [
|
|
3243
3196
|
pod for pod in pod_list.items if pod.metadata.deletion_timestamp is None
|
|
3244
3197
|
]
|
|
3245
|
-
|
|
3198
|
+
|
|
3199
|
+
# Sort pods by name, with workers sorted by their numeric suffix.
|
|
3200
|
+
# This ensures consistent ordering (e.g., cluster-head, cluster-worker1,
|
|
3201
|
+
# cluster-worker2, cluster-worker3, ...) even when Kubernetes API
|
|
3202
|
+
# returns them in arbitrary order. This works even if there were
|
|
3203
|
+
# somehow pod names other than head/worker ones, and those end up at
|
|
3204
|
+
# the end of the list.
|
|
3205
|
+
def get_pod_sort_key(
|
|
3206
|
+
pod: V1Pod
|
|
3207
|
+
) -> Union[Tuple[Literal[0], str], Tuple[Literal[1], int], Tuple[Literal[2],
|
|
3208
|
+
str]]:
|
|
3209
|
+
name = pod.metadata.name
|
|
3210
|
+
name_suffix = name.split('-')[-1]
|
|
3211
|
+
if name_suffix == 'head':
|
|
3212
|
+
return (0, name)
|
|
3213
|
+
elif name_suffix.startswith('worker'):
|
|
3214
|
+
try:
|
|
3215
|
+
return (1, int(name_suffix.split('worker')[-1]))
|
|
3216
|
+
except (ValueError, IndexError):
|
|
3217
|
+
return (2, name)
|
|
3218
|
+
else:
|
|
3219
|
+
return (2, name)
|
|
3220
|
+
|
|
3221
|
+
sorted_pods = sorted(pods, key=get_pod_sort_key)
|
|
3222
|
+
|
|
3223
|
+
return {pod.metadata.name: pod for pod in sorted_pods}
|
|
3246
3224
|
|
|
3247
3225
|
|
|
3248
3226
|
def _remove_pod_annotation(pod: Any,
|
|
@@ -3371,13 +3349,13 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
|
|
|
3371
3349
|
|
|
3372
3350
|
try:
|
|
3373
3351
|
pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
|
|
3374
|
-
label_selector=
|
|
3352
|
+
label_selector=provision_constants.TAG_SKYPILOT_CLUSTER_NAME,
|
|
3375
3353
|
_request_timeout=kubernetes.API_TIMEOUT).items
|
|
3376
3354
|
except kubernetes.max_retry_error():
|
|
3377
3355
|
raise exceptions.ResourcesUnavailableError(
|
|
3378
3356
|
'Timed out trying to get SkyPilot pods from Kubernetes cluster. '
|
|
3379
3357
|
'Please check if the cluster is healthy and retry. To debug, run: '
|
|
3380
|
-
'kubectl get pods --selector=skypilot-cluster --all-namespaces'
|
|
3358
|
+
'kubectl get pods --selector=skypilot-cluster-name --all-namespaces'
|
|
3381
3359
|
) from None
|
|
3382
3360
|
return pods
|
|
3383
3361
|
|
|
@@ -3514,7 +3492,8 @@ def process_skypilot_pods(
|
|
|
3514
3492
|
serve_controllers: List[KubernetesSkyPilotClusterInfo] = []
|
|
3515
3493
|
|
|
3516
3494
|
for pod in pods:
|
|
3517
|
-
cluster_name_on_cloud = pod.metadata.labels.get(
|
|
3495
|
+
cluster_name_on_cloud = pod.metadata.labels.get(
|
|
3496
|
+
provision_constants.TAG_SKYPILOT_CLUSTER_NAME)
|
|
3518
3497
|
cluster_name = cluster_name_on_cloud.rsplit(
|
|
3519
3498
|
'-', 1
|
|
3520
3499
|
)[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
|
|
@@ -3541,9 +3520,20 @@ def process_skypilot_pods(
|
|
|
3541
3520
|
f'requesting GPUs: {pod.metadata.name}')
|
|
3542
3521
|
gpu_label = label_formatter.get_label_key()
|
|
3543
3522
|
# Get GPU name from pod node selector
|
|
3544
|
-
|
|
3545
|
-
|
|
3546
|
-
|
|
3523
|
+
node_selector_terms = (
|
|
3524
|
+
pod.spec.affinity.node_affinity.
|
|
3525
|
+
required_during_scheduling_ignored_during_execution.
|
|
3526
|
+
node_selector_terms)
|
|
3527
|
+
if node_selector_terms is not None:
|
|
3528
|
+
expressions = []
|
|
3529
|
+
for term in node_selector_terms:
|
|
3530
|
+
if term.match_expressions:
|
|
3531
|
+
expressions.extend(term.match_expressions)
|
|
3532
|
+
for expression in expressions:
|
|
3533
|
+
if expression.key == gpu_label and expression.operator == 'In':
|
|
3534
|
+
gpu_name = label_formatter.get_accelerator_from_label_value(
|
|
3535
|
+
expression.values[0])
|
|
3536
|
+
break
|
|
3547
3537
|
|
|
3548
3538
|
resources = resources_lib.Resources(
|
|
3549
3539
|
cloud=clouds.Kubernetes(),
|
|
@@ -3790,3 +3780,13 @@ def should_exclude_pod_from_gpu_allocation(pod) -> bool:
|
|
|
3790
3780
|
return True
|
|
3791
3781
|
|
|
3792
3782
|
return False
|
|
3783
|
+
|
|
3784
|
+
|
|
3785
|
+
def get_cleaned_context_and_cloud_str(
|
|
3786
|
+
context: Optional[str]) -> Tuple[Optional[str], str]:
|
|
3787
|
+
"""Return the cleaned context and relevant cloud string from a context."""
|
|
3788
|
+
cloud_str = 'kubernetes'
|
|
3789
|
+
if context is not None and context.startswith('ssh-'):
|
|
3790
|
+
cloud_str = 'ssh'
|
|
3791
|
+
context = context[len('ssh-'):]
|
|
3792
|
+
return context, cloud_str
|