skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
"""Kubernetes instance provisioning."""
|
|
2
2
|
import copy
|
|
3
|
+
import datetime
|
|
3
4
|
import json
|
|
5
|
+
import re
|
|
6
|
+
import sys
|
|
4
7
|
import time
|
|
5
|
-
from typing import Any,
|
|
6
|
-
import uuid
|
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
7
9
|
|
|
8
10
|
from sky import exceptions
|
|
11
|
+
from sky import global_user_state
|
|
9
12
|
from sky import sky_logging
|
|
10
13
|
from sky import skypilot_config
|
|
11
14
|
from sky.adaptors import kubernetes
|
|
@@ -13,31 +16,40 @@ from sky.provision import common
|
|
|
13
16
|
from sky.provision import constants
|
|
14
17
|
from sky.provision import docker_utils
|
|
15
18
|
from sky.provision.kubernetes import config as config_lib
|
|
16
|
-
from sky.provision.kubernetes import
|
|
19
|
+
from sky.provision.kubernetes import constants as k8s_constants
|
|
17
20
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
21
|
+
from sky.provision.kubernetes import volume
|
|
18
22
|
from sky.utils import command_runner
|
|
19
23
|
from sky.utils import common_utils
|
|
20
24
|
from sky.utils import config_utils
|
|
21
25
|
from sky.utils import kubernetes_enums
|
|
26
|
+
from sky.utils import rich_utils
|
|
22
27
|
from sky.utils import status_lib
|
|
23
28
|
from sky.utils import subprocess_utils
|
|
24
29
|
from sky.utils import timeline
|
|
25
30
|
from sky.utils import ux_utils
|
|
31
|
+
from sky.utils.db import db_utils
|
|
26
32
|
|
|
27
33
|
POLL_INTERVAL = 2
|
|
28
34
|
_TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
|
|
29
35
|
_MAX_RETRIES = 3
|
|
36
|
+
_MAX_MISSING_PODS_RETRIES = 5
|
|
37
|
+
_MAX_QUERY_INSTANCES_RETRIES = 5
|
|
38
|
+
_QUERY_INSTANCES_RETRY_INTERVAL = .5
|
|
30
39
|
_NUM_THREADS = subprocess_utils.get_parallel_threads('kubernetes')
|
|
31
40
|
|
|
41
|
+
COMMON_NON_PENDING_EVENT_REASONS = {
|
|
42
|
+
'Scheduled', 'Created', 'Started', 'Failed', 'Pulled'
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
# Pattern to extract SSH user from command output, handling MOTD contamination
|
|
46
|
+
_SSH_USER_PATTERN = re.compile(r'SKYPILOT_SSH_USER: ([^\s\n]+)')
|
|
47
|
+
|
|
32
48
|
logger = sky_logging.init_logger(__name__)
|
|
33
|
-
TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
|
|
34
|
-
TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
|
|
35
|
-
TAG_POD_INITIALIZED = 'skypilot-initialized'
|
|
36
|
-
TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
|
|
37
49
|
|
|
38
50
|
|
|
39
51
|
def ray_tag_filter(cluster_name: str) -> Dict[str, str]:
|
|
40
|
-
return {TAG_RAY_CLUSTER_NAME: cluster_name}
|
|
52
|
+
return {k8s_constants.TAG_RAY_CLUSTER_NAME: cluster_name}
|
|
41
53
|
|
|
42
54
|
|
|
43
55
|
def _is_head(pod) -> bool:
|
|
@@ -67,12 +79,16 @@ def is_high_availability_cluster_by_kubectl(
|
|
|
67
79
|
namespace: Optional[str] = None) -> bool:
|
|
68
80
|
"""Check if a cluster is a high availability controller by calling
|
|
69
81
|
`kubectl get deployment`.
|
|
82
|
+
|
|
83
|
+
The deployment must have the label `skypilot-cluster-name` set to
|
|
84
|
+
`cluster_name`.
|
|
70
85
|
"""
|
|
71
86
|
try:
|
|
72
87
|
deployment_list = kubernetes.apps_api(
|
|
73
88
|
context).list_namespaced_deployment(
|
|
74
89
|
namespace,
|
|
75
|
-
label_selector=
|
|
90
|
+
label_selector=
|
|
91
|
+
f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
|
|
76
92
|
except kubernetes.api_exception():
|
|
77
93
|
return False
|
|
78
94
|
# It is a high availability cluster if there is at least one deployment
|
|
@@ -186,14 +202,20 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
|
186
202
|
break
|
|
187
203
|
if event_message is not None:
|
|
188
204
|
if pod_status == 'Pending':
|
|
189
|
-
|
|
205
|
+
out_of = {}
|
|
206
|
+
# key: resource name, value: (extra message, nice name)
|
|
190
207
|
if 'Insufficient cpu' in event_message:
|
|
191
|
-
|
|
192
|
-
|
|
208
|
+
out_of['CPU'] = (': Run \'kubectl get nodes -o '
|
|
209
|
+
'custom-columns=NAME:.metadata.name,'
|
|
210
|
+
'CPU:.status.allocatable.cpu\' to check '
|
|
211
|
+
'the available CPUs on the node.', 'CPUs')
|
|
193
212
|
if 'Insufficient memory' in event_message:
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
213
|
+
out_of['memory'] = (': Run \'kubectl get nodes -o '
|
|
214
|
+
'custom-columns=NAME:.metadata.name,'
|
|
215
|
+
'MEMORY:.status.allocatable.memory\' '
|
|
216
|
+
'to check the available memory on the '
|
|
217
|
+
'node.', 'Memory')
|
|
218
|
+
|
|
197
219
|
# TODO(aylei): after switching from smarter-device-manager to
|
|
198
220
|
# fusermount-server, we need a new way to check whether the
|
|
199
221
|
# fusermount-server daemonset is ready.
|
|
@@ -201,43 +223,79 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
|
201
223
|
key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
|
|
202
224
|
for key in lf.get_label_keys()
|
|
203
225
|
]
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
226
|
+
for label_key in gpu_lf_keys:
|
|
227
|
+
# TODO(romilb): We may have additional node
|
|
228
|
+
# affinity selectors in the future - in that
|
|
229
|
+
# case we will need to update this logic.
|
|
230
|
+
# TODO(Doyoung): Update the error message raised
|
|
231
|
+
# with the multi-host TPU support.
|
|
232
|
+
gpu_resource_key = kubernetes_utils.get_gpu_resource_key(
|
|
233
|
+
context) # pylint: disable=line-too-long
|
|
234
|
+
if ((f'Insufficient {gpu_resource_key}' in event_message) or
|
|
235
|
+
('didn\'t match Pod\'s node affinity/selector'
|
|
236
|
+
in event_message) and pod.spec.node_selector):
|
|
237
|
+
if 'gpu' in gpu_resource_key.lower():
|
|
238
|
+
info_msg = (
|
|
239
|
+
': Run \'sky show-gpus --infra kubernetes\' to '
|
|
240
|
+
'see the available GPUs.')
|
|
241
|
+
else:
|
|
242
|
+
info_msg = ': '
|
|
243
|
+
if (pod.spec.node_selector and
|
|
244
|
+
label_key in pod.spec.node_selector):
|
|
245
|
+
extra_msg = (
|
|
246
|
+
f'Verify if any node matching label '
|
|
247
|
+
f'{pod.spec.node_selector[label_key]} and '
|
|
248
|
+
f'sufficient resource {gpu_resource_key} '
|
|
249
|
+
f'is available in the cluster.')
|
|
250
|
+
extra_msg = info_msg + ' ' + extra_msg
|
|
251
|
+
else:
|
|
252
|
+
extra_msg = info_msg
|
|
253
|
+
if gpu_resource_key not in out_of or len(
|
|
254
|
+
out_of[gpu_resource_key][0]) < len(extra_msg):
|
|
255
|
+
out_of[f'{gpu_resource_key}'] = (extra_msg, 'GPUs')
|
|
256
|
+
|
|
257
|
+
if len(out_of) > 0:
|
|
258
|
+
# We are out of some resources. We should raise an error.
|
|
259
|
+
rsrc_err_msg = 'Insufficient resource capacity on the '
|
|
260
|
+
rsrc_err_msg += 'cluster:\n'
|
|
261
|
+
out_of_keys = list(out_of.keys())
|
|
262
|
+
for i in range(len(out_of_keys)):
|
|
263
|
+
rsrc = out_of_keys[i]
|
|
264
|
+
(extra_msg, nice_name) = out_of[rsrc]
|
|
265
|
+
extra_msg = extra_msg if extra_msg else ''
|
|
266
|
+
if i == len(out_of_keys) - 1:
|
|
267
|
+
indent = '└──'
|
|
268
|
+
else:
|
|
269
|
+
indent = '├──'
|
|
270
|
+
rsrc_err_msg += (f'{indent} Cluster does not have '
|
|
271
|
+
f'sufficient {nice_name} for your request'
|
|
272
|
+
f'{extra_msg}')
|
|
273
|
+
if i != len(out_of_keys) - 1:
|
|
274
|
+
rsrc_err_msg += '\n'
|
|
275
|
+
|
|
276
|
+
# Emit the error message without logging prefixes for better UX.
|
|
277
|
+
tmp_handler = sky_logging.EnvAwareHandler(sys.stdout)
|
|
278
|
+
tmp_handler.flush = sys.stdout.flush # type: ignore
|
|
279
|
+
tmp_handler.setFormatter(sky_logging.NO_PREFIX_FORMATTER)
|
|
280
|
+
tmp_handler.setLevel(sky_logging.ERROR)
|
|
281
|
+
prev_propagate = logger.propagate
|
|
282
|
+
try:
|
|
283
|
+
logger.addHandler(tmp_handler)
|
|
284
|
+
logger.propagate = False
|
|
285
|
+
logger.error(ux_utils.error_message(f'{rsrc_err_msg}'))
|
|
286
|
+
finally:
|
|
287
|
+
logger.removeHandler(tmp_handler)
|
|
288
|
+
logger.propagate = prev_propagate
|
|
289
|
+
nice_names = [out_of[rsrc][1] for rsrc in out_of_keys]
|
|
290
|
+
raise config_lib.KubernetesError(
|
|
291
|
+
f'{timeout_err_msg} '
|
|
292
|
+
f'Pod status: {pod_status} '
|
|
293
|
+
f'Details: \'{event_message}\' ',
|
|
294
|
+
insufficent_resources=nice_names,
|
|
295
|
+
)
|
|
296
|
+
|
|
239
297
|
raise config_lib.KubernetesError(f'{timeout_err_msg} '
|
|
240
|
-
f'Pod status: {pod_status}'
|
|
298
|
+
f'Pod status: {pod_status} '
|
|
241
299
|
f'Details: \'{event_message}\' ')
|
|
242
300
|
raise config_lib.KubernetesError(f'{timeout_err_msg}')
|
|
243
301
|
|
|
@@ -251,8 +309,89 @@ def _raise_command_running_error(message: str, command: str, pod_name: str,
|
|
|
251
309
|
f'code {rc}: {command!r}\nOutput: {stdout}.')
|
|
252
310
|
|
|
253
311
|
|
|
312
|
+
def _detect_cluster_event_reason_occurred(namespace, context, search_start,
|
|
313
|
+
reason) -> bool:
|
|
314
|
+
|
|
315
|
+
def _convert_to_utc(timestamp):
|
|
316
|
+
if timestamp.tzinfo is None:
|
|
317
|
+
return timestamp.replace(tzinfo=datetime.timezone.utc)
|
|
318
|
+
return timestamp.astimezone(datetime.timezone.utc)
|
|
319
|
+
|
|
320
|
+
def _get_event_timestamp(event):
|
|
321
|
+
if event.last_timestamp:
|
|
322
|
+
return event.last_timestamp
|
|
323
|
+
elif event.metadata.creation_timestamp:
|
|
324
|
+
return event.metadata.creation_timestamp
|
|
325
|
+
return None
|
|
326
|
+
|
|
327
|
+
events = kubernetes.core_api(context).list_namespaced_event(
|
|
328
|
+
namespace=namespace, field_selector=f'reason={reason}')
|
|
329
|
+
for event in events.items:
|
|
330
|
+
ts = _get_event_timestamp(event)
|
|
331
|
+
if ts and _convert_to_utc(ts) > search_start:
|
|
332
|
+
return True
|
|
333
|
+
return False
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def _cluster_had_autoscale_event(namespace, context, search_start) -> bool:
|
|
337
|
+
"""Detects whether the cluster had a autoscaling event after a
|
|
338
|
+
specified datetime. This only works when using cluster-autoscaler.
|
|
339
|
+
|
|
340
|
+
Args:
|
|
341
|
+
namespace: kubernetes namespace
|
|
342
|
+
context: kubernetes context
|
|
343
|
+
search_start (datetime.datetime): filter for events that occurred
|
|
344
|
+
after search_start
|
|
345
|
+
|
|
346
|
+
Returns:
|
|
347
|
+
A boolean whether the cluster has an autoscaling event or not.
|
|
348
|
+
"""
|
|
349
|
+
assert namespace is not None
|
|
350
|
+
|
|
351
|
+
try:
|
|
352
|
+
return _detect_cluster_event_reason_occurred(namespace, context,
|
|
353
|
+
search_start,
|
|
354
|
+
'TriggeredScaleUp')
|
|
355
|
+
except Exception as e: # pylint: disable=broad-except
|
|
356
|
+
logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
|
|
357
|
+
return False
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def _cluster_maybe_autoscaling(namespace, context, search_start) -> bool:
|
|
361
|
+
"""Detects whether a kubernetes cluster may have an autoscaling event.
|
|
362
|
+
|
|
363
|
+
This is not a definitive detection. FailedScheduling, which is an
|
|
364
|
+
event that can occur when not enough resources are present in the cluster,
|
|
365
|
+
which is a trigger for cluster autoscaling. However, FailedScheduling may
|
|
366
|
+
have occurred due to other reasons (cluster itself is abnormal).
|
|
367
|
+
|
|
368
|
+
Hence, this should only be used for autoscalers that don't emit the
|
|
369
|
+
TriggeredScaleUp event, e.g.: Karpenter.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
namespace: kubernetes namespace
|
|
373
|
+
context: kubernetes context
|
|
374
|
+
search_start (datetime.datetime): filter for events that occurred
|
|
375
|
+
after search_start
|
|
376
|
+
|
|
377
|
+
Returns:
|
|
378
|
+
A boolean whether the cluster has an autoscaling event or not.
|
|
379
|
+
"""
|
|
380
|
+
assert namespace is not None
|
|
381
|
+
|
|
382
|
+
try:
|
|
383
|
+
return _detect_cluster_event_reason_occurred(namespace, context,
|
|
384
|
+
search_start,
|
|
385
|
+
'FailedScheduling')
|
|
386
|
+
except Exception as e: # pylint: disable=broad-except
|
|
387
|
+
logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
|
|
388
|
+
return False
|
|
389
|
+
|
|
390
|
+
|
|
254
391
|
@timeline.event
|
|
255
|
-
def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int
|
|
392
|
+
def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int,
|
|
393
|
+
cluster_name: str,
|
|
394
|
+
create_pods_start: datetime.datetime):
|
|
256
395
|
"""Wait for all pods to be scheduled.
|
|
257
396
|
|
|
258
397
|
Wait for all pods including jump pod to be scheduled, and if it
|
|
@@ -261,6 +400,9 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
261
400
|
allocated and we can exit.
|
|
262
401
|
|
|
263
402
|
If timeout is set to a negative value, this method will wait indefinitely.
|
|
403
|
+
|
|
404
|
+
Will update the spinner message to indicate autoscaling if autoscaling
|
|
405
|
+
is happening.
|
|
264
406
|
"""
|
|
265
407
|
# Create a set of pod names we're waiting for
|
|
266
408
|
if not new_nodes:
|
|
@@ -268,6 +410,18 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
268
410
|
expected_pod_names = {node.metadata.name for node in new_nodes}
|
|
269
411
|
start_time = time.time()
|
|
270
412
|
|
|
413
|
+
# Variables for autoscaler detection
|
|
414
|
+
autoscaler_type = skypilot_config.get_effective_region_config(
|
|
415
|
+
cloud='kubernetes',
|
|
416
|
+
region=context,
|
|
417
|
+
keys=('autoscaler',),
|
|
418
|
+
default_value=None)
|
|
419
|
+
autoscaler_is_set = autoscaler_type is not None
|
|
420
|
+
use_heuristic_detection = (autoscaler_is_set and
|
|
421
|
+
not kubernetes_enums.KubernetesAutoscalerType(
|
|
422
|
+
autoscaler_type).emits_autoscale_event())
|
|
423
|
+
is_autoscaling = False
|
|
424
|
+
|
|
271
425
|
def _evaluate_timeout() -> bool:
|
|
272
426
|
# If timeout is negative, retry indefinitely.
|
|
273
427
|
if timeout < 0:
|
|
@@ -277,10 +431,13 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
277
431
|
while _evaluate_timeout():
|
|
278
432
|
# Get all pods in a single API call using the cluster name label
|
|
279
433
|
# which all pods in new_nodes should share
|
|
280
|
-
|
|
434
|
+
cluster_name_on_cloud = new_nodes[0].metadata.labels[
|
|
435
|
+
constants.TAG_SKYPILOT_CLUSTER_NAME]
|
|
281
436
|
pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
282
437
|
namespace,
|
|
283
|
-
label_selector=
|
|
438
|
+
label_selector=
|
|
439
|
+
f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
|
|
440
|
+
).items
|
|
284
441
|
|
|
285
442
|
# Get the set of found pod names and check if we have all expected pods
|
|
286
443
|
found_pod_names = {pod.metadata.name for pod in pods}
|
|
@@ -304,6 +461,26 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
304
461
|
|
|
305
462
|
if all_scheduled:
|
|
306
463
|
return
|
|
464
|
+
|
|
465
|
+
# Check if cluster is autoscaling and update spinner message.
|
|
466
|
+
# Minor optimization to not query k8s api after autoscaling
|
|
467
|
+
# event was detected. This is useful because there isn't any
|
|
468
|
+
# autoscaling complete event.
|
|
469
|
+
if autoscaler_is_set and not is_autoscaling:
|
|
470
|
+
if use_heuristic_detection:
|
|
471
|
+
is_autoscaling = _cluster_maybe_autoscaling(
|
|
472
|
+
namespace, context, create_pods_start)
|
|
473
|
+
msg = 'Kubernetes cluster may be scaling up'
|
|
474
|
+
else:
|
|
475
|
+
is_autoscaling = _cluster_had_autoscale_event(
|
|
476
|
+
namespace, context, create_pods_start)
|
|
477
|
+
msg = 'Kubernetes cluster is autoscaling'
|
|
478
|
+
|
|
479
|
+
if is_autoscaling:
|
|
480
|
+
rich_utils.force_update_status(
|
|
481
|
+
ux_utils.spinner_message(f'Launching ({msg})',
|
|
482
|
+
cluster_name=cluster_name))
|
|
483
|
+
|
|
307
484
|
time.sleep(1)
|
|
308
485
|
|
|
309
486
|
# Handle pod scheduling errors
|
|
@@ -319,17 +496,17 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
319
496
|
|
|
320
497
|
|
|
321
498
|
@timeline.event
|
|
322
|
-
def _wait_for_pods_to_run(namespace, context,
|
|
499
|
+
def _wait_for_pods_to_run(namespace, context, cluster_name, new_pods):
|
|
323
500
|
"""Wait for pods and their containers to be ready.
|
|
324
501
|
|
|
325
502
|
Pods may be pulling images or may be in the process of container
|
|
326
503
|
creation.
|
|
327
504
|
"""
|
|
328
|
-
if not
|
|
505
|
+
if not new_pods:
|
|
329
506
|
return
|
|
330
507
|
|
|
331
508
|
# Create a set of pod names we're waiting for
|
|
332
|
-
expected_pod_names = {
|
|
509
|
+
expected_pod_names = {pod.metadata.name for pod in new_pods}
|
|
333
510
|
|
|
334
511
|
def _check_init_containers(pod):
|
|
335
512
|
# Check if any of the init containers failed
|
|
@@ -356,37 +533,40 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
|
|
|
356
533
|
'Failed to create init container for pod '
|
|
357
534
|
f'{pod.metadata.name}. Error details: {msg}.')
|
|
358
535
|
|
|
359
|
-
|
|
360
|
-
#
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
536
|
+
def _inspect_pod_status(pod):
|
|
537
|
+
# Check if pod is terminated/preempted/failed.
|
|
538
|
+
if (pod.metadata.deletion_timestamp is not None or
|
|
539
|
+
pod.status.phase == 'Failed'):
|
|
540
|
+
# Get the reason and write to cluster events before
|
|
541
|
+
# the pod gets completely deleted from the API.
|
|
542
|
+
termination_reason = _get_pod_termination_reason(pod, cluster_name)
|
|
543
|
+
logger.warning(
|
|
544
|
+
f'Pod {pod.metadata.name} terminated: {termination_reason}')
|
|
545
|
+
raise config_lib.KubernetesError(
|
|
546
|
+
f'Pod {pod.metadata.name} has terminated or failed '
|
|
547
|
+
f'unexpectedly. Run `sky logs --provision {cluster_name}` '
|
|
548
|
+
'for more details.')
|
|
549
|
+
|
|
550
|
+
container_statuses = pod.status.container_statuses
|
|
551
|
+
# Continue if pod and all the containers within the
|
|
552
|
+
# pod are successfully created and running.
|
|
553
|
+
if (pod.status.phase == 'Running' and container_statuses is not None and
|
|
554
|
+
all(container.state.running
|
|
555
|
+
for container in container_statuses)):
|
|
556
|
+
return True, None
|
|
557
|
+
|
|
558
|
+
reason: Optional[str] = None
|
|
559
|
+
if pod.status.phase == 'Pending':
|
|
560
|
+
pending_reason = _get_pod_pending_reason(context, namespace,
|
|
561
|
+
pod.metadata.name)
|
|
562
|
+
if pending_reason is not None:
|
|
563
|
+
reason, message = pending_reason
|
|
564
|
+
logger.debug(f'Pod {pod.metadata.name} is pending: '
|
|
565
|
+
f'{reason}: {message}')
|
|
566
|
+
|
|
567
|
+
# Iterate over each container in pod to check their status
|
|
568
|
+
if container_statuses is not None:
|
|
569
|
+
for container_status in container_statuses:
|
|
390
570
|
# If the container wasn't in 'ContainerCreating'
|
|
391
571
|
# state, then we know pod wasn't scheduled or
|
|
392
572
|
# had some other error, such as image pull error.
|
|
@@ -397,43 +577,86 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
|
|
|
397
577
|
if waiting.reason == 'PodInitializing':
|
|
398
578
|
_check_init_containers(pod)
|
|
399
579
|
elif waiting.reason != 'ContainerCreating':
|
|
400
|
-
msg = waiting.message if
|
|
401
|
-
waiting)
|
|
580
|
+
msg = waiting.message if (
|
|
581
|
+
waiting.message) else str(waiting)
|
|
402
582
|
raise config_lib.KubernetesError(
|
|
403
583
|
'Failed to create container while launching '
|
|
404
584
|
f'the node. Error details: {msg}.')
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
585
|
+
return False, reason
|
|
586
|
+
|
|
587
|
+
missing_pods_retry = 0
|
|
588
|
+
last_status_msg: Optional[str] = None
|
|
589
|
+
while True:
|
|
590
|
+
# Get all pods in a single API call
|
|
591
|
+
cluster_name_on_cloud = new_pods[0].metadata.labels[
|
|
592
|
+
constants.TAG_SKYPILOT_CLUSTER_NAME]
|
|
593
|
+
all_pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
594
|
+
namespace,
|
|
595
|
+
label_selector=
|
|
596
|
+
f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
|
|
597
|
+
).items
|
|
598
|
+
|
|
599
|
+
# Get the set of found pod names and check if we have all expected pods
|
|
600
|
+
found_pod_names = {pod.metadata.name for pod in all_pods}
|
|
601
|
+
missing_pod_names = expected_pod_names - found_pod_names
|
|
602
|
+
if missing_pod_names:
|
|
603
|
+
# In _wait_for_pods_to_schedule, we already wait for all pods to go
|
|
604
|
+
# from pending to scheduled. So if a pod is missing here, it means
|
|
605
|
+
# something unusual must have happened, and so should be treated as
|
|
606
|
+
# an exception.
|
|
607
|
+
# It is also only in _wait_for_pods_to_schedule that
|
|
608
|
+
# provision_timeout is used.
|
|
609
|
+
# TODO(kevin): Should we take provision_timeout into account here,
|
|
610
|
+
# instead of hardcoding the number of retries?
|
|
611
|
+
if missing_pods_retry >= _MAX_MISSING_PODS_RETRIES:
|
|
612
|
+
for pod_name in missing_pod_names:
|
|
613
|
+
reason = _get_pod_missing_reason(context, namespace,
|
|
614
|
+
cluster_name, pod_name)
|
|
615
|
+
logger.warning(f'Pod {pod_name} missing: {reason}')
|
|
616
|
+
raise config_lib.KubernetesError(
|
|
617
|
+
f'Failed to get all pods after {missing_pods_retry} '
|
|
618
|
+
f'retries. Some pods may have been terminated or failed '
|
|
619
|
+
f'unexpectedly. Run `sky logs --provision {cluster_name}` '
|
|
620
|
+
'for more details.')
|
|
621
|
+
logger.info('Retrying running pods check: '
|
|
622
|
+
f'Missing pods: {missing_pod_names}')
|
|
623
|
+
time.sleep(0.5)
|
|
624
|
+
missing_pods_retry += 1
|
|
625
|
+
continue
|
|
626
|
+
|
|
627
|
+
pods_to_check = [
|
|
628
|
+
pod for pod in all_pods if pod.metadata.name in expected_pod_names
|
|
629
|
+
]
|
|
630
|
+
pod_statuses = subprocess_utils.run_in_parallel(_inspect_pod_status,
|
|
631
|
+
pods_to_check,
|
|
632
|
+
_NUM_THREADS)
|
|
633
|
+
|
|
634
|
+
all_pods_running = True
|
|
635
|
+
pending_reasons_count: Dict[str, int] = {}
|
|
636
|
+
for is_running, pending_reason in pod_statuses:
|
|
637
|
+
if not is_running:
|
|
638
|
+
all_pods_running = False
|
|
639
|
+
if pending_reason is not None:
|
|
640
|
+
pending_reasons_count[pending_reason] = (
|
|
641
|
+
pending_reasons_count.get(pending_reason, 0) + 1)
|
|
408
642
|
|
|
409
643
|
if all_pods_running:
|
|
410
644
|
break
|
|
411
|
-
time.sleep(1)
|
|
412
|
-
|
|
413
645
|
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
try:
|
|
429
|
-
return func()
|
|
430
|
-
except config_lib.KubernetesError:
|
|
431
|
-
if attempt < max_retries:
|
|
432
|
-
logger.warning(f'Failed to {operation_name} - '
|
|
433
|
-
f'retrying in {retry_delay} seconds.')
|
|
434
|
-
time.sleep(retry_delay)
|
|
435
|
-
else:
|
|
436
|
-
raise
|
|
646
|
+
if pending_reasons_count:
|
|
647
|
+
msg = ', '.join([
|
|
648
|
+
f'{count} pod(s) pending due to {reason}'
|
|
649
|
+
for reason, count in sorted(pending_reasons_count.items())
|
|
650
|
+
])
|
|
651
|
+
status_text = f'Launching ({msg})'
|
|
652
|
+
else:
|
|
653
|
+
status_text = 'Launching'
|
|
654
|
+
new_status_msg = ux_utils.spinner_message(status_text,
|
|
655
|
+
cluster_name=cluster_name)
|
|
656
|
+
if new_status_msg != last_status_msg:
|
|
657
|
+
rich_utils.force_update_status(new_status_msg)
|
|
658
|
+
last_status_msg = new_status_msg
|
|
659
|
+
time.sleep(1)
|
|
437
660
|
|
|
438
661
|
|
|
439
662
|
@timeline.event
|
|
@@ -670,26 +893,11 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
|
|
|
670
893
|
raise e
|
|
671
894
|
|
|
672
895
|
|
|
673
|
-
def _create_persistent_volume_claim(namespace: str, context: Optional[str],
|
|
674
|
-
pvc_spec: Dict[str, Any]) -> None:
|
|
675
|
-
"""Creates a persistent volume claim for SkyServe controller."""
|
|
676
|
-
try:
|
|
677
|
-
kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
|
|
678
|
-
name=pvc_spec['metadata']['name'], namespace=namespace)
|
|
679
|
-
return
|
|
680
|
-
except kubernetes.api_exception() as e:
|
|
681
|
-
if e.status != 404: # Not found
|
|
682
|
-
raise
|
|
683
|
-
|
|
684
|
-
kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
|
|
685
|
-
namespace=namespace, body=pvc_spec)
|
|
686
|
-
|
|
687
|
-
|
|
688
896
|
@timeline.event
|
|
689
897
|
def _wait_for_deployment_pod(context,
|
|
690
898
|
namespace,
|
|
691
899
|
deployment,
|
|
692
|
-
timeout=
|
|
900
|
+
timeout=300) -> List:
|
|
693
901
|
label_selector = ','.join([
|
|
694
902
|
f'{key}={value}'
|
|
695
903
|
for key, value in deployment.spec.selector.match_labels.items()
|
|
@@ -721,13 +929,14 @@ def _wait_for_deployment_pod(context,
|
|
|
721
929
|
|
|
722
930
|
|
|
723
931
|
@timeline.event
|
|
724
|
-
def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
932
|
+
def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
725
933
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
726
934
|
"""Create pods based on the config."""
|
|
727
935
|
provider_config = config.provider_config
|
|
728
936
|
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
729
937
|
context = kubernetes_utils.get_context_from_config(provider_config)
|
|
730
938
|
pod_spec = copy.deepcopy(config.node_config)
|
|
939
|
+
create_pods_start = datetime.datetime.now(datetime.timezone.utc)
|
|
731
940
|
|
|
732
941
|
to_create_deployment = 'deployment_spec' in pod_spec
|
|
733
942
|
if to_create_deployment:
|
|
@@ -744,7 +953,26 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
744
953
|
else:
|
|
745
954
|
pod_spec['metadata']['labels'] = tags
|
|
746
955
|
pod_spec['metadata']['labels'].update(
|
|
747
|
-
{TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
|
|
956
|
+
{constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
|
|
957
|
+
|
|
958
|
+
ephemeral_volumes = provider_config.get('ephemeral_volume_infos')
|
|
959
|
+
if ephemeral_volumes:
|
|
960
|
+
for ephemeral_volume in ephemeral_volumes:
|
|
961
|
+
# Update the volumes and volume mounts in the pod spec
|
|
962
|
+
if 'volumes' not in pod_spec['spec']:
|
|
963
|
+
pod_spec['spec']['volumes'] = []
|
|
964
|
+
pod_spec['spec']['volumes'].append({
|
|
965
|
+
'name': ephemeral_volume.name,
|
|
966
|
+
'persistentVolumeClaim': {
|
|
967
|
+
'claimName': ephemeral_volume.volume_name_on_cloud,
|
|
968
|
+
},
|
|
969
|
+
})
|
|
970
|
+
if 'volumeMounts' not in pod_spec['spec']['containers'][0]:
|
|
971
|
+
pod_spec['spec']['containers'][0]['volumeMounts'] = []
|
|
972
|
+
pod_spec['spec']['containers'][0]['volumeMounts'].append({
|
|
973
|
+
'name': ephemeral_volume.name,
|
|
974
|
+
'mountPath': ephemeral_volume.path,
|
|
975
|
+
})
|
|
748
976
|
|
|
749
977
|
terminating_pods = kubernetes_utils.filter_pods(namespace, context, tags,
|
|
750
978
|
['Terminating'])
|
|
@@ -776,8 +1004,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
776
1004
|
running_pods = kubernetes_utils.filter_pods(namespace, context, tags,
|
|
777
1005
|
['Pending', 'Running'])
|
|
778
1006
|
head_pod_name = _get_head_pod_name(running_pods)
|
|
1007
|
+
running_pod_statuses = [{
|
|
1008
|
+
pod.metadata.name: pod.status.phase
|
|
1009
|
+
} for pod in running_pods.values()]
|
|
779
1010
|
logger.debug(f'Found {len(running_pods)} existing pods: '
|
|
780
|
-
f'{
|
|
1011
|
+
f'{running_pod_statuses}')
|
|
781
1012
|
|
|
782
1013
|
to_start_count = config.count - len(running_pods)
|
|
783
1014
|
if to_start_count < 0:
|
|
@@ -793,7 +1024,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
793
1024
|
nvidia_runtime_exists = False
|
|
794
1025
|
try:
|
|
795
1026
|
nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
|
|
796
|
-
context)
|
|
1027
|
+
context=context)
|
|
797
1028
|
except kubernetes.kubernetes.client.ApiException as e:
|
|
798
1029
|
logger.warning('run_instances: Error occurred while checking for '
|
|
799
1030
|
f'nvidia RuntimeClass - '
|
|
@@ -804,14 +1035,18 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
804
1035
|
'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html') # pylint: disable=line-too-long
|
|
805
1036
|
|
|
806
1037
|
needs_gpus = False
|
|
1038
|
+
needs_gpus_nvidia = False
|
|
807
1039
|
limits = pod_spec['spec']['containers'][0].get('resources',
|
|
808
1040
|
{}).get('limits')
|
|
809
1041
|
if limits is not None:
|
|
810
|
-
needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(),
|
|
1042
|
+
needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(context),
|
|
1043
|
+
0) > 0
|
|
1044
|
+
needs_gpus_nvidia = limits.get(
|
|
1045
|
+
kubernetes_utils.SUPPORTED_GPU_RESOURCE_KEYS['nvidia'], 0) > 0
|
|
811
1046
|
|
|
812
1047
|
# TPU pods provisioned on GKE use the default containerd runtime.
|
|
813
1048
|
# Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview # pylint: disable=line-too-long
|
|
814
|
-
if nvidia_runtime_exists and
|
|
1049
|
+
if nvidia_runtime_exists and needs_gpus_nvidia:
|
|
815
1050
|
pod_spec['spec']['runtimeClassName'] = 'nvidia'
|
|
816
1051
|
|
|
817
1052
|
logger.debug(f'run_instances: calling create_namespaced_pod '
|
|
@@ -819,19 +1054,46 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
819
1054
|
|
|
820
1055
|
def _create_resource_thread(i: int):
|
|
821
1056
|
pod_spec_copy = copy.deepcopy(pod_spec)
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
1057
|
+
# 0 is for head pod, while 1+ is for worker pods.
|
|
1058
|
+
if i == 0:
|
|
1059
|
+
if head_pod_name is None:
|
|
1060
|
+
# First pod should be head if no head exists
|
|
1061
|
+
pod_spec_copy['metadata']['labels'].update(
|
|
1062
|
+
constants.HEAD_NODE_TAGS)
|
|
1063
|
+
head_selector = _head_service_selector(cluster_name_on_cloud)
|
|
1064
|
+
pod_spec_copy['metadata']['labels'].update(head_selector)
|
|
1065
|
+
pod_spec_copy['metadata'][
|
|
1066
|
+
'name'] = f'{cluster_name_on_cloud}-head'
|
|
1067
|
+
else:
|
|
1068
|
+
# If head pod already exists, we skip creating it.
|
|
1069
|
+
return
|
|
828
1070
|
else:
|
|
829
1071
|
# Worker pods
|
|
830
1072
|
pod_spec_copy['metadata']['labels'].update(
|
|
831
1073
|
constants.WORKER_NODE_TAGS)
|
|
832
|
-
|
|
833
|
-
pod_name
|
|
834
|
-
|
|
1074
|
+
pod_name = f'{cluster_name_on_cloud}-worker{i}'
|
|
1075
|
+
if pod_name in running_pods:
|
|
1076
|
+
# If the pod is already running, we skip creating it.
|
|
1077
|
+
return
|
|
1078
|
+
pod_spec_copy['metadata']['name'] = pod_name
|
|
1079
|
+
pod_spec_copy['metadata']['labels']['component'] = pod_name
|
|
1080
|
+
|
|
1081
|
+
# We need to keep the following fields in the pod spec to be same for
|
|
1082
|
+
# head and worker pods.
|
|
1083
|
+
# So that Kueue can merge them into a single PodSet when creating
|
|
1084
|
+
# ProvisioningRequest to trigger scale up of the cluster autoscaler,
|
|
1085
|
+
# this is especially required for DWS queued provisioning mode in GKE.
|
|
1086
|
+
# spec.containers[*].resources.requests
|
|
1087
|
+
# spec.initContainers[*].resources.requests
|
|
1088
|
+
# spec.resources
|
|
1089
|
+
# spec.nodeSelector
|
|
1090
|
+
# spec.tolerations
|
|
1091
|
+
# spec.affinity
|
|
1092
|
+
# resourceClaims
|
|
1093
|
+
# Refer to the following links for more details:
|
|
1094
|
+
# https://cloud.google.com/kubernetes-engine/docs/how-to/provisioningrequest#define_a_provisioningrequest_object # pylint: disable=line-too-long
|
|
1095
|
+
# https://kueue.sigs.k8s.io/docs/admission-check-controllers/provisioning/#podset-merge-policy # pylint: disable=line-too-long
|
|
1096
|
+
if config.count > 1:
|
|
835
1097
|
# For multi-node support, we put a soft-constraint to schedule
|
|
836
1098
|
# worker pods on different nodes than the head pod.
|
|
837
1099
|
# This is not set as a hard constraint because if different nodes
|
|
@@ -850,7 +1112,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
850
1112
|
'podAffinityTerm': {
|
|
851
1113
|
'labelSelector': {
|
|
852
1114
|
'matchExpressions': [{
|
|
853
|
-
'key': TAG_SKYPILOT_CLUSTER_NAME,
|
|
1115
|
+
'key': constants.TAG_SKYPILOT_CLUSTER_NAME,
|
|
854
1116
|
'operator': 'In',
|
|
855
1117
|
'values': [cluster_name_on_cloud]
|
|
856
1118
|
}]
|
|
@@ -883,9 +1145,25 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
883
1145
|
pod_spec_copy['spec']['tolerations'] = existing_tolerations + [
|
|
884
1146
|
tpu_toleration
|
|
885
1147
|
]
|
|
1148
|
+
# Add GPU toleration if GPU is requested.
|
|
1149
|
+
# The nodes provisioned by DWS with flex start with queued provisioning
|
|
1150
|
+
# mode have the GPU taint, so we have to add the GPU toleration.
|
|
1151
|
+
# No need to check if DWS is enabled here since this has no side effect
|
|
1152
|
+
# to the non-DWS case.
|
|
1153
|
+
if needs_gpus:
|
|
1154
|
+
gpu_toleration = {
|
|
1155
|
+
'key': kubernetes_utils.get_gpu_resource_key(context),
|
|
1156
|
+
'operator': 'Exists',
|
|
1157
|
+
'effect': 'NoSchedule'
|
|
1158
|
+
}
|
|
1159
|
+
# Preserve existing tolerations if any
|
|
1160
|
+
existing_tolerations = pod_spec_copy['spec'].get('tolerations', [])
|
|
1161
|
+
pod_spec_copy['spec']['tolerations'] = existing_tolerations + [
|
|
1162
|
+
gpu_toleration
|
|
1163
|
+
]
|
|
886
1164
|
|
|
887
1165
|
if to_create_deployment:
|
|
888
|
-
|
|
1166
|
+
volume.create_persistent_volume_claim(namespace, context, pvc_spec)
|
|
889
1167
|
|
|
890
1168
|
# It's safe to directly modify the template spec in the deployment spec
|
|
891
1169
|
# because controller pod is singleton, i in [0].
|
|
@@ -893,9 +1171,12 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
893
1171
|
# Add the deployment name as a label to the pod spec
|
|
894
1172
|
deployment_name = deployment_spec['metadata']['name']
|
|
895
1173
|
pod_spec_copy['metadata']['labels'][
|
|
896
|
-
TAG_SKYPILOT_DEPLOYMENT_NAME] = deployment_name
|
|
1174
|
+
k8s_constants.TAG_SKYPILOT_DEPLOYMENT_NAME] = deployment_name
|
|
897
1175
|
template_pod_spec['metadata'] = pod_spec_copy['metadata']
|
|
898
1176
|
template_pod_spec['spec'].update(pod_spec_copy['spec'])
|
|
1177
|
+
# Propagate the labels to the deployment for identification.
|
|
1178
|
+
deployment_spec['metadata']['labels'] = pod_spec_copy['metadata'][
|
|
1179
|
+
'labels']
|
|
899
1180
|
try:
|
|
900
1181
|
return kubernetes.apps_api(
|
|
901
1182
|
context).create_namespaced_deployment(
|
|
@@ -904,6 +1185,10 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
904
1185
|
print('Deployment failed', e)
|
|
905
1186
|
raise e
|
|
906
1187
|
|
|
1188
|
+
# Check if any PVCs with access mode ReadWriteOnce or ReadWriteOncePod
|
|
1189
|
+
# is used by any pod in the namespace.
|
|
1190
|
+
volume.check_pvc_usage_for_pod(context, namespace, pod_spec_copy)
|
|
1191
|
+
|
|
907
1192
|
return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
|
|
908
1193
|
context)
|
|
909
1194
|
|
|
@@ -922,9 +1207,16 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
922
1207
|
'and then up the cluster again.')
|
|
923
1208
|
raise exceptions.InconsistentHighAvailabilityError(message)
|
|
924
1209
|
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
1210
|
+
created_resources = []
|
|
1211
|
+
if to_start_count > 0:
|
|
1212
|
+
# Create pods in parallel.
|
|
1213
|
+
# Use `config.count` instead of `to_start_count` to keep the index of
|
|
1214
|
+
# the Pods consistent especially for the case where some Pods are down
|
|
1215
|
+
# due to node failure or manual termination, etc. and then launch
|
|
1216
|
+
# again to create the Pods back.
|
|
1217
|
+
# The existing Pods will be skipped in _create_resource_thread.
|
|
1218
|
+
created_resources = subprocess_utils.run_in_parallel(
|
|
1219
|
+
_create_resource_thread, list(range(config.count)), _NUM_THREADS)
|
|
928
1220
|
|
|
929
1221
|
if to_create_deployment:
|
|
930
1222
|
deployments = copy.deepcopy(created_resources)
|
|
@@ -937,20 +1229,22 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
937
1229
|
pods = created_resources
|
|
938
1230
|
|
|
939
1231
|
created_pods = {}
|
|
1232
|
+
valid_pods = []
|
|
940
1233
|
for pod in pods:
|
|
1234
|
+
# In case Pod is not created
|
|
1235
|
+
if pod is None:
|
|
1236
|
+
continue
|
|
1237
|
+
valid_pods.append(pod)
|
|
941
1238
|
created_pods[pod.metadata.name] = pod
|
|
942
1239
|
if head_pod_name is None and _is_head(pod):
|
|
943
1240
|
head_pod_name = pod.metadata.name
|
|
1241
|
+
pods = valid_pods
|
|
1242
|
+
|
|
1243
|
+
# The running_pods may include Pending Pods, so we add them to the pods
|
|
1244
|
+
# list to wait for scheduling and running
|
|
1245
|
+
if running_pods:
|
|
1246
|
+
pods = pods + list(running_pods.values())
|
|
944
1247
|
|
|
945
|
-
networking_mode = network_utils.get_networking_mode(
|
|
946
|
-
config.provider_config.get('networking_mode'))
|
|
947
|
-
if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
|
948
|
-
# Adding the jump pod to the new_nodes list as well so it can be
|
|
949
|
-
# checked if it's scheduled and running along with other pods.
|
|
950
|
-
ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
|
|
951
|
-
jump_pod = kubernetes.core_api(context).read_namespaced_pod(
|
|
952
|
-
ssh_jump_pod_name, namespace)
|
|
953
|
-
pods.append(jump_pod)
|
|
954
1248
|
provision_timeout = provider_config['timeout']
|
|
955
1249
|
|
|
956
1250
|
wait_str = ('indefinitely'
|
|
@@ -960,12 +1254,21 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
960
1254
|
|
|
961
1255
|
# Wait until the pods are scheduled and surface cause for error
|
|
962
1256
|
# if there is one
|
|
963
|
-
_wait_for_pods_to_schedule(namespace, context, pods, provision_timeout
|
|
1257
|
+
_wait_for_pods_to_schedule(namespace, context, pods, provision_timeout,
|
|
1258
|
+
cluster_name, create_pods_start)
|
|
1259
|
+
# Reset spinner message here because it might have hinted autoscaling
|
|
1260
|
+
# while waiting for pods to schedule.
|
|
1261
|
+
rich_utils.force_update_status(
|
|
1262
|
+
ux_utils.spinner_message('Launching', cluster_name=cluster_name))
|
|
964
1263
|
# Wait until the pods and their containers are up and running, and
|
|
965
1264
|
# fail early if there is an error
|
|
966
|
-
logger.debug(f'run_instances: waiting for pods to be running
|
|
967
|
-
f'
|
|
968
|
-
_wait_for_pods_to_run(namespace, context, pods)
|
|
1265
|
+
logger.debug(f'run_instances: waiting for pods to be running: '
|
|
1266
|
+
f'{[pod.metadata.name for pod in pods]}')
|
|
1267
|
+
_wait_for_pods_to_run(namespace, context, cluster_name, pods)
|
|
1268
|
+
# Reset spinner message here because it might have hinted the reason
|
|
1269
|
+
# pods were pending.
|
|
1270
|
+
rich_utils.force_update_status(
|
|
1271
|
+
ux_utils.spinner_message('Launching', cluster_name=cluster_name))
|
|
969
1272
|
logger.debug(f'run_instances: all pods are scheduled and running: '
|
|
970
1273
|
f'{[pod.metadata.name for pod in pods]}')
|
|
971
1274
|
|
|
@@ -981,11 +1284,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
981
1284
|
)
|
|
982
1285
|
|
|
983
1286
|
|
|
984
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
1287
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
985
1288
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
986
1289
|
"""Runs instances for the given cluster."""
|
|
987
1290
|
try:
|
|
988
|
-
return _create_pods(region, cluster_name_on_cloud, config)
|
|
1291
|
+
return _create_pods(region, cluster_name, cluster_name_on_cloud, config)
|
|
989
1292
|
except (kubernetes.api_exception(), config_lib.KubernetesError) as e:
|
|
990
1293
|
e_msg = common_utils.format_exception(e).replace('\n', ' ')
|
|
991
1294
|
logger.warning('run_instances: Error occurred when creating pods: '
|
|
@@ -1006,42 +1309,10 @@ def stop_instances(
|
|
|
1006
1309
|
raise NotImplementedError()
|
|
1007
1310
|
|
|
1008
1311
|
|
|
1009
|
-
def
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
Args:
|
|
1014
|
-
delete_func: Function to call to delete the resource
|
|
1015
|
-
resource_type: Type of resource being deleted (e.g. 'service'),
|
|
1016
|
-
used in logging
|
|
1017
|
-
resource_name: Name of the resource being deleted, used in logging
|
|
1018
|
-
"""
|
|
1019
|
-
max_retries = 3
|
|
1020
|
-
retry_delay = 5 # seconds
|
|
1021
|
-
|
|
1022
|
-
for attempt in range(max_retries):
|
|
1023
|
-
try:
|
|
1024
|
-
delete_func()
|
|
1025
|
-
return
|
|
1026
|
-
except kubernetes.api_exception() as e:
|
|
1027
|
-
if e.status == 404:
|
|
1028
|
-
logger.warning(
|
|
1029
|
-
f'terminate_instances: Tried to delete {resource_type} '
|
|
1030
|
-
f'{resource_name}, but the {resource_type} was not '
|
|
1031
|
-
'found (404).')
|
|
1032
|
-
return
|
|
1033
|
-
elif attempt < max_retries - 1:
|
|
1034
|
-
logger.warning(f'terminate_instances: Failed to delete '
|
|
1035
|
-
f'{resource_type} {resource_name} (attempt '
|
|
1036
|
-
f'{attempt + 1}/{max_retries}). Error: {e}. '
|
|
1037
|
-
f'Retrying in {retry_delay} seconds...')
|
|
1038
|
-
time.sleep(retry_delay)
|
|
1039
|
-
else:
|
|
1040
|
-
raise
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
def _delete_services(name_prefix: str, namespace: str,
|
|
1044
|
-
context: Optional[str]) -> None:
|
|
1312
|
+
def _delete_services(name_prefix: str,
|
|
1313
|
+
namespace: str,
|
|
1314
|
+
context: Optional[str],
|
|
1315
|
+
skip_ssh_service: bool = False) -> None:
|
|
1045
1316
|
"""Delete services with the given name prefix.
|
|
1046
1317
|
|
|
1047
1318
|
Args:
|
|
@@ -1050,18 +1321,21 @@ def _delete_services(name_prefix: str, namespace: str,
|
|
|
1050
1321
|
context: Kubernetes context
|
|
1051
1322
|
"""
|
|
1052
1323
|
# TODO(andy): We should use tag for the service filter.
|
|
1053
|
-
|
|
1324
|
+
services = ([name_prefix, f'{name_prefix}-ssh']
|
|
1325
|
+
if not skip_ssh_service else [name_prefix])
|
|
1326
|
+
for service_name in services:
|
|
1054
1327
|
# Since we are not saving this lambda, it's a false positive.
|
|
1055
1328
|
# TODO(andyl): Wait for
|
|
1056
1329
|
# https://github.com/pylint-dev/pylint/issues/5263.
|
|
1057
1330
|
# pylint: disable=cell-var-from-loop
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1331
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
|
1332
|
+
delete_func=lambda: kubernetes.core_api(
|
|
1333
|
+
context).delete_namespaced_service(name=service_name,
|
|
1334
|
+
namespace=namespace,
|
|
1335
|
+
_request_timeout=config_lib.
|
|
1336
|
+
DELETION_TIMEOUT),
|
|
1337
|
+
resource_type='service',
|
|
1338
|
+
resource_name=service_name)
|
|
1065
1339
|
|
|
1066
1340
|
|
|
1067
1341
|
def _terminate_node(namespace: str,
|
|
@@ -1075,13 +1349,16 @@ def _terminate_node(namespace: str,
|
|
|
1075
1349
|
# Delete services for the head pod
|
|
1076
1350
|
# services are specified in sky/templates/kubernetes-ray.yml.j2
|
|
1077
1351
|
_delete_services(pod_name, namespace, context)
|
|
1352
|
+
else:
|
|
1353
|
+
# No ssh service is created for worker pods
|
|
1354
|
+
_delete_services(pod_name, namespace, context, skip_ssh_service=True)
|
|
1078
1355
|
|
|
1079
1356
|
# Note - delete pod after all other resources are deleted.
|
|
1080
1357
|
# This is to ensure there are no leftover resources if this down is run
|
|
1081
1358
|
# from within the pod, e.g., for autodown.
|
|
1082
1359
|
# Note - some misbehaving pods may not terminate gracefully if they have
|
|
1083
1360
|
# open file descriptors. We force delete pods to avoid this.
|
|
1084
|
-
|
|
1361
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
|
1085
1362
|
delete_func=lambda: kubernetes.core_api(context).delete_namespaced_pod(
|
|
1086
1363
|
name=pod_name,
|
|
1087
1364
|
namespace=namespace,
|
|
@@ -1099,26 +1376,28 @@ def _terminate_deployment(cluster_name: str, namespace: str,
|
|
|
1099
1376
|
|
|
1100
1377
|
# Delete deployment
|
|
1101
1378
|
deployment_name = _get_deployment_name(cluster_name)
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1379
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
|
1380
|
+
delete_func=lambda: kubernetes.apps_api(
|
|
1381
|
+
context).delete_namespaced_deployment(name=deployment_name,
|
|
1382
|
+
namespace=namespace,
|
|
1383
|
+
_request_timeout=config_lib.
|
|
1384
|
+
DELETION_TIMEOUT),
|
|
1385
|
+
resource_type='deployment',
|
|
1386
|
+
resource_name=deployment_name)
|
|
1109
1387
|
|
|
1110
1388
|
# Delete PVCs
|
|
1111
1389
|
pvc_name = _get_pvc_name(
|
|
1112
1390
|
cluster_name,
|
|
1113
1391
|
kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME)
|
|
1114
1392
|
# pylint: disable=cell-var-from-loop
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1393
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
|
1394
|
+
delete_func=lambda: kubernetes.core_api(
|
|
1395
|
+
context).delete_namespaced_persistent_volume_claim(
|
|
1396
|
+
name=pvc_name,
|
|
1397
|
+
namespace=namespace,
|
|
1398
|
+
_request_timeout=config_lib.DELETION_TIMEOUT),
|
|
1399
|
+
resource_type='pvc',
|
|
1400
|
+
resource_name=pvc_name)
|
|
1122
1401
|
|
|
1123
1402
|
|
|
1124
1403
|
def terminate_instances(
|
|
@@ -1133,18 +1412,6 @@ def terminate_instances(
|
|
|
1133
1412
|
ray_tag_filter(cluster_name_on_cloud),
|
|
1134
1413
|
None)
|
|
1135
1414
|
|
|
1136
|
-
# Clean up the SSH jump pod if in use
|
|
1137
|
-
networking_mode = network_utils.get_networking_mode(
|
|
1138
|
-
provider_config.get('networking_mode'))
|
|
1139
|
-
if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
|
1140
|
-
pod_name = list(pods.keys())[0]
|
|
1141
|
-
try:
|
|
1142
|
-
kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context,
|
|
1143
|
-
pod_name)
|
|
1144
|
-
except Exception as e: # pylint: disable=broad-except
|
|
1145
|
-
logger.warning('terminate_instances: Error occurred when analyzing '
|
|
1146
|
-
f'SSH Jump pod: {e}')
|
|
1147
|
-
|
|
1148
1415
|
if is_high_availability_cluster_by_kubectl(cluster_name_on_cloud, context,
|
|
1149
1416
|
namespace):
|
|
1150
1417
|
# For high availability controllers, terminate the deployment
|
|
@@ -1175,16 +1442,11 @@ def get_cluster_info(
|
|
|
1175
1442
|
|
|
1176
1443
|
running_pods = kubernetes_utils.filter_pods(
|
|
1177
1444
|
namespace, context, ray_tag_filter(cluster_name_on_cloud), ['Running'])
|
|
1445
|
+
logger.debug(f'Running pods: {list(running_pods.keys())}')
|
|
1178
1446
|
|
|
1179
1447
|
pods: Dict[str, List[common.InstanceInfo]] = {}
|
|
1180
1448
|
head_pod_name = None
|
|
1181
1449
|
|
|
1182
|
-
port_forward_mode = kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD
|
|
1183
|
-
network_mode_str = skypilot_config.get_nested(('kubernetes', 'networking'),
|
|
1184
|
-
port_forward_mode.value)
|
|
1185
|
-
network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
|
|
1186
|
-
network_mode_str)
|
|
1187
|
-
external_ip = kubernetes_utils.get_external_ip(network_mode, context)
|
|
1188
1450
|
port = 22
|
|
1189
1451
|
if not provider_config.get('use_internal_ips', False):
|
|
1190
1452
|
port = kubernetes_utils.get_head_ssh_port(cluster_name_on_cloud,
|
|
@@ -1198,10 +1460,12 @@ def get_cluster_info(
|
|
|
1198
1460
|
common.InstanceInfo(
|
|
1199
1461
|
instance_id=pod_name,
|
|
1200
1462
|
internal_ip=internal_ip,
|
|
1201
|
-
external_ip=
|
|
1202
|
-
external_ip),
|
|
1463
|
+
external_ip=None,
|
|
1203
1464
|
ssh_port=port,
|
|
1204
1465
|
tags=pod.metadata.labels,
|
|
1466
|
+
# TODO(hailong): `cluster.local` may need to be configurable
|
|
1467
|
+
# Service name is same as the pod name for now.
|
|
1468
|
+
internal_svc=f'{pod_name}.{namespace}.svc.cluster.local',
|
|
1205
1469
|
)
|
|
1206
1470
|
]
|
|
1207
1471
|
if _is_head(pod):
|
|
@@ -1210,10 +1474,16 @@ def get_cluster_info(
|
|
|
1210
1474
|
assert head_spec is not None, pod
|
|
1211
1475
|
cpu_request = head_spec.containers[0].resources.requests['cpu']
|
|
1212
1476
|
|
|
1213
|
-
|
|
1477
|
+
if cpu_request is None:
|
|
1478
|
+
raise RuntimeError(f'Pod {cluster_name_on_cloud}-head not found'
|
|
1479
|
+
' or not Running, check the Pod status')
|
|
1214
1480
|
|
|
1215
1481
|
ssh_user = 'sky'
|
|
1216
|
-
|
|
1482
|
+
# Use pattern matching to extract SSH user, handling MOTD contamination.
|
|
1483
|
+
# Some container images (like CUDA-Q) print MOTD when login shells start,
|
|
1484
|
+
# which can contaminate command output. We use a unique pattern to extract
|
|
1485
|
+
# the actual username reliably.
|
|
1486
|
+
get_k8s_ssh_user_cmd = 'echo "SKYPILOT_SSH_USER: $(whoami)"'
|
|
1217
1487
|
assert head_pod_name is not None
|
|
1218
1488
|
runner = command_runner.KubernetesCommandRunner(
|
|
1219
1489
|
((namespace, context), head_pod_name))
|
|
@@ -1223,10 +1493,24 @@ def get_cluster_info(
|
|
|
1223
1493
|
stream_logs=False)
|
|
1224
1494
|
_raise_command_running_error('get ssh user', get_k8s_ssh_user_cmd,
|
|
1225
1495
|
head_pod_name, rc, stdout + stderr)
|
|
1226
|
-
|
|
1496
|
+
|
|
1497
|
+
# Extract SSH user using pattern matching
|
|
1498
|
+
ssh_user_match = _SSH_USER_PATTERN.search(stdout)
|
|
1499
|
+
if ssh_user_match:
|
|
1500
|
+
ssh_user = ssh_user_match.group(1)
|
|
1501
|
+
else:
|
|
1502
|
+
raise ValueError('Failed to find SSH user identifier: '
|
|
1503
|
+
f'{stdout + stderr}')
|
|
1227
1504
|
logger.debug(
|
|
1228
1505
|
f'Using ssh user {ssh_user} for cluster {cluster_name_on_cloud}')
|
|
1229
1506
|
|
|
1507
|
+
# cpu_request may be a string like `100m`, need to parse and convert
|
|
1508
|
+
num_cpus = kubernetes_utils.parse_cpu_or_gpu_resource_to_float(cpu_request)
|
|
1509
|
+
# 'num-cpus' for ray must be an integer, but we should not set it to 0 if
|
|
1510
|
+
# cpus is <1.
|
|
1511
|
+
# Keep consistent with the logic in clouds/kubernetes.py
|
|
1512
|
+
str_cpus = str(max(int(num_cpus), 1))
|
|
1513
|
+
|
|
1230
1514
|
return common.ClusterInfo(
|
|
1231
1515
|
instances=pods,
|
|
1232
1516
|
head_instance_id=head_pod_name,
|
|
@@ -1236,56 +1520,410 @@ def get_cluster_info(
|
|
|
1236
1520
|
# problems for other pods.
|
|
1237
1521
|
custom_ray_options={
|
|
1238
1522
|
'object-store-memory': 500000000,
|
|
1239
|
-
'num-cpus':
|
|
1523
|
+
'num-cpus': str_cpus,
|
|
1240
1524
|
},
|
|
1241
1525
|
provider_name='kubernetes',
|
|
1242
1526
|
provider_config=provider_config)
|
|
1243
1527
|
|
|
1244
1528
|
|
|
1529
|
+
def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
|
|
1530
|
+
"""Get pod termination reason and write to cluster events.
|
|
1531
|
+
|
|
1532
|
+
Checks both pod conditions (for preemption/disruption) and
|
|
1533
|
+
container statuses (for exit codes/errors).
|
|
1534
|
+
"""
|
|
1535
|
+
latest_timestamp = pod.status.start_time or datetime.datetime.min
|
|
1536
|
+
ready_state = 'Unknown'
|
|
1537
|
+
termination_reason = 'Terminated unexpectedly'
|
|
1538
|
+
container_reasons = []
|
|
1539
|
+
|
|
1540
|
+
# Check pod status conditions for high level overview.
|
|
1541
|
+
# No need to sort, as each condition.type will only appear once.
|
|
1542
|
+
for condition in pod.status.conditions:
|
|
1543
|
+
reason = condition.reason or 'Unknown reason'
|
|
1544
|
+
message = condition.message or ''
|
|
1545
|
+
|
|
1546
|
+
# Get last known readiness state.
|
|
1547
|
+
if condition.type == 'Ready':
|
|
1548
|
+
ready_state = f'{reason} ({message})' if message else reason
|
|
1549
|
+
# Kueue preemption, as defined in:
|
|
1550
|
+
# https://pkg.go.dev/sigs.k8s.io/kueue/pkg/controller/jobs/pod#pkg-constants
|
|
1551
|
+
elif condition.type == 'TerminationTarget':
|
|
1552
|
+
termination_reason = f'Preempted by Kueue: {reason}'
|
|
1553
|
+
if message:
|
|
1554
|
+
termination_reason += f' ({message})'
|
|
1555
|
+
# Generic disruption.
|
|
1556
|
+
elif condition.type == 'DisruptionTarget':
|
|
1557
|
+
termination_reason = f'Disrupted: {reason}'
|
|
1558
|
+
if message:
|
|
1559
|
+
termination_reason += f' ({message})'
|
|
1560
|
+
|
|
1561
|
+
if condition.last_transition_time is not None:
|
|
1562
|
+
latest_timestamp = max(latest_timestamp,
|
|
1563
|
+
condition.last_transition_time)
|
|
1564
|
+
|
|
1565
|
+
pod_reason = (f'{termination_reason}.\n'
|
|
1566
|
+
f'Last known state: {ready_state}.')
|
|
1567
|
+
|
|
1568
|
+
# Check container statuses for exit codes/errors
|
|
1569
|
+
if pod.status and pod.status.container_statuses:
|
|
1570
|
+
for container_status in pod.status.container_statuses:
|
|
1571
|
+
terminated = container_status.state.terminated
|
|
1572
|
+
if terminated:
|
|
1573
|
+
exit_code = terminated.exit_code
|
|
1574
|
+
reason = terminated.reason
|
|
1575
|
+
if exit_code == 0:
|
|
1576
|
+
# skip exit 0 (non-failed) just for sanity
|
|
1577
|
+
logger.debug(f'{pod.metadata.name}/{container_status.name} '
|
|
1578
|
+
'had exit code 0. Skipping.')
|
|
1579
|
+
continue
|
|
1580
|
+
if reason is None:
|
|
1581
|
+
# just in-case reason is None, have default for debugging
|
|
1582
|
+
reason = f'exit({exit_code})'
|
|
1583
|
+
container_reasons.append(reason)
|
|
1584
|
+
latest_timestamp = max(latest_timestamp, terminated.finished_at)
|
|
1585
|
+
|
|
1586
|
+
# TODO (kyuds): later, if needed, query `last_state` too.
|
|
1587
|
+
|
|
1588
|
+
# Normally we will have a single container per pod for skypilot
|
|
1589
|
+
# but doing this just in-case there are multiple containers.
|
|
1590
|
+
if container_reasons:
|
|
1591
|
+
pod_reason += f'\nContainer errors: {" | ".join(container_reasons)}'
|
|
1592
|
+
|
|
1593
|
+
global_user_state.add_cluster_event(
|
|
1594
|
+
cluster_name,
|
|
1595
|
+
None,
|
|
1596
|
+
f'[kubernetes pod {pod.metadata.name} terminated] {pod_reason}',
|
|
1597
|
+
global_user_state.ClusterEventType.DEBUG,
|
|
1598
|
+
transitioned_at=int(latest_timestamp.timestamp()),
|
|
1599
|
+
)
|
|
1600
|
+
return pod_reason
|
|
1601
|
+
|
|
1602
|
+
|
|
1603
|
+
def _get_pod_events(context: Optional[str], namespace: str,
|
|
1604
|
+
pod_name: str) -> List[Any]:
|
|
1605
|
+
"""Get the events for a pod, sorted by timestamp, most recent first."""
|
|
1606
|
+
pod_field_selector = (
|
|
1607
|
+
f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
|
|
1608
|
+
pod_events = kubernetes.core_api(context).list_namespaced_event(
|
|
1609
|
+
namespace,
|
|
1610
|
+
field_selector=pod_field_selector,
|
|
1611
|
+
_request_timeout=kubernetes.API_TIMEOUT).items
|
|
1612
|
+
return sorted(
|
|
1613
|
+
pod_events,
|
|
1614
|
+
key=lambda event: event.metadata.creation_timestamp,
|
|
1615
|
+
# latest event appears first
|
|
1616
|
+
reverse=True)
|
|
1617
|
+
|
|
1618
|
+
|
|
1619
|
+
def _get_pod_pending_reason(context: Optional[str], namespace: str,
|
|
1620
|
+
pod_name: str) -> Optional[Tuple[str, str]]:
|
|
1621
|
+
"""Get the reason why a pod is pending from its events.
|
|
1622
|
+
|
|
1623
|
+
Returns a (reason, message) tuple about why the pod is pending (e.g.,
|
|
1624
|
+
("FailedMount", "hostPath type check failed")) or None if no reason found.
|
|
1625
|
+
"""
|
|
1626
|
+
try:
|
|
1627
|
+
pod_events = _get_pod_events(context, namespace, pod_name)
|
|
1628
|
+
except Exception as e: # pylint: disable=broad-except
|
|
1629
|
+
logger.debug(f'Failed to get events for pod {pod_name}: {e}')
|
|
1630
|
+
return None
|
|
1631
|
+
|
|
1632
|
+
if not pod_events:
|
|
1633
|
+
return None
|
|
1634
|
+
|
|
1635
|
+
for event in pod_events:
|
|
1636
|
+
# Omit common events that does not indicate a pending reason.
|
|
1637
|
+
# We could also filter by event type 'Warning' or 'Error',
|
|
1638
|
+
# but there might be useful 'Normal' events such as pulling
|
|
1639
|
+
# image that we want to surface to the user.
|
|
1640
|
+
if event.reason not in COMMON_NON_PENDING_EVENT_REASONS:
|
|
1641
|
+
reason = event.reason or 'Unknown'
|
|
1642
|
+
message = event.message or ''
|
|
1643
|
+
return reason, message
|
|
1644
|
+
|
|
1645
|
+
return None
|
|
1646
|
+
|
|
1647
|
+
|
|
1648
|
+
def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
1649
|
+
cluster_name: str, pod_name: str) -> Optional[str]:
|
|
1650
|
+
"""Get events for missing pod and write to cluster events."""
|
|
1651
|
+
logger.debug(f'Analyzing events for pod {pod_name}')
|
|
1652
|
+
pod_events = _get_pod_events(context, namespace, pod_name)
|
|
1653
|
+
last_scheduled_node = None
|
|
1654
|
+
insert_new_pod_event = True
|
|
1655
|
+
new_event_inserted = False
|
|
1656
|
+
inserted_pod_events = 0
|
|
1657
|
+
|
|
1658
|
+
for event in pod_events:
|
|
1659
|
+
if event.reason == 'Scheduled':
|
|
1660
|
+
pattern = r'Successfully assigned (\S+) to (\S+)'
|
|
1661
|
+
match = re.search(pattern, event.message)
|
|
1662
|
+
if match:
|
|
1663
|
+
scheduled_node = match.group(2)
|
|
1664
|
+
last_scheduled_node = scheduled_node
|
|
1665
|
+
if insert_new_pod_event:
|
|
1666
|
+
# Try inserting the latest events first. If the event is a
|
|
1667
|
+
# duplicate, it means the event (and any previous events) have
|
|
1668
|
+
# already been inserted - so do not insert further events.
|
|
1669
|
+
try:
|
|
1670
|
+
global_user_state.add_cluster_event(
|
|
1671
|
+
cluster_name,
|
|
1672
|
+
None, f'[kubernetes pod {pod_name}] '
|
|
1673
|
+
f'{event.reason} {event.message}',
|
|
1674
|
+
global_user_state.ClusterEventType.DEBUG,
|
|
1675
|
+
transitioned_at=int(
|
|
1676
|
+
event.metadata.creation_timestamp.timestamp()),
|
|
1677
|
+
expose_duplicate_error=True)
|
|
1678
|
+
logger.debug(f'[pod {pod_name}] encountered new pod event: '
|
|
1679
|
+
f'{event.metadata.creation_timestamp} '
|
|
1680
|
+
f'{event.reason} {event.message}')
|
|
1681
|
+
except db_utils.UniqueConstraintViolationError:
|
|
1682
|
+
insert_new_pod_event = False
|
|
1683
|
+
else:
|
|
1684
|
+
new_event_inserted = True
|
|
1685
|
+
inserted_pod_events += 1
|
|
1686
|
+
|
|
1687
|
+
logger.debug(f'[pod {pod_name}] processed {len(pod_events)} pod events and '
|
|
1688
|
+
f'inserted {inserted_pod_events} new pod events '
|
|
1689
|
+
'previously unseen')
|
|
1690
|
+
|
|
1691
|
+
if last_scheduled_node is not None:
|
|
1692
|
+
node_field_selector = ('involvedObject.kind=Node,'
|
|
1693
|
+
f'involvedObject.name={last_scheduled_node}')
|
|
1694
|
+
node_events = kubernetes.core_api(context).list_namespaced_event(
|
|
1695
|
+
namespace,
|
|
1696
|
+
field_selector=node_field_selector,
|
|
1697
|
+
_request_timeout=kubernetes.API_TIMEOUT).items
|
|
1698
|
+
node_events = sorted(
|
|
1699
|
+
node_events,
|
|
1700
|
+
key=lambda event: event.metadata.creation_timestamp,
|
|
1701
|
+
# latest event appears first
|
|
1702
|
+
reverse=True)
|
|
1703
|
+
insert_new_node_event = True
|
|
1704
|
+
inserted_node_events = 0
|
|
1705
|
+
for event in node_events:
|
|
1706
|
+
if insert_new_node_event:
|
|
1707
|
+
# Try inserting the latest events first. If the event is a
|
|
1708
|
+
# duplicate, it means the event (and any previous events) have
|
|
1709
|
+
# already been inserted - so do not insert further events.
|
|
1710
|
+
try:
|
|
1711
|
+
global_user_state.add_cluster_event(
|
|
1712
|
+
cluster_name,
|
|
1713
|
+
None, f'[kubernetes node {last_scheduled_node}] '
|
|
1714
|
+
f'{event.reason} {event.message}',
|
|
1715
|
+
global_user_state.ClusterEventType.DEBUG,
|
|
1716
|
+
transitioned_at=int(
|
|
1717
|
+
event.metadata.creation_timestamp.timestamp()),
|
|
1718
|
+
expose_duplicate_error=True)
|
|
1719
|
+
logger.debug(
|
|
1720
|
+
f'[pod {pod_name}] encountered new node event: '
|
|
1721
|
+
f'{event.metadata.creation_timestamp} '
|
|
1722
|
+
f'{event.reason} {event.message}')
|
|
1723
|
+
except db_utils.UniqueConstraintViolationError:
|
|
1724
|
+
insert_new_node_event = False
|
|
1725
|
+
else:
|
|
1726
|
+
new_event_inserted = True
|
|
1727
|
+
inserted_node_events += 1
|
|
1728
|
+
|
|
1729
|
+
logger.debug(f'[pod {pod_name}: node {last_scheduled_node}] '
|
|
1730
|
+
f'processed {len(node_events)} node events and '
|
|
1731
|
+
f'inserted {inserted_node_events} new node events '
|
|
1732
|
+
'previously unseen')
|
|
1733
|
+
else:
|
|
1734
|
+
logger.debug(f'[pod {pod_name}] could not determine the node '
|
|
1735
|
+
'the pod was scheduled to')
|
|
1736
|
+
|
|
1737
|
+
if not new_event_inserted:
|
|
1738
|
+
# If new event is not inserted, there is no useful information to
|
|
1739
|
+
# return. Return None.
|
|
1740
|
+
return None
|
|
1741
|
+
|
|
1742
|
+
# Analyze the events for failure
|
|
1743
|
+
failure_reason = None
|
|
1744
|
+
failure_decisiveness = 0
|
|
1745
|
+
|
|
1746
|
+
def _record_failure_reason(reason: str, decisiveness: int):
|
|
1747
|
+
nonlocal failure_reason, failure_decisiveness
|
|
1748
|
+
if decisiveness > failure_decisiveness:
|
|
1749
|
+
failure_reason = reason
|
|
1750
|
+
failure_decisiveness = decisiveness
|
|
1751
|
+
|
|
1752
|
+
cluster_events = global_user_state.get_cluster_events(
|
|
1753
|
+
cluster_name, None, global_user_state.ClusterEventType.DEBUG)
|
|
1754
|
+
for event in cluster_events:
|
|
1755
|
+
if event.startswith('[kubernetes pod'):
|
|
1756
|
+
event = event.split(']')[1].strip()
|
|
1757
|
+
elif event.startswith('[kubernetes node'):
|
|
1758
|
+
event = event.split(']')[1].strip()
|
|
1759
|
+
|
|
1760
|
+
if event.startswith('NodeNotReady '):
|
|
1761
|
+
_record_failure_reason(event[len('NodeNotReady '):], 1)
|
|
1762
|
+
elif event.startswith('TaintManagerEviction '):
|
|
1763
|
+
# usually the event message for TaintManagerEviction is not useful
|
|
1764
|
+
# so we record a more generic message.
|
|
1765
|
+
_record_failure_reason('pod was evicted by taint manager', 2)
|
|
1766
|
+
elif event.startswith('DeletingNode '):
|
|
1767
|
+
_record_failure_reason(event[len('DeletingNode '):], 3)
|
|
1768
|
+
return failure_reason
|
|
1769
|
+
|
|
1770
|
+
|
|
1771
|
+
def list_namespaced_pod(context: Optional[str], namespace: str,
|
|
1772
|
+
cluster_name_on_cloud: str, is_ssh: bool, identity: str,
|
|
1773
|
+
label_selector: str) -> List[Any]:
|
|
1774
|
+
# Get all the pods with the label skypilot-cluster-name: <cluster_name>
|
|
1775
|
+
try:
|
|
1776
|
+
# log the query parameters we pass to the k8s api
|
|
1777
|
+
logger.debug(f'Querying k8s api for pods:\n'
|
|
1778
|
+
f'context: {context}\n'
|
|
1779
|
+
f'namespace: {namespace}\n'
|
|
1780
|
+
f'label selector:`{label_selector}`.')
|
|
1781
|
+
|
|
1782
|
+
response = kubernetes.core_api(context).list_namespaced_pod(
|
|
1783
|
+
namespace,
|
|
1784
|
+
label_selector=label_selector,
|
|
1785
|
+
_request_timeout=kubernetes.API_TIMEOUT)
|
|
1786
|
+
|
|
1787
|
+
# log PodList response info
|
|
1788
|
+
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
1789
|
+
logger.debug(f'k8s api response for `{label_selector}`:\n'
|
|
1790
|
+
f'apiVersion={response.api_version}, '
|
|
1791
|
+
f'kind={response.kind},\n'
|
|
1792
|
+
f'metadata={response.metadata}')
|
|
1793
|
+
|
|
1794
|
+
pods = response.items
|
|
1795
|
+
|
|
1796
|
+
# log detailed Pod info
|
|
1797
|
+
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
1798
|
+
logger.debug(f'k8s api response for `{label_selector}`: '
|
|
1799
|
+
f'len(pods)={len(pods)}')
|
|
1800
|
+
for pod in pods:
|
|
1801
|
+
logger.debug(f'k8s pod info for `{label_selector}`: '
|
|
1802
|
+
f'pod.apiVersion={pod.api_version}, '
|
|
1803
|
+
f'pod.kind={pod.kind}, \n'
|
|
1804
|
+
f'pod.name={pod.metadata.name}, '
|
|
1805
|
+
f'pod.namespace={pod.metadata.namespace}, \n'
|
|
1806
|
+
f'pod.labels={pod.metadata.labels}, \n'
|
|
1807
|
+
f'pod.annotations={pod.metadata.annotations}, \n'
|
|
1808
|
+
'pod.creationTimestamp='
|
|
1809
|
+
f'{pod.metadata.creation_timestamp}, '
|
|
1810
|
+
'pod.deletionTimestamp='
|
|
1811
|
+
f'{pod.metadata.deletion_timestamp}, \n'
|
|
1812
|
+
f'pod.status={pod.status}')
|
|
1813
|
+
return pods
|
|
1814
|
+
|
|
1815
|
+
except kubernetes.max_retry_error():
|
|
1816
|
+
with ux_utils.print_exception_no_traceback():
|
|
1817
|
+
if is_ssh:
|
|
1818
|
+
node_pool = common_utils.removeprefix(context,
|
|
1819
|
+
'ssh-') if context else ''
|
|
1820
|
+
msg = (
|
|
1821
|
+
f'Cannot connect to SSH Node Pool {node_pool}. '
|
|
1822
|
+
'Please check if the SSH Node Pool is up and accessible. '
|
|
1823
|
+
'To debug, run `sky check ssh` to check the status of '
|
|
1824
|
+
'the SSH Node Pool.')
|
|
1825
|
+
else:
|
|
1826
|
+
ctx = kubernetes_utils.get_current_kube_config_context_name()
|
|
1827
|
+
msg = (f'Network error - check if the {identity} in '
|
|
1828
|
+
f'context {ctx} is up and accessible.')
|
|
1829
|
+
raise exceptions.ClusterStatusFetchingError(
|
|
1830
|
+
f'Failed to query cluster {cluster_name_on_cloud!r} status. ' +
|
|
1831
|
+
msg) from None
|
|
1832
|
+
except Exception as e: # pylint: disable=broad-except
|
|
1833
|
+
with ux_utils.print_exception_no_traceback():
|
|
1834
|
+
raise exceptions.ClusterStatusFetchingError(
|
|
1835
|
+
f'Failed to query {identity} {cluster_name_on_cloud!r} '
|
|
1836
|
+
f'status: {common_utils.format_exception(e)}')
|
|
1837
|
+
|
|
1838
|
+
|
|
1245
1839
|
def query_instances(
|
|
1840
|
+
cluster_name: str,
|
|
1246
1841
|
cluster_name_on_cloud: str,
|
|
1247
1842
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
1248
|
-
non_terminated_only: bool = True
|
|
1249
|
-
|
|
1843
|
+
non_terminated_only: bool = True,
|
|
1844
|
+
retry_if_missing: bool = False,
|
|
1845
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
1846
|
+
# Mapping from pod phase to skypilot status. These are the only valid pod
|
|
1847
|
+
# phases.
|
|
1848
|
+
# https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
|
|
1250
1849
|
status_map = {
|
|
1251
1850
|
'Pending': status_lib.ClusterStatus.INIT,
|
|
1252
1851
|
'Running': status_lib.ClusterStatus.UP,
|
|
1253
|
-
'Failed':
|
|
1852
|
+
'Failed': status_lib.ClusterStatus.INIT,
|
|
1254
1853
|
'Unknown': None,
|
|
1255
1854
|
'Succeeded': None,
|
|
1256
|
-
'Terminating': None,
|
|
1257
1855
|
}
|
|
1258
1856
|
|
|
1259
1857
|
assert provider_config is not None
|
|
1260
1858
|
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
1261
1859
|
context = kubernetes_utils.get_context_from_config(provider_config)
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1860
|
+
is_ssh = context.startswith('ssh-') if context else False
|
|
1861
|
+
identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
|
|
1862
|
+
label_selector = (f'{constants.TAG_SKYPILOT_CLUSTER_NAME}='
|
|
1863
|
+
f'{cluster_name_on_cloud}')
|
|
1864
|
+
|
|
1865
|
+
attempts = 0
|
|
1866
|
+
pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
|
|
1867
|
+
is_ssh, identity, label_selector)
|
|
1868
|
+
# When we see no pods returned from the k8s api, we assume the pods have
|
|
1869
|
+
# been terminated by the user directly and mark the cluster as terminated
|
|
1870
|
+
# in the global user state.
|
|
1871
|
+
# We add retry logic here as an attempt to mitigate a leak caused by the
|
|
1872
|
+
# kubernetes api returning no pods despite the pods actually existing.
|
|
1873
|
+
while (retry_if_missing and not pods and
|
|
1874
|
+
attempts < _MAX_QUERY_INSTANCES_RETRIES):
|
|
1875
|
+
logger.debug(f'Retrying to query k8s api for {cluster_name_on_cloud} '
|
|
1876
|
+
f'{attempts}/{_MAX_QUERY_INSTANCES_RETRIES} times.'
|
|
1877
|
+
f'after {_QUERY_INSTANCES_RETRY_INTERVAL} seconds.')
|
|
1878
|
+
time.sleep(_QUERY_INSTANCES_RETRY_INTERVAL)
|
|
1879
|
+
attempts += 1
|
|
1880
|
+
pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
|
|
1881
|
+
is_ssh, identity, label_selector)
|
|
1882
|
+
if len(pods) > 0:
|
|
1883
|
+
logger.info(f'Found {len(pods)} pods for {label_selector} after'
|
|
1884
|
+
f'{attempts} retries.')
|
|
1281
1885
|
|
|
1282
1886
|
# Check if the pods are running or pending
|
|
1283
|
-
cluster_status
|
|
1887
|
+
cluster_status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
1888
|
+
Optional[str]]] = {}
|
|
1284
1889
|
for pod in pods:
|
|
1285
|
-
|
|
1890
|
+
phase = pod.status.phase
|
|
1891
|
+
is_terminating = pod.metadata.deletion_timestamp is not None
|
|
1892
|
+
pod_status = status_map[phase]
|
|
1893
|
+
reason = None
|
|
1894
|
+
if phase in ('Failed', 'Unknown') or is_terminating:
|
|
1895
|
+
reason = _get_pod_termination_reason(pod, cluster_name)
|
|
1896
|
+
logger.debug(f'Pod Status ({phase}) Reason(s): {reason}')
|
|
1286
1897
|
if non_terminated_only and pod_status is None:
|
|
1898
|
+
logger.debug(f'Pod {pod.metadata.name} is terminated, but '
|
|
1899
|
+
'query_instances is called with '
|
|
1900
|
+
f'non_terminated_only=True. Phase: {phase}')
|
|
1287
1901
|
continue
|
|
1288
|
-
|
|
1902
|
+
pod_name = pod.metadata.name
|
|
1903
|
+
reason = f'{pod_name}: {reason}' if reason is not None else None
|
|
1904
|
+
cluster_status[pod_name] = (pod_status, reason)
|
|
1905
|
+
|
|
1906
|
+
# Find the list of pod names that should be there
|
|
1907
|
+
# from k8s services. Filter duplicates as -ssh service
|
|
1908
|
+
# creates a duplicate entry.
|
|
1909
|
+
target_pod_names = list(
|
|
1910
|
+
set([
|
|
1911
|
+
service['spec']['selector']['component']
|
|
1912
|
+
for service in provider_config.get('services', [])
|
|
1913
|
+
]))
|
|
1914
|
+
|
|
1915
|
+
for target_pod_name in target_pod_names:
|
|
1916
|
+
if target_pod_name not in cluster_status:
|
|
1917
|
+
# If the pod is not in the cluster_status, it means it's not
|
|
1918
|
+
# running.
|
|
1919
|
+
# Analyze what happened to the pod based on events.
|
|
1920
|
+
reason = _get_pod_missing_reason(context, namespace, cluster_name,
|
|
1921
|
+
target_pod_name)
|
|
1922
|
+
reason = (f'{target_pod_name}: {reason}'
|
|
1923
|
+
if reason is not None else None)
|
|
1924
|
+
if not non_terminated_only:
|
|
1925
|
+
cluster_status[target_pod_name] = (None, reason)
|
|
1926
|
+
|
|
1289
1927
|
return cluster_status
|
|
1290
1928
|
|
|
1291
1929
|
|
|
@@ -1307,7 +1945,8 @@ def get_command_runners(
|
|
|
1307
1945
|
|
|
1308
1946
|
# Try to get deployment name from label first
|
|
1309
1947
|
head_instance_info = instances[pod_name][0]
|
|
1310
|
-
deployment = head_instance_info.tags.get(
|
|
1948
|
+
deployment = head_instance_info.tags.get(
|
|
1949
|
+
k8s_constants.TAG_SKYPILOT_DEPLOYMENT_NAME)
|
|
1311
1950
|
|
|
1312
1951
|
node_list = [((namespace, context), pod_name)]
|
|
1313
1952
|
head_runner = command_runner.KubernetesCommandRunner(
|