skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -1,11 +1,14 @@
|
|
|
1
1
|
"""Kubernetes instance provisioning."""
|
|
2
2
|
import copy
|
|
3
|
+
import datetime
|
|
3
4
|
import json
|
|
5
|
+
import re
|
|
6
|
+
import sys
|
|
4
7
|
import time
|
|
5
|
-
from typing import Any,
|
|
6
|
-
import uuid
|
|
8
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
7
9
|
|
|
8
10
|
from sky import exceptions
|
|
11
|
+
from sky import global_user_state
|
|
9
12
|
from sky import sky_logging
|
|
10
13
|
from sky import skypilot_config
|
|
11
14
|
from sky.adaptors import kubernetes
|
|
@@ -13,31 +16,36 @@ from sky.provision import common
|
|
|
13
16
|
from sky.provision import constants
|
|
14
17
|
from sky.provision import docker_utils
|
|
15
18
|
from sky.provision.kubernetes import config as config_lib
|
|
16
|
-
from sky.provision.kubernetes import
|
|
19
|
+
from sky.provision.kubernetes import constants as k8s_constants
|
|
17
20
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
21
|
+
from sky.provision.kubernetes import volume
|
|
18
22
|
from sky.utils import command_runner
|
|
19
23
|
from sky.utils import common_utils
|
|
20
24
|
from sky.utils import config_utils
|
|
21
25
|
from sky.utils import kubernetes_enums
|
|
26
|
+
from sky.utils import rich_utils
|
|
22
27
|
from sky.utils import status_lib
|
|
23
28
|
from sky.utils import subprocess_utils
|
|
24
29
|
from sky.utils import timeline
|
|
25
30
|
from sky.utils import ux_utils
|
|
31
|
+
from sky.utils.db import db_utils
|
|
26
32
|
|
|
27
33
|
POLL_INTERVAL = 2
|
|
28
34
|
_TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
|
|
29
35
|
_MAX_RETRIES = 3
|
|
36
|
+
_MAX_MISSING_PODS_RETRIES = 5
|
|
37
|
+
_MAX_QUERY_INSTANCES_RETRIES = 5
|
|
38
|
+
_QUERY_INSTANCES_RETRY_INTERVAL = .5
|
|
30
39
|
_NUM_THREADS = subprocess_utils.get_parallel_threads('kubernetes')
|
|
31
40
|
|
|
41
|
+
# Pattern to extract SSH user from command output, handling MOTD contamination
|
|
42
|
+
_SSH_USER_PATTERN = re.compile(r'SKYPILOT_SSH_USER: ([^\s\n]+)')
|
|
43
|
+
|
|
32
44
|
logger = sky_logging.init_logger(__name__)
|
|
33
|
-
TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
|
|
34
|
-
TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
|
|
35
|
-
TAG_POD_INITIALIZED = 'skypilot-initialized'
|
|
36
|
-
TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
|
|
37
45
|
|
|
38
46
|
|
|
39
47
|
def ray_tag_filter(cluster_name: str) -> Dict[str, str]:
|
|
40
|
-
return {TAG_RAY_CLUSTER_NAME: cluster_name}
|
|
48
|
+
return {k8s_constants.TAG_RAY_CLUSTER_NAME: cluster_name}
|
|
41
49
|
|
|
42
50
|
|
|
43
51
|
def _is_head(pod) -> bool:
|
|
@@ -67,12 +75,16 @@ def is_high_availability_cluster_by_kubectl(
|
|
|
67
75
|
namespace: Optional[str] = None) -> bool:
|
|
68
76
|
"""Check if a cluster is a high availability controller by calling
|
|
69
77
|
`kubectl get deployment`.
|
|
78
|
+
|
|
79
|
+
The deployment must have the label `skypilot-cluster-name` set to
|
|
80
|
+
`cluster_name`.
|
|
70
81
|
"""
|
|
71
82
|
try:
|
|
72
83
|
deployment_list = kubernetes.apps_api(
|
|
73
84
|
context).list_namespaced_deployment(
|
|
74
85
|
namespace,
|
|
75
|
-
label_selector=
|
|
86
|
+
label_selector=
|
|
87
|
+
f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
|
|
76
88
|
except kubernetes.api_exception():
|
|
77
89
|
return False
|
|
78
90
|
# It is a high availability cluster if there is at least one deployment
|
|
@@ -186,14 +198,20 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
|
186
198
|
break
|
|
187
199
|
if event_message is not None:
|
|
188
200
|
if pod_status == 'Pending':
|
|
189
|
-
|
|
201
|
+
out_of = {}
|
|
202
|
+
# key: resource name, value: (extra message, nice name)
|
|
190
203
|
if 'Insufficient cpu' in event_message:
|
|
191
|
-
|
|
192
|
-
|
|
204
|
+
out_of['CPU'] = (': Run \'kubectl get nodes -o '
|
|
205
|
+
'custom-columns=NAME:.metadata.name,'
|
|
206
|
+
'CPU:.status.allocatable.cpu\' to check '
|
|
207
|
+
'the available CPUs on the node.', 'CPUs')
|
|
193
208
|
if 'Insufficient memory' in event_message:
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
209
|
+
out_of['memory'] = (': Run \'kubectl get nodes -o '
|
|
210
|
+
'custom-columns=NAME:.metadata.name,'
|
|
211
|
+
'MEMORY:.status.allocatable.memory\' '
|
|
212
|
+
'to check the available memory on the '
|
|
213
|
+
'node.', 'Memory')
|
|
214
|
+
|
|
197
215
|
# TODO(aylei): after switching from smarter-device-manager to
|
|
198
216
|
# fusermount-server, we need a new way to check whether the
|
|
199
217
|
# fusermount-server daemonset is ready.
|
|
@@ -201,43 +219,79 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
|
|
|
201
219
|
key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
|
|
202
220
|
for key in lf.get_label_keys()
|
|
203
221
|
]
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
222
|
+
for label_key in gpu_lf_keys:
|
|
223
|
+
# TODO(romilb): We may have additional node
|
|
224
|
+
# affinity selectors in the future - in that
|
|
225
|
+
# case we will need to update this logic.
|
|
226
|
+
# TODO(Doyoung): Update the error message raised
|
|
227
|
+
# with the multi-host TPU support.
|
|
228
|
+
gpu_resource_key = kubernetes_utils.get_gpu_resource_key(
|
|
229
|
+
context) # pylint: disable=line-too-long
|
|
230
|
+
if ((f'Insufficient {gpu_resource_key}' in event_message) or
|
|
231
|
+
('didn\'t match Pod\'s node affinity/selector'
|
|
232
|
+
in event_message) and pod.spec.node_selector):
|
|
233
|
+
if 'gpu' in gpu_resource_key.lower():
|
|
234
|
+
info_msg = (
|
|
235
|
+
': Run \'sky show-gpus --infra kubernetes\' to '
|
|
236
|
+
'see the available GPUs.')
|
|
237
|
+
else:
|
|
238
|
+
info_msg = ': '
|
|
239
|
+
if (pod.spec.node_selector and
|
|
240
|
+
label_key in pod.spec.node_selector):
|
|
241
|
+
extra_msg = (
|
|
242
|
+
f'Verify if any node matching label '
|
|
243
|
+
f'{pod.spec.node_selector[label_key]} and '
|
|
244
|
+
f'sufficient resource {gpu_resource_key} '
|
|
245
|
+
f'is available in the cluster.')
|
|
246
|
+
extra_msg = info_msg + ' ' + extra_msg
|
|
247
|
+
else:
|
|
248
|
+
extra_msg = info_msg
|
|
249
|
+
if gpu_resource_key not in out_of or len(
|
|
250
|
+
out_of[gpu_resource_key][0]) < len(extra_msg):
|
|
251
|
+
out_of[f'{gpu_resource_key}'] = (extra_msg, 'GPUs')
|
|
252
|
+
|
|
253
|
+
if len(out_of) > 0:
|
|
254
|
+
# We are out of some resources. We should raise an error.
|
|
255
|
+
rsrc_err_msg = 'Insufficient resource capacity on the '
|
|
256
|
+
rsrc_err_msg += 'cluster:\n'
|
|
257
|
+
out_of_keys = list(out_of.keys())
|
|
258
|
+
for i in range(len(out_of_keys)):
|
|
259
|
+
rsrc = out_of_keys[i]
|
|
260
|
+
(extra_msg, nice_name) = out_of[rsrc]
|
|
261
|
+
extra_msg = extra_msg if extra_msg else ''
|
|
262
|
+
if i == len(out_of_keys) - 1:
|
|
263
|
+
indent = '└──'
|
|
264
|
+
else:
|
|
265
|
+
indent = '├──'
|
|
266
|
+
rsrc_err_msg += (f'{indent} Cluster does not have '
|
|
267
|
+
f'sufficient {nice_name} for your request'
|
|
268
|
+
f'{extra_msg}')
|
|
269
|
+
if i != len(out_of_keys) - 1:
|
|
270
|
+
rsrc_err_msg += '\n'
|
|
271
|
+
|
|
272
|
+
# Emit the error message without logging prefixes for better UX.
|
|
273
|
+
tmp_handler = sky_logging.EnvAwareHandler(sys.stdout)
|
|
274
|
+
tmp_handler.flush = sys.stdout.flush
|
|
275
|
+
tmp_handler.setFormatter(sky_logging.NO_PREFIX_FORMATTER)
|
|
276
|
+
tmp_handler.setLevel(sky_logging.ERROR)
|
|
277
|
+
prev_propagate = logger.propagate
|
|
278
|
+
try:
|
|
279
|
+
logger.addHandler(tmp_handler)
|
|
280
|
+
logger.propagate = False
|
|
281
|
+
logger.error(ux_utils.error_message(f'{rsrc_err_msg}'))
|
|
282
|
+
finally:
|
|
283
|
+
logger.removeHandler(tmp_handler)
|
|
284
|
+
logger.propagate = prev_propagate
|
|
285
|
+
nice_names = [out_of[rsrc][1] for rsrc in out_of_keys]
|
|
286
|
+
raise config_lib.KubernetesError(
|
|
287
|
+
f'{timeout_err_msg} '
|
|
288
|
+
f'Pod status: {pod_status} '
|
|
289
|
+
f'Details: \'{event_message}\' ',
|
|
290
|
+
insufficent_resources=nice_names,
|
|
291
|
+
)
|
|
292
|
+
|
|
239
293
|
raise config_lib.KubernetesError(f'{timeout_err_msg} '
|
|
240
|
-
f'Pod status: {pod_status}'
|
|
294
|
+
f'Pod status: {pod_status} '
|
|
241
295
|
f'Details: \'{event_message}\' ')
|
|
242
296
|
raise config_lib.KubernetesError(f'{timeout_err_msg}')
|
|
243
297
|
|
|
@@ -251,8 +305,89 @@ def _raise_command_running_error(message: str, command: str, pod_name: str,
|
|
|
251
305
|
f'code {rc}: {command!r}\nOutput: {stdout}.')
|
|
252
306
|
|
|
253
307
|
|
|
308
|
+
def _detect_cluster_event_reason_occurred(namespace, context, search_start,
|
|
309
|
+
reason) -> bool:
|
|
310
|
+
|
|
311
|
+
def _convert_to_utc(timestamp):
|
|
312
|
+
if timestamp.tzinfo is None:
|
|
313
|
+
return timestamp.replace(tzinfo=datetime.timezone.utc)
|
|
314
|
+
return timestamp.astimezone(datetime.timezone.utc)
|
|
315
|
+
|
|
316
|
+
def _get_event_timestamp(event):
|
|
317
|
+
if event.last_timestamp:
|
|
318
|
+
return event.last_timestamp
|
|
319
|
+
elif event.metadata.creation_timestamp:
|
|
320
|
+
return event.metadata.creation_timestamp
|
|
321
|
+
return None
|
|
322
|
+
|
|
323
|
+
events = kubernetes.core_api(context).list_namespaced_event(
|
|
324
|
+
namespace=namespace, field_selector=f'reason={reason}')
|
|
325
|
+
for event in events.items:
|
|
326
|
+
ts = _get_event_timestamp(event)
|
|
327
|
+
if ts and _convert_to_utc(ts) > search_start:
|
|
328
|
+
return True
|
|
329
|
+
return False
|
|
330
|
+
|
|
331
|
+
|
|
332
|
+
def _cluster_had_autoscale_event(namespace, context, search_start) -> bool:
|
|
333
|
+
"""Detects whether the cluster had a autoscaling event after a
|
|
334
|
+
specified datetime. This only works when using cluster-autoscaler.
|
|
335
|
+
|
|
336
|
+
Args:
|
|
337
|
+
namespace: kubernetes namespace
|
|
338
|
+
context: kubernetes context
|
|
339
|
+
search_start (datetime.datetime): filter for events that occurred
|
|
340
|
+
after search_start
|
|
341
|
+
|
|
342
|
+
Returns:
|
|
343
|
+
A boolean whether the cluster has an autoscaling event or not.
|
|
344
|
+
"""
|
|
345
|
+
assert namespace is not None
|
|
346
|
+
|
|
347
|
+
try:
|
|
348
|
+
return _detect_cluster_event_reason_occurred(namespace, context,
|
|
349
|
+
search_start,
|
|
350
|
+
'TriggeredScaleUp')
|
|
351
|
+
except Exception as e: # pylint: disable=broad-except
|
|
352
|
+
logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
|
|
353
|
+
return False
|
|
354
|
+
|
|
355
|
+
|
|
356
|
+
def _cluster_maybe_autoscaling(namespace, context, search_start) -> bool:
|
|
357
|
+
"""Detects whether a kubernetes cluster may have an autoscaling event.
|
|
358
|
+
|
|
359
|
+
This is not a definitive detection. FailedScheduling, which is an
|
|
360
|
+
event that can occur when not enough resources are present in the cluster,
|
|
361
|
+
which is a trigger for cluster autoscaling. However, FailedScheduling may
|
|
362
|
+
have occurred due to other reasons (cluster itself is abnormal).
|
|
363
|
+
|
|
364
|
+
Hence, this should only be used for autoscalers that don't emit the
|
|
365
|
+
TriggeredScaleUp event, e.g.: Karpenter.
|
|
366
|
+
|
|
367
|
+
Args:
|
|
368
|
+
namespace: kubernetes namespace
|
|
369
|
+
context: kubernetes context
|
|
370
|
+
search_start (datetime.datetime): filter for events that occurred
|
|
371
|
+
after search_start
|
|
372
|
+
|
|
373
|
+
Returns:
|
|
374
|
+
A boolean whether the cluster has an autoscaling event or not.
|
|
375
|
+
"""
|
|
376
|
+
assert namespace is not None
|
|
377
|
+
|
|
378
|
+
try:
|
|
379
|
+
return _detect_cluster_event_reason_occurred(namespace, context,
|
|
380
|
+
search_start,
|
|
381
|
+
'FailedScheduling')
|
|
382
|
+
except Exception as e: # pylint: disable=broad-except
|
|
383
|
+
logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
|
|
384
|
+
return False
|
|
385
|
+
|
|
386
|
+
|
|
254
387
|
@timeline.event
|
|
255
|
-
def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int
|
|
388
|
+
def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int,
|
|
389
|
+
cluster_name: str,
|
|
390
|
+
create_pods_start: datetime.datetime):
|
|
256
391
|
"""Wait for all pods to be scheduled.
|
|
257
392
|
|
|
258
393
|
Wait for all pods including jump pod to be scheduled, and if it
|
|
@@ -261,6 +396,9 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
261
396
|
allocated and we can exit.
|
|
262
397
|
|
|
263
398
|
If timeout is set to a negative value, this method will wait indefinitely.
|
|
399
|
+
|
|
400
|
+
Will update the spinner message to indicate autoscaling if autoscaling
|
|
401
|
+
is happening.
|
|
264
402
|
"""
|
|
265
403
|
# Create a set of pod names we're waiting for
|
|
266
404
|
if not new_nodes:
|
|
@@ -268,6 +406,18 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
268
406
|
expected_pod_names = {node.metadata.name for node in new_nodes}
|
|
269
407
|
start_time = time.time()
|
|
270
408
|
|
|
409
|
+
# Variables for autoscaler detection
|
|
410
|
+
autoscaler_type = skypilot_config.get_effective_region_config(
|
|
411
|
+
cloud='kubernetes',
|
|
412
|
+
region=context,
|
|
413
|
+
keys=('autoscaler',),
|
|
414
|
+
default_value=None)
|
|
415
|
+
autoscaler_is_set = autoscaler_type is not None
|
|
416
|
+
use_heuristic_detection = (autoscaler_is_set and
|
|
417
|
+
not kubernetes_enums.KubernetesAutoscalerType(
|
|
418
|
+
autoscaler_type).emits_autoscale_event())
|
|
419
|
+
is_autoscaling = False
|
|
420
|
+
|
|
271
421
|
def _evaluate_timeout() -> bool:
|
|
272
422
|
# If timeout is negative, retry indefinitely.
|
|
273
423
|
if timeout < 0:
|
|
@@ -277,10 +427,13 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
277
427
|
while _evaluate_timeout():
|
|
278
428
|
# Get all pods in a single API call using the cluster name label
|
|
279
429
|
# which all pods in new_nodes should share
|
|
280
|
-
|
|
430
|
+
cluster_name_on_cloud = new_nodes[0].metadata.labels[
|
|
431
|
+
constants.TAG_SKYPILOT_CLUSTER_NAME]
|
|
281
432
|
pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
282
433
|
namespace,
|
|
283
|
-
label_selector=
|
|
434
|
+
label_selector=
|
|
435
|
+
f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
|
|
436
|
+
).items
|
|
284
437
|
|
|
285
438
|
# Get the set of found pod names and check if we have all expected pods
|
|
286
439
|
found_pod_names = {pod.metadata.name for pod in pods}
|
|
@@ -304,6 +457,26 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
304
457
|
|
|
305
458
|
if all_scheduled:
|
|
306
459
|
return
|
|
460
|
+
|
|
461
|
+
# Check if cluster is autoscaling and update spinner message.
|
|
462
|
+
# Minor optimization to not query k8s api after autoscaling
|
|
463
|
+
# event was detected. This is useful because there isn't any
|
|
464
|
+
# autoscaling complete event.
|
|
465
|
+
if autoscaler_is_set and not is_autoscaling:
|
|
466
|
+
if use_heuristic_detection:
|
|
467
|
+
is_autoscaling = _cluster_maybe_autoscaling(
|
|
468
|
+
namespace, context, create_pods_start)
|
|
469
|
+
msg = 'Kubernetes cluster may be scaling up'
|
|
470
|
+
else:
|
|
471
|
+
is_autoscaling = _cluster_had_autoscale_event(
|
|
472
|
+
namespace, context, create_pods_start)
|
|
473
|
+
msg = 'Kubernetes cluster is autoscaling'
|
|
474
|
+
|
|
475
|
+
if is_autoscaling:
|
|
476
|
+
rich_utils.force_update_status(
|
|
477
|
+
ux_utils.spinner_message(f'Launching ({msg})',
|
|
478
|
+
cluster_name=cluster_name))
|
|
479
|
+
|
|
307
480
|
time.sleep(1)
|
|
308
481
|
|
|
309
482
|
# Handle pod scheduling errors
|
|
@@ -319,17 +492,17 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
|
|
|
319
492
|
|
|
320
493
|
|
|
321
494
|
@timeline.event
|
|
322
|
-
def _wait_for_pods_to_run(namespace, context,
|
|
495
|
+
def _wait_for_pods_to_run(namespace, context, cluster_name, new_pods):
|
|
323
496
|
"""Wait for pods and their containers to be ready.
|
|
324
497
|
|
|
325
498
|
Pods may be pulling images or may be in the process of container
|
|
326
499
|
creation.
|
|
327
500
|
"""
|
|
328
|
-
if not
|
|
501
|
+
if not new_pods:
|
|
329
502
|
return
|
|
330
503
|
|
|
331
504
|
# Create a set of pod names we're waiting for
|
|
332
|
-
expected_pod_names = {
|
|
505
|
+
expected_pod_names = {pod.metadata.name for pod in new_pods}
|
|
333
506
|
|
|
334
507
|
def _check_init_containers(pod):
|
|
335
508
|
# Check if any of the init containers failed
|
|
@@ -356,26 +529,62 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
|
|
|
356
529
|
'Failed to create init container for pod '
|
|
357
530
|
f'{pod.metadata.name}. Error details: {msg}.')
|
|
358
531
|
|
|
532
|
+
missing_pods_retry = 0
|
|
359
533
|
while True:
|
|
360
534
|
# Get all pods in a single API call
|
|
361
|
-
|
|
535
|
+
cluster_name_on_cloud = new_pods[0].metadata.labels[
|
|
536
|
+
constants.TAG_SKYPILOT_CLUSTER_NAME]
|
|
362
537
|
all_pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
363
538
|
namespace,
|
|
364
|
-
label_selector=
|
|
539
|
+
label_selector=
|
|
540
|
+
f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
|
|
541
|
+
).items
|
|
365
542
|
|
|
366
543
|
# Get the set of found pod names and check if we have all expected pods
|
|
367
544
|
found_pod_names = {pod.metadata.name for pod in all_pods}
|
|
368
|
-
|
|
369
|
-
if
|
|
545
|
+
missing_pod_names = expected_pod_names - found_pod_names
|
|
546
|
+
if missing_pod_names:
|
|
547
|
+
# In _wait_for_pods_to_schedule, we already wait for all pods to go
|
|
548
|
+
# from pending to scheduled. So if a pod is missing here, it means
|
|
549
|
+
# something unusual must have happened, and so should be treated as
|
|
550
|
+
# an exception.
|
|
551
|
+
# It is also only in _wait_for_pods_to_schedule that
|
|
552
|
+
# provision_timeout is used.
|
|
553
|
+
# TODO(kevin): Should we take provision_timeout into account here,
|
|
554
|
+
# instead of hardcoding the number of retries?
|
|
555
|
+
if missing_pods_retry >= _MAX_MISSING_PODS_RETRIES:
|
|
556
|
+
for pod_name in missing_pod_names:
|
|
557
|
+
reason = _get_pod_missing_reason(context, namespace,
|
|
558
|
+
cluster_name, pod_name)
|
|
559
|
+
logger.warning(f'Pod {pod_name} missing: {reason}')
|
|
560
|
+
raise config_lib.KubernetesError(
|
|
561
|
+
f'Failed to get all pods after {missing_pods_retry} '
|
|
562
|
+
f'retries. Some pods may have been terminated or failed '
|
|
563
|
+
f'unexpectedly. Run `sky logs --provision {cluster_name}` '
|
|
564
|
+
'for more details.')
|
|
370
565
|
logger.info('Retrying running pods check: '
|
|
371
|
-
f'Missing pods: {
|
|
566
|
+
f'Missing pods: {missing_pod_names}')
|
|
372
567
|
time.sleep(0.5)
|
|
568
|
+
missing_pods_retry += 1
|
|
373
569
|
continue
|
|
374
570
|
|
|
375
571
|
all_pods_running = True
|
|
376
572
|
for pod in all_pods:
|
|
377
573
|
if pod.metadata.name not in expected_pod_names:
|
|
378
574
|
continue
|
|
575
|
+
|
|
576
|
+
# Check if pod is terminated/preempted/failed.
|
|
577
|
+
if (pod.metadata.deletion_timestamp is not None or
|
|
578
|
+
pod.status.phase == 'Failed'):
|
|
579
|
+
# Get the reason and write to cluster events before
|
|
580
|
+
# the pod gets completely deleted from the API.
|
|
581
|
+
reason = _get_pod_termination_reason(pod, cluster_name)
|
|
582
|
+
logger.warning(f'Pod {pod.metadata.name} terminated: {reason}')
|
|
583
|
+
raise config_lib.KubernetesError(
|
|
584
|
+
f'Pod {pod.metadata.name} has terminated or failed '
|
|
585
|
+
f'unexpectedly. Run `sky logs --provision {cluster_name}` '
|
|
586
|
+
'for more details.')
|
|
587
|
+
|
|
379
588
|
# Continue if pod and all the containers within the
|
|
380
589
|
# pod are successfully created and running.
|
|
381
590
|
if pod.status.phase == 'Running' and all(
|
|
@@ -411,31 +620,6 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
|
|
|
411
620
|
time.sleep(1)
|
|
412
621
|
|
|
413
622
|
|
|
414
|
-
def _run_function_with_retries(func: Callable,
|
|
415
|
-
operation_name: str,
|
|
416
|
-
max_retries: int = _MAX_RETRIES,
|
|
417
|
-
retry_delay: int = 5) -> Any:
|
|
418
|
-
"""Runs a function with retries on Kubernetes errors.
|
|
419
|
-
Args:
|
|
420
|
-
func: Function to retry
|
|
421
|
-
operation_name: Name of the operation for logging
|
|
422
|
-
max_retries: Maximum number of retry attempts
|
|
423
|
-
retry_delay: Delay between retries in seconds
|
|
424
|
-
Raises:
|
|
425
|
-
The last exception encountered if all retries fail.
|
|
426
|
-
"""
|
|
427
|
-
for attempt in range(max_retries + 1):
|
|
428
|
-
try:
|
|
429
|
-
return func()
|
|
430
|
-
except config_lib.KubernetesError:
|
|
431
|
-
if attempt < max_retries:
|
|
432
|
-
logger.warning(f'Failed to {operation_name} - '
|
|
433
|
-
f'retrying in {retry_delay} seconds.')
|
|
434
|
-
time.sleep(retry_delay)
|
|
435
|
-
else:
|
|
436
|
-
raise
|
|
437
|
-
|
|
438
|
-
|
|
439
623
|
@timeline.event
|
|
440
624
|
def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
|
|
441
625
|
"""Pre-initialization step for SkyPilot pods.
|
|
@@ -670,26 +854,11 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
|
|
|
670
854
|
raise e
|
|
671
855
|
|
|
672
856
|
|
|
673
|
-
def _create_persistent_volume_claim(namespace: str, context: Optional[str],
|
|
674
|
-
pvc_spec: Dict[str, Any]) -> None:
|
|
675
|
-
"""Creates a persistent volume claim for SkyServe controller."""
|
|
676
|
-
try:
|
|
677
|
-
kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
|
|
678
|
-
name=pvc_spec['metadata']['name'], namespace=namespace)
|
|
679
|
-
return
|
|
680
|
-
except kubernetes.api_exception() as e:
|
|
681
|
-
if e.status != 404: # Not found
|
|
682
|
-
raise
|
|
683
|
-
|
|
684
|
-
kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
|
|
685
|
-
namespace=namespace, body=pvc_spec)
|
|
686
|
-
|
|
687
|
-
|
|
688
857
|
@timeline.event
|
|
689
858
|
def _wait_for_deployment_pod(context,
|
|
690
859
|
namespace,
|
|
691
860
|
deployment,
|
|
692
|
-
timeout=
|
|
861
|
+
timeout=300) -> List:
|
|
693
862
|
label_selector = ','.join([
|
|
694
863
|
f'{key}={value}'
|
|
695
864
|
for key, value in deployment.spec.selector.match_labels.items()
|
|
@@ -721,13 +890,14 @@ def _wait_for_deployment_pod(context,
|
|
|
721
890
|
|
|
722
891
|
|
|
723
892
|
@timeline.event
|
|
724
|
-
def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
893
|
+
def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
725
894
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
726
895
|
"""Create pods based on the config."""
|
|
727
896
|
provider_config = config.provider_config
|
|
728
897
|
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
729
898
|
context = kubernetes_utils.get_context_from_config(provider_config)
|
|
730
899
|
pod_spec = copy.deepcopy(config.node_config)
|
|
900
|
+
create_pods_start = datetime.datetime.now(datetime.timezone.utc)
|
|
731
901
|
|
|
732
902
|
to_create_deployment = 'deployment_spec' in pod_spec
|
|
733
903
|
if to_create_deployment:
|
|
@@ -744,7 +914,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
744
914
|
else:
|
|
745
915
|
pod_spec['metadata']['labels'] = tags
|
|
746
916
|
pod_spec['metadata']['labels'].update(
|
|
747
|
-
{TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
|
|
917
|
+
{constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
|
|
748
918
|
|
|
749
919
|
terminating_pods = kubernetes_utils.filter_pods(namespace, context, tags,
|
|
750
920
|
['Terminating'])
|
|
@@ -776,8 +946,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
776
946
|
running_pods = kubernetes_utils.filter_pods(namespace, context, tags,
|
|
777
947
|
['Pending', 'Running'])
|
|
778
948
|
head_pod_name = _get_head_pod_name(running_pods)
|
|
949
|
+
running_pod_statuses = [{
|
|
950
|
+
pod.metadata.name: pod.status.phase
|
|
951
|
+
} for pod in running_pods.values()]
|
|
779
952
|
logger.debug(f'Found {len(running_pods)} existing pods: '
|
|
780
|
-
f'{
|
|
953
|
+
f'{running_pod_statuses}')
|
|
781
954
|
|
|
782
955
|
to_start_count = config.count - len(running_pods)
|
|
783
956
|
if to_start_count < 0:
|
|
@@ -793,7 +966,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
793
966
|
nvidia_runtime_exists = False
|
|
794
967
|
try:
|
|
795
968
|
nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
|
|
796
|
-
context)
|
|
969
|
+
context=context)
|
|
797
970
|
except kubernetes.kubernetes.client.ApiException as e:
|
|
798
971
|
logger.warning('run_instances: Error occurred while checking for '
|
|
799
972
|
f'nvidia RuntimeClass - '
|
|
@@ -804,14 +977,18 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
804
977
|
'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html') # pylint: disable=line-too-long
|
|
805
978
|
|
|
806
979
|
needs_gpus = False
|
|
980
|
+
needs_gpus_nvidia = False
|
|
807
981
|
limits = pod_spec['spec']['containers'][0].get('resources',
|
|
808
982
|
{}).get('limits')
|
|
809
983
|
if limits is not None:
|
|
810
|
-
needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(),
|
|
984
|
+
needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(context),
|
|
985
|
+
0) > 0
|
|
986
|
+
needs_gpus_nvidia = limits.get(
|
|
987
|
+
kubernetes_utils.SUPPORTED_GPU_RESOURCE_KEYS['nvidia'], 0) > 0
|
|
811
988
|
|
|
812
989
|
# TPU pods provisioned on GKE use the default containerd runtime.
|
|
813
990
|
# Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview # pylint: disable=line-too-long
|
|
814
|
-
if nvidia_runtime_exists and
|
|
991
|
+
if nvidia_runtime_exists and needs_gpus_nvidia:
|
|
815
992
|
pod_spec['spec']['runtimeClassName'] = 'nvidia'
|
|
816
993
|
|
|
817
994
|
logger.debug(f'run_instances: calling create_namespaced_pod '
|
|
@@ -819,19 +996,46 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
819
996
|
|
|
820
997
|
def _create_resource_thread(i: int):
|
|
821
998
|
pod_spec_copy = copy.deepcopy(pod_spec)
|
|
822
|
-
|
|
823
|
-
|
|
824
|
-
|
|
825
|
-
|
|
826
|
-
|
|
827
|
-
|
|
999
|
+
# 0 is for head pod, while 1+ is for worker pods.
|
|
1000
|
+
if i == 0:
|
|
1001
|
+
if head_pod_name is None:
|
|
1002
|
+
# First pod should be head if no head exists
|
|
1003
|
+
pod_spec_copy['metadata']['labels'].update(
|
|
1004
|
+
constants.HEAD_NODE_TAGS)
|
|
1005
|
+
head_selector = _head_service_selector(cluster_name_on_cloud)
|
|
1006
|
+
pod_spec_copy['metadata']['labels'].update(head_selector)
|
|
1007
|
+
pod_spec_copy['metadata'][
|
|
1008
|
+
'name'] = f'{cluster_name_on_cloud}-head'
|
|
1009
|
+
else:
|
|
1010
|
+
# If head pod already exists, we skip creating it.
|
|
1011
|
+
return
|
|
828
1012
|
else:
|
|
829
1013
|
# Worker pods
|
|
830
1014
|
pod_spec_copy['metadata']['labels'].update(
|
|
831
1015
|
constants.WORKER_NODE_TAGS)
|
|
832
|
-
|
|
833
|
-
pod_name
|
|
834
|
-
|
|
1016
|
+
pod_name = f'{cluster_name_on_cloud}-worker{i}'
|
|
1017
|
+
if pod_name in running_pods:
|
|
1018
|
+
# If the pod is already running, we skip creating it.
|
|
1019
|
+
return
|
|
1020
|
+
pod_spec_copy['metadata']['name'] = pod_name
|
|
1021
|
+
pod_spec_copy['metadata']['labels']['component'] = pod_name
|
|
1022
|
+
|
|
1023
|
+
# We need to keep the following fields in the pod spec to be same for
|
|
1024
|
+
# head and worker pods.
|
|
1025
|
+
# So that Kueue can merge them into a single PodSet when creating
|
|
1026
|
+
# ProvisioningRequest to trigger scale up of the cluster autoscaler,
|
|
1027
|
+
# this is especially required for DWS queued provisioning mode in GKE.
|
|
1028
|
+
# spec.containers[*].resources.requests
|
|
1029
|
+
# spec.initContainers[*].resources.requests
|
|
1030
|
+
# spec.resources
|
|
1031
|
+
# spec.nodeSelector
|
|
1032
|
+
# spec.tolerations
|
|
1033
|
+
# spec.affinity
|
|
1034
|
+
# resourceClaims
|
|
1035
|
+
# Refer to the following links for more details:
|
|
1036
|
+
# https://cloud.google.com/kubernetes-engine/docs/how-to/provisioningrequest#define_a_provisioningrequest_object # pylint: disable=line-too-long
|
|
1037
|
+
# https://kueue.sigs.k8s.io/docs/admission-check-controllers/provisioning/#podset-merge-policy # pylint: disable=line-too-long
|
|
1038
|
+
if config.count > 1:
|
|
835
1039
|
# For multi-node support, we put a soft-constraint to schedule
|
|
836
1040
|
# worker pods on different nodes than the head pod.
|
|
837
1041
|
# This is not set as a hard constraint because if different nodes
|
|
@@ -850,7 +1054,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
850
1054
|
'podAffinityTerm': {
|
|
851
1055
|
'labelSelector': {
|
|
852
1056
|
'matchExpressions': [{
|
|
853
|
-
'key': TAG_SKYPILOT_CLUSTER_NAME,
|
|
1057
|
+
'key': constants.TAG_SKYPILOT_CLUSTER_NAME,
|
|
854
1058
|
'operator': 'In',
|
|
855
1059
|
'values': [cluster_name_on_cloud]
|
|
856
1060
|
}]
|
|
@@ -883,9 +1087,25 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
883
1087
|
pod_spec_copy['spec']['tolerations'] = existing_tolerations + [
|
|
884
1088
|
tpu_toleration
|
|
885
1089
|
]
|
|
1090
|
+
# Add GPU toleration if GPU is requested.
|
|
1091
|
+
# The nodes provisioned by DWS with flex start with queued provisioning
|
|
1092
|
+
# mode have the GPU taint, so we have to add the GPU toleration.
|
|
1093
|
+
# No need to check if DWS is enabled here since this has no side effect
|
|
1094
|
+
# to the non-DWS case.
|
|
1095
|
+
if needs_gpus:
|
|
1096
|
+
gpu_toleration = {
|
|
1097
|
+
'key': kubernetes_utils.get_gpu_resource_key(context),
|
|
1098
|
+
'operator': 'Exists',
|
|
1099
|
+
'effect': 'NoSchedule'
|
|
1100
|
+
}
|
|
1101
|
+
# Preserve existing tolerations if any
|
|
1102
|
+
existing_tolerations = pod_spec_copy['spec'].get('tolerations', [])
|
|
1103
|
+
pod_spec_copy['spec']['tolerations'] = existing_tolerations + [
|
|
1104
|
+
gpu_toleration
|
|
1105
|
+
]
|
|
886
1106
|
|
|
887
1107
|
if to_create_deployment:
|
|
888
|
-
|
|
1108
|
+
volume.create_persistent_volume_claim(namespace, context, pvc_spec)
|
|
889
1109
|
|
|
890
1110
|
# It's safe to directly modify the template spec in the deployment spec
|
|
891
1111
|
# because controller pod is singleton, i in [0].
|
|
@@ -893,9 +1113,12 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
893
1113
|
# Add the deployment name as a label to the pod spec
|
|
894
1114
|
deployment_name = deployment_spec['metadata']['name']
|
|
895
1115
|
pod_spec_copy['metadata']['labels'][
|
|
896
|
-
TAG_SKYPILOT_DEPLOYMENT_NAME] = deployment_name
|
|
1116
|
+
k8s_constants.TAG_SKYPILOT_DEPLOYMENT_NAME] = deployment_name
|
|
897
1117
|
template_pod_spec['metadata'] = pod_spec_copy['metadata']
|
|
898
1118
|
template_pod_spec['spec'].update(pod_spec_copy['spec'])
|
|
1119
|
+
# Propagate the labels to the deployment for identification.
|
|
1120
|
+
deployment_spec['metadata']['labels'] = pod_spec_copy['metadata'][
|
|
1121
|
+
'labels']
|
|
899
1122
|
try:
|
|
900
1123
|
return kubernetes.apps_api(
|
|
901
1124
|
context).create_namespaced_deployment(
|
|
@@ -904,6 +1127,10 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
904
1127
|
print('Deployment failed', e)
|
|
905
1128
|
raise e
|
|
906
1129
|
|
|
1130
|
+
# Check if any PVCs with access mode ReadWriteOnce or ReadWriteOncePod
|
|
1131
|
+
# is used by any pod in the namespace.
|
|
1132
|
+
volume.check_pvc_usage_for_pod(context, namespace, pod_spec_copy)
|
|
1133
|
+
|
|
907
1134
|
return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
|
|
908
1135
|
context)
|
|
909
1136
|
|
|
@@ -922,9 +1149,16 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
922
1149
|
'and then up the cluster again.')
|
|
923
1150
|
raise exceptions.InconsistentHighAvailabilityError(message)
|
|
924
1151
|
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
1152
|
+
created_resources = []
|
|
1153
|
+
if to_start_count > 0:
|
|
1154
|
+
# Create pods in parallel.
|
|
1155
|
+
# Use `config.count` instead of `to_start_count` to keep the index of
|
|
1156
|
+
# the Pods consistent especially for the case where some Pods are down
|
|
1157
|
+
# due to node failure or manual termination, etc. and then launch
|
|
1158
|
+
# again to create the Pods back.
|
|
1159
|
+
# The existing Pods will be skipped in _create_resource_thread.
|
|
1160
|
+
created_resources = subprocess_utils.run_in_parallel(
|
|
1161
|
+
_create_resource_thread, list(range(config.count)), _NUM_THREADS)
|
|
928
1162
|
|
|
929
1163
|
if to_create_deployment:
|
|
930
1164
|
deployments = copy.deepcopy(created_resources)
|
|
@@ -937,20 +1171,22 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
937
1171
|
pods = created_resources
|
|
938
1172
|
|
|
939
1173
|
created_pods = {}
|
|
1174
|
+
valid_pods = []
|
|
940
1175
|
for pod in pods:
|
|
1176
|
+
# In case Pod is not created
|
|
1177
|
+
if pod is None:
|
|
1178
|
+
continue
|
|
1179
|
+
valid_pods.append(pod)
|
|
941
1180
|
created_pods[pod.metadata.name] = pod
|
|
942
1181
|
if head_pod_name is None and _is_head(pod):
|
|
943
1182
|
head_pod_name = pod.metadata.name
|
|
1183
|
+
pods = valid_pods
|
|
1184
|
+
|
|
1185
|
+
# The running_pods may include Pending Pods, so we add them to the pods
|
|
1186
|
+
# list to wait for scheduling and running
|
|
1187
|
+
if running_pods:
|
|
1188
|
+
pods = pods + list(running_pods.values())
|
|
944
1189
|
|
|
945
|
-
networking_mode = network_utils.get_networking_mode(
|
|
946
|
-
config.provider_config.get('networking_mode'))
|
|
947
|
-
if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
|
948
|
-
# Adding the jump pod to the new_nodes list as well so it can be
|
|
949
|
-
# checked if it's scheduled and running along with other pods.
|
|
950
|
-
ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
|
|
951
|
-
jump_pod = kubernetes.core_api(context).read_namespaced_pod(
|
|
952
|
-
ssh_jump_pod_name, namespace)
|
|
953
|
-
pods.append(jump_pod)
|
|
954
1190
|
provision_timeout = provider_config['timeout']
|
|
955
1191
|
|
|
956
1192
|
wait_str = ('indefinitely'
|
|
@@ -960,12 +1196,17 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
960
1196
|
|
|
961
1197
|
# Wait until the pods are scheduled and surface cause for error
|
|
962
1198
|
# if there is one
|
|
963
|
-
_wait_for_pods_to_schedule(namespace, context, pods, provision_timeout
|
|
1199
|
+
_wait_for_pods_to_schedule(namespace, context, pods, provision_timeout,
|
|
1200
|
+
cluster_name, create_pods_start)
|
|
1201
|
+
# Reset spinner message here because it might have hinted autoscaling
|
|
1202
|
+
# while waiting for pods to schedule.
|
|
1203
|
+
rich_utils.force_update_status(
|
|
1204
|
+
ux_utils.spinner_message('Launching', cluster_name=cluster_name))
|
|
964
1205
|
# Wait until the pods and their containers are up and running, and
|
|
965
1206
|
# fail early if there is an error
|
|
966
1207
|
logger.debug(f'run_instances: waiting for pods to be running (pulling '
|
|
967
1208
|
f'images): {[pod.metadata.name for pod in pods]}')
|
|
968
|
-
_wait_for_pods_to_run(namespace, context, pods)
|
|
1209
|
+
_wait_for_pods_to_run(namespace, context, cluster_name, pods)
|
|
969
1210
|
logger.debug(f'run_instances: all pods are scheduled and running: '
|
|
970
1211
|
f'{[pod.metadata.name for pod in pods]}')
|
|
971
1212
|
|
|
@@ -981,11 +1222,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
|
|
|
981
1222
|
)
|
|
982
1223
|
|
|
983
1224
|
|
|
984
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
1225
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
985
1226
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
986
1227
|
"""Runs instances for the given cluster."""
|
|
987
1228
|
try:
|
|
988
|
-
return _create_pods(region, cluster_name_on_cloud, config)
|
|
1229
|
+
return _create_pods(region, cluster_name, cluster_name_on_cloud, config)
|
|
989
1230
|
except (kubernetes.api_exception(), config_lib.KubernetesError) as e:
|
|
990
1231
|
e_msg = common_utils.format_exception(e).replace('\n', ' ')
|
|
991
1232
|
logger.warning('run_instances: Error occurred when creating pods: '
|
|
@@ -1006,42 +1247,10 @@ def stop_instances(
|
|
|
1006
1247
|
raise NotImplementedError()
|
|
1007
1248
|
|
|
1008
1249
|
|
|
1009
|
-
def
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
Args:
|
|
1014
|
-
delete_func: Function to call to delete the resource
|
|
1015
|
-
resource_type: Type of resource being deleted (e.g. 'service'),
|
|
1016
|
-
used in logging
|
|
1017
|
-
resource_name: Name of the resource being deleted, used in logging
|
|
1018
|
-
"""
|
|
1019
|
-
max_retries = 3
|
|
1020
|
-
retry_delay = 5 # seconds
|
|
1021
|
-
|
|
1022
|
-
for attempt in range(max_retries):
|
|
1023
|
-
try:
|
|
1024
|
-
delete_func()
|
|
1025
|
-
return
|
|
1026
|
-
except kubernetes.api_exception() as e:
|
|
1027
|
-
if e.status == 404:
|
|
1028
|
-
logger.warning(
|
|
1029
|
-
f'terminate_instances: Tried to delete {resource_type} '
|
|
1030
|
-
f'{resource_name}, but the {resource_type} was not '
|
|
1031
|
-
'found (404).')
|
|
1032
|
-
return
|
|
1033
|
-
elif attempt < max_retries - 1:
|
|
1034
|
-
logger.warning(f'terminate_instances: Failed to delete '
|
|
1035
|
-
f'{resource_type} {resource_name} (attempt '
|
|
1036
|
-
f'{attempt + 1}/{max_retries}). Error: {e}. '
|
|
1037
|
-
f'Retrying in {retry_delay} seconds...')
|
|
1038
|
-
time.sleep(retry_delay)
|
|
1039
|
-
else:
|
|
1040
|
-
raise
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
def _delete_services(name_prefix: str, namespace: str,
|
|
1044
|
-
context: Optional[str]) -> None:
|
|
1250
|
+
def _delete_services(name_prefix: str,
|
|
1251
|
+
namespace: str,
|
|
1252
|
+
context: Optional[str],
|
|
1253
|
+
skip_ssh_service: bool = False) -> None:
|
|
1045
1254
|
"""Delete services with the given name prefix.
|
|
1046
1255
|
|
|
1047
1256
|
Args:
|
|
@@ -1050,18 +1259,21 @@ def _delete_services(name_prefix: str, namespace: str,
|
|
|
1050
1259
|
context: Kubernetes context
|
|
1051
1260
|
"""
|
|
1052
1261
|
# TODO(andy): We should use tag for the service filter.
|
|
1053
|
-
|
|
1262
|
+
services = ([name_prefix, f'{name_prefix}-ssh']
|
|
1263
|
+
if not skip_ssh_service else [name_prefix])
|
|
1264
|
+
for service_name in services:
|
|
1054
1265
|
# Since we are not saving this lambda, it's a false positive.
|
|
1055
1266
|
# TODO(andyl): Wait for
|
|
1056
1267
|
# https://github.com/pylint-dev/pylint/issues/5263.
|
|
1057
1268
|
# pylint: disable=cell-var-from-loop
|
|
1058
|
-
|
|
1059
|
-
|
|
1060
|
-
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1269
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
|
1270
|
+
delete_func=lambda: kubernetes.core_api(
|
|
1271
|
+
context).delete_namespaced_service(name=service_name,
|
|
1272
|
+
namespace=namespace,
|
|
1273
|
+
_request_timeout=config_lib.
|
|
1274
|
+
DELETION_TIMEOUT),
|
|
1275
|
+
resource_type='service',
|
|
1276
|
+
resource_name=service_name)
|
|
1065
1277
|
|
|
1066
1278
|
|
|
1067
1279
|
def _terminate_node(namespace: str,
|
|
@@ -1075,13 +1287,16 @@ def _terminate_node(namespace: str,
|
|
|
1075
1287
|
# Delete services for the head pod
|
|
1076
1288
|
# services are specified in sky/templates/kubernetes-ray.yml.j2
|
|
1077
1289
|
_delete_services(pod_name, namespace, context)
|
|
1290
|
+
else:
|
|
1291
|
+
# No ssh service is created for worker pods
|
|
1292
|
+
_delete_services(pod_name, namespace, context, skip_ssh_service=True)
|
|
1078
1293
|
|
|
1079
1294
|
# Note - delete pod after all other resources are deleted.
|
|
1080
1295
|
# This is to ensure there are no leftover resources if this down is run
|
|
1081
1296
|
# from within the pod, e.g., for autodown.
|
|
1082
1297
|
# Note - some misbehaving pods may not terminate gracefully if they have
|
|
1083
1298
|
# open file descriptors. We force delete pods to avoid this.
|
|
1084
|
-
|
|
1299
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
|
1085
1300
|
delete_func=lambda: kubernetes.core_api(context).delete_namespaced_pod(
|
|
1086
1301
|
name=pod_name,
|
|
1087
1302
|
namespace=namespace,
|
|
@@ -1099,26 +1314,28 @@ def _terminate_deployment(cluster_name: str, namespace: str,
|
|
|
1099
1314
|
|
|
1100
1315
|
# Delete deployment
|
|
1101
1316
|
deployment_name = _get_deployment_name(cluster_name)
|
|
1102
|
-
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1317
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
|
1318
|
+
delete_func=lambda: kubernetes.apps_api(
|
|
1319
|
+
context).delete_namespaced_deployment(name=deployment_name,
|
|
1320
|
+
namespace=namespace,
|
|
1321
|
+
_request_timeout=config_lib.
|
|
1322
|
+
DELETION_TIMEOUT),
|
|
1323
|
+
resource_type='deployment',
|
|
1324
|
+
resource_name=deployment_name)
|
|
1109
1325
|
|
|
1110
1326
|
# Delete PVCs
|
|
1111
1327
|
pvc_name = _get_pvc_name(
|
|
1112
1328
|
cluster_name,
|
|
1113
1329
|
kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME)
|
|
1114
1330
|
# pylint: disable=cell-var-from-loop
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1331
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
|
1332
|
+
delete_func=lambda: kubernetes.core_api(
|
|
1333
|
+
context).delete_namespaced_persistent_volume_claim(
|
|
1334
|
+
name=pvc_name,
|
|
1335
|
+
namespace=namespace,
|
|
1336
|
+
_request_timeout=config_lib.DELETION_TIMEOUT),
|
|
1337
|
+
resource_type='pvc',
|
|
1338
|
+
resource_name=pvc_name)
|
|
1122
1339
|
|
|
1123
1340
|
|
|
1124
1341
|
def terminate_instances(
|
|
@@ -1133,18 +1350,6 @@ def terminate_instances(
|
|
|
1133
1350
|
ray_tag_filter(cluster_name_on_cloud),
|
|
1134
1351
|
None)
|
|
1135
1352
|
|
|
1136
|
-
# Clean up the SSH jump pod if in use
|
|
1137
|
-
networking_mode = network_utils.get_networking_mode(
|
|
1138
|
-
provider_config.get('networking_mode'))
|
|
1139
|
-
if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
|
|
1140
|
-
pod_name = list(pods.keys())[0]
|
|
1141
|
-
try:
|
|
1142
|
-
kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context,
|
|
1143
|
-
pod_name)
|
|
1144
|
-
except Exception as e: # pylint: disable=broad-except
|
|
1145
|
-
logger.warning('terminate_instances: Error occurred when analyzing '
|
|
1146
|
-
f'SSH Jump pod: {e}')
|
|
1147
|
-
|
|
1148
1353
|
if is_high_availability_cluster_by_kubectl(cluster_name_on_cloud, context,
|
|
1149
1354
|
namespace):
|
|
1150
1355
|
# For high availability controllers, terminate the deployment
|
|
@@ -1175,16 +1380,11 @@ def get_cluster_info(
|
|
|
1175
1380
|
|
|
1176
1381
|
running_pods = kubernetes_utils.filter_pods(
|
|
1177
1382
|
namespace, context, ray_tag_filter(cluster_name_on_cloud), ['Running'])
|
|
1383
|
+
logger.debug(f'Running pods: {list(running_pods.keys())}')
|
|
1178
1384
|
|
|
1179
1385
|
pods: Dict[str, List[common.InstanceInfo]] = {}
|
|
1180
1386
|
head_pod_name = None
|
|
1181
1387
|
|
|
1182
|
-
port_forward_mode = kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD
|
|
1183
|
-
network_mode_str = skypilot_config.get_nested(('kubernetes', 'networking'),
|
|
1184
|
-
port_forward_mode.value)
|
|
1185
|
-
network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
|
|
1186
|
-
network_mode_str)
|
|
1187
|
-
external_ip = kubernetes_utils.get_external_ip(network_mode, context)
|
|
1188
1388
|
port = 22
|
|
1189
1389
|
if not provider_config.get('use_internal_ips', False):
|
|
1190
1390
|
port = kubernetes_utils.get_head_ssh_port(cluster_name_on_cloud,
|
|
@@ -1198,10 +1398,12 @@ def get_cluster_info(
|
|
|
1198
1398
|
common.InstanceInfo(
|
|
1199
1399
|
instance_id=pod_name,
|
|
1200
1400
|
internal_ip=internal_ip,
|
|
1201
|
-
external_ip=
|
|
1202
|
-
external_ip),
|
|
1401
|
+
external_ip=None,
|
|
1203
1402
|
ssh_port=port,
|
|
1204
1403
|
tags=pod.metadata.labels,
|
|
1404
|
+
# TODO(hailong): `cluster.local` may need to be configurable
|
|
1405
|
+
# Service name is same as the pod name for now.
|
|
1406
|
+
internal_svc=f'{pod_name}.{namespace}.svc.cluster.local',
|
|
1205
1407
|
)
|
|
1206
1408
|
]
|
|
1207
1409
|
if _is_head(pod):
|
|
@@ -1210,10 +1412,16 @@ def get_cluster_info(
|
|
|
1210
1412
|
assert head_spec is not None, pod
|
|
1211
1413
|
cpu_request = head_spec.containers[0].resources.requests['cpu']
|
|
1212
1414
|
|
|
1213
|
-
|
|
1415
|
+
if cpu_request is None:
|
|
1416
|
+
raise RuntimeError(f'Pod {cluster_name_on_cloud}-head not found'
|
|
1417
|
+
' or not Running, check the Pod status')
|
|
1214
1418
|
|
|
1215
1419
|
ssh_user = 'sky'
|
|
1216
|
-
|
|
1420
|
+
# Use pattern matching to extract SSH user, handling MOTD contamination.
|
|
1421
|
+
# Some container images (like CUDA-Q) print MOTD when login shells start,
|
|
1422
|
+
# which can contaminate command output. We use a unique pattern to extract
|
|
1423
|
+
# the actual username reliably.
|
|
1424
|
+
get_k8s_ssh_user_cmd = 'echo "SKYPILOT_SSH_USER: $(whoami)"'
|
|
1217
1425
|
assert head_pod_name is not None
|
|
1218
1426
|
runner = command_runner.KubernetesCommandRunner(
|
|
1219
1427
|
((namespace, context), head_pod_name))
|
|
@@ -1223,10 +1431,24 @@ def get_cluster_info(
|
|
|
1223
1431
|
stream_logs=False)
|
|
1224
1432
|
_raise_command_running_error('get ssh user', get_k8s_ssh_user_cmd,
|
|
1225
1433
|
head_pod_name, rc, stdout + stderr)
|
|
1226
|
-
|
|
1434
|
+
|
|
1435
|
+
# Extract SSH user using pattern matching
|
|
1436
|
+
ssh_user_match = _SSH_USER_PATTERN.search(stdout)
|
|
1437
|
+
if ssh_user_match:
|
|
1438
|
+
ssh_user = ssh_user_match.group(1)
|
|
1439
|
+
else:
|
|
1440
|
+
raise ValueError('Failed to find SSH user identifier: '
|
|
1441
|
+
f'{stdout + stderr}')
|
|
1227
1442
|
logger.debug(
|
|
1228
1443
|
f'Using ssh user {ssh_user} for cluster {cluster_name_on_cloud}')
|
|
1229
1444
|
|
|
1445
|
+
# cpu_request may be a string like `100m`, need to parse and convert
|
|
1446
|
+
num_cpus = kubernetes_utils.parse_cpu_or_gpu_resource_to_float(cpu_request)
|
|
1447
|
+
# 'num-cpus' for ray must be an integer, but we should not set it to 0 if
|
|
1448
|
+
# cpus is <1.
|
|
1449
|
+
# Keep consistent with the logic in clouds/kubernetes.py
|
|
1450
|
+
str_cpus = str(max(int(num_cpus), 1))
|
|
1451
|
+
|
|
1230
1452
|
return common.ClusterInfo(
|
|
1231
1453
|
instances=pods,
|
|
1232
1454
|
head_instance_id=head_pod_name,
|
|
@@ -1236,56 +1458,375 @@ def get_cluster_info(
|
|
|
1236
1458
|
# problems for other pods.
|
|
1237
1459
|
custom_ray_options={
|
|
1238
1460
|
'object-store-memory': 500000000,
|
|
1239
|
-
'num-cpus':
|
|
1461
|
+
'num-cpus': str_cpus,
|
|
1240
1462
|
},
|
|
1241
1463
|
provider_name='kubernetes',
|
|
1242
1464
|
provider_config=provider_config)
|
|
1243
1465
|
|
|
1244
1466
|
|
|
1467
|
+
def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
|
|
1468
|
+
"""Get pod termination reason and write to cluster events.
|
|
1469
|
+
|
|
1470
|
+
Checks both pod conditions (for preemption/disruption) and
|
|
1471
|
+
container statuses (for exit codes/errors).
|
|
1472
|
+
"""
|
|
1473
|
+
latest_timestamp = pod.status.start_time or datetime.datetime.min
|
|
1474
|
+
ready_state = 'Unknown'
|
|
1475
|
+
termination_reason = 'Terminated unexpectedly'
|
|
1476
|
+
container_reasons = []
|
|
1477
|
+
|
|
1478
|
+
# Check pod status conditions for high level overview.
|
|
1479
|
+
# No need to sort, as each condition.type will only appear once.
|
|
1480
|
+
for condition in pod.status.conditions:
|
|
1481
|
+
reason = condition.reason or 'Unknown reason'
|
|
1482
|
+
message = condition.message or ''
|
|
1483
|
+
|
|
1484
|
+
# Get last known readiness state.
|
|
1485
|
+
if condition.type == 'Ready':
|
|
1486
|
+
ready_state = f'{reason} ({message})' if message else reason
|
|
1487
|
+
# Kueue preemption, as defined in:
|
|
1488
|
+
# https://pkg.go.dev/sigs.k8s.io/kueue/pkg/controller/jobs/pod#pkg-constants
|
|
1489
|
+
elif condition.type == 'TerminationTarget':
|
|
1490
|
+
termination_reason = f'Preempted by Kueue: {reason}'
|
|
1491
|
+
if message:
|
|
1492
|
+
termination_reason += f' ({message})'
|
|
1493
|
+
# Generic disruption.
|
|
1494
|
+
elif condition.type == 'DisruptionTarget':
|
|
1495
|
+
termination_reason = f'Disrupted: {reason}'
|
|
1496
|
+
if message:
|
|
1497
|
+
termination_reason += f' ({message})'
|
|
1498
|
+
|
|
1499
|
+
if condition.last_transition_time is not None:
|
|
1500
|
+
latest_timestamp = max(latest_timestamp,
|
|
1501
|
+
condition.last_transition_time)
|
|
1502
|
+
|
|
1503
|
+
pod_reason = (f'{termination_reason}.\n'
|
|
1504
|
+
f'Last known state: {ready_state}.')
|
|
1505
|
+
|
|
1506
|
+
# Check container statuses for exit codes/errors
|
|
1507
|
+
if pod.status and pod.status.container_statuses:
|
|
1508
|
+
for container_status in pod.status.container_statuses:
|
|
1509
|
+
terminated = container_status.state.terminated
|
|
1510
|
+
if terminated:
|
|
1511
|
+
exit_code = terminated.exit_code
|
|
1512
|
+
reason = terminated.reason
|
|
1513
|
+
if exit_code == 0:
|
|
1514
|
+
# skip exit 0 (non-failed) just for sanity
|
|
1515
|
+
logger.debug(f'{pod.metadata.name}/{container_status.name} '
|
|
1516
|
+
'had exit code 0. Skipping.')
|
|
1517
|
+
continue
|
|
1518
|
+
if reason is None:
|
|
1519
|
+
# just in-case reason is None, have default for debugging
|
|
1520
|
+
reason = f'exit({exit_code})'
|
|
1521
|
+
container_reasons.append(reason)
|
|
1522
|
+
latest_timestamp = max(latest_timestamp, terminated.finished_at)
|
|
1523
|
+
|
|
1524
|
+
# TODO (kyuds): later, if needed, query `last_state` too.
|
|
1525
|
+
|
|
1526
|
+
# Normally we will have a single container per pod for skypilot
|
|
1527
|
+
# but doing this just in-case there are multiple containers.
|
|
1528
|
+
if container_reasons:
|
|
1529
|
+
pod_reason += f'\nContainer errors: {" | ".join(container_reasons)}'
|
|
1530
|
+
|
|
1531
|
+
global_user_state.add_cluster_event(
|
|
1532
|
+
cluster_name,
|
|
1533
|
+
None,
|
|
1534
|
+
f'[kubernetes pod {pod.metadata.name} terminated] {pod_reason}',
|
|
1535
|
+
global_user_state.ClusterEventType.DEBUG,
|
|
1536
|
+
transitioned_at=int(latest_timestamp.timestamp()),
|
|
1537
|
+
)
|
|
1538
|
+
return pod_reason
|
|
1539
|
+
|
|
1540
|
+
|
|
1541
|
+
def _get_pod_missing_reason(context: Optional[str], namespace: str,
|
|
1542
|
+
cluster_name: str, pod_name: str) -> Optional[str]:
|
|
1543
|
+
"""Get events for missing pod and write to cluster events."""
|
|
1544
|
+
logger.debug(f'Analyzing events for pod {pod_name}')
|
|
1545
|
+
pod_field_selector = (
|
|
1546
|
+
f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
|
|
1547
|
+
pod_events = kubernetes.core_api(context).list_namespaced_event(
|
|
1548
|
+
namespace,
|
|
1549
|
+
field_selector=pod_field_selector,
|
|
1550
|
+
_request_timeout=kubernetes.API_TIMEOUT).items
|
|
1551
|
+
pod_events = sorted(
|
|
1552
|
+
pod_events,
|
|
1553
|
+
key=lambda event: event.metadata.creation_timestamp,
|
|
1554
|
+
# latest event appears first
|
|
1555
|
+
reverse=True)
|
|
1556
|
+
last_scheduled_node = None
|
|
1557
|
+
insert_new_pod_event = True
|
|
1558
|
+
new_event_inserted = False
|
|
1559
|
+
inserted_pod_events = 0
|
|
1560
|
+
|
|
1561
|
+
for event in pod_events:
|
|
1562
|
+
if event.reason == 'Scheduled':
|
|
1563
|
+
pattern = r'Successfully assigned (\S+) to (\S+)'
|
|
1564
|
+
match = re.search(pattern, event.message)
|
|
1565
|
+
if match:
|
|
1566
|
+
scheduled_node = match.group(2)
|
|
1567
|
+
last_scheduled_node = scheduled_node
|
|
1568
|
+
if insert_new_pod_event:
|
|
1569
|
+
# Try inserting the latest events first. If the event is a
|
|
1570
|
+
# duplicate, it means the event (and any previous events) have
|
|
1571
|
+
# already been inserted - so do not insert further events.
|
|
1572
|
+
try:
|
|
1573
|
+
global_user_state.add_cluster_event(
|
|
1574
|
+
cluster_name,
|
|
1575
|
+
None, f'[kubernetes pod {pod_name}] '
|
|
1576
|
+
f'{event.reason} {event.message}',
|
|
1577
|
+
global_user_state.ClusterEventType.DEBUG,
|
|
1578
|
+
transitioned_at=int(
|
|
1579
|
+
event.metadata.creation_timestamp.timestamp()),
|
|
1580
|
+
expose_duplicate_error=True)
|
|
1581
|
+
logger.debug(f'[pod {pod_name}] encountered new pod event: '
|
|
1582
|
+
f'{event.metadata.creation_timestamp} '
|
|
1583
|
+
f'{event.reason} {event.message}')
|
|
1584
|
+
except db_utils.UniqueConstraintViolationError:
|
|
1585
|
+
insert_new_pod_event = False
|
|
1586
|
+
else:
|
|
1587
|
+
new_event_inserted = True
|
|
1588
|
+
inserted_pod_events += 1
|
|
1589
|
+
|
|
1590
|
+
logger.debug(f'[pod {pod_name}] processed {len(pod_events)} pod events and '
|
|
1591
|
+
f'inserted {inserted_pod_events} new pod events '
|
|
1592
|
+
'previously unseen')
|
|
1593
|
+
|
|
1594
|
+
if last_scheduled_node is not None:
|
|
1595
|
+
node_field_selector = ('involvedObject.kind=Node,'
|
|
1596
|
+
f'involvedObject.name={last_scheduled_node}')
|
|
1597
|
+
node_events = kubernetes.core_api(context).list_namespaced_event(
|
|
1598
|
+
namespace,
|
|
1599
|
+
field_selector=node_field_selector,
|
|
1600
|
+
_request_timeout=kubernetes.API_TIMEOUT).items
|
|
1601
|
+
node_events = sorted(
|
|
1602
|
+
node_events,
|
|
1603
|
+
key=lambda event: event.metadata.creation_timestamp,
|
|
1604
|
+
# latest event appears first
|
|
1605
|
+
reverse=True)
|
|
1606
|
+
insert_new_node_event = True
|
|
1607
|
+
inserted_node_events = 0
|
|
1608
|
+
for event in node_events:
|
|
1609
|
+
if insert_new_node_event:
|
|
1610
|
+
# Try inserting the latest events first. If the event is a
|
|
1611
|
+
# duplicate, it means the event (and any previous events) have
|
|
1612
|
+
# already been inserted - so do not insert further events.
|
|
1613
|
+
try:
|
|
1614
|
+
global_user_state.add_cluster_event(
|
|
1615
|
+
cluster_name,
|
|
1616
|
+
None, f'[kubernetes node {last_scheduled_node}] '
|
|
1617
|
+
f'{event.reason} {event.message}',
|
|
1618
|
+
global_user_state.ClusterEventType.DEBUG,
|
|
1619
|
+
transitioned_at=int(
|
|
1620
|
+
event.metadata.creation_timestamp.timestamp()),
|
|
1621
|
+
expose_duplicate_error=True)
|
|
1622
|
+
logger.debug(
|
|
1623
|
+
f'[pod {pod_name}] encountered new node event: '
|
|
1624
|
+
f'{event.metadata.creation_timestamp} '
|
|
1625
|
+
f'{event.reason} {event.message}')
|
|
1626
|
+
except db_utils.UniqueConstraintViolationError:
|
|
1627
|
+
insert_new_node_event = False
|
|
1628
|
+
else:
|
|
1629
|
+
new_event_inserted = True
|
|
1630
|
+
inserted_node_events += 1
|
|
1631
|
+
|
|
1632
|
+
logger.debug(f'[pod {pod_name}: node {last_scheduled_node}] '
|
|
1633
|
+
f'processed {len(node_events)} node events and '
|
|
1634
|
+
f'inserted {inserted_node_events} new node events '
|
|
1635
|
+
'previously unseen')
|
|
1636
|
+
else:
|
|
1637
|
+
logger.debug(f'[pod {pod_name}] could not determine the node '
|
|
1638
|
+
'the pod was scheduled to')
|
|
1639
|
+
|
|
1640
|
+
if not new_event_inserted:
|
|
1641
|
+
# If new event is not inserted, there is no useful information to
|
|
1642
|
+
# return. Return None.
|
|
1643
|
+
return None
|
|
1644
|
+
|
|
1645
|
+
# Analyze the events for failure
|
|
1646
|
+
failure_reason = None
|
|
1647
|
+
failure_decisiveness = 0
|
|
1648
|
+
|
|
1649
|
+
def _record_failure_reason(reason: str, decisiveness: int):
|
|
1650
|
+
nonlocal failure_reason, failure_decisiveness
|
|
1651
|
+
if decisiveness > failure_decisiveness:
|
|
1652
|
+
failure_reason = reason
|
|
1653
|
+
failure_decisiveness = decisiveness
|
|
1654
|
+
|
|
1655
|
+
cluster_events = global_user_state.get_cluster_events(
|
|
1656
|
+
cluster_name, None, global_user_state.ClusterEventType.DEBUG)
|
|
1657
|
+
for event in cluster_events:
|
|
1658
|
+
if event.startswith('[kubernetes pod'):
|
|
1659
|
+
event = event.split(']')[1].strip()
|
|
1660
|
+
elif event.startswith('[kubernetes node'):
|
|
1661
|
+
event = event.split(']')[1].strip()
|
|
1662
|
+
|
|
1663
|
+
if event.startswith('NodeNotReady '):
|
|
1664
|
+
_record_failure_reason(event[len('NodeNotReady '):], 1)
|
|
1665
|
+
elif event.startswith('TaintManagerEviction '):
|
|
1666
|
+
# usually the event message for TaintManagerEviction is not useful
|
|
1667
|
+
# so we record a more generic message.
|
|
1668
|
+
_record_failure_reason('pod was evicted by taint manager', 2)
|
|
1669
|
+
elif event.startswith('DeletingNode '):
|
|
1670
|
+
_record_failure_reason(event[len('DeletingNode '):], 3)
|
|
1671
|
+
return failure_reason
|
|
1672
|
+
|
|
1673
|
+
|
|
1674
|
+
def list_namespaced_pod(context: Optional[str], namespace: str,
|
|
1675
|
+
cluster_name_on_cloud: str, is_ssh: bool, identity: str,
|
|
1676
|
+
label_selector: str) -> List[Any]:
|
|
1677
|
+
# Get all the pods with the label skypilot-cluster-name: <cluster_name>
|
|
1678
|
+
try:
|
|
1679
|
+
# log the query parameters we pass to the k8s api
|
|
1680
|
+
logger.debug(f'Querying k8s api for pods:\n'
|
|
1681
|
+
f'context: {context}\n'
|
|
1682
|
+
f'namespace: {namespace}\n'
|
|
1683
|
+
f'label selector:`{label_selector}`.')
|
|
1684
|
+
|
|
1685
|
+
response = kubernetes.core_api(context).list_namespaced_pod(
|
|
1686
|
+
namespace,
|
|
1687
|
+
label_selector=label_selector,
|
|
1688
|
+
_request_timeout=kubernetes.API_TIMEOUT)
|
|
1689
|
+
|
|
1690
|
+
# log PodList response info
|
|
1691
|
+
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
1692
|
+
logger.debug(f'k8s api response for `{label_selector}`:\n'
|
|
1693
|
+
f'apiVersion={response.api_version}, '
|
|
1694
|
+
f'kind={response.kind},\n'
|
|
1695
|
+
f'metadata={response.metadata}')
|
|
1696
|
+
|
|
1697
|
+
pods = response.items
|
|
1698
|
+
|
|
1699
|
+
# log detailed Pod info
|
|
1700
|
+
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
1701
|
+
logger.debug(f'k8s api response for `{label_selector}`: '
|
|
1702
|
+
f'len(pods)={len(pods)}')
|
|
1703
|
+
for pod in pods:
|
|
1704
|
+
logger.debug(f'k8s pod info for `{label_selector}`: '
|
|
1705
|
+
f'pod.apiVersion={pod.api_version}, '
|
|
1706
|
+
f'pod.kind={pod.kind}, \n'
|
|
1707
|
+
f'pod.name={pod.metadata.name}, '
|
|
1708
|
+
f'pod.namespace={pod.metadata.namespace}, \n'
|
|
1709
|
+
f'pod.labels={pod.metadata.labels}, \n'
|
|
1710
|
+
f'pod.annotations={pod.metadata.annotations}, \n'
|
|
1711
|
+
'pod.creationTimestamp='
|
|
1712
|
+
f'{pod.metadata.creation_timestamp}, '
|
|
1713
|
+
'pod.deletionTimestamp='
|
|
1714
|
+
f'{pod.metadata.deletion_timestamp}, \n'
|
|
1715
|
+
f'pod.status={pod.status}')
|
|
1716
|
+
return pods
|
|
1717
|
+
|
|
1718
|
+
except kubernetes.max_retry_error():
|
|
1719
|
+
with ux_utils.print_exception_no_traceback():
|
|
1720
|
+
if is_ssh:
|
|
1721
|
+
node_pool = common_utils.removeprefix(context,
|
|
1722
|
+
'ssh-') if context else ''
|
|
1723
|
+
msg = (
|
|
1724
|
+
f'Cannot connect to SSH Node Pool {node_pool}. '
|
|
1725
|
+
'Please check if the SSH Node Pool is up and accessible. '
|
|
1726
|
+
'To debug, run `sky check ssh` to check the status of '
|
|
1727
|
+
'the SSH Node Pool.')
|
|
1728
|
+
else:
|
|
1729
|
+
ctx = kubernetes_utils.get_current_kube_config_context_name()
|
|
1730
|
+
msg = (f'Network error - check if the {identity} in '
|
|
1731
|
+
f'context {ctx} is up and accessible.')
|
|
1732
|
+
raise exceptions.ClusterStatusFetchingError(
|
|
1733
|
+
f'Failed to query cluster {cluster_name_on_cloud!r} status. ' +
|
|
1734
|
+
msg) from None
|
|
1735
|
+
except Exception as e: # pylint: disable=broad-except
|
|
1736
|
+
with ux_utils.print_exception_no_traceback():
|
|
1737
|
+
raise exceptions.ClusterStatusFetchingError(
|
|
1738
|
+
f'Failed to query {identity} {cluster_name_on_cloud!r} '
|
|
1739
|
+
f'status: {common_utils.format_exception(e)}')
|
|
1740
|
+
|
|
1741
|
+
|
|
1245
1742
|
def query_instances(
|
|
1743
|
+
cluster_name: str,
|
|
1246
1744
|
cluster_name_on_cloud: str,
|
|
1247
1745
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
1248
|
-
non_terminated_only: bool = True
|
|
1249
|
-
|
|
1746
|
+
non_terminated_only: bool = True,
|
|
1747
|
+
retry_if_missing: bool = False,
|
|
1748
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
1749
|
+
# Mapping from pod phase to skypilot status. These are the only valid pod
|
|
1750
|
+
# phases.
|
|
1751
|
+
# https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
|
|
1250
1752
|
status_map = {
|
|
1251
1753
|
'Pending': status_lib.ClusterStatus.INIT,
|
|
1252
1754
|
'Running': status_lib.ClusterStatus.UP,
|
|
1253
|
-
'Failed':
|
|
1755
|
+
'Failed': status_lib.ClusterStatus.INIT,
|
|
1254
1756
|
'Unknown': None,
|
|
1255
1757
|
'Succeeded': None,
|
|
1256
|
-
'Terminating': None,
|
|
1257
1758
|
}
|
|
1258
1759
|
|
|
1259
1760
|
assert provider_config is not None
|
|
1260
1761
|
namespace = kubernetes_utils.get_namespace_from_config(provider_config)
|
|
1261
1762
|
context = kubernetes_utils.get_context_from_config(provider_config)
|
|
1262
|
-
|
|
1263
|
-
|
|
1264
|
-
|
|
1265
|
-
|
|
1266
|
-
|
|
1267
|
-
|
|
1268
|
-
|
|
1269
|
-
|
|
1270
|
-
|
|
1271
|
-
|
|
1272
|
-
|
|
1273
|
-
|
|
1274
|
-
|
|
1275
|
-
|
|
1276
|
-
|
|
1277
|
-
|
|
1278
|
-
|
|
1279
|
-
|
|
1280
|
-
|
|
1763
|
+
is_ssh = context.startswith('ssh-') if context else False
|
|
1764
|
+
identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
|
|
1765
|
+
label_selector = (f'{constants.TAG_SKYPILOT_CLUSTER_NAME}='
|
|
1766
|
+
f'{cluster_name_on_cloud}')
|
|
1767
|
+
|
|
1768
|
+
attempts = 0
|
|
1769
|
+
pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
|
|
1770
|
+
is_ssh, identity, label_selector)
|
|
1771
|
+
# When we see no pods returned from the k8s api, we assume the pods have
|
|
1772
|
+
# been terminated by the user directly and mark the cluster as terminated
|
|
1773
|
+
# in the global user state.
|
|
1774
|
+
# We add retry logic here as an attempt to mitigate a leak caused by the
|
|
1775
|
+
# kubernetes api returning no pods despite the pods actually existing.
|
|
1776
|
+
while (retry_if_missing and not pods and
|
|
1777
|
+
attempts < _MAX_QUERY_INSTANCES_RETRIES):
|
|
1778
|
+
logger.debug(f'Retrying to query k8s api for {cluster_name_on_cloud} '
|
|
1779
|
+
f'{attempts}/{_MAX_QUERY_INSTANCES_RETRIES} times.'
|
|
1780
|
+
f'after {_QUERY_INSTANCES_RETRY_INTERVAL} seconds.')
|
|
1781
|
+
time.sleep(_QUERY_INSTANCES_RETRY_INTERVAL)
|
|
1782
|
+
attempts += 1
|
|
1783
|
+
pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
|
|
1784
|
+
is_ssh, identity, label_selector)
|
|
1785
|
+
if len(pods) > 0:
|
|
1786
|
+
logger.info(f'Found {len(pods)} pods for {label_selector} after'
|
|
1787
|
+
f'{attempts} retries.')
|
|
1281
1788
|
|
|
1282
1789
|
# Check if the pods are running or pending
|
|
1283
|
-
cluster_status
|
|
1790
|
+
cluster_status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
1791
|
+
Optional[str]]] = {}
|
|
1284
1792
|
for pod in pods:
|
|
1285
|
-
|
|
1793
|
+
phase = pod.status.phase
|
|
1794
|
+
is_terminating = pod.metadata.deletion_timestamp is not None
|
|
1795
|
+
pod_status = status_map[phase]
|
|
1796
|
+
reason = None
|
|
1797
|
+
if phase in ('Failed', 'Unknown') or is_terminating:
|
|
1798
|
+
reason = _get_pod_termination_reason(pod, cluster_name)
|
|
1799
|
+
logger.debug(f'Pod Status ({phase}) Reason(s): {reason}')
|
|
1286
1800
|
if non_terminated_only and pod_status is None:
|
|
1801
|
+
logger.debug(f'Pod {pod.metadata.name} is terminated, but '
|
|
1802
|
+
'query_instances is called with '
|
|
1803
|
+
f'non_terminated_only=True. Phase: {phase}')
|
|
1287
1804
|
continue
|
|
1288
|
-
|
|
1805
|
+
pod_name = pod.metadata.name
|
|
1806
|
+
reason = f'{pod_name}: {reason}' if reason is not None else None
|
|
1807
|
+
cluster_status[pod_name] = (pod_status, reason)
|
|
1808
|
+
|
|
1809
|
+
# Find the list of pod names that should be there
|
|
1810
|
+
# from k8s services. Filter duplicates as -ssh service
|
|
1811
|
+
# creates a duplicate entry.
|
|
1812
|
+
target_pod_names = list(
|
|
1813
|
+
set([
|
|
1814
|
+
service['spec']['selector']['component']
|
|
1815
|
+
for service in provider_config.get('services', [])
|
|
1816
|
+
]))
|
|
1817
|
+
|
|
1818
|
+
for target_pod_name in target_pod_names:
|
|
1819
|
+
if target_pod_name not in cluster_status:
|
|
1820
|
+
# If the pod is not in the cluster_status, it means it's not
|
|
1821
|
+
# running.
|
|
1822
|
+
# Analyze what happened to the pod based on events.
|
|
1823
|
+
reason = _get_pod_missing_reason(context, namespace, cluster_name,
|
|
1824
|
+
target_pod_name)
|
|
1825
|
+
reason = (f'{target_pod_name}: {reason}'
|
|
1826
|
+
if reason is not None else None)
|
|
1827
|
+
if not non_terminated_only:
|
|
1828
|
+
cluster_status[target_pod_name] = (None, reason)
|
|
1829
|
+
|
|
1289
1830
|
return cluster_status
|
|
1290
1831
|
|
|
1291
1832
|
|
|
@@ -1307,7 +1848,8 @@ def get_command_runners(
|
|
|
1307
1848
|
|
|
1308
1849
|
# Try to get deployment name from label first
|
|
1309
1850
|
head_instance_info = instances[pod_name][0]
|
|
1310
|
-
deployment = head_instance_info.tags.get(
|
|
1851
|
+
deployment = head_instance_info.tags.get(
|
|
1852
|
+
k8s_constants.TAG_SKYPILOT_DEPLOYMENT_NAME)
|
|
1311
1853
|
|
|
1312
1854
|
node_list = [((namespace, context), pod_name)]
|
|
1313
1855
|
head_runner = command_runner.KubernetesCommandRunner(
|