skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/backends/backend_utils.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Util constants/functions for the backends."""
|
|
2
|
+
import asyncio
|
|
2
3
|
from datetime import datetime
|
|
3
4
|
import enum
|
|
4
5
|
import fnmatch
|
|
@@ -6,18 +7,23 @@ import hashlib
|
|
|
6
7
|
import os
|
|
7
8
|
import pathlib
|
|
8
9
|
import pprint
|
|
10
|
+
import queue as queue_lib
|
|
9
11
|
import re
|
|
10
12
|
import shlex
|
|
11
13
|
import subprocess
|
|
12
14
|
import sys
|
|
13
15
|
import tempfile
|
|
16
|
+
import threading
|
|
14
17
|
import time
|
|
15
18
|
import typing
|
|
16
|
-
from typing import Any, Dict, List, Optional, Sequence,
|
|
19
|
+
from typing import (Any, Callable, Dict, Iterator, List, Optional, Sequence,
|
|
20
|
+
Set, Tuple, TypeVar, Union)
|
|
17
21
|
import uuid
|
|
18
22
|
|
|
23
|
+
import aiohttp
|
|
24
|
+
from aiohttp import ClientTimeout
|
|
25
|
+
from aiohttp import TCPConnector
|
|
19
26
|
import colorama
|
|
20
|
-
import filelock
|
|
21
27
|
from packaging import version
|
|
22
28
|
from typing_extensions import Literal
|
|
23
29
|
|
|
@@ -28,30 +34,44 @@ from sky import check as sky_check
|
|
|
28
34
|
from sky import clouds
|
|
29
35
|
from sky import exceptions
|
|
30
36
|
from sky import global_user_state
|
|
37
|
+
from sky import logs
|
|
31
38
|
from sky import provision as provision_lib
|
|
32
39
|
from sky import sky_logging
|
|
33
40
|
from sky import skypilot_config
|
|
34
41
|
from sky.adaptors import common as adaptors_common
|
|
42
|
+
from sky.jobs import utils as managed_job_utils
|
|
43
|
+
from sky.provision import common as provision_common
|
|
35
44
|
from sky.provision import instance_setup
|
|
36
45
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
46
|
+
from sky.serve import serve_utils
|
|
47
|
+
from sky.server.requests import requests as requests_lib
|
|
48
|
+
from sky.skylet import autostop_lib
|
|
37
49
|
from sky.skylet import constants
|
|
38
50
|
from sky.usage import usage_lib
|
|
51
|
+
from sky.utils import auth_utils
|
|
39
52
|
from sky.utils import cluster_utils
|
|
40
53
|
from sky.utils import command_runner
|
|
41
54
|
from sky.utils import common
|
|
42
55
|
from sky.utils import common_utils
|
|
56
|
+
from sky.utils import context as context_lib
|
|
57
|
+
from sky.utils import context_utils
|
|
43
58
|
from sky.utils import controller_utils
|
|
44
59
|
from sky.utils import env_options
|
|
60
|
+
from sky.utils import locks
|
|
45
61
|
from sky.utils import registry
|
|
46
62
|
from sky.utils import resources_utils
|
|
47
63
|
from sky.utils import rich_utils
|
|
48
64
|
from sky.utils import schemas
|
|
49
65
|
from sky.utils import status_lib
|
|
50
66
|
from sky.utils import subprocess_utils
|
|
67
|
+
from sky.utils import tempstore
|
|
51
68
|
from sky.utils import timeline
|
|
52
69
|
from sky.utils import ux_utils
|
|
70
|
+
from sky.utils import yaml_utils
|
|
71
|
+
from sky.workspaces import core as workspaces_core
|
|
53
72
|
|
|
54
73
|
if typing.TYPE_CHECKING:
|
|
74
|
+
import grpc
|
|
55
75
|
import requests
|
|
56
76
|
from requests import adapters
|
|
57
77
|
from requests.packages.urllib3.util import retry as retry_lib
|
|
@@ -62,6 +82,7 @@ if typing.TYPE_CHECKING:
|
|
|
62
82
|
from sky import task as task_lib
|
|
63
83
|
from sky.backends import cloud_vm_ray_backend
|
|
64
84
|
from sky.backends import local_docker_backend
|
|
85
|
+
from sky.utils import volume as volume_lib
|
|
65
86
|
else:
|
|
66
87
|
yaml = adaptors_common.LazyImport('yaml')
|
|
67
88
|
requests = adaptors_common.LazyImport('requests')
|
|
@@ -69,6 +90,8 @@ else:
|
|
|
69
90
|
adapters = adaptors_common.LazyImport('requests.adapters')
|
|
70
91
|
retry_lib = adaptors_common.LazyImport(
|
|
71
92
|
'requests.packages.urllib3.util.retry')
|
|
93
|
+
# To avoid requiring grpcio to be installed on the client side.
|
|
94
|
+
grpc = adaptors_common.LazyImport('grpc')
|
|
72
95
|
|
|
73
96
|
logger = sky_logging.init_logger(__name__)
|
|
74
97
|
|
|
@@ -91,6 +114,13 @@ _LAUNCHED_RESERVED_WORKER_PATTERN = re.compile(
|
|
|
91
114
|
# 10.133.0.5: ray.worker.default,
|
|
92
115
|
_LAUNCHING_IP_PATTERN = re.compile(
|
|
93
116
|
r'({}): ray[._]worker[._](?:default|reserved)'.format(IP_ADDR_REGEX))
|
|
117
|
+
SSH_CONNECTION_ERROR_PATTERN = re.compile(
|
|
118
|
+
r'^ssh:.*(timed out|connection refused)$', re.IGNORECASE)
|
|
119
|
+
_SSH_CONNECTION_TIMED_OUT_PATTERN = re.compile(r'^ssh:.*timed out$',
|
|
120
|
+
re.IGNORECASE)
|
|
121
|
+
K8S_PODS_NOT_FOUND_PATTERN = re.compile(r'.*(NotFound|pods .* not found).*',
|
|
122
|
+
re.IGNORECASE)
|
|
123
|
+
_RAY_CLUSTER_NOT_FOUND_MESSAGE = 'Ray cluster is not found'
|
|
94
124
|
WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
|
|
95
125
|
|
|
96
126
|
# We check network connection by going through _TEST_IP_LIST. We may need to
|
|
@@ -98,24 +128,21 @@ WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
|
|
|
98
128
|
# Fixed IP addresses are used to avoid DNS lookup blocking the check, for
|
|
99
129
|
# machine with no internet connection.
|
|
100
130
|
# Refer to: https://stackoverflow.com/questions/3764291/how-can-i-see-if-theres-an-available-and-active-network-connection-in-python # pylint: disable=line-too-long
|
|
101
|
-
_TEST_IP_LIST = ['https://
|
|
131
|
+
_TEST_IP_LIST = ['https://8.8.8.8', 'https://1.1.1.1']
|
|
102
132
|
|
|
103
133
|
# Allow each CPU thread take 2 tasks.
|
|
104
134
|
# Note: This value cannot be too small, otherwise OOM issue may occur.
|
|
105
135
|
DEFAULT_TASK_CPU_DEMAND = 0.5
|
|
106
136
|
|
|
107
|
-
# Filelocks for the cluster status change.
|
|
108
|
-
CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock')
|
|
109
137
|
CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
|
|
110
138
|
|
|
111
139
|
# Time that must elapse since the last status check before we should re-check if
|
|
112
140
|
# the cluster has been terminated or autostopped.
|
|
113
141
|
_CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
|
|
114
142
|
|
|
115
|
-
# Filelocks for updating cluster's file_mounts.
|
|
116
|
-
CLUSTER_FILE_MOUNTS_LOCK_PATH = os.path.expanduser(
|
|
117
|
-
'~/.sky/.{}_file_mounts.lock')
|
|
118
143
|
CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
|
|
144
|
+
WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
|
|
145
|
+
CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10
|
|
119
146
|
|
|
120
147
|
# Remote dir that holds our runtime files.
|
|
121
148
|
_REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
|
|
@@ -124,7 +151,7 @@ _ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
|
|
|
124
151
|
'please retry after a while.')
|
|
125
152
|
|
|
126
153
|
# If a cluster is less than LAUNCH_DOUBLE_CHECK_WINDOW seconds old, and we don't
|
|
127
|
-
# see any instances in the cloud, the instances might be in the
|
|
154
|
+
# see any instances in the cloud, the instances might be in the process of
|
|
128
155
|
# being created. We will wait LAUNCH_DOUBLE_CHECK_DELAY seconds and then double
|
|
129
156
|
# check to make sure there are still no instances. LAUNCH_DOUBLE_CHECK_DELAY
|
|
130
157
|
# should be set longer than the delay between (sending the create instance
|
|
@@ -194,6 +221,9 @@ _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
|
|
|
194
221
|
('provider', 'availability_zone'),
|
|
195
222
|
]
|
|
196
223
|
|
|
224
|
+
_ACK_MESSAGE = 'ack'
|
|
225
|
+
_FORWARDING_FROM_MESSAGE = 'Forwarding from'
|
|
226
|
+
|
|
197
227
|
|
|
198
228
|
def is_ip(s: str) -> bool:
|
|
199
229
|
"""Returns whether this string matches IP_ADDR_REGEX."""
|
|
@@ -212,7 +242,7 @@ def _get_yaml_path_from_cluster_name(cluster_name: str,
|
|
|
212
242
|
# Add retry for the file mounts optimization, as the underlying cp command may
|
|
213
243
|
# experience transient errors, #4758.
|
|
214
244
|
@common_utils.retry
|
|
215
|
-
def _optimize_file_mounts(
|
|
245
|
+
def _optimize_file_mounts(tmp_yaml_path: str) -> None:
|
|
216
246
|
"""Optimize file mounts in the given ray yaml file.
|
|
217
247
|
|
|
218
248
|
Runtime files handling:
|
|
@@ -226,7 +256,7 @@ def _optimize_file_mounts(yaml_path: str) -> None:
|
|
|
226
256
|
subprocess.CalledProcessError: If the file mounts are failed to be
|
|
227
257
|
copied.
|
|
228
258
|
"""
|
|
229
|
-
yaml_config =
|
|
259
|
+
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
230
260
|
|
|
231
261
|
file_mounts = yaml_config.get('file_mounts', {})
|
|
232
262
|
# Remove the file mounts added by the newline.
|
|
@@ -242,7 +272,7 @@ def _optimize_file_mounts(yaml_path: str) -> None:
|
|
|
242
272
|
# - use a remote command to move all runtime files to their right places.
|
|
243
273
|
|
|
244
274
|
# Local tmp dir holding runtime files.
|
|
245
|
-
local_runtime_files_dir =
|
|
275
|
+
local_runtime_files_dir = tempstore.mkdtemp()
|
|
246
276
|
new_file_mounts = {_REMOTE_RUNTIME_FILES_DIR: local_runtime_files_dir}
|
|
247
277
|
|
|
248
278
|
# Generate local_src -> unique_name.
|
|
@@ -310,7 +340,7 @@ def _optimize_file_mounts(yaml_path: str) -> None:
|
|
|
310
340
|
shell=True,
|
|
311
341
|
check=True)
|
|
312
342
|
|
|
313
|
-
|
|
343
|
+
yaml_utils.dump_yaml(tmp_yaml_path, yaml_config)
|
|
314
344
|
|
|
315
345
|
|
|
316
346
|
def path_size_megabytes(path: str) -> int:
|
|
@@ -339,7 +369,13 @@ def path_size_megabytes(path: str) -> int:
|
|
|
339
369
|
f'{git_exclude_filter} --dry-run {path!r}')
|
|
340
370
|
rsync_output = ''
|
|
341
371
|
try:
|
|
342
|
-
|
|
372
|
+
# rsync sometimes fails `--dry-run` for MacOS' rsync build, however this function is only used to display
|
|
373
|
+
# a warning message to the user if the size of a file/directory is too
|
|
374
|
+
# large, so we can safely ignore the error.
|
|
375
|
+
rsync_output = str(
|
|
376
|
+
subprocess.check_output(rsync_command,
|
|
377
|
+
shell=True,
|
|
378
|
+
stderr=subprocess.DEVNULL))
|
|
343
379
|
except subprocess.CalledProcessError:
|
|
344
380
|
logger.debug('Command failed, proceeding without estimating size: '
|
|
345
381
|
f'{rsync_command}')
|
|
@@ -464,8 +500,8 @@ def _replace_yaml_dicts(
|
|
|
464
500
|
if key in old_block:
|
|
465
501
|
_restore_block(value, old_block[key])
|
|
466
502
|
|
|
467
|
-
new_config =
|
|
468
|
-
old_config =
|
|
503
|
+
new_config = yaml_utils.safe_load(new_yaml)
|
|
504
|
+
old_config = yaml_utils.safe_load(old_yaml)
|
|
469
505
|
excluded_results = {}
|
|
470
506
|
# Find all key values excluded from restore
|
|
471
507
|
for exclude_restore_key_name_list in restore_key_names_exceptions:
|
|
@@ -489,7 +525,7 @@ def _replace_yaml_dicts(
|
|
|
489
525
|
for key in exclude_restore_key_name[:-1]:
|
|
490
526
|
curr = curr[key]
|
|
491
527
|
curr[exclude_restore_key_name[-1]] = value
|
|
492
|
-
return
|
|
528
|
+
return yaml_utils.dump_yaml_str(new_config)
|
|
493
529
|
|
|
494
530
|
|
|
495
531
|
def get_expirable_clouds(
|
|
@@ -509,11 +545,55 @@ def get_expirable_clouds(
|
|
|
509
545
|
expirable_clouds = []
|
|
510
546
|
local_credentials_value = schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value
|
|
511
547
|
for cloud in enabled_clouds:
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
548
|
+
# Kubernetes config might have context-specific properties
|
|
549
|
+
if isinstance(cloud, clouds.Kubernetes):
|
|
550
|
+
# get all custom contexts
|
|
551
|
+
contexts = kubernetes_utils.get_custom_config_k8s_contexts()
|
|
552
|
+
# add remote_identity of each context if it exists
|
|
553
|
+
remote_identities: Optional[Union[str, List[Dict[str, str]]]] = None
|
|
554
|
+
for context in contexts:
|
|
555
|
+
context_remote_identity = skypilot_config.get_effective_region_config(
|
|
556
|
+
cloud='kubernetes',
|
|
557
|
+
region=context,
|
|
558
|
+
keys=('remote_identity',),
|
|
559
|
+
default_value=None)
|
|
560
|
+
if context_remote_identity is not None:
|
|
561
|
+
if remote_identities is None:
|
|
562
|
+
remote_identities = []
|
|
563
|
+
if isinstance(context_remote_identity, str):
|
|
564
|
+
assert isinstance(remote_identities, list)
|
|
565
|
+
remote_identities.append(
|
|
566
|
+
{context: context_remote_identity})
|
|
567
|
+
elif isinstance(context_remote_identity, list):
|
|
568
|
+
assert isinstance(remote_identities, list)
|
|
569
|
+
remote_identities.extend(context_remote_identity)
|
|
570
|
+
# add global kubernetes remote identity if it exists, if not, add default
|
|
571
|
+
global_remote_identity = skypilot_config.get_effective_region_config(
|
|
572
|
+
cloud='kubernetes',
|
|
573
|
+
region=None,
|
|
574
|
+
keys=('remote_identity',),
|
|
575
|
+
default_value=None)
|
|
576
|
+
if global_remote_identity is not None:
|
|
577
|
+
if remote_identities is None:
|
|
578
|
+
remote_identities = []
|
|
579
|
+
if isinstance(global_remote_identity, str):
|
|
580
|
+
assert isinstance(remote_identities, list)
|
|
581
|
+
remote_identities.append({'*': global_remote_identity})
|
|
582
|
+
elif isinstance(global_remote_identity, list):
|
|
583
|
+
assert isinstance(remote_identities, list)
|
|
584
|
+
remote_identities.extend(global_remote_identity)
|
|
585
|
+
if remote_identities is None:
|
|
586
|
+
remote_identities = schemas.get_default_remote_identity(
|
|
587
|
+
str(cloud).lower())
|
|
588
|
+
else:
|
|
589
|
+
remote_identities = skypilot_config.get_effective_region_config(
|
|
590
|
+
cloud=str(cloud).lower(),
|
|
591
|
+
region=None,
|
|
592
|
+
keys=('remote_identity',),
|
|
593
|
+
default_value=None)
|
|
594
|
+
if remote_identities is None:
|
|
595
|
+
remote_identities = schemas.get_default_remote_identity(
|
|
596
|
+
str(cloud).lower())
|
|
517
597
|
|
|
518
598
|
local_credential_expiring = cloud.can_credential_expire()
|
|
519
599
|
if isinstance(remote_identities, str):
|
|
@@ -531,16 +611,18 @@ def get_expirable_clouds(
|
|
|
531
611
|
# TODO: too many things happening here - leaky abstraction. Refactor.
|
|
532
612
|
@timeline.event
|
|
533
613
|
def write_cluster_config(
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
614
|
+
to_provision: 'resources_lib.Resources',
|
|
615
|
+
num_nodes: int,
|
|
616
|
+
cluster_config_template: str,
|
|
617
|
+
cluster_name: str,
|
|
618
|
+
local_wheel_path: pathlib.Path,
|
|
619
|
+
wheel_hash: str,
|
|
620
|
+
region: clouds.Region,
|
|
621
|
+
zones: Optional[List[clouds.Zone]] = None,
|
|
622
|
+
dryrun: bool = False,
|
|
623
|
+
keep_launch_fields_in_existing_config: bool = True,
|
|
624
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
|
625
|
+
) -> Dict[str, str]:
|
|
544
626
|
"""Fills in cluster configuration templates and writes them out.
|
|
545
627
|
|
|
546
628
|
Returns:
|
|
@@ -588,12 +670,15 @@ def write_cluster_config(
|
|
|
588
670
|
resources_utils.ClusterName(
|
|
589
671
|
cluster_name,
|
|
590
672
|
cluster_name_on_cloud,
|
|
591
|
-
), region, zones, num_nodes, dryrun)
|
|
673
|
+
), region, zones, num_nodes, dryrun, volume_mounts)
|
|
592
674
|
config_dict = {}
|
|
593
675
|
|
|
594
676
|
specific_reservations = set(
|
|
595
|
-
skypilot_config.
|
|
596
|
-
|
|
677
|
+
skypilot_config.get_effective_region_config(
|
|
678
|
+
cloud=str(to_provision.cloud).lower(),
|
|
679
|
+
region=to_provision.region,
|
|
680
|
+
keys=('specific_reservations',),
|
|
681
|
+
default_value=set()))
|
|
597
682
|
|
|
598
683
|
# Remote identity handling can have 4 cases:
|
|
599
684
|
# 1. LOCAL_CREDENTIALS (default for most clouds): Upload local credentials
|
|
@@ -605,9 +690,12 @@ def write_cluster_config(
|
|
|
605
690
|
# other cases, we exclude the cloud from credential file uploads after
|
|
606
691
|
# running required checks.
|
|
607
692
|
assert cluster_name is not None
|
|
608
|
-
excluded_clouds = set()
|
|
609
|
-
remote_identity_config = skypilot_config.
|
|
610
|
-
|
|
693
|
+
excluded_clouds: Set[clouds.Cloud] = set()
|
|
694
|
+
remote_identity_config = skypilot_config.get_effective_region_config(
|
|
695
|
+
cloud=str(cloud).lower(),
|
|
696
|
+
region=region.name,
|
|
697
|
+
keys=('remote_identity',),
|
|
698
|
+
default_value=None)
|
|
611
699
|
remote_identity = schemas.get_default_remote_identity(str(cloud).lower())
|
|
612
700
|
if isinstance(remote_identity_config, str):
|
|
613
701
|
remote_identity = remote_identity_config
|
|
@@ -636,15 +724,25 @@ def write_cluster_config(
|
|
|
636
724
|
'is not supported by this cloud. Remove the config or set: '
|
|
637
725
|
'`remote_identity: LOCAL_CREDENTIALS`.')
|
|
638
726
|
if isinstance(cloud, clouds.Kubernetes):
|
|
639
|
-
|
|
640
|
-
|
|
727
|
+
allowed_contexts = skypilot_config.get_workspace_cloud(
|
|
728
|
+
'kubernetes').get('allowed_contexts', None)
|
|
729
|
+
if allowed_contexts is None:
|
|
730
|
+
allowed_contexts = skypilot_config.get_effective_region_config(
|
|
731
|
+
cloud='kubernetes',
|
|
732
|
+
region=None,
|
|
733
|
+
keys=('allowed_contexts',),
|
|
734
|
+
default_value=None)
|
|
735
|
+
if allowed_contexts is None:
|
|
641
736
|
excluded_clouds.add(cloud)
|
|
642
737
|
else:
|
|
643
738
|
excluded_clouds.add(cloud)
|
|
644
739
|
|
|
645
740
|
for cloud_str, cloud_obj in registry.CLOUD_REGISTRY.items():
|
|
646
|
-
remote_identity_config = skypilot_config.
|
|
647
|
-
|
|
741
|
+
remote_identity_config = skypilot_config.get_effective_region_config(
|
|
742
|
+
cloud=cloud_str.lower(),
|
|
743
|
+
region=region.name,
|
|
744
|
+
keys=('remote_identity',),
|
|
745
|
+
default_value=None)
|
|
648
746
|
if remote_identity_config:
|
|
649
747
|
if (remote_identity_config ==
|
|
650
748
|
schemas.RemoteIdentityOptions.NO_UPLOAD.value):
|
|
@@ -652,15 +750,24 @@ def write_cluster_config(
|
|
|
652
750
|
|
|
653
751
|
credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)
|
|
654
752
|
|
|
655
|
-
|
|
753
|
+
logging_agent = logs.get_logging_agent()
|
|
754
|
+
if logging_agent:
|
|
755
|
+
for k, v in logging_agent.get_credential_file_mounts().items():
|
|
756
|
+
assert k not in credentials, f'{k} already in credentials'
|
|
757
|
+
credentials[k] = v
|
|
758
|
+
|
|
759
|
+
private_key_path, _ = auth_utils.get_or_generate_keys()
|
|
656
760
|
auth_config = {'ssh_private_key': private_key_path}
|
|
657
761
|
region_name = resources_vars.get('region')
|
|
658
762
|
|
|
659
763
|
yaml_path = _get_yaml_path_from_cluster_name(cluster_name)
|
|
660
764
|
|
|
661
765
|
# Retrieve the ssh_proxy_command for the given cloud / region.
|
|
662
|
-
ssh_proxy_command_config = skypilot_config.
|
|
663
|
-
|
|
766
|
+
ssh_proxy_command_config = skypilot_config.get_effective_region_config(
|
|
767
|
+
cloud=str(cloud).lower(),
|
|
768
|
+
region=None,
|
|
769
|
+
keys=('ssh_proxy_command',),
|
|
770
|
+
default_value=None)
|
|
664
771
|
if (isinstance(ssh_proxy_command_config, str) or
|
|
665
772
|
ssh_proxy_command_config is None):
|
|
666
773
|
ssh_proxy_command = ssh_proxy_command_config
|
|
@@ -683,10 +790,63 @@ def write_cluster_config(
|
|
|
683
790
|
assert region_name in ssh_proxy_command_config, (
|
|
684
791
|
region_name, ssh_proxy_command_config)
|
|
685
792
|
ssh_proxy_command = ssh_proxy_command_config[region_name]
|
|
793
|
+
|
|
794
|
+
use_internal_ips = skypilot_config.get_effective_region_config(
|
|
795
|
+
cloud=str(cloud).lower(),
|
|
796
|
+
region=region.name,
|
|
797
|
+
keys=('use_internal_ips',),
|
|
798
|
+
default_value=False)
|
|
799
|
+
if isinstance(cloud, clouds.AWS):
|
|
800
|
+
# If the use_ssm flag is set to true, we use the ssm proxy command.
|
|
801
|
+
use_ssm = skypilot_config.get_effective_region_config(
|
|
802
|
+
cloud=str(cloud).lower(),
|
|
803
|
+
region=region.name,
|
|
804
|
+
keys=('use_ssm',),
|
|
805
|
+
default_value=None)
|
|
806
|
+
|
|
807
|
+
if use_ssm and ssh_proxy_command is not None:
|
|
808
|
+
raise exceptions.InvalidCloudConfigs(
|
|
809
|
+
'use_ssm is set to true, but ssh_proxy_command '
|
|
810
|
+
f'is already set to {ssh_proxy_command!r}. Please remove '
|
|
811
|
+
'ssh_proxy_command or set use_ssm to false.')
|
|
812
|
+
|
|
813
|
+
if use_internal_ips and ssh_proxy_command is None:
|
|
814
|
+
# Only if use_ssm is explicitly not set, we default to using SSM.
|
|
815
|
+
if use_ssm is None:
|
|
816
|
+
logger.warning(
|
|
817
|
+
f'{colorama.Fore.YELLOW}'
|
|
818
|
+
'use_internal_ips is set to true, '
|
|
819
|
+
'but ssh_proxy_command is not set. Defaulting to '
|
|
820
|
+
'using SSM. Specify ssh_proxy_command to use a different '
|
|
821
|
+
'https://docs.skypilot.co/en/latest/reference/config.html#'
|
|
822
|
+
f'aws.ssh_proxy_command.{colorama.Style.RESET_ALL}')
|
|
823
|
+
use_ssm = True
|
|
824
|
+
|
|
825
|
+
if use_ssm:
|
|
826
|
+
aws_profile = os.environ.get('AWS_PROFILE', None)
|
|
827
|
+
profile_str = f'--profile {aws_profile}' if aws_profile else ''
|
|
828
|
+
ip_address_filter = ('Name=private-ip-address,Values=%h'
|
|
829
|
+
if use_internal_ips else
|
|
830
|
+
'Name=ip-address,Values=%h')
|
|
831
|
+
get_instance_id_command = 'aws ec2 describe-instances ' + \
|
|
832
|
+
f'--region {region_name} --filters {ip_address_filter} ' + \
|
|
833
|
+
'--query \"Reservations[].Instances[].InstanceId\" ' + \
|
|
834
|
+
f'{profile_str} --output text'
|
|
835
|
+
ssm_proxy_command = 'aws ssm start-session --target ' + \
|
|
836
|
+
f'\"$({get_instance_id_command})\" ' + \
|
|
837
|
+
f'--region {region_name} {profile_str} ' + \
|
|
838
|
+
'--document-name AWS-StartSSHSession ' + \
|
|
839
|
+
'--parameters portNumber=%p'
|
|
840
|
+
ssh_proxy_command = ssm_proxy_command
|
|
841
|
+
region_name = 'ssm-session'
|
|
686
842
|
logger.debug(f'Using ssh_proxy_command: {ssh_proxy_command!r}')
|
|
687
843
|
|
|
688
844
|
# User-supplied global instance tags from ~/.sky/config.yaml.
|
|
689
|
-
labels = skypilot_config.
|
|
845
|
+
labels = skypilot_config.get_effective_region_config(
|
|
846
|
+
cloud=str(cloud).lower(),
|
|
847
|
+
region=region.name,
|
|
848
|
+
keys=('labels',),
|
|
849
|
+
default_value={})
|
|
690
850
|
# labels is a dict, which is guaranteed by the type check in
|
|
691
851
|
# schemas.py
|
|
692
852
|
assert isinstance(labels, dict), labels
|
|
@@ -715,6 +875,22 @@ def write_cluster_config(
|
|
|
715
875
|
high_availability_specified = controller_utils.high_availability_specified(
|
|
716
876
|
cluster_name)
|
|
717
877
|
|
|
878
|
+
volume_mount_vars = []
|
|
879
|
+
if volume_mounts is not None:
|
|
880
|
+
for vol in volume_mounts:
|
|
881
|
+
volume_mount_vars.append({
|
|
882
|
+
'name': vol.volume_name,
|
|
883
|
+
'path': vol.path,
|
|
884
|
+
'volume_name_on_cloud': vol.volume_config.name_on_cloud,
|
|
885
|
+
'volume_id_on_cloud': vol.volume_config.id_on_cloud,
|
|
886
|
+
})
|
|
887
|
+
|
|
888
|
+
runcmd = skypilot_config.get_effective_region_config(
|
|
889
|
+
cloud=str(to_provision.cloud).lower(),
|
|
890
|
+
region=to_provision.region,
|
|
891
|
+
keys=('post_provision_runcmd',),
|
|
892
|
+
default_value=None)
|
|
893
|
+
|
|
718
894
|
# Use a tmp file path to avoid incomplete YAML file being re-used in the
|
|
719
895
|
# future.
|
|
720
896
|
tmp_yaml_path = yaml_path + '.tmp'
|
|
@@ -734,18 +910,23 @@ def write_cluster_config(
|
|
|
734
910
|
os.environ.get(constants.USER_ENV_VAR, '')),
|
|
735
911
|
|
|
736
912
|
# Networking configs
|
|
737
|
-
'use_internal_ips': skypilot_config.
|
|
738
|
-
|
|
913
|
+
'use_internal_ips': skypilot_config.get_effective_region_config(
|
|
914
|
+
cloud=str(cloud).lower(),
|
|
915
|
+
region=region.name,
|
|
916
|
+
keys=('use_internal_ips',),
|
|
917
|
+
default_value=False),
|
|
739
918
|
'ssh_proxy_command': ssh_proxy_command,
|
|
740
|
-
'vpc_name': skypilot_config.
|
|
741
|
-
|
|
742
|
-
|
|
919
|
+
'vpc_name': skypilot_config.get_effective_region_config(
|
|
920
|
+
cloud=str(cloud).lower(),
|
|
921
|
+
region=region.name,
|
|
922
|
+
keys=('vpc_name',),
|
|
923
|
+
default_value=None),
|
|
743
924
|
# User-supplied labels.
|
|
744
925
|
'labels': labels,
|
|
745
926
|
# User-supplied remote_identity
|
|
746
927
|
'remote_identity': remote_identity,
|
|
747
928
|
# The reservation pools that specified by the user. This is
|
|
748
|
-
# currently only used by GCP.
|
|
929
|
+
# currently only used by AWS and GCP.
|
|
749
930
|
'specific_reservations': specific_reservations,
|
|
750
931
|
|
|
751
932
|
# Conda setup
|
|
@@ -805,6 +986,13 @@ def write_cluster_config(
|
|
|
805
986
|
|
|
806
987
|
# High availability
|
|
807
988
|
'high_availability': high_availability_specified,
|
|
989
|
+
|
|
990
|
+
# Volume mounts
|
|
991
|
+
'volume_mounts': volume_mount_vars,
|
|
992
|
+
|
|
993
|
+
# runcmd to append to the cloud-init cloud config passed to the
|
|
994
|
+
# machine's UserData. This is currently only used by AWS.
|
|
995
|
+
'runcmd': runcmd,
|
|
808
996
|
}),
|
|
809
997
|
output_path=tmp_yaml_path)
|
|
810
998
|
config_dict['cluster_name'] = cluster_name
|
|
@@ -812,14 +1000,20 @@ def write_cluster_config(
|
|
|
812
1000
|
|
|
813
1001
|
# Add kubernetes config fields from ~/.sky/config
|
|
814
1002
|
if isinstance(cloud, clouds.Kubernetes):
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
1003
|
+
cluster_config_overrides = to_provision.cluster_config_overrides
|
|
1004
|
+
with open(tmp_yaml_path, 'r', encoding='utf-8') as f:
|
|
1005
|
+
tmp_yaml_str = f.read()
|
|
1006
|
+
cluster_yaml_obj = yaml_utils.safe_load(tmp_yaml_str)
|
|
1007
|
+
combined_yaml_obj = kubernetes_utils.combine_pod_config_fields_and_metadata(
|
|
1008
|
+
cluster_yaml_obj,
|
|
1009
|
+
cluster_config_overrides=cluster_config_overrides,
|
|
1010
|
+
cloud=cloud,
|
|
1011
|
+
context=region.name)
|
|
1012
|
+
# Write the updated YAML back to the file
|
|
1013
|
+
yaml_utils.dump_yaml(tmp_yaml_path, combined_yaml_obj)
|
|
1014
|
+
|
|
1015
|
+
pod_config: Dict[str, Any] = combined_yaml_obj['available_node_types'][
|
|
821
1016
|
'ray_head_default']['node_config']
|
|
822
|
-
|
|
823
1017
|
# Check pod spec only. For high availability controllers, we deploy pvc & deployment for the controller. Read kubernetes-ray.yml.j2 for more details.
|
|
824
1018
|
pod_config.pop('deployment_spec', None)
|
|
825
1019
|
pod_config.pop('pvc_spec', None)
|
|
@@ -841,9 +1035,8 @@ def write_cluster_config(
|
|
|
841
1035
|
_add_auth_to_cluster_config(cloud, tmp_yaml_path)
|
|
842
1036
|
|
|
843
1037
|
# Restore the old yaml content for backward compatibility.
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
old_yaml_content = f.read()
|
|
1038
|
+
old_yaml_content = global_user_state.get_cluster_yaml_str(yaml_path)
|
|
1039
|
+
if old_yaml_content is not None and keep_launch_fields_in_existing_config:
|
|
847
1040
|
with open(tmp_yaml_path, 'r', encoding='utf-8') as f:
|
|
848
1041
|
new_yaml_content = f.read()
|
|
849
1042
|
restored_yaml_content = _replace_yaml_dicts(
|
|
@@ -856,7 +1049,7 @@ def write_cluster_config(
|
|
|
856
1049
|
# Read the cluster name from the tmp yaml file, to take the backward
|
|
857
1050
|
# compatbility restortion above into account.
|
|
858
1051
|
# TODO: remove this after 2 minor releases, 0.10.0.
|
|
859
|
-
yaml_config =
|
|
1052
|
+
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
860
1053
|
config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
|
|
861
1054
|
|
|
862
1055
|
# Make sure to do this before we optimize file mounts. Optimization is
|
|
@@ -880,18 +1073,29 @@ def write_cluster_config(
|
|
|
880
1073
|
# compatibility should go before this call.
|
|
881
1074
|
_optimize_file_mounts(tmp_yaml_path)
|
|
882
1075
|
|
|
883
|
-
#
|
|
884
|
-
|
|
885
|
-
|
|
1076
|
+
# commit the final yaml to the database
|
|
1077
|
+
global_user_state.set_cluster_yaml(
|
|
1078
|
+
cluster_name,
|
|
1079
|
+
open(tmp_yaml_path, 'r', encoding='utf-8').read())
|
|
1080
|
+
|
|
1081
|
+
usage_lib.messages.usage.update_ray_yaml(tmp_yaml_path)
|
|
1082
|
+
|
|
1083
|
+
# Remove the tmp file.
|
|
1084
|
+
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
1085
|
+
debug_yaml_path = yaml_path + '.debug'
|
|
1086
|
+
os.rename(tmp_yaml_path, debug_yaml_path)
|
|
1087
|
+
else:
|
|
1088
|
+
os.remove(tmp_yaml_path)
|
|
1089
|
+
|
|
886
1090
|
return config_dict
|
|
887
1091
|
|
|
888
1092
|
|
|
889
|
-
def _add_auth_to_cluster_config(cloud: clouds.Cloud,
|
|
1093
|
+
def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
|
|
890
1094
|
"""Adds SSH key info to the cluster config.
|
|
891
1095
|
|
|
892
1096
|
This function's output removes comments included in the jinja2 template.
|
|
893
1097
|
"""
|
|
894
|
-
config =
|
|
1098
|
+
config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
895
1099
|
# Check the availability of the cloud type.
|
|
896
1100
|
if isinstance(cloud, (
|
|
897
1101
|
clouds.AWS,
|
|
@@ -919,9 +1123,17 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
|
|
|
919
1123
|
config = auth.setup_vast_authentication(config)
|
|
920
1124
|
elif isinstance(cloud, clouds.Fluidstack):
|
|
921
1125
|
config = auth.setup_fluidstack_authentication(config)
|
|
1126
|
+
elif isinstance(cloud, clouds.Hyperbolic):
|
|
1127
|
+
config = auth.setup_hyperbolic_authentication(config)
|
|
1128
|
+
elif isinstance(cloud, clouds.Shadeform):
|
|
1129
|
+
config = auth.setup_shadeform_authentication(config)
|
|
1130
|
+
elif isinstance(cloud, clouds.PrimeIntellect):
|
|
1131
|
+
config = auth.setup_primeintellect_authentication(config)
|
|
1132
|
+
elif isinstance(cloud, clouds.Seeweb):
|
|
1133
|
+
config = auth.setup_seeweb_authentication(config)
|
|
922
1134
|
else:
|
|
923
1135
|
assert False, cloud
|
|
924
|
-
|
|
1136
|
+
yaml_utils.dump_yaml(tmp_yaml_path, config)
|
|
925
1137
|
|
|
926
1138
|
|
|
927
1139
|
def get_timestamp_from_run_timestamp(run_timestamp: str) -> float:
|
|
@@ -979,7 +1191,7 @@ def _count_healthy_nodes_from_ray(output: str,
|
|
|
979
1191
|
|
|
980
1192
|
|
|
981
1193
|
@timeline.event
|
|
982
|
-
def _deterministic_cluster_yaml_hash(
|
|
1194
|
+
def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
|
|
983
1195
|
"""Hash the cluster yaml and contents of file mounts to a unique string.
|
|
984
1196
|
|
|
985
1197
|
Two invocations of this function should return the same string if and only
|
|
@@ -1021,9 +1233,8 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
|
|
|
1021
1233
|
Rather than constructing the whole byte sequence, which may be quite large,
|
|
1022
1234
|
we construct it incrementally by using hash.update() to add new bytes.
|
|
1023
1235
|
"""
|
|
1024
|
-
|
|
1025
1236
|
# Load the yaml contents so that we can directly remove keys.
|
|
1026
|
-
yaml_config =
|
|
1237
|
+
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
1027
1238
|
for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
|
|
1028
1239
|
dict_to_remove_from = yaml_config
|
|
1029
1240
|
found_key = True
|
|
@@ -1042,7 +1253,7 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
|
|
|
1042
1253
|
config_hash = hashlib.sha256()
|
|
1043
1254
|
|
|
1044
1255
|
yaml_hash = hashlib.sha256(
|
|
1045
|
-
|
|
1256
|
+
yaml_utils.dump_yaml_str(yaml_config).encode('utf-8'))
|
|
1046
1257
|
config_hash.update(yaml_hash.digest())
|
|
1047
1258
|
|
|
1048
1259
|
file_mounts = yaml_config.get('file_mounts', {})
|
|
@@ -1052,7 +1263,7 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
|
|
|
1052
1263
|
file_mounts.pop('')
|
|
1053
1264
|
|
|
1054
1265
|
for dst, src in sorted(file_mounts.items()):
|
|
1055
|
-
if src ==
|
|
1266
|
+
if src == tmp_yaml_path:
|
|
1056
1267
|
# Skip the yaml file itself. We have already hashed a modified
|
|
1057
1268
|
# version of it. The file may include fields we don't want to hash.
|
|
1058
1269
|
continue
|
|
@@ -1147,7 +1358,7 @@ def wait_until_ray_cluster_ready(
|
|
|
1147
1358
|
logger.error(common_utils.format_exception(e))
|
|
1148
1359
|
return False, None # failed
|
|
1149
1360
|
|
|
1150
|
-
config =
|
|
1361
|
+
config = global_user_state.get_cluster_yaml_dict(cluster_config_file)
|
|
1151
1362
|
|
|
1152
1363
|
docker_user = None
|
|
1153
1364
|
if 'docker' in config:
|
|
@@ -1247,11 +1458,11 @@ def ssh_credential_from_yaml(
|
|
|
1247
1458
|
"""
|
|
1248
1459
|
if cluster_yaml is None:
|
|
1249
1460
|
return dict()
|
|
1250
|
-
config =
|
|
1461
|
+
config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
|
|
1251
1462
|
auth_section = config['auth']
|
|
1252
1463
|
if ssh_user is None:
|
|
1253
1464
|
ssh_user = auth_section['ssh_user'].strip()
|
|
1254
|
-
|
|
1465
|
+
ssh_private_key_path = auth_section.get('ssh_private_key')
|
|
1255
1466
|
ssh_control_name = config.get('cluster_name', '__default__')
|
|
1256
1467
|
ssh_proxy_command = auth_section.get('ssh_proxy_command')
|
|
1257
1468
|
|
|
@@ -1260,9 +1471,10 @@ def ssh_credential_from_yaml(
|
|
|
1260
1471
|
constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
|
|
1261
1472
|
ssh_proxy_command = ssh_proxy_command.replace(
|
|
1262
1473
|
constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
|
|
1474
|
+
|
|
1263
1475
|
credentials = {
|
|
1264
1476
|
'ssh_user': ssh_user,
|
|
1265
|
-
'ssh_private_key':
|
|
1477
|
+
'ssh_private_key': ssh_private_key_path,
|
|
1266
1478
|
'ssh_control_name': ssh_control_name,
|
|
1267
1479
|
'ssh_proxy_command': ssh_proxy_command,
|
|
1268
1480
|
}
|
|
@@ -1275,6 +1487,62 @@ def ssh_credential_from_yaml(
|
|
|
1275
1487
|
return credentials
|
|
1276
1488
|
|
|
1277
1489
|
|
|
1490
|
+
def ssh_credentials_from_handles(
|
|
1491
|
+
handles: List['cloud_vm_ray_backend.CloudVmRayResourceHandle'],
|
|
1492
|
+
) -> List[Dict[str, Any]]:
|
|
1493
|
+
"""Returns ssh_user, ssh_private_key and ssh_control name.
|
|
1494
|
+
"""
|
|
1495
|
+
non_empty_cluster_yaml_paths = [
|
|
1496
|
+
handle.cluster_yaml
|
|
1497
|
+
for handle in handles
|
|
1498
|
+
if handle.cluster_yaml is not None
|
|
1499
|
+
]
|
|
1500
|
+
cluster_yaml_dicts = global_user_state.get_cluster_yaml_dict_multiple(
|
|
1501
|
+
non_empty_cluster_yaml_paths)
|
|
1502
|
+
cluster_yaml_dicts_to_index = {
|
|
1503
|
+
cluster_yaml_path: cluster_yaml_dict
|
|
1504
|
+
for cluster_yaml_path, cluster_yaml_dict in zip(
|
|
1505
|
+
non_empty_cluster_yaml_paths, cluster_yaml_dicts)
|
|
1506
|
+
}
|
|
1507
|
+
|
|
1508
|
+
credentials_to_return: List[Dict[str, Any]] = []
|
|
1509
|
+
for handle in handles:
|
|
1510
|
+
if handle.cluster_yaml is None:
|
|
1511
|
+
credentials_to_return.append(dict())
|
|
1512
|
+
continue
|
|
1513
|
+
ssh_user = handle.ssh_user
|
|
1514
|
+
docker_user = handle.docker_user
|
|
1515
|
+
config = cluster_yaml_dicts_to_index[handle.cluster_yaml]
|
|
1516
|
+
auth_section = config['auth']
|
|
1517
|
+
if ssh_user is None:
|
|
1518
|
+
ssh_user = auth_section['ssh_user'].strip()
|
|
1519
|
+
ssh_private_key_path = auth_section.get('ssh_private_key')
|
|
1520
|
+
ssh_control_name = config.get('cluster_name', '__default__')
|
|
1521
|
+
ssh_proxy_command = auth_section.get('ssh_proxy_command')
|
|
1522
|
+
|
|
1523
|
+
# Update the ssh_user placeholder in proxy command, if required
|
|
1524
|
+
if (ssh_proxy_command is not None and
|
|
1525
|
+
constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
|
|
1526
|
+
ssh_proxy_command = ssh_proxy_command.replace(
|
|
1527
|
+
constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
|
|
1528
|
+
|
|
1529
|
+
credentials = {
|
|
1530
|
+
'ssh_user': ssh_user,
|
|
1531
|
+
'ssh_private_key': ssh_private_key_path,
|
|
1532
|
+
'ssh_control_name': ssh_control_name,
|
|
1533
|
+
'ssh_proxy_command': ssh_proxy_command,
|
|
1534
|
+
}
|
|
1535
|
+
if docker_user is not None:
|
|
1536
|
+
credentials['docker_user'] = docker_user
|
|
1537
|
+
ssh_provider_module = config['provider']['module']
|
|
1538
|
+
# If we are running ssh command on kubernetes node.
|
|
1539
|
+
if 'kubernetes' in ssh_provider_module:
|
|
1540
|
+
credentials['disable_control_master'] = True
|
|
1541
|
+
credentials_to_return.append(credentials)
|
|
1542
|
+
|
|
1543
|
+
return credentials_to_return
|
|
1544
|
+
|
|
1545
|
+
|
|
1278
1546
|
def parallel_data_transfer_to_nodes(
|
|
1279
1547
|
runners: List[command_runner.CommandRunner],
|
|
1280
1548
|
source: Optional[str],
|
|
@@ -1435,7 +1703,7 @@ def get_node_ips(cluster_yaml: str,
|
|
|
1435
1703
|
exceptions.FetchClusterInfoError: if we failed to get the IPs. e.reason is
|
|
1436
1704
|
HEAD or WORKER.
|
|
1437
1705
|
"""
|
|
1438
|
-
ray_config =
|
|
1706
|
+
ray_config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
|
|
1439
1707
|
# Use the new provisioner for AWS.
|
|
1440
1708
|
provider_name = cluster_utils.get_provider_name(ray_config)
|
|
1441
1709
|
cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
|
|
@@ -1523,18 +1791,54 @@ def get_node_ips(cluster_yaml: str,
|
|
|
1523
1791
|
|
|
1524
1792
|
def check_network_connection():
|
|
1525
1793
|
# Tolerate 3 retries as it is observed that connections can fail.
|
|
1526
|
-
adapter = adapters.HTTPAdapter(max_retries=retry_lib.Retry(total=3))
|
|
1527
1794
|
http = requests.Session()
|
|
1528
|
-
http.mount('https://',
|
|
1529
|
-
http.mount('http://',
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
|
|
1537
|
-
|
|
1795
|
+
http.mount('https://', adapters.HTTPAdapter())
|
|
1796
|
+
http.mount('http://', adapters.HTTPAdapter())
|
|
1797
|
+
|
|
1798
|
+
# Alternate between IPs on each retry
|
|
1799
|
+
max_retries = 3
|
|
1800
|
+
timeout = 0.5
|
|
1801
|
+
|
|
1802
|
+
for _ in range(max_retries):
|
|
1803
|
+
for ip in _TEST_IP_LIST:
|
|
1804
|
+
try:
|
|
1805
|
+
http.head(ip, timeout=timeout)
|
|
1806
|
+
return
|
|
1807
|
+
except (requests.Timeout, requests.exceptions.ConnectionError):
|
|
1808
|
+
continue
|
|
1809
|
+
|
|
1810
|
+
timeout *= 2 # Double the timeout for next retry
|
|
1811
|
+
|
|
1812
|
+
# If we get here, all IPs failed
|
|
1813
|
+
# Assume network connection is down
|
|
1814
|
+
raise exceptions.NetworkError('Could not refresh the cluster. '
|
|
1815
|
+
'Network seems down.')
|
|
1816
|
+
|
|
1817
|
+
|
|
1818
|
+
async def async_check_network_connection():
|
|
1819
|
+
"""Check if the network connection is available.
|
|
1820
|
+
|
|
1821
|
+
Tolerates 3 retries as it is observed that connections can fail.
|
|
1822
|
+
Uses aiohttp for async HTTP requests.
|
|
1823
|
+
"""
|
|
1824
|
+
# Create a session with retry logic
|
|
1825
|
+
timeout = ClientTimeout(total=15)
|
|
1826
|
+
connector = TCPConnector(limit=1) # Limit to 1 connection at a time
|
|
1827
|
+
|
|
1828
|
+
async with aiohttp.ClientSession(timeout=timeout,
|
|
1829
|
+
connector=connector) as session:
|
|
1830
|
+
for i, ip in enumerate(_TEST_IP_LIST):
|
|
1831
|
+
try:
|
|
1832
|
+
async with session.head(ip) as response:
|
|
1833
|
+
if response.status < 400: # Any 2xx or 3xx status is good
|
|
1834
|
+
return
|
|
1835
|
+
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
|
1836
|
+
if i == len(_TEST_IP_LIST) - 1:
|
|
1837
|
+
raise exceptions.NetworkError(
|
|
1838
|
+
'Could not refresh the cluster. '
|
|
1839
|
+
'Network seems down.') from e
|
|
1840
|
+
# If not the last IP, continue to try the next one
|
|
1841
|
+
continue
|
|
1538
1842
|
|
|
1539
1843
|
|
|
1540
1844
|
@timeline.event
|
|
@@ -1549,14 +1853,34 @@ def check_owner_identity(cluster_name: str) -> None:
|
|
|
1549
1853
|
"""
|
|
1550
1854
|
if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
|
|
1551
1855
|
return
|
|
1552
|
-
record = global_user_state.get_cluster_from_name(cluster_name
|
|
1856
|
+
record = global_user_state.get_cluster_from_name(cluster_name,
|
|
1857
|
+
include_user_info=False,
|
|
1858
|
+
summary_response=True)
|
|
1553
1859
|
if record is None:
|
|
1554
1860
|
return
|
|
1861
|
+
_check_owner_identity_with_record(cluster_name, record)
|
|
1862
|
+
|
|
1863
|
+
|
|
1864
|
+
def _check_owner_identity_with_record(cluster_name: str,
|
|
1865
|
+
record: Dict[str, Any]) -> None:
|
|
1866
|
+
if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
|
|
1867
|
+
return
|
|
1555
1868
|
handle = record['handle']
|
|
1556
1869
|
if not isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1557
1870
|
return
|
|
1871
|
+
active_workspace = skypilot_config.get_active_workspace()
|
|
1872
|
+
cluster_workspace = record.get('workspace',
|
|
1873
|
+
constants.SKYPILOT_DEFAULT_WORKSPACE)
|
|
1874
|
+
if active_workspace != cluster_workspace:
|
|
1875
|
+
with ux_utils.print_exception_no_traceback():
|
|
1876
|
+
raise exceptions.ClusterOwnerIdentityMismatchError(
|
|
1877
|
+
f'{colorama.Fore.YELLOW}'
|
|
1878
|
+
f'The cluster {cluster_name!r} is in workspace '
|
|
1879
|
+
f'{cluster_workspace!r}, but the active workspace is '
|
|
1880
|
+
f'{active_workspace!r}.{colorama.Fore.RESET}')
|
|
1558
1881
|
|
|
1559
|
-
|
|
1882
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
|
1883
|
+
cloud = launched_resources.cloud
|
|
1560
1884
|
user_identities = cloud.get_user_identities()
|
|
1561
1885
|
owner_identity = record['owner']
|
|
1562
1886
|
if user_identities is None:
|
|
@@ -1625,22 +1949,26 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
|
|
|
1625
1949
|
}
|
|
1626
1950
|
|
|
1627
1951
|
|
|
1952
|
+
@context_utils.cancellation_guard
|
|
1628
1953
|
def _query_cluster_status_via_cloud_api(
|
|
1629
|
-
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
|
|
1630
|
-
|
|
1631
|
-
|
|
1954
|
+
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
|
|
1955
|
+
retry_if_missing: bool,
|
|
1956
|
+
) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
|
|
1957
|
+
"""Returns the status of the cluster as a list of tuples corresponding
|
|
1958
|
+
to the node status and an optional reason string for said status.
|
|
1632
1959
|
|
|
1633
1960
|
Raises:
|
|
1634
1961
|
exceptions.ClusterStatusFetchingError: the cluster status cannot be
|
|
1635
1962
|
fetched from the cloud provider.
|
|
1636
1963
|
"""
|
|
1964
|
+
cluster_name = handle.cluster_name
|
|
1637
1965
|
cluster_name_on_cloud = handle.cluster_name_on_cloud
|
|
1638
1966
|
cluster_name_in_hint = common_utils.cluster_name_in_hint(
|
|
1639
1967
|
handle.cluster_name, cluster_name_on_cloud)
|
|
1640
1968
|
# Use region and zone from the cluster config, instead of the
|
|
1641
1969
|
# handle.launched_resources, because the latter may not be set
|
|
1642
1970
|
# correctly yet.
|
|
1643
|
-
ray_config =
|
|
1971
|
+
ray_config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
|
|
1644
1972
|
provider_config = ray_config['provider']
|
|
1645
1973
|
|
|
1646
1974
|
# Query the cloud provider.
|
|
@@ -1651,7 +1979,11 @@ def _query_cluster_status_via_cloud_api(
|
|
|
1651
1979
|
cloud_name = repr(handle.launched_resources.cloud)
|
|
1652
1980
|
try:
|
|
1653
1981
|
node_status_dict = provision_lib.query_instances(
|
|
1654
|
-
cloud_name,
|
|
1982
|
+
cloud_name,
|
|
1983
|
+
cluster_name,
|
|
1984
|
+
cluster_name_on_cloud,
|
|
1985
|
+
provider_config,
|
|
1986
|
+
retry_if_missing=retry_if_missing)
|
|
1655
1987
|
logger.debug(f'Querying {cloud_name} cluster '
|
|
1656
1988
|
f'{cluster_name_in_hint} '
|
|
1657
1989
|
f'status:\n{pprint.pformat(node_status_dict)}')
|
|
@@ -1667,12 +1999,55 @@ def _query_cluster_status_via_cloud_api(
|
|
|
1667
1999
|
region = provider_config.get('region') or provider_config.get(
|
|
1668
2000
|
'location')
|
|
1669
2001
|
zone = ray_config['provider'].get('availability_zone')
|
|
2002
|
+
# TODO (kyuds): refactor cloud.query_status api to include reason.
|
|
2003
|
+
# Currently not refactoring as this API is actually supposed to be
|
|
2004
|
+
# deprecated soon.
|
|
1670
2005
|
node_statuses = cloud.query_status(
|
|
1671
2006
|
cluster_name_on_cloud,
|
|
1672
2007
|
tag_filter_for_cluster(cluster_name_on_cloud), region, zone)
|
|
2008
|
+
node_statuses = [(status, None) for status in node_statuses]
|
|
1673
2009
|
return node_statuses
|
|
1674
2010
|
|
|
1675
2011
|
|
|
2012
|
+
def _query_cluster_info_via_cloud_api(
|
|
2013
|
+
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
|
|
2014
|
+
) -> provision_common.ClusterInfo:
|
|
2015
|
+
"""Returns the cluster info.
|
|
2016
|
+
|
|
2017
|
+
Raises:
|
|
2018
|
+
exceptions.NotSupportedError: the cloud does not support the new provisioner.
|
|
2019
|
+
exceptions.FetchClusterInfoError: the cluster info cannot be
|
|
2020
|
+
fetched from the cloud provider.
|
|
2021
|
+
"""
|
|
2022
|
+
cloud = handle.launched_resources.cloud
|
|
2023
|
+
assert cloud is not None, handle
|
|
2024
|
+
if cloud.STATUS_VERSION >= clouds.StatusVersion.SKYPILOT:
|
|
2025
|
+
try:
|
|
2026
|
+
cloud_name = repr(cloud)
|
|
2027
|
+
ray_config = global_user_state.get_cluster_yaml_dict(
|
|
2028
|
+
handle.cluster_yaml)
|
|
2029
|
+
provider_config = ray_config['provider']
|
|
2030
|
+
region = provider_config.get('region') or provider_config.get(
|
|
2031
|
+
'location')
|
|
2032
|
+
cluster_info = provision_lib.get_cluster_info(
|
|
2033
|
+
cloud_name, region, handle.cluster_name_on_cloud,
|
|
2034
|
+
provider_config)
|
|
2035
|
+
logger.debug(
|
|
2036
|
+
f'Querying {cloud_name} cluster '
|
|
2037
|
+
f'{handle.cluster_name_on_cloud} '
|
|
2038
|
+
f'head instance:\n{cluster_info.get_head_instance()}\n'
|
|
2039
|
+
f'worker instances:\n{cluster_info.get_worker_instances()}')
|
|
2040
|
+
return cluster_info
|
|
2041
|
+
except Exception as e: # pylint: disable=broad-except
|
|
2042
|
+
with ux_utils.print_exception_no_traceback():
|
|
2043
|
+
raise exceptions.FetchClusterInfoError(
|
|
2044
|
+
reason=exceptions.FetchClusterInfoError.Reason.UNKNOWN
|
|
2045
|
+
) from e
|
|
2046
|
+
else:
|
|
2047
|
+
raise exceptions.NotSupportedError(
|
|
2048
|
+
f'The cloud {cloud} does not support the SkyPilot provisioner.')
|
|
2049
|
+
|
|
2050
|
+
|
|
1676
2051
|
def check_can_clone_disk_and_override_task(
|
|
1677
2052
|
cluster_name: str, target_cluster_name: Optional[str], task: 'task_lib.Task'
|
|
1678
2053
|
) -> Tuple['task_lib.Task', 'cloud_vm_ray_backend.CloudVmRayResourceHandle']:
|
|
@@ -1720,12 +2095,12 @@ def check_can_clone_disk_and_override_task(
|
|
|
1720
2095
|
'a new target cluster name.')
|
|
1721
2096
|
|
|
1722
2097
|
new_task_resources = []
|
|
1723
|
-
|
|
2098
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
|
2099
|
+
original_cloud = launched_resources.cloud
|
|
1724
2100
|
original_cloud.check_features_are_supported(
|
|
1725
|
-
|
|
2101
|
+
launched_resources,
|
|
1726
2102
|
{clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER})
|
|
1727
2103
|
|
|
1728
|
-
assert original_cloud is not None, handle.launched_resources
|
|
1729
2104
|
has_override = False
|
|
1730
2105
|
has_disk_size_met = False
|
|
1731
2106
|
has_cloud_met = False
|
|
@@ -1739,7 +2114,7 @@ def check_can_clone_disk_and_override_task(
|
|
|
1739
2114
|
continue
|
|
1740
2115
|
has_cloud_met = True
|
|
1741
2116
|
|
|
1742
|
-
override_param = {}
|
|
2117
|
+
override_param: Dict[str, Any] = {}
|
|
1743
2118
|
if task_resources.cloud is None:
|
|
1744
2119
|
override_param['cloud'] = original_cloud
|
|
1745
2120
|
if task_resources.region is None:
|
|
@@ -1786,7 +2161,12 @@ def check_can_clone_disk_and_override_task(
|
|
|
1786
2161
|
return task, handle
|
|
1787
2162
|
|
|
1788
2163
|
|
|
1789
|
-
def _update_cluster_status(
|
|
2164
|
+
def _update_cluster_status(
|
|
2165
|
+
cluster_name: str,
|
|
2166
|
+
record: Dict[str, Any],
|
|
2167
|
+
retry_if_missing: bool,
|
|
2168
|
+
include_user_info: bool = True,
|
|
2169
|
+
summary_response: bool = False) -> Optional[Dict[str, Any]]:
|
|
1790
2170
|
"""Update the cluster status.
|
|
1791
2171
|
|
|
1792
2172
|
The cluster status is updated by checking ray cluster and real status from
|
|
@@ -1813,13 +2193,16 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
1813
2193
|
fetched from the cloud provider or there are leaked nodes causing
|
|
1814
2194
|
the node number larger than expected.
|
|
1815
2195
|
"""
|
|
1816
|
-
record = global_user_state.get_cluster_from_name(cluster_name)
|
|
1817
|
-
if record is None:
|
|
1818
|
-
return None
|
|
1819
2196
|
handle = record['handle']
|
|
1820
2197
|
if handle.cluster_yaml is None:
|
|
1821
2198
|
# Remove cluster from db since this cluster does not have a config file
|
|
1822
2199
|
# or any other ongoing requests
|
|
2200
|
+
global_user_state.add_cluster_event(
|
|
2201
|
+
cluster_name,
|
|
2202
|
+
None,
|
|
2203
|
+
'Cluster has no YAML file. Removing the cluster from cache.',
|
|
2204
|
+
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2205
|
+
nop_if_duplicate=True)
|
|
1823
2206
|
global_user_state.remove_cluster(cluster_name, terminate=True)
|
|
1824
2207
|
logger.debug(f'Cluster {cluster_name!r} has no YAML file. '
|
|
1825
2208
|
'Removing the cluster from cache.')
|
|
@@ -1828,10 +2211,11 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
1828
2211
|
return record
|
|
1829
2212
|
cluster_name = handle.cluster_name
|
|
1830
2213
|
|
|
1831
|
-
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2214
|
+
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2215
|
+
handle, retry_if_missing=retry_if_missing)
|
|
1832
2216
|
|
|
1833
|
-
all_nodes_up = (all(
|
|
1834
|
-
|
|
2217
|
+
all_nodes_up = (all(status[0] == status_lib.ClusterStatus.UP
|
|
2218
|
+
for status in node_statuses) and
|
|
1835
2219
|
len(node_statuses) == handle.launched_nodes)
|
|
1836
2220
|
|
|
1837
2221
|
def get_node_counts_from_ray_status(
|
|
@@ -1842,14 +2226,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
1842
2226
|
require_outputs=True,
|
|
1843
2227
|
separate_stderr=True)
|
|
1844
2228
|
if rc:
|
|
1845
|
-
raise
|
|
1846
|
-
|
|
1847
|
-
f'ray cluster\'s healthiness
|
|
1848
|
-
|
|
1849
|
-
f'
|
|
2229
|
+
raise exceptions.CommandError(
|
|
2230
|
+
rc, instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
|
|
2231
|
+
f'Failed to check ray cluster\'s healthiness.\n'
|
|
2232
|
+
'-- stdout --\n'
|
|
2233
|
+
f'{output}\n', stderr)
|
|
1850
2234
|
return (*_count_healthy_nodes_from_ray(output), output, stderr)
|
|
1851
2235
|
|
|
2236
|
+
ray_status_details: Optional[str] = None
|
|
2237
|
+
|
|
1852
2238
|
def run_ray_status_to_check_ray_cluster_healthy() -> bool:
|
|
2239
|
+
nonlocal ray_status_details
|
|
1853
2240
|
try:
|
|
1854
2241
|
# NOTE: fetching the IPs is very slow as it calls into
|
|
1855
2242
|
# `ray get head-ip/worker-ips`. Using cached IPs is safe because
|
|
@@ -1872,9 +2259,44 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
1872
2259
|
|
|
1873
2260
|
total_nodes = handle.launched_nodes * handle.num_ips_per_node
|
|
1874
2261
|
|
|
2262
|
+
cloud_name = repr(handle.launched_resources.cloud).lower()
|
|
1875
2263
|
for i in range(5):
|
|
1876
|
-
|
|
1877
|
-
|
|
2264
|
+
try:
|
|
2265
|
+
ready_head, ready_workers, output, stderr = (
|
|
2266
|
+
get_node_counts_from_ray_status(head_runner))
|
|
2267
|
+
except exceptions.CommandError as e:
|
|
2268
|
+
logger.debug(f'Refreshing status ({cluster_name!r}) attempt'
|
|
2269
|
+
f' {i}: {common_utils.format_exception(e)}')
|
|
2270
|
+
if cloud_name != 'kubernetes':
|
|
2271
|
+
# Non-k8s clusters can be manually restarted and:
|
|
2272
|
+
# 1. Get new IP addresses, or
|
|
2273
|
+
# 2. Not have the SkyPilot runtime setup
|
|
2274
|
+
#
|
|
2275
|
+
# So we should surface a message to the user to
|
|
2276
|
+
# help them recover from this inconsistent state.
|
|
2277
|
+
has_new_ip_addr = (
|
|
2278
|
+
e.detailed_reason is not None and
|
|
2279
|
+
_SSH_CONNECTION_TIMED_OUT_PATTERN.search(
|
|
2280
|
+
e.detailed_reason.strip()) is not None)
|
|
2281
|
+
runtime_not_setup = (_RAY_CLUSTER_NOT_FOUND_MESSAGE
|
|
2282
|
+
in e.error_msg)
|
|
2283
|
+
if has_new_ip_addr or runtime_not_setup:
|
|
2284
|
+
yellow = colorama.Fore.YELLOW
|
|
2285
|
+
bright = colorama.Style.BRIGHT
|
|
2286
|
+
reset = colorama.Style.RESET_ALL
|
|
2287
|
+
ux_utils.console_newline()
|
|
2288
|
+
logger.warning(
|
|
2289
|
+
f'{yellow}Failed getting cluster status despite all nodes '
|
|
2290
|
+
f'being up ({cluster_name!r}). '
|
|
2291
|
+
f'If the cluster was restarted manually, try running: '
|
|
2292
|
+
f'{reset}{bright}sky start {cluster_name}{reset} '
|
|
2293
|
+
f'{yellow}to recover from INIT status.{reset}')
|
|
2294
|
+
return False
|
|
2295
|
+
raise e
|
|
2296
|
+
# We retry for kubernetes because coreweave can have a
|
|
2297
|
+
# transient network issue.
|
|
2298
|
+
time.sleep(1)
|
|
2299
|
+
continue
|
|
1878
2300
|
if ready_head + ready_workers == total_nodes:
|
|
1879
2301
|
return True
|
|
1880
2302
|
logger.debug(f'Refreshing status ({cluster_name!r}) attempt '
|
|
@@ -1892,19 +2314,25 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
1892
2314
|
# showing up
|
|
1893
2315
|
time.sleep(1)
|
|
1894
2316
|
|
|
2317
|
+
ray_status_details = (
|
|
2318
|
+
f'{ready_head + ready_workers}/{total_nodes} ready')
|
|
1895
2319
|
raise RuntimeError(
|
|
1896
2320
|
f'Refreshing status ({cluster_name!r}): ray status not showing '
|
|
1897
2321
|
f'all nodes ({ready_head + ready_workers}/'
|
|
1898
2322
|
f'{total_nodes});\noutput:\n{output}\nstderr:\n{stderr}')
|
|
1899
2323
|
|
|
1900
2324
|
except exceptions.FetchClusterInfoError:
|
|
2325
|
+
ray_status_details = 'failed to get IPs'
|
|
1901
2326
|
logger.debug(
|
|
1902
2327
|
f'Refreshing status ({cluster_name!r}) failed to get IPs.')
|
|
1903
2328
|
except RuntimeError as e:
|
|
2329
|
+
if ray_status_details is None:
|
|
2330
|
+
ray_status_details = str(e)
|
|
1904
2331
|
logger.debug(common_utils.format_exception(e))
|
|
1905
2332
|
except Exception as e: # pylint: disable=broad-except
|
|
1906
2333
|
# This can be raised by `external_ssh_ports()`, due to the
|
|
1907
2334
|
# underlying call to kubernetes API.
|
|
2335
|
+
ray_status_details = str(e)
|
|
1908
2336
|
logger.debug(f'Refreshing status ({cluster_name!r}) failed: ',
|
|
1909
2337
|
exc_info=e)
|
|
1910
2338
|
return False
|
|
@@ -1925,16 +2353,28 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
1925
2353
|
# run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
|
|
1926
2354
|
# head-ip/worker-ips`.
|
|
1927
2355
|
record['status'] = status_lib.ClusterStatus.UP
|
|
1928
|
-
|
|
1929
|
-
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
2356
|
+
# Add cluster event for instance status check.
|
|
2357
|
+
global_user_state.add_cluster_event(
|
|
2358
|
+
cluster_name,
|
|
2359
|
+
status_lib.ClusterStatus.UP,
|
|
2360
|
+
'All nodes up; SkyPilot runtime healthy.',
|
|
2361
|
+
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2362
|
+
nop_if_duplicate=True)
|
|
2363
|
+
global_user_state.add_or_update_cluster(
|
|
2364
|
+
cluster_name,
|
|
2365
|
+
handle,
|
|
2366
|
+
requested_resources=None,
|
|
2367
|
+
ready=True,
|
|
2368
|
+
is_launch=False,
|
|
2369
|
+
existing_cluster_hash=record['cluster_hash'])
|
|
2370
|
+
return global_user_state.get_cluster_from_name(
|
|
2371
|
+
cluster_name,
|
|
2372
|
+
include_user_info=include_user_info,
|
|
2373
|
+
summary_response=summary_response)
|
|
1934
2374
|
|
|
1935
2375
|
# All cases below are transitioning the cluster to non-UP states.
|
|
1936
|
-
|
|
1937
|
-
if (not node_statuses and
|
|
2376
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
|
2377
|
+
if (not node_statuses and launched_resources.cloud.STATUS_VERSION >=
|
|
1938
2378
|
clouds.StatusVersion.SKYPILOT):
|
|
1939
2379
|
# Note: launched_at is set during sky launch, even on an existing
|
|
1940
2380
|
# cluster. This will catch the case where the cluster was terminated on
|
|
@@ -1947,7 +2387,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
1947
2387
|
# and check again. This is a best-effort leak prevention check.
|
|
1948
2388
|
# See https://github.com/skypilot-org/skypilot/issues/4431.
|
|
1949
2389
|
time.sleep(_LAUNCH_DOUBLE_CHECK_DELAY)
|
|
1950
|
-
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2390
|
+
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2391
|
+
handle, retry_if_missing=False)
|
|
1951
2392
|
# Note: even if all the node_statuses are UP now, we will still
|
|
1952
2393
|
# consider this cluster abnormal, and its status will be INIT.
|
|
1953
2394
|
|
|
@@ -2002,85 +2443,168 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2002
2443
|
# * The cluster is partially or completely in the INIT state, which means
|
|
2003
2444
|
# that provisioning was interrupted. This is considered abnormal.
|
|
2004
2445
|
#
|
|
2005
|
-
# An abnormal cluster will transition to INIT and
|
|
2006
|
-
#
|
|
2007
|
-
|
|
2008
|
-
|
|
2446
|
+
# An abnormal cluster will transition to INIT, and one of the following will happen:
|
|
2447
|
+
# (1) If the SkyPilot provisioner is used AND the head node is alive, we
|
|
2448
|
+
# will not reset the autostop setting. Because autostop is handled by
|
|
2449
|
+
# the skylet through the cloud APIs, and will continue to function
|
|
2450
|
+
# regardless of the ray cluster's health.
|
|
2451
|
+
# (2) Otherwise, we will reset the autostop setting, unless the cluster is
|
|
2452
|
+
# autostopping/autodowning.
|
|
2453
|
+
some_nodes_terminated = 0 < len(node_statuses) < handle.launched_nodes
|
|
2454
|
+
# If all nodes are up and ray cluster is health, we would have returned
|
|
2455
|
+
# earlier. So if all_nodes_up is True and we are here, it means the ray
|
|
2456
|
+
# cluster must have been unhealthy.
|
|
2457
|
+
ray_cluster_unhealthy = all_nodes_up
|
|
2458
|
+
some_nodes_not_stopped = any(status[0] != status_lib.ClusterStatus.STOPPED
|
|
2459
|
+
for status in node_statuses)
|
|
2460
|
+
is_abnormal = (some_nodes_terminated or some_nodes_not_stopped)
|
|
2461
|
+
|
|
2009
2462
|
if is_abnormal:
|
|
2463
|
+
status_reason = ', '.join(
|
|
2464
|
+
[status[1] for status in node_statuses if status[1] is not None])
|
|
2465
|
+
|
|
2466
|
+
if some_nodes_terminated:
|
|
2467
|
+
init_reason = 'one or more nodes terminated'
|
|
2468
|
+
elif ray_cluster_unhealthy:
|
|
2469
|
+
init_reason = f'ray cluster is unhealthy ({ray_status_details})'
|
|
2470
|
+
elif some_nodes_not_stopped:
|
|
2471
|
+
init_reason = 'some but not all nodes are stopped'
|
|
2010
2472
|
logger.debug('The cluster is abnormal. Setting to INIT status. '
|
|
2011
2473
|
f'node_statuses: {node_statuses}')
|
|
2012
|
-
|
|
2013
|
-
|
|
2014
|
-
|
|
2015
|
-
|
|
2016
|
-
stream_logs=False):
|
|
2017
|
-
# Friendly hint.
|
|
2018
|
-
autostop = record['autostop']
|
|
2019
|
-
maybe_down_str = ' --down' if record['to_down'] else ''
|
|
2020
|
-
noun = 'autodown' if record['to_down'] else 'autostop'
|
|
2021
|
-
|
|
2022
|
-
# Reset the autostopping as the cluster is abnormal, and may
|
|
2023
|
-
# not correctly autostop. Resetting the autostop will let
|
|
2024
|
-
# the user know that the autostop may not happen to avoid
|
|
2025
|
-
# leakages from the assumption that the cluster will autostop.
|
|
2026
|
-
success = True
|
|
2027
|
-
reset_local_autostop = True
|
|
2474
|
+
if record['autostop'] >= 0:
|
|
2475
|
+
is_head_node_alive = False
|
|
2476
|
+
if launched_resources.cloud.PROVISIONER_VERSION >= clouds.ProvisionerVersion.SKYPILOT:
|
|
2477
|
+
# Check if the head node is alive
|
|
2028
2478
|
try:
|
|
2029
|
-
|
|
2030
|
-
|
|
2031
|
-
|
|
2032
|
-
|
|
2033
|
-
|
|
2034
|
-
|
|
2035
|
-
|
|
2036
|
-
|
|
2037
|
-
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
|
|
2041
|
-
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
|
|
2479
|
+
cluster_info = _query_cluster_info_via_cloud_api(handle)
|
|
2480
|
+
is_head_node_alive = cluster_info.get_head_instance(
|
|
2481
|
+
) is not None
|
|
2482
|
+
except Exception as e: # pylint: disable=broad-except
|
|
2483
|
+
logger.debug(
|
|
2484
|
+
f'Failed to get cluster info for {cluster_name!r}: '
|
|
2485
|
+
f'{common_utils.format_exception(e)}')
|
|
2486
|
+
|
|
2487
|
+
backend = get_backend_from_handle(handle)
|
|
2488
|
+
if isinstance(backend, backends.CloudVmRayBackend):
|
|
2489
|
+
if is_head_node_alive:
|
|
2490
|
+
logger.debug(
|
|
2491
|
+
f'Skipping autostop reset for cluster {cluster_name!r} '
|
|
2492
|
+
'because the head node is alive.')
|
|
2493
|
+
elif not backend.is_definitely_autostopping(handle,
|
|
2494
|
+
stream_logs=False):
|
|
2495
|
+
# Friendly hint.
|
|
2496
|
+
autostop = record['autostop']
|
|
2497
|
+
maybe_down_str = ' --down' if record['to_down'] else ''
|
|
2498
|
+
noun = 'autodown' if record['to_down'] else 'autostop'
|
|
2499
|
+
|
|
2500
|
+
# Reset the autostopping as the cluster is abnormal, and may
|
|
2501
|
+
# not correctly autostop. Resetting the autostop will let
|
|
2502
|
+
# the user know that the autostop may not happen to avoid
|
|
2503
|
+
# leakages from the assumption that the cluster will autostop.
|
|
2504
|
+
success = True
|
|
2505
|
+
reset_local_autostop = True
|
|
2506
|
+
try:
|
|
2507
|
+
backend.set_autostop(
|
|
2508
|
+
handle,
|
|
2509
|
+
-1,
|
|
2510
|
+
autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
|
|
2511
|
+
stream_logs=False)
|
|
2512
|
+
except (exceptions.CommandError,
|
|
2513
|
+
grpc.FutureTimeoutError) as e:
|
|
2514
|
+
success = False
|
|
2515
|
+
if isinstance(e, grpc.FutureTimeoutError) or (
|
|
2516
|
+
isinstance(e, exceptions.CommandError) and
|
|
2517
|
+
e.returncode == 255):
|
|
2518
|
+
word = 'autostopped' if noun == 'autostop' else 'autodowned'
|
|
2519
|
+
logger.debug(f'The cluster is likely {word}.')
|
|
2520
|
+
reset_local_autostop = False
|
|
2521
|
+
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
|
2522
|
+
success = False
|
|
2523
|
+
logger.debug(f'Failed to reset autostop. Due to '
|
|
2524
|
+
f'{common_utils.format_exception(e)}')
|
|
2525
|
+
if reset_local_autostop:
|
|
2526
|
+
global_user_state.set_cluster_autostop_value(
|
|
2527
|
+
handle.cluster_name, -1, to_down=False)
|
|
2528
|
+
|
|
2529
|
+
if success:
|
|
2530
|
+
operation_str = (f'Canceled {noun} on the cluster '
|
|
2531
|
+
f'{cluster_name!r}')
|
|
2532
|
+
else:
|
|
2533
|
+
operation_str = (
|
|
2534
|
+
f'Attempted to cancel {noun} on the '
|
|
2535
|
+
f'cluster {cluster_name!r} with best effort')
|
|
2536
|
+
yellow = colorama.Fore.YELLOW
|
|
2537
|
+
bright = colorama.Style.BRIGHT
|
|
2538
|
+
reset = colorama.Style.RESET_ALL
|
|
2539
|
+
ux_utils.console_newline()
|
|
2540
|
+
logger.warning(
|
|
2541
|
+
f'{yellow}{operation_str}, since it is found to be in an '
|
|
2542
|
+
f'abnormal state. To fix, try running: {reset}{bright}sky '
|
|
2543
|
+
f'start -f -i {autostop}{maybe_down_str} {cluster_name}'
|
|
2544
|
+
f'{reset}')
|
|
2047
2545
|
else:
|
|
2048
|
-
|
|
2049
|
-
|
|
2050
|
-
|
|
2051
|
-
|
|
2052
|
-
|
|
2053
|
-
|
|
2054
|
-
ux_utils.console_newline()
|
|
2055
|
-
logger.warning(
|
|
2056
|
-
f'{yellow}{operation_str}, since it is found to be in an '
|
|
2057
|
-
f'abnormal state. To fix, try running: {reset}{bright}sky '
|
|
2058
|
-
f'start -f -i {autostop}{maybe_down_str} {cluster_name}'
|
|
2059
|
-
f'{reset}')
|
|
2060
|
-
else:
|
|
2061
|
-
ux_utils.console_newline()
|
|
2062
|
-
operation_str = 'autodowning' if record[
|
|
2063
|
-
'to_down'] else 'autostopping'
|
|
2064
|
-
logger.info(
|
|
2065
|
-
f'Cluster {cluster_name!r} is {operation_str}. Setting to '
|
|
2066
|
-
'INIT status; try refresh again in a while.')
|
|
2546
|
+
ux_utils.console_newline()
|
|
2547
|
+
operation_str = 'autodowning' if record[
|
|
2548
|
+
'to_down'] else 'autostopping'
|
|
2549
|
+
logger.info(
|
|
2550
|
+
f'Cluster {cluster_name!r} is {operation_str}. Setting to '
|
|
2551
|
+
'INIT status; try refresh again in a while.')
|
|
2067
2552
|
|
|
2068
2553
|
# If the user starts part of a STOPPED cluster, we still need a status
|
|
2069
2554
|
# to represent the abnormal status. For spot cluster, it can also
|
|
2070
2555
|
# represent that the cluster is partially preempted.
|
|
2071
2556
|
# TODO(zhwu): the definition of INIT should be audited/changed.
|
|
2072
2557
|
# Adding a new status UNHEALTHY for abnormal status can be a choice.
|
|
2073
|
-
|
|
2074
|
-
|
|
2075
|
-
|
|
2076
|
-
|
|
2077
|
-
|
|
2078
|
-
|
|
2558
|
+
init_reason_regex = None
|
|
2559
|
+
if not status_reason:
|
|
2560
|
+
# If there is not a status reason, don't re-add (and overwrite) the
|
|
2561
|
+
# event if there is already an event with the same reason which may
|
|
2562
|
+
# have a status reason.
|
|
2563
|
+
# Some status reason clears after a certain time (e.g. k8s events
|
|
2564
|
+
# are only stored for an hour by default), so it is possible that
|
|
2565
|
+
# the previous event has a status reason, but now it does not.
|
|
2566
|
+
init_reason_regex = (f'^Cluster is abnormal because '
|
|
2567
|
+
f'{re.escape(init_reason)}.*')
|
|
2568
|
+
log_message = f'Cluster is abnormal because {init_reason}'
|
|
2569
|
+
if status_reason:
|
|
2570
|
+
log_message += f' ({status_reason})'
|
|
2571
|
+
log_message += '. Transitioned to INIT.'
|
|
2572
|
+
global_user_state.add_cluster_event(
|
|
2573
|
+
cluster_name,
|
|
2574
|
+
status_lib.ClusterStatus.INIT,
|
|
2575
|
+
log_message,
|
|
2576
|
+
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2577
|
+
nop_if_duplicate=True,
|
|
2578
|
+
duplicate_regex=init_reason_regex)
|
|
2579
|
+
global_user_state.add_or_update_cluster(
|
|
2580
|
+
cluster_name,
|
|
2581
|
+
handle,
|
|
2582
|
+
requested_resources=None,
|
|
2583
|
+
ready=False,
|
|
2584
|
+
is_launch=False,
|
|
2585
|
+
existing_cluster_hash=record['cluster_hash'])
|
|
2586
|
+
return global_user_state.get_cluster_from_name(
|
|
2587
|
+
cluster_name,
|
|
2588
|
+
include_user_info=include_user_info,
|
|
2589
|
+
summary_response=summary_response)
|
|
2079
2590
|
# Now is_abnormal is False: either node_statuses is empty or all nodes are
|
|
2080
2591
|
# STOPPED.
|
|
2592
|
+
verb = 'terminated' if to_terminate else 'stopped'
|
|
2081
2593
|
backend = backends.CloudVmRayBackend()
|
|
2594
|
+
global_user_state.add_cluster_event(
|
|
2595
|
+
cluster_name,
|
|
2596
|
+
None,
|
|
2597
|
+
f'All nodes {verb}, cleaning up the cluster.',
|
|
2598
|
+
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2599
|
+
# This won't do anything for a terminated cluster, but it's needed for a
|
|
2600
|
+
# stopped cluster.
|
|
2601
|
+
nop_if_duplicate=True,
|
|
2602
|
+
)
|
|
2082
2603
|
backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
|
|
2083
|
-
return global_user_state.get_cluster_from_name(
|
|
2604
|
+
return global_user_state.get_cluster_from_name(
|
|
2605
|
+
cluster_name,
|
|
2606
|
+
include_user_info=include_user_info,
|
|
2607
|
+
summary_response=summary_response)
|
|
2084
2608
|
|
|
2085
2609
|
|
|
2086
2610
|
def _must_refresh_cluster_status(
|
|
@@ -2102,12 +2626,14 @@ def _must_refresh_cluster_status(
|
|
|
2102
2626
|
|
|
2103
2627
|
|
|
2104
2628
|
def refresh_cluster_record(
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2109
|
-
|
|
2110
|
-
|
|
2629
|
+
cluster_name: str,
|
|
2630
|
+
*,
|
|
2631
|
+
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
|
|
2632
|
+
cluster_lock_already_held: bool = False,
|
|
2633
|
+
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
|
|
2634
|
+
include_user_info: bool = True,
|
|
2635
|
+
summary_response: bool = False,
|
|
2636
|
+
retry_if_missing: bool = True) -> Optional[Dict[str, Any]]:
|
|
2111
2637
|
"""Refresh the cluster, and return the possibly updated record.
|
|
2112
2638
|
|
|
2113
2639
|
The function will update the cached cluster status in the global state. For
|
|
@@ -2124,14 +2650,20 @@ def refresh_cluster_record(
|
|
|
2124
2650
|
_CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
|
|
2125
2651
|
1. the cluster is a spot cluster, or
|
|
2126
2652
|
2. cluster autostop is set and the cluster is not STOPPED.
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
|
|
2653
|
+
cluster_lock_already_held: Whether the caller is already holding the
|
|
2654
|
+
per-cluster lock. You MUST NOT set this to True if the caller does not
|
|
2655
|
+
already hold the lock. If True, we will not acquire the lock before
|
|
2656
|
+
updating the status. Failing to hold the lock while updating the
|
|
2657
|
+
status can lead to correctness issues - e.g. an launch in-progress may
|
|
2658
|
+
appear to be DOWN incorrectly. Even if this is set to False, the lock
|
|
2659
|
+
may not be acquired if the status does not need to be refreshed.
|
|
2130
2660
|
cluster_status_lock_timeout: The timeout to acquire the per-cluster
|
|
2131
2661
|
lock. If timeout, the function will use the cached status. If the
|
|
2132
2662
|
value is <0, do not timeout (wait for the lock indefinitely). By
|
|
2133
2663
|
default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
|
|
2134
2664
|
if correctness is required, you must set this to -1.
|
|
2665
|
+
retry_if_missing: Whether to retry the call to the cloud api if the
|
|
2666
|
+
cluster is not found when querying the live status on the cloud.
|
|
2135
2667
|
|
|
2136
2668
|
Returns:
|
|
2137
2669
|
If the cluster is terminated or does not exist, return None.
|
|
@@ -2147,69 +2679,95 @@ def refresh_cluster_record(
|
|
|
2147
2679
|
the node number larger than expected.
|
|
2148
2680
|
"""
|
|
2149
2681
|
|
|
2150
|
-
|
|
2682
|
+
ctx = context_lib.get()
|
|
2683
|
+
record = global_user_state.get_cluster_from_name(
|
|
2684
|
+
cluster_name,
|
|
2685
|
+
include_user_info=include_user_info,
|
|
2686
|
+
summary_response=summary_response)
|
|
2151
2687
|
if record is None:
|
|
2152
2688
|
return None
|
|
2153
|
-
|
|
2154
|
-
|
|
2155
|
-
|
|
2156
|
-
|
|
2157
|
-
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
|
|
2165
|
-
|
|
2166
|
-
#
|
|
2167
|
-
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
2171
|
-
return
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
|
|
2186
|
-
|
|
2187
|
-
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
|
|
2197
|
-
|
|
2198
|
-
|
|
2199
|
-
|
|
2200
|
-
|
|
2201
|
-
|
|
2202
|
-
|
|
2203
|
-
|
|
2689
|
+
# TODO(zhwu, 05/20): switch to the specific workspace to make sure we are
|
|
2690
|
+
# using the correct cloud credentials.
|
|
2691
|
+
workspace = record.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE)
|
|
2692
|
+
with skypilot_config.local_active_workspace_ctx(workspace):
|
|
2693
|
+
# check_owner_identity returns if the record handle is
|
|
2694
|
+
# not a CloudVmRayResourceHandle
|
|
2695
|
+
_check_owner_identity_with_record(cluster_name, record)
|
|
2696
|
+
|
|
2697
|
+
# The loop logic allows us to notice if the status was updated in the
|
|
2698
|
+
# global_user_state by another process and stop trying to get the lock.
|
|
2699
|
+
lock = locks.get_lock(cluster_status_lock_id(cluster_name))
|
|
2700
|
+
start_time = time.perf_counter()
|
|
2701
|
+
|
|
2702
|
+
# Loop until we have an up-to-date status or until we acquire the lock.
|
|
2703
|
+
while True:
|
|
2704
|
+
# Check if the context is canceled.
|
|
2705
|
+
if ctx is not None and ctx.is_canceled():
|
|
2706
|
+
raise asyncio.CancelledError()
|
|
2707
|
+
# Check to see if we can return the cached status.
|
|
2708
|
+
if not _must_refresh_cluster_status(record, force_refresh_statuses):
|
|
2709
|
+
return record
|
|
2710
|
+
|
|
2711
|
+
if cluster_lock_already_held:
|
|
2712
|
+
return _update_cluster_status(cluster_name, record,
|
|
2713
|
+
retry_if_missing,
|
|
2714
|
+
include_user_info,
|
|
2715
|
+
summary_response)
|
|
2716
|
+
|
|
2717
|
+
# Try to acquire the lock so we can fetch the status.
|
|
2718
|
+
try:
|
|
2719
|
+
with lock.acquire(blocking=False):
|
|
2720
|
+
# Check the cluster status again, since it could have been
|
|
2721
|
+
# updated between our last check and acquiring the lock.
|
|
2722
|
+
record = global_user_state.get_cluster_from_name(
|
|
2723
|
+
cluster_name,
|
|
2724
|
+
include_user_info=include_user_info,
|
|
2725
|
+
summary_response=summary_response)
|
|
2726
|
+
if record is None or not _must_refresh_cluster_status(
|
|
2727
|
+
record, force_refresh_statuses):
|
|
2728
|
+
return record
|
|
2729
|
+
# Update and return the cluster status.
|
|
2730
|
+
return _update_cluster_status(cluster_name, record,
|
|
2731
|
+
retry_if_missing,
|
|
2732
|
+
include_user_info,
|
|
2733
|
+
summary_response)
|
|
2734
|
+
|
|
2735
|
+
except locks.LockTimeout:
|
|
2736
|
+
# lock.acquire() will throw a Timeout exception if the lock is not
|
|
2737
|
+
# available and we have blocking=False.
|
|
2738
|
+
pass
|
|
2739
|
+
|
|
2740
|
+
# Logic adapted from FileLock.acquire().
|
|
2741
|
+
# If cluster_status_lock_time is <0, we will never hit this. No timeout.
|
|
2742
|
+
# Otherwise, if we have timed out, return the cached status. This has
|
|
2743
|
+
# the potential to cause correctness issues, but if so it is the
|
|
2744
|
+
# caller's responsibility to set the timeout to -1.
|
|
2745
|
+
if 0 <= cluster_status_lock_timeout < time.perf_counter(
|
|
2746
|
+
) - start_time:
|
|
2747
|
+
logger.debug(
|
|
2748
|
+
'Refreshing status: Failed get the lock for cluster '
|
|
2749
|
+
f'{cluster_name!r}. Using the cached status.')
|
|
2750
|
+
return record
|
|
2751
|
+
time.sleep(lock.poll_interval)
|
|
2752
|
+
|
|
2753
|
+
# Refresh for next loop iteration.
|
|
2754
|
+
record = global_user_state.get_cluster_from_name(
|
|
2755
|
+
cluster_name,
|
|
2756
|
+
include_user_info=include_user_info,
|
|
2757
|
+
summary_response=summary_response)
|
|
2758
|
+
if record is None:
|
|
2759
|
+
return None
|
|
2204
2760
|
|
|
2205
2761
|
|
|
2206
2762
|
@timeline.event
|
|
2763
|
+
@context_utils.cancellation_guard
|
|
2207
2764
|
def refresh_cluster_status_handle(
|
|
2208
2765
|
cluster_name: str,
|
|
2209
2766
|
*,
|
|
2210
2767
|
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
|
|
2211
|
-
|
|
2212
|
-
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
|
|
2768
|
+
cluster_lock_already_held: bool = False,
|
|
2769
|
+
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
|
|
2770
|
+
retry_if_missing: bool = True,
|
|
2213
2771
|
) -> Tuple[Optional[status_lib.ClusterStatus],
|
|
2214
2772
|
Optional[backends.ResourceHandle]]:
|
|
2215
2773
|
"""Refresh the cluster, and return the possibly updated status and handle.
|
|
@@ -2221,8 +2779,11 @@ def refresh_cluster_status_handle(
|
|
|
2221
2779
|
record = refresh_cluster_record(
|
|
2222
2780
|
cluster_name,
|
|
2223
2781
|
force_refresh_statuses=force_refresh_statuses,
|
|
2224
|
-
|
|
2225
|
-
cluster_status_lock_timeout=cluster_status_lock_timeout
|
|
2782
|
+
cluster_lock_already_held=cluster_lock_already_held,
|
|
2783
|
+
cluster_status_lock_timeout=cluster_status_lock_timeout,
|
|
2784
|
+
include_user_info=False,
|
|
2785
|
+
summary_response=True,
|
|
2786
|
+
retry_if_missing=retry_if_missing)
|
|
2226
2787
|
if record is None:
|
|
2227
2788
|
return None, None
|
|
2228
2789
|
return record['status'], record['handle']
|
|
@@ -2253,6 +2814,7 @@ def check_cluster_available(
|
|
|
2253
2814
|
...
|
|
2254
2815
|
|
|
2255
2816
|
|
|
2817
|
+
@context_utils.cancellation_guard
|
|
2256
2818
|
def check_cluster_available(
|
|
2257
2819
|
cluster_name: str,
|
|
2258
2820
|
*,
|
|
@@ -2272,7 +2834,9 @@ def check_cluster_available(
|
|
|
2272
2834
|
exceptions.CloudUserIdentityError: if we fail to get the current user
|
|
2273
2835
|
identity.
|
|
2274
2836
|
"""
|
|
2275
|
-
record = global_user_state.get_cluster_from_name(cluster_name
|
|
2837
|
+
record = global_user_state.get_cluster_from_name(cluster_name,
|
|
2838
|
+
include_user_info=False,
|
|
2839
|
+
summary_response=True)
|
|
2276
2840
|
if dryrun:
|
|
2277
2841
|
assert record is not None, cluster_name
|
|
2278
2842
|
return record['handle']
|
|
@@ -2404,6 +2968,19 @@ def is_controller_accessible(
|
|
|
2404
2968
|
exceptions.ClusterNotUpError: if the controller is not accessible, or
|
|
2405
2969
|
failed to be connected.
|
|
2406
2970
|
"""
|
|
2971
|
+
if (managed_job_utils.is_consolidation_mode() and
|
|
2972
|
+
controller == controller_utils.Controllers.JOBS_CONTROLLER
|
|
2973
|
+
) or (serve_utils.is_consolidation_mode() and
|
|
2974
|
+
controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER):
|
|
2975
|
+
cn = 'local-controller-consolidation'
|
|
2976
|
+
return backends.LocalResourcesHandle(
|
|
2977
|
+
cluster_name=cn,
|
|
2978
|
+
cluster_name_on_cloud=cn,
|
|
2979
|
+
cluster_yaml=None,
|
|
2980
|
+
launched_nodes=1,
|
|
2981
|
+
launched_resources=sky.Resources(cloud=clouds.Cloud(),
|
|
2982
|
+
instance_type=cn),
|
|
2983
|
+
)
|
|
2407
2984
|
if non_existent_message is None:
|
|
2408
2985
|
non_existent_message = controller.value.default_hint_if_non_existent
|
|
2409
2986
|
cluster_name = controller.value.cluster_name
|
|
@@ -2446,7 +3023,8 @@ def is_controller_accessible(
|
|
|
2446
3023
|
f'fatal, but {controller_name} commands/calls may hang or return '
|
|
2447
3024
|
'stale information, when the controller is not up.\n'
|
|
2448
3025
|
f' Details: {common_utils.format_exception(e, use_bracket=True)}')
|
|
2449
|
-
record = global_user_state.get_cluster_from_name(
|
|
3026
|
+
record = global_user_state.get_cluster_from_name(
|
|
3027
|
+
cluster_name, include_user_info=False, summary_response=True)
|
|
2450
3028
|
if record is not None:
|
|
2451
3029
|
controller_status, handle = record['status'], record['handle']
|
|
2452
3030
|
# We check the connection even if the cluster has a cached status UP
|
|
@@ -2467,7 +3045,7 @@ def is_controller_accessible(
|
|
|
2467
3045
|
need_connection_check):
|
|
2468
3046
|
# Check ssh connection if (1) controller is in INIT state, or (2) we failed to fetch the
|
|
2469
3047
|
# status, both of which can happen when controller's status lock is held by another `sky jobs launch` or
|
|
2470
|
-
# `sky serve up`. If we have
|
|
3048
|
+
# `sky serve up`. If we have controller's head_ip available and it is ssh-reachable,
|
|
2471
3049
|
# we can allow access to the controller.
|
|
2472
3050
|
ssh_credentials = ssh_credential_from_yaml(handle.cluster_yaml,
|
|
2473
3051
|
handle.docker_user,
|
|
@@ -2503,21 +3081,100 @@ class CloudFilter(enum.Enum):
|
|
|
2503
3081
|
LOCAL = 'local'
|
|
2504
3082
|
|
|
2505
3083
|
|
|
2506
|
-
def _get_glob_clusters(
|
|
3084
|
+
def _get_glob_clusters(
|
|
3085
|
+
clusters: List[str],
|
|
3086
|
+
silent: bool = False,
|
|
3087
|
+
workspaces_filter: Optional[Dict[str, Any]] = None) -> List[str]:
|
|
2507
3088
|
"""Returns a list of clusters that match the glob pattern."""
|
|
2508
3089
|
glob_clusters = []
|
|
2509
3090
|
for cluster in clusters:
|
|
2510
|
-
glob_cluster = global_user_state.get_glob_cluster_names(
|
|
3091
|
+
glob_cluster = global_user_state.get_glob_cluster_names(
|
|
3092
|
+
cluster, workspaces_filter=workspaces_filter)
|
|
2511
3093
|
if len(glob_cluster) == 0 and not silent:
|
|
2512
3094
|
logger.info(f'Cluster {cluster} not found.')
|
|
2513
3095
|
glob_clusters.extend(glob_cluster)
|
|
2514
3096
|
return list(set(glob_clusters))
|
|
2515
3097
|
|
|
2516
3098
|
|
|
3099
|
+
def _refresh_cluster(
|
|
3100
|
+
cluster_name: str,
|
|
3101
|
+
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]],
|
|
3102
|
+
include_user_info: bool = True,
|
|
3103
|
+
summary_response: bool = False) -> Optional[Dict[str, Any]]:
|
|
3104
|
+
try:
|
|
3105
|
+
record = refresh_cluster_record(
|
|
3106
|
+
cluster_name,
|
|
3107
|
+
force_refresh_statuses=force_refresh_statuses,
|
|
3108
|
+
cluster_lock_already_held=False,
|
|
3109
|
+
include_user_info=include_user_info,
|
|
3110
|
+
summary_response=summary_response)
|
|
3111
|
+
except (exceptions.ClusterStatusFetchingError,
|
|
3112
|
+
exceptions.CloudUserIdentityError,
|
|
3113
|
+
exceptions.ClusterOwnerIdentityMismatchError) as e:
|
|
3114
|
+
# Do not fail the entire refresh process. The caller will
|
|
3115
|
+
# handle the 'UNKNOWN' status, and collect the errors into
|
|
3116
|
+
# a table.
|
|
3117
|
+
record = {'status': 'UNKNOWN', 'error': e}
|
|
3118
|
+
return record
|
|
3119
|
+
|
|
3120
|
+
|
|
3121
|
+
def refresh_cluster_records() -> None:
|
|
3122
|
+
"""Refreshes the status of all clusters, except managed clusters.
|
|
3123
|
+
|
|
3124
|
+
Used by the background status refresh daemon.
|
|
3125
|
+
This function is a stripped-down version of get_clusters, with only the
|
|
3126
|
+
bare bones refresh logic.
|
|
3127
|
+
|
|
3128
|
+
Returns:
|
|
3129
|
+
None
|
|
3130
|
+
|
|
3131
|
+
Raises:
|
|
3132
|
+
None
|
|
3133
|
+
"""
|
|
3134
|
+
exclude_managed_clusters = True
|
|
3135
|
+
if env_options.Options.SHOW_DEBUG_INFO.get():
|
|
3136
|
+
exclude_managed_clusters = False
|
|
3137
|
+
cluster_names = set(
|
|
3138
|
+
global_user_state.get_cluster_names(
|
|
3139
|
+
exclude_managed_clusters=exclude_managed_clusters,))
|
|
3140
|
+
|
|
3141
|
+
# TODO(syang): we should try not to leak
|
|
3142
|
+
# request info in backend_utils.py.
|
|
3143
|
+
# Refactor this to use some other info to
|
|
3144
|
+
# determine if a launch is in progress.
|
|
3145
|
+
cluster_names_with_launch_request = {
|
|
3146
|
+
request.cluster_name for request in requests_lib.get_request_tasks(
|
|
3147
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
3148
|
+
status=[requests_lib.RequestStatus.RUNNING],
|
|
3149
|
+
include_request_names=['sky.launch'],
|
|
3150
|
+
fields=['cluster_name']))
|
|
3151
|
+
}
|
|
3152
|
+
cluster_names_without_launch_request = (cluster_names -
|
|
3153
|
+
cluster_names_with_launch_request)
|
|
3154
|
+
|
|
3155
|
+
def _refresh_cluster_record(cluster_name):
|
|
3156
|
+
return _refresh_cluster(cluster_name,
|
|
3157
|
+
force_refresh_statuses=set(
|
|
3158
|
+
status_lib.ClusterStatus),
|
|
3159
|
+
include_user_info=False,
|
|
3160
|
+
summary_response=True)
|
|
3161
|
+
|
|
3162
|
+
if len(cluster_names_without_launch_request) > 0:
|
|
3163
|
+
# Do not refresh the clusters that have an active launch request.
|
|
3164
|
+
subprocess_utils.run_in_parallel(_refresh_cluster_record,
|
|
3165
|
+
cluster_names_without_launch_request)
|
|
3166
|
+
|
|
3167
|
+
|
|
2517
3168
|
def get_clusters(
|
|
2518
3169
|
refresh: common.StatusRefreshMode,
|
|
2519
3170
|
cluster_names: Optional[Union[str, List[str]]] = None,
|
|
2520
3171
|
all_users: bool = True,
|
|
3172
|
+
include_credentials: bool = False,
|
|
3173
|
+
summary_response: bool = False,
|
|
3174
|
+
include_handle: bool = True,
|
|
3175
|
+
# Internal only:
|
|
3176
|
+
# pylint: disable=invalid-name
|
|
3177
|
+
_include_is_managed: bool = False,
|
|
2521
3178
|
) -> List[Dict[str, Any]]:
|
|
2522
3179
|
"""Returns a list of cached or optionally refreshed cluster records.
|
|
2523
3180
|
|
|
@@ -2527,114 +3184,159 @@ def get_clusters(
|
|
|
2527
3184
|
of the clusters.
|
|
2528
3185
|
|
|
2529
3186
|
Args:
|
|
2530
|
-
include_controller: Whether to include controllers, e.g. jobs controller
|
|
2531
|
-
or sky serve controller.
|
|
2532
3187
|
refresh: Whether to refresh the status of the clusters. (Refreshing will
|
|
2533
3188
|
set the status to STOPPED if the cluster cannot be pinged.)
|
|
2534
|
-
cloud_filter: Sets which clouds to filer through from the global user
|
|
2535
|
-
state. Supports three values, 'all' for all clouds, 'public' for
|
|
2536
|
-
public clouds only, and 'local' for only local clouds.
|
|
2537
3189
|
cluster_names: If provided, only return records for the given cluster
|
|
2538
3190
|
names.
|
|
3191
|
+
all_users: If True, return clusters from all users. If False, only
|
|
3192
|
+
return clusters from the current user.
|
|
3193
|
+
include_credentials: If True, include cluster ssh credentials in the
|
|
3194
|
+
return value.
|
|
3195
|
+
_include_is_managed: Whether to force include clusters created by the
|
|
3196
|
+
controller.
|
|
2539
3197
|
|
|
2540
3198
|
Returns:
|
|
2541
3199
|
A list of cluster records. If the cluster does not exist or has been
|
|
2542
3200
|
terminated, the record will be omitted from the returned list.
|
|
2543
3201
|
"""
|
|
2544
|
-
|
|
3202
|
+
accessible_workspaces = workspaces_core.get_workspaces()
|
|
3203
|
+
if cluster_names is not None:
|
|
3204
|
+
if isinstance(cluster_names, str):
|
|
3205
|
+
cluster_names = [cluster_names]
|
|
3206
|
+
non_glob_cluster_names = []
|
|
3207
|
+
glob_cluster_names = []
|
|
3208
|
+
for cluster_name in cluster_names:
|
|
3209
|
+
if ux_utils.is_glob_pattern(cluster_name):
|
|
3210
|
+
glob_cluster_names.append(cluster_name)
|
|
3211
|
+
else:
|
|
3212
|
+
non_glob_cluster_names.append(cluster_name)
|
|
3213
|
+
cluster_names = non_glob_cluster_names
|
|
3214
|
+
if glob_cluster_names:
|
|
3215
|
+
cluster_names += _get_glob_clusters(
|
|
3216
|
+
glob_cluster_names,
|
|
3217
|
+
silent=True,
|
|
3218
|
+
workspaces_filter=accessible_workspaces)
|
|
3219
|
+
|
|
3220
|
+
exclude_managed_clusters = False
|
|
3221
|
+
if not (_include_is_managed or env_options.Options.SHOW_DEBUG_INFO.get()):
|
|
3222
|
+
exclude_managed_clusters = True
|
|
3223
|
+
user_hashes_filter = None
|
|
2545
3224
|
if not all_users:
|
|
2546
|
-
|
|
2547
|
-
|
|
2548
|
-
|
|
2549
|
-
|
|
2550
|
-
|
|
3225
|
+
user_hashes_filter = {common_utils.get_current_user().id}
|
|
3226
|
+
records = global_user_state.get_clusters(
|
|
3227
|
+
exclude_managed_clusters=exclude_managed_clusters,
|
|
3228
|
+
user_hashes_filter=user_hashes_filter,
|
|
3229
|
+
workspaces_filter=accessible_workspaces,
|
|
3230
|
+
cluster_names=cluster_names,
|
|
3231
|
+
summary_response=summary_response)
|
|
2551
3232
|
|
|
2552
3233
|
yellow = colorama.Fore.YELLOW
|
|
2553
3234
|
bright = colorama.Style.BRIGHT
|
|
2554
3235
|
reset = colorama.Style.RESET_ALL
|
|
2555
3236
|
|
|
2556
|
-
|
|
2557
|
-
|
|
3237
|
+
if cluster_names is not None:
|
|
3238
|
+
record_names = {record['name'] for record in records}
|
|
3239
|
+
not_found_clusters = ux_utils.get_non_matched_query(
|
|
3240
|
+
cluster_names, record_names)
|
|
3241
|
+
if not_found_clusters:
|
|
3242
|
+
clusters_str = ', '.join(not_found_clusters)
|
|
3243
|
+
logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
|
|
3244
|
+
|
|
3245
|
+
def _get_records_with_handle(
|
|
3246
|
+
records: List[Optional[Dict[str, Any]]]) -> List[Dict[str, Any]]:
|
|
3247
|
+
"""Filter for records that have a handle"""
|
|
3248
|
+
return [
|
|
3249
|
+
record for record in records
|
|
3250
|
+
if record is not None and record['handle'] is not None
|
|
3251
|
+
]
|
|
3252
|
+
|
|
3253
|
+
def _update_records_with_handle_info(
|
|
3254
|
+
records: List[Optional[Dict[str, Any]]]) -> None:
|
|
3255
|
+
"""Add resource str to record"""
|
|
3256
|
+
for record in _get_records_with_handle(records):
|
|
3257
|
+
handle = record['handle']
|
|
3258
|
+
resource_str_simple, resource_str_full = (
|
|
3259
|
+
resources_utils.get_readable_resources_repr(
|
|
3260
|
+
handle, simplified_only=False))
|
|
3261
|
+
record['resources_str'] = resource_str_simple
|
|
3262
|
+
record['resources_str_full'] = resource_str_full
|
|
3263
|
+
if not summary_response:
|
|
3264
|
+
record['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
|
|
3265
|
+
|
|
3266
|
+
def _update_records_with_credentials(
|
|
3267
|
+
records: List[Optional[Dict[str, Any]]]) -> None:
|
|
2558
3268
|
"""Add the credentials to the record.
|
|
2559
3269
|
|
|
2560
3270
|
This is useful for the client side to setup the ssh config of the
|
|
2561
3271
|
cluster.
|
|
2562
3272
|
"""
|
|
2563
|
-
|
|
2564
|
-
|
|
2565
|
-
handle = record['handle']
|
|
2566
|
-
if handle is None:
|
|
3273
|
+
records_with_handle = _get_records_with_handle(records)
|
|
3274
|
+
if len(records_with_handle) == 0:
|
|
2567
3275
|
return
|
|
2568
|
-
record['resources_str'] = resources_utils.get_readable_resources_repr(
|
|
2569
|
-
handle)
|
|
2570
|
-
credentials = ssh_credential_from_yaml(handle.cluster_yaml,
|
|
2571
|
-
handle.docker_user,
|
|
2572
|
-
handle.ssh_user)
|
|
2573
|
-
|
|
2574
|
-
if not credentials:
|
|
2575
|
-
return
|
|
2576
|
-
ssh_private_key_path = credentials.get('ssh_private_key', None)
|
|
2577
|
-
if ssh_private_key_path is not None:
|
|
2578
|
-
with open(os.path.expanduser(ssh_private_key_path),
|
|
2579
|
-
'r',
|
|
2580
|
-
encoding='utf-8') as f:
|
|
2581
|
-
credentials['ssh_private_key_content'] = f.read()
|
|
2582
|
-
else:
|
|
2583
|
-
private_key_path, _ = auth.get_or_generate_keys()
|
|
2584
|
-
with open(os.path.expanduser(private_key_path),
|
|
2585
|
-
'r',
|
|
2586
|
-
encoding='utf-8') as f:
|
|
2587
|
-
credentials['ssh_private_key_content'] = f.read()
|
|
2588
|
-
record['credentials'] = credentials
|
|
2589
3276
|
|
|
2590
|
-
|
|
2591
|
-
|
|
2592
|
-
|
|
2593
|
-
|
|
2594
|
-
|
|
2595
|
-
|
|
2596
|
-
|
|
2597
|
-
|
|
2598
|
-
|
|
2599
|
-
|
|
2600
|
-
|
|
3277
|
+
handles = [record['handle'] for record in records_with_handle]
|
|
3278
|
+
credentials = ssh_credentials_from_handles(handles)
|
|
3279
|
+
cached_private_keys: Dict[str, str] = {}
|
|
3280
|
+
for record, credential in zip(records_with_handle, credentials):
|
|
3281
|
+
if not credential:
|
|
3282
|
+
continue
|
|
3283
|
+
ssh_private_key_path = credential.get('ssh_private_key', None)
|
|
3284
|
+
if ssh_private_key_path is not None:
|
|
3285
|
+
expanded_private_key_path = os.path.expanduser(
|
|
3286
|
+
ssh_private_key_path)
|
|
3287
|
+
if not os.path.exists(expanded_private_key_path):
|
|
3288
|
+
success = auth_utils.create_ssh_key_files_from_db(
|
|
3289
|
+
ssh_private_key_path)
|
|
3290
|
+
if not success:
|
|
3291
|
+
# If the ssh key files are not found, we do not
|
|
3292
|
+
# update the record with credentials.
|
|
3293
|
+
logger.debug(
|
|
3294
|
+
f'SSH keys not found for cluster {record["name"]} '
|
|
3295
|
+
f'at key path {ssh_private_key_path}')
|
|
3296
|
+
continue
|
|
2601
3297
|
else:
|
|
2602
|
-
|
|
2603
|
-
|
|
2604
|
-
|
|
2605
|
-
|
|
2606
|
-
|
|
2607
|
-
|
|
2608
|
-
|
|
3298
|
+
private_key_path, _ = auth_utils.get_or_generate_keys()
|
|
3299
|
+
expanded_private_key_path = os.path.expanduser(private_key_path)
|
|
3300
|
+
if expanded_private_key_path in cached_private_keys:
|
|
3301
|
+
credential['ssh_private_key_content'] = cached_private_keys[
|
|
3302
|
+
expanded_private_key_path]
|
|
3303
|
+
else:
|
|
3304
|
+
with open(expanded_private_key_path, 'r',
|
|
3305
|
+
encoding='utf-8') as f:
|
|
3306
|
+
credential['ssh_private_key_content'] = f.read()
|
|
3307
|
+
cached_private_keys[expanded_private_key_path] = credential[
|
|
3308
|
+
'ssh_private_key_content']
|
|
3309
|
+
record['credentials'] = credential
|
|
3310
|
+
|
|
3311
|
+
def _update_records_with_resources(
|
|
3312
|
+
records: List[Optional[Dict[str, Any]]],) -> None:
|
|
2609
3313
|
"""Add the resources to the record."""
|
|
2610
|
-
|
|
2611
|
-
|
|
2612
|
-
|
|
2613
|
-
|
|
2614
|
-
|
|
2615
|
-
|
|
2616
|
-
|
|
2617
|
-
|
|
2618
|
-
|
|
2619
|
-
|
|
2620
|
-
|
|
2621
|
-
|
|
2622
|
-
|
|
2623
|
-
|
|
2624
|
-
|
|
2625
|
-
|
|
2626
|
-
|
|
2627
|
-
|
|
2628
|
-
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
-
|
|
2632
|
-
|
|
2633
|
-
|
|
3314
|
+
for record in _get_records_with_handle(records):
|
|
3315
|
+
handle = record['handle']
|
|
3316
|
+
record['nodes'] = handle.launched_nodes
|
|
3317
|
+
if handle.launched_resources is None:
|
|
3318
|
+
continue
|
|
3319
|
+
record['cloud'] = (f'{handle.launched_resources.cloud}'
|
|
3320
|
+
if handle.launched_resources.cloud else None)
|
|
3321
|
+
record['region'] = (f'{handle.launched_resources.region}'
|
|
3322
|
+
if handle.launched_resources.region else None)
|
|
3323
|
+
record['cpus'] = (f'{handle.launched_resources.cpus}'
|
|
3324
|
+
if handle.launched_resources.cpus else None)
|
|
3325
|
+
record['memory'] = (f'{handle.launched_resources.memory}'
|
|
3326
|
+
if handle.launched_resources.memory else None)
|
|
3327
|
+
record['accelerators'] = (
|
|
3328
|
+
f'{handle.launched_resources.accelerators}'
|
|
3329
|
+
if handle.launched_resources.accelerators else None)
|
|
3330
|
+
if not include_handle:
|
|
3331
|
+
record.pop('handle', None)
|
|
3332
|
+
|
|
3333
|
+
# Add handle info to the records
|
|
3334
|
+
_update_records_with_handle_info(records)
|
|
3335
|
+
if include_credentials:
|
|
3336
|
+
_update_records_with_credentials(records)
|
|
2634
3337
|
if refresh == common.StatusRefreshMode.NONE:
|
|
2635
3338
|
# Add resources to the records
|
|
2636
|
-
|
|
2637
|
-
_update_record_with_resources(record)
|
|
3339
|
+
_update_records_with_resources(records)
|
|
2638
3340
|
return records
|
|
2639
3341
|
|
|
2640
3342
|
plural = 's' if len(records) > 1 else ''
|
|
@@ -2650,47 +3352,76 @@ def get_clusters(
|
|
|
2650
3352
|
else:
|
|
2651
3353
|
force_refresh_statuses = None
|
|
2652
3354
|
|
|
2653
|
-
def
|
|
2654
|
-
|
|
2655
|
-
|
|
2656
|
-
|
|
2657
|
-
|
|
2658
|
-
|
|
2659
|
-
|
|
2660
|
-
|
|
2661
|
-
|
|
2662
|
-
|
|
2663
|
-
|
|
2664
|
-
|
|
2665
|
-
|
|
2666
|
-
record = {'status': 'UNKNOWN', 'error': e}
|
|
2667
|
-
progress.update(task, advance=1)
|
|
3355
|
+
def _refresh_cluster_record(cluster_name):
|
|
3356
|
+
record = _refresh_cluster(cluster_name,
|
|
3357
|
+
force_refresh_statuses=force_refresh_statuses,
|
|
3358
|
+
include_user_info=True,
|
|
3359
|
+
summary_response=summary_response)
|
|
3360
|
+
# record may be None if the cluster is deleted during refresh,
|
|
3361
|
+
# e.g. all the Pods of a cluster on Kubernetes have been
|
|
3362
|
+
# deleted before refresh.
|
|
3363
|
+
if record is not None and 'error' not in record:
|
|
3364
|
+
_update_records_with_handle_info([record])
|
|
3365
|
+
if include_credentials:
|
|
3366
|
+
_update_records_with_credentials([record])
|
|
3367
|
+
progress.update(task, advance=1)
|
|
2668
3368
|
return record
|
|
2669
3369
|
|
|
2670
3370
|
cluster_names = [record['name'] for record in records]
|
|
3371
|
+
# TODO(syang): we should try not to leak
|
|
3372
|
+
# request info in backend_utils.py.
|
|
3373
|
+
# Refactor this to use some other info to
|
|
3374
|
+
# determine if a launch is in progress.
|
|
3375
|
+
cluster_names_with_launch_request = {
|
|
3376
|
+
request.cluster_name for request in requests_lib.get_request_tasks(
|
|
3377
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
3378
|
+
status=[requests_lib.RequestStatus.RUNNING],
|
|
3379
|
+
include_request_names=['sky.launch'],
|
|
3380
|
+
cluster_names=cluster_names,
|
|
3381
|
+
fields=['cluster_name']))
|
|
3382
|
+
}
|
|
3383
|
+
# Preserve the index of the cluster name as it appears on "records"
|
|
3384
|
+
cluster_names_without_launch_request = [
|
|
3385
|
+
(i, cluster_name)
|
|
3386
|
+
for i, cluster_name in enumerate(cluster_names)
|
|
3387
|
+
if cluster_name not in cluster_names_with_launch_request
|
|
3388
|
+
]
|
|
3389
|
+
# for clusters that have an active launch request, we do not refresh the status
|
|
2671
3390
|
updated_records = []
|
|
2672
|
-
if len(
|
|
3391
|
+
if len(cluster_names_without_launch_request) > 0:
|
|
2673
3392
|
with progress:
|
|
2674
3393
|
updated_records = subprocess_utils.run_in_parallel(
|
|
2675
|
-
|
|
2676
|
-
|
|
3394
|
+
_refresh_cluster_record, [
|
|
3395
|
+
cluster_name
|
|
3396
|
+
for _, cluster_name in cluster_names_without_launch_request
|
|
3397
|
+
])
|
|
3398
|
+
# Preserve the index of the cluster name as it appears on "records"
|
|
3399
|
+
# before filtering for clusters being launched.
|
|
3400
|
+
updated_records_dict: Dict[int, Optional[Dict[str, Any]]] = {
|
|
3401
|
+
cluster_names_without_launch_request[i][0]: updated_records[i]
|
|
3402
|
+
for i in range(len(cluster_names_without_launch_request))
|
|
3403
|
+
}
|
|
2677
3404
|
# Show information for removed clusters.
|
|
2678
3405
|
kept_records = []
|
|
2679
3406
|
autodown_clusters, remaining_clusters, failed_clusters = [], [], []
|
|
2680
3407
|
for i, record in enumerate(records):
|
|
2681
|
-
if
|
|
3408
|
+
if i not in updated_records_dict:
|
|
3409
|
+
# record was not refreshed, keep the original record
|
|
3410
|
+
kept_records.append(record)
|
|
3411
|
+
continue
|
|
3412
|
+
updated_record = updated_records_dict[i]
|
|
3413
|
+
if updated_record is None:
|
|
2682
3414
|
if record['to_down']:
|
|
2683
|
-
autodown_clusters.append(
|
|
3415
|
+
autodown_clusters.append(record['name'])
|
|
2684
3416
|
else:
|
|
2685
|
-
remaining_clusters.append(
|
|
2686
|
-
elif
|
|
2687
|
-
failed_clusters.append(
|
|
2688
|
-
(cluster_names[i], updated_records[i]['error']))
|
|
3417
|
+
remaining_clusters.append(record['name'])
|
|
3418
|
+
elif updated_record['status'] == 'UNKNOWN':
|
|
3419
|
+
failed_clusters.append((record['name'], updated_record['error']))
|
|
2689
3420
|
# Keep the original record if the status is unknown,
|
|
2690
3421
|
# so that the user can still see the cluster.
|
|
2691
3422
|
kept_records.append(record)
|
|
2692
3423
|
else:
|
|
2693
|
-
kept_records.append(
|
|
3424
|
+
kept_records.append(updated_record)
|
|
2694
3425
|
|
|
2695
3426
|
if autodown_clusters:
|
|
2696
3427
|
plural = 's' if len(autodown_clusters) > 1 else ''
|
|
@@ -2711,8 +3442,7 @@ def get_clusters(
|
|
|
2711
3442
|
logger.warning(f' {bright}{cluster_name}{reset}: {e}')
|
|
2712
3443
|
|
|
2713
3444
|
# Add resources to the records
|
|
2714
|
-
|
|
2715
|
-
_update_record_with_resources(record)
|
|
3445
|
+
_update_records_with_resources(kept_records)
|
|
2716
3446
|
return kept_records
|
|
2717
3447
|
|
|
2718
3448
|
|
|
@@ -2799,6 +3529,7 @@ def get_task_resources_str(task: 'task_lib.Task',
|
|
|
2799
3529
|
if is_managed_job:
|
|
2800
3530
|
if task.best_resources.use_spot:
|
|
2801
3531
|
spot_str = '[Spot]'
|
|
3532
|
+
assert task.best_resources.cpus is not None
|
|
2802
3533
|
task_cpu_demand = task.best_resources.cpus
|
|
2803
3534
|
if accelerator_dict is None:
|
|
2804
3535
|
resources_str = f'CPU:{task_cpu_demand}'
|
|
@@ -2943,7 +3674,8 @@ def get_endpoints(cluster: str,
|
|
|
2943
3674
|
with ux_utils.print_exception_no_traceback():
|
|
2944
3675
|
raise ValueError(f'Invalid endpoint {port!r}.') from None
|
|
2945
3676
|
cluster_records = get_clusters(refresh=common.StatusRefreshMode.NONE,
|
|
2946
|
-
cluster_names=[cluster]
|
|
3677
|
+
cluster_names=[cluster],
|
|
3678
|
+
_include_is_managed=True)
|
|
2947
3679
|
if not cluster_records:
|
|
2948
3680
|
with ux_utils.print_exception_no_traceback():
|
|
2949
3681
|
raise exceptions.ClusterNotUpError(
|
|
@@ -2965,7 +3697,7 @@ def get_endpoints(cluster: str,
|
|
|
2965
3697
|
f'for cluster {cluster!r} with backend '
|
|
2966
3698
|
f'{get_backend_from_handle(handle).NAME}.')
|
|
2967
3699
|
|
|
2968
|
-
launched_resources = handle.launched_resources
|
|
3700
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
|
2969
3701
|
cloud = launched_resources.cloud
|
|
2970
3702
|
try:
|
|
2971
3703
|
cloud.check_features_are_supported(
|
|
@@ -2975,18 +3707,18 @@ def get_endpoints(cluster: str,
|
|
|
2975
3707
|
raise ValueError('Querying endpoints is not supported '
|
|
2976
3708
|
f'for {cluster!r} on {cloud}.') from None
|
|
2977
3709
|
|
|
2978
|
-
config =
|
|
3710
|
+
config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
|
|
2979
3711
|
port_details = provision_lib.query_ports(repr(cloud),
|
|
2980
3712
|
handle.cluster_name_on_cloud,
|
|
2981
3713
|
handle.launched_resources.ports,
|
|
2982
3714
|
head_ip=handle.head_ip,
|
|
2983
3715
|
provider_config=config['provider'])
|
|
2984
3716
|
|
|
3717
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
|
2985
3718
|
# Validation before returning the endpoints
|
|
2986
3719
|
if port is not None:
|
|
2987
3720
|
# If the requested endpoint was not to be exposed
|
|
2988
|
-
port_set = resources_utils.port_ranges_to_set(
|
|
2989
|
-
handle.launched_resources.ports)
|
|
3721
|
+
port_set = resources_utils.port_ranges_to_set(launched_resources.ports)
|
|
2990
3722
|
if port not in port_set:
|
|
2991
3723
|
logger.warning(f'Port {port} is not exposed on '
|
|
2992
3724
|
f'cluster {cluster!r}.')
|
|
@@ -2995,17 +3727,17 @@ def get_endpoints(cluster: str,
|
|
|
2995
3727
|
if port not in port_details:
|
|
2996
3728
|
error_msg = (f'Port {port} not exposed yet. '
|
|
2997
3729
|
f'{_ENDPOINTS_RETRY_MESSAGE} ')
|
|
2998
|
-
if
|
|
2999
|
-
clouds.Kubernetes()):
|
|
3730
|
+
if launched_resources.cloud.is_same_cloud(clouds.Kubernetes()):
|
|
3000
3731
|
# Add Kubernetes specific debugging info
|
|
3001
|
-
error_msg +=
|
|
3732
|
+
error_msg += kubernetes_utils.get_endpoint_debug_message(
|
|
3733
|
+
launched_resources.region)
|
|
3002
3734
|
logger.warning(error_msg)
|
|
3003
3735
|
return {}
|
|
3004
3736
|
return {port: port_details[port][0].url()}
|
|
3005
3737
|
else:
|
|
3006
3738
|
if not port_details:
|
|
3007
3739
|
# If cluster had no ports to be exposed
|
|
3008
|
-
if
|
|
3740
|
+
if launched_resources.ports is None:
|
|
3009
3741
|
logger.warning(f'Cluster {cluster!r} does not have any '
|
|
3010
3742
|
'ports to be exposed.')
|
|
3011
3743
|
return {}
|
|
@@ -3014,13 +3746,200 @@ def get_endpoints(cluster: str,
|
|
|
3014
3746
|
else:
|
|
3015
3747
|
error_msg = (f'No endpoints exposed yet. '
|
|
3016
3748
|
f'{_ENDPOINTS_RETRY_MESSAGE} ')
|
|
3017
|
-
if
|
|
3018
|
-
clouds.Kubernetes()):
|
|
3749
|
+
if launched_resources.cloud.is_same_cloud(clouds.Kubernetes()):
|
|
3019
3750
|
# Add Kubernetes specific debugging info
|
|
3020
|
-
error_msg +=
|
|
3021
|
-
|
|
3751
|
+
error_msg += kubernetes_utils.get_endpoint_debug_message(
|
|
3752
|
+
launched_resources.region)
|
|
3022
3753
|
logger.warning(error_msg)
|
|
3023
3754
|
return {}
|
|
3024
3755
|
return {
|
|
3025
3756
|
port_num: urls[0].url() for port_num, urls in port_details.items()
|
|
3026
3757
|
}
|
|
3758
|
+
|
|
3759
|
+
|
|
3760
|
+
def cluster_status_lock_id(cluster_name: str) -> str:
|
|
3761
|
+
"""Get the lock ID for cluster status operations."""
|
|
3762
|
+
return f'{cluster_name}_status'
|
|
3763
|
+
|
|
3764
|
+
|
|
3765
|
+
def cluster_file_mounts_lock_id(cluster_name: str) -> str:
|
|
3766
|
+
"""Get the lock ID for cluster file mounts operations."""
|
|
3767
|
+
return f'{cluster_name}_file_mounts'
|
|
3768
|
+
|
|
3769
|
+
|
|
3770
|
+
def workspace_lock_id(workspace_name: str) -> str:
|
|
3771
|
+
"""Get the lock ID for workspace operations."""
|
|
3772
|
+
return f'{workspace_name}_workspace'
|
|
3773
|
+
|
|
3774
|
+
|
|
3775
|
+
def cluster_tunnel_lock_id(cluster_name: str) -> str:
|
|
3776
|
+
"""Get the lock ID for cluster tunnel operations."""
|
|
3777
|
+
return f'{cluster_name}_ssh_tunnel'
|
|
3778
|
+
|
|
3779
|
+
|
|
3780
|
+
def open_ssh_tunnel(head_runner: Union[command_runner.SSHCommandRunner,
|
|
3781
|
+
command_runner.KubernetesCommandRunner],
|
|
3782
|
+
port_forward: Tuple[int, int]) -> subprocess.Popen:
|
|
3783
|
+
local_port, remote_port = port_forward
|
|
3784
|
+
if isinstance(head_runner, command_runner.SSHCommandRunner):
|
|
3785
|
+
# Disabling ControlMaster makes things easier to reason about
|
|
3786
|
+
# with respect to resource management/ownership,
|
|
3787
|
+
# as killing the process will close the tunnel too.
|
|
3788
|
+
head_runner.disable_control_master = True
|
|
3789
|
+
head_runner.port_forward_execute_remote_command = True
|
|
3790
|
+
|
|
3791
|
+
# The default connect_timeout of 1s is too short for
|
|
3792
|
+
# connecting to clusters using a jump server.
|
|
3793
|
+
# We use NON_INTERACTIVE mode to avoid allocating a pseudo-tty,
|
|
3794
|
+
# which is counted towards non-idleness.
|
|
3795
|
+
cmd: List[str] = head_runner.port_forward_command(
|
|
3796
|
+
[(local_port, remote_port)],
|
|
3797
|
+
connect_timeout=5,
|
|
3798
|
+
ssh_mode=command_runner.SshMode.NON_INTERACTIVE)
|
|
3799
|
+
if isinstance(head_runner, command_runner.SSHCommandRunner):
|
|
3800
|
+
# cat so the command doesn't exit until we kill it
|
|
3801
|
+
cmd += [f'"echo {_ACK_MESSAGE} && cat"']
|
|
3802
|
+
cmd_str = ' '.join(cmd)
|
|
3803
|
+
logger.debug(f'Running port forward command: {cmd_str}')
|
|
3804
|
+
ssh_tunnel_proc = subprocess.Popen(cmd_str,
|
|
3805
|
+
shell=True,
|
|
3806
|
+
stdin=subprocess.PIPE,
|
|
3807
|
+
stdout=subprocess.PIPE,
|
|
3808
|
+
stderr=subprocess.PIPE,
|
|
3809
|
+
start_new_session=True,
|
|
3810
|
+
text=True)
|
|
3811
|
+
# Wait until we receive an ack from the remote cluster or
|
|
3812
|
+
# the SSH connection times out.
|
|
3813
|
+
queue: queue_lib.Queue = queue_lib.Queue()
|
|
3814
|
+
stdout_thread = threading.Thread(
|
|
3815
|
+
target=lambda queue, stdout: queue.put(stdout.readline()),
|
|
3816
|
+
args=(queue, ssh_tunnel_proc.stdout),
|
|
3817
|
+
daemon=True)
|
|
3818
|
+
stdout_thread.start()
|
|
3819
|
+
while ssh_tunnel_proc.poll() is None:
|
|
3820
|
+
try:
|
|
3821
|
+
ack = queue.get_nowait()
|
|
3822
|
+
except queue_lib.Empty:
|
|
3823
|
+
ack = None
|
|
3824
|
+
time.sleep(0.1)
|
|
3825
|
+
continue
|
|
3826
|
+
assert ack is not None
|
|
3827
|
+
if isinstance(
|
|
3828
|
+
head_runner,
|
|
3829
|
+
command_runner.SSHCommandRunner) and ack == f'{_ACK_MESSAGE}\n':
|
|
3830
|
+
break
|
|
3831
|
+
elif isinstance(head_runner, command_runner.KubernetesCommandRunner
|
|
3832
|
+
) and _FORWARDING_FROM_MESSAGE in ack:
|
|
3833
|
+
# On kind clusters, this error occurs if we make a request
|
|
3834
|
+
# immediately after the port-forward is established on a new pod:
|
|
3835
|
+
# "Unhandled Error" err="an error occurred forwarding ... -> 46590:
|
|
3836
|
+
# failed to execute portforward in network namespace
|
|
3837
|
+
# "/var/run/netns/cni-...": failed to connect to localhost:46590
|
|
3838
|
+
# inside namespace "...", IPv4: dial tcp4 127.0.0.1:46590:
|
|
3839
|
+
# connect: connection refused
|
|
3840
|
+
# So we need to poll the port on the pod to check if it is open.
|
|
3841
|
+
# We did not observe this with real Kubernetes clusters.
|
|
3842
|
+
timeout = 5
|
|
3843
|
+
port_check_cmd = (
|
|
3844
|
+
# We install netcat in our ray-node container,
|
|
3845
|
+
# so we can use it here.
|
|
3846
|
+
# (See kubernetes-ray.yml.j2)
|
|
3847
|
+
f'end=$((SECONDS+{timeout})); '
|
|
3848
|
+
f'while ! nc -z -w 1 localhost {remote_port}; do '
|
|
3849
|
+
'if (( SECONDS >= end )); then exit 1; fi; '
|
|
3850
|
+
'sleep 0.1; '
|
|
3851
|
+
'done')
|
|
3852
|
+
returncode, stdout, stderr = head_runner.run(port_check_cmd,
|
|
3853
|
+
require_outputs=True,
|
|
3854
|
+
stream_logs=False)
|
|
3855
|
+
if returncode != 0:
|
|
3856
|
+
try:
|
|
3857
|
+
ssh_tunnel_proc.terminate()
|
|
3858
|
+
ssh_tunnel_proc.wait(timeout=5)
|
|
3859
|
+
except subprocess.TimeoutExpired:
|
|
3860
|
+
ssh_tunnel_proc.kill()
|
|
3861
|
+
ssh_tunnel_proc.wait()
|
|
3862
|
+
finally:
|
|
3863
|
+
error_msg = (f'Failed to check remote port {remote_port}')
|
|
3864
|
+
if stdout:
|
|
3865
|
+
error_msg += f'\n-- stdout --\n{stdout}\n'
|
|
3866
|
+
raise exceptions.CommandError(returncode=returncode,
|
|
3867
|
+
command=cmd_str,
|
|
3868
|
+
error_msg=error_msg,
|
|
3869
|
+
detailed_reason=stderr)
|
|
3870
|
+
break
|
|
3871
|
+
|
|
3872
|
+
if ssh_tunnel_proc.poll() is not None:
|
|
3873
|
+
stdout, stderr = ssh_tunnel_proc.communicate()
|
|
3874
|
+
error_msg = 'Port forward failed'
|
|
3875
|
+
if stdout:
|
|
3876
|
+
error_msg += f'\n-- stdout --\n{stdout}\n'
|
|
3877
|
+
raise exceptions.CommandError(returncode=ssh_tunnel_proc.returncode,
|
|
3878
|
+
command=cmd_str,
|
|
3879
|
+
error_msg=error_msg,
|
|
3880
|
+
detailed_reason=stderr)
|
|
3881
|
+
return ssh_tunnel_proc
|
|
3882
|
+
|
|
3883
|
+
|
|
3884
|
+
T = TypeVar('T')
|
|
3885
|
+
|
|
3886
|
+
|
|
3887
|
+
def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
|
|
3888
|
+
"""Generic helper for making Skylet gRPC requests.
|
|
3889
|
+
|
|
3890
|
+
This method handles the common pattern of:
|
|
3891
|
+
1. Try the gRPC request
|
|
3892
|
+
2. If SSH tunnel is closed, recreate it and retry
|
|
3893
|
+
"""
|
|
3894
|
+
max_attempts = 5
|
|
3895
|
+
backoff = common_utils.Backoff(initial_backoff=0.5)
|
|
3896
|
+
last_exception: Optional[Exception] = None
|
|
3897
|
+
|
|
3898
|
+
for _ in range(max_attempts):
|
|
3899
|
+
try:
|
|
3900
|
+
return func()
|
|
3901
|
+
except grpc.RpcError as e:
|
|
3902
|
+
last_exception = e
|
|
3903
|
+
_handle_grpc_error(e, backoff.current_backoff())
|
|
3904
|
+
|
|
3905
|
+
raise RuntimeError(
|
|
3906
|
+
f'Failed to invoke Skylet after {max_attempts} attempts: {last_exception}'
|
|
3907
|
+
) from last_exception
|
|
3908
|
+
|
|
3909
|
+
|
|
3910
|
+
def invoke_skylet_streaming_with_retries(
|
|
3911
|
+
stream_func: Callable[..., Iterator[T]]) -> Iterator[T]:
|
|
3912
|
+
"""Generic helper for making Skylet streaming gRPC requests."""
|
|
3913
|
+
max_attempts = 3
|
|
3914
|
+
backoff = common_utils.Backoff(initial_backoff=0.5)
|
|
3915
|
+
last_exception: Optional[Exception] = None
|
|
3916
|
+
|
|
3917
|
+
for _ in range(max_attempts):
|
|
3918
|
+
try:
|
|
3919
|
+
for response in stream_func():
|
|
3920
|
+
yield response
|
|
3921
|
+
return
|
|
3922
|
+
except grpc.RpcError as e:
|
|
3923
|
+
last_exception = e
|
|
3924
|
+
_handle_grpc_error(e, backoff.current_backoff())
|
|
3925
|
+
|
|
3926
|
+
raise RuntimeError(
|
|
3927
|
+
f'Failed to stream Skylet response after {max_attempts} attempts'
|
|
3928
|
+
) from last_exception
|
|
3929
|
+
|
|
3930
|
+
|
|
3931
|
+
def _handle_grpc_error(e: 'grpc.RpcError', current_backoff: float) -> None:
|
|
3932
|
+
if e.code() == grpc.StatusCode.INTERNAL:
|
|
3933
|
+
with ux_utils.print_exception_no_traceback():
|
|
3934
|
+
raise exceptions.SkyletInternalError(e.details())
|
|
3935
|
+
elif e.code() == grpc.StatusCode.UNAVAILABLE:
|
|
3936
|
+
time.sleep(current_backoff)
|
|
3937
|
+
elif e.code() == grpc.StatusCode.UNIMPLEMENTED or e.code(
|
|
3938
|
+
) == grpc.StatusCode.UNKNOWN:
|
|
3939
|
+
# Handle backwards compatibility: old server doesn't implement this RPC.
|
|
3940
|
+
# Let the caller fall back to legacy execution.
|
|
3941
|
+
raise exceptions.SkyletMethodNotImplementedError(
|
|
3942
|
+
f'gRPC method not implemented on server, falling back to legacy execution: {e.details()}'
|
|
3943
|
+
)
|
|
3944
|
+
else:
|
|
3945
|
+
raise e
|