skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/backends/backend_utils.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Util constants/functions for the backends."""
|
|
2
|
+
import asyncio
|
|
2
3
|
from datetime import datetime
|
|
3
4
|
import enum
|
|
4
5
|
import fnmatch
|
|
@@ -6,18 +7,23 @@ import hashlib
|
|
|
6
7
|
import os
|
|
7
8
|
import pathlib
|
|
8
9
|
import pprint
|
|
10
|
+
import queue as queue_lib
|
|
9
11
|
import re
|
|
10
12
|
import shlex
|
|
11
13
|
import subprocess
|
|
12
14
|
import sys
|
|
13
15
|
import tempfile
|
|
16
|
+
import threading
|
|
14
17
|
import time
|
|
15
18
|
import typing
|
|
16
|
-
from typing import Any, Dict, List, Optional, Sequence,
|
|
19
|
+
from typing import (Any, Callable, Dict, Iterator, List, Optional, Sequence,
|
|
20
|
+
Set, Tuple, TypeVar, Union)
|
|
17
21
|
import uuid
|
|
18
22
|
|
|
23
|
+
import aiohttp
|
|
24
|
+
from aiohttp import ClientTimeout
|
|
25
|
+
from aiohttp import TCPConnector
|
|
19
26
|
import colorama
|
|
20
|
-
import filelock
|
|
21
27
|
from packaging import version
|
|
22
28
|
from typing_extensions import Literal
|
|
23
29
|
|
|
@@ -28,30 +34,45 @@ from sky import check as sky_check
|
|
|
28
34
|
from sky import clouds
|
|
29
35
|
from sky import exceptions
|
|
30
36
|
from sky import global_user_state
|
|
37
|
+
from sky import logs
|
|
31
38
|
from sky import provision as provision_lib
|
|
32
39
|
from sky import sky_logging
|
|
33
40
|
from sky import skypilot_config
|
|
34
41
|
from sky.adaptors import common as adaptors_common
|
|
42
|
+
from sky.jobs import utils as managed_job_utils
|
|
43
|
+
from sky.provision import common as provision_common
|
|
35
44
|
from sky.provision import instance_setup
|
|
36
45
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
46
|
+
from sky.serve import serve_utils
|
|
47
|
+
from sky.server.requests import requests as requests_lib
|
|
48
|
+
from sky.skylet import autostop_lib
|
|
37
49
|
from sky.skylet import constants
|
|
38
50
|
from sky.usage import usage_lib
|
|
51
|
+
from sky.utils import auth_utils
|
|
39
52
|
from sky.utils import cluster_utils
|
|
40
53
|
from sky.utils import command_runner
|
|
41
54
|
from sky.utils import common
|
|
42
55
|
from sky.utils import common_utils
|
|
56
|
+
from sky.utils import context as context_lib
|
|
57
|
+
from sky.utils import context_utils
|
|
43
58
|
from sky.utils import controller_utils
|
|
44
59
|
from sky.utils import env_options
|
|
60
|
+
from sky.utils import locks
|
|
45
61
|
from sky.utils import registry
|
|
46
62
|
from sky.utils import resources_utils
|
|
47
63
|
from sky.utils import rich_utils
|
|
48
64
|
from sky.utils import schemas
|
|
49
65
|
from sky.utils import status_lib
|
|
50
66
|
from sky.utils import subprocess_utils
|
|
67
|
+
from sky.utils import tempstore
|
|
51
68
|
from sky.utils import timeline
|
|
52
69
|
from sky.utils import ux_utils
|
|
70
|
+
from sky.utils import volume as volume_utils
|
|
71
|
+
from sky.utils import yaml_utils
|
|
72
|
+
from sky.workspaces import core as workspaces_core
|
|
53
73
|
|
|
54
74
|
if typing.TYPE_CHECKING:
|
|
75
|
+
import grpc
|
|
55
76
|
import requests
|
|
56
77
|
from requests import adapters
|
|
57
78
|
from requests.packages.urllib3.util import retry as retry_lib
|
|
@@ -69,6 +90,8 @@ else:
|
|
|
69
90
|
adapters = adaptors_common.LazyImport('requests.adapters')
|
|
70
91
|
retry_lib = adaptors_common.LazyImport(
|
|
71
92
|
'requests.packages.urllib3.util.retry')
|
|
93
|
+
# To avoid requiring grpcio to be installed on the client side.
|
|
94
|
+
grpc = adaptors_common.LazyImport('grpc')
|
|
72
95
|
|
|
73
96
|
logger = sky_logging.init_logger(__name__)
|
|
74
97
|
|
|
@@ -91,6 +114,13 @@ _LAUNCHED_RESERVED_WORKER_PATTERN = re.compile(
|
|
|
91
114
|
# 10.133.0.5: ray.worker.default,
|
|
92
115
|
_LAUNCHING_IP_PATTERN = re.compile(
|
|
93
116
|
r'({}): ray[._]worker[._](?:default|reserved)'.format(IP_ADDR_REGEX))
|
|
117
|
+
SSH_CONNECTION_ERROR_PATTERN = re.compile(
|
|
118
|
+
r'^ssh:.*(timed out|connection refused)$', re.IGNORECASE)
|
|
119
|
+
_SSH_CONNECTION_TIMED_OUT_PATTERN = re.compile(r'^ssh:.*timed out$',
|
|
120
|
+
re.IGNORECASE)
|
|
121
|
+
K8S_PODS_NOT_FOUND_PATTERN = re.compile(r'.*(NotFound|pods .* not found).*',
|
|
122
|
+
re.IGNORECASE)
|
|
123
|
+
_RAY_CLUSTER_NOT_FOUND_MESSAGE = 'Ray cluster is not found'
|
|
94
124
|
WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
|
|
95
125
|
|
|
96
126
|
# We check network connection by going through _TEST_IP_LIST. We may need to
|
|
@@ -98,24 +128,21 @@ WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
|
|
|
98
128
|
# Fixed IP addresses are used to avoid DNS lookup blocking the check, for
|
|
99
129
|
# machine with no internet connection.
|
|
100
130
|
# Refer to: https://stackoverflow.com/questions/3764291/how-can-i-see-if-theres-an-available-and-active-network-connection-in-python # pylint: disable=line-too-long
|
|
101
|
-
_TEST_IP_LIST = ['https://
|
|
131
|
+
_TEST_IP_LIST = ['https://8.8.8.8', 'https://1.1.1.1']
|
|
102
132
|
|
|
103
133
|
# Allow each CPU thread take 2 tasks.
|
|
104
134
|
# Note: This value cannot be too small, otherwise OOM issue may occur.
|
|
105
135
|
DEFAULT_TASK_CPU_DEMAND = 0.5
|
|
106
136
|
|
|
107
|
-
# Filelocks for the cluster status change.
|
|
108
|
-
CLUSTER_STATUS_LOCK_PATH = os.path.expanduser('~/.sky/.{}.lock')
|
|
109
137
|
CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS = 20
|
|
110
138
|
|
|
111
139
|
# Time that must elapse since the last status check before we should re-check if
|
|
112
140
|
# the cluster has been terminated or autostopped.
|
|
113
141
|
_CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
|
|
114
142
|
|
|
115
|
-
# Filelocks for updating cluster's file_mounts.
|
|
116
|
-
CLUSTER_FILE_MOUNTS_LOCK_PATH = os.path.expanduser(
|
|
117
|
-
'~/.sky/.{}_file_mounts.lock')
|
|
118
143
|
CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
|
|
144
|
+
WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
|
|
145
|
+
CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10.0
|
|
119
146
|
|
|
120
147
|
# Remote dir that holds our runtime files.
|
|
121
148
|
_REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
|
|
@@ -124,7 +151,7 @@ _ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
|
|
|
124
151
|
'please retry after a while.')
|
|
125
152
|
|
|
126
153
|
# If a cluster is less than LAUNCH_DOUBLE_CHECK_WINDOW seconds old, and we don't
|
|
127
|
-
# see any instances in the cloud, the instances might be in the
|
|
154
|
+
# see any instances in the cloud, the instances might be in the process of
|
|
128
155
|
# being created. We will wait LAUNCH_DOUBLE_CHECK_DELAY seconds and then double
|
|
129
156
|
# check to make sure there are still no instances. LAUNCH_DOUBLE_CHECK_DELAY
|
|
130
157
|
# should be set longer than the delay between (sending the create instance
|
|
@@ -194,6 +221,9 @@ _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
|
|
|
194
221
|
('provider', 'availability_zone'),
|
|
195
222
|
]
|
|
196
223
|
|
|
224
|
+
_ACK_MESSAGE = 'ack'
|
|
225
|
+
_FORWARDING_FROM_MESSAGE = 'Forwarding from'
|
|
226
|
+
|
|
197
227
|
|
|
198
228
|
def is_ip(s: str) -> bool:
|
|
199
229
|
"""Returns whether this string matches IP_ADDR_REGEX."""
|
|
@@ -212,7 +242,7 @@ def _get_yaml_path_from_cluster_name(cluster_name: str,
|
|
|
212
242
|
# Add retry for the file mounts optimization, as the underlying cp command may
|
|
213
243
|
# experience transient errors, #4758.
|
|
214
244
|
@common_utils.retry
|
|
215
|
-
def _optimize_file_mounts(
|
|
245
|
+
def _optimize_file_mounts(tmp_yaml_path: str) -> None:
|
|
216
246
|
"""Optimize file mounts in the given ray yaml file.
|
|
217
247
|
|
|
218
248
|
Runtime files handling:
|
|
@@ -226,7 +256,7 @@ def _optimize_file_mounts(yaml_path: str) -> None:
|
|
|
226
256
|
subprocess.CalledProcessError: If the file mounts are failed to be
|
|
227
257
|
copied.
|
|
228
258
|
"""
|
|
229
|
-
yaml_config =
|
|
259
|
+
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
230
260
|
|
|
231
261
|
file_mounts = yaml_config.get('file_mounts', {})
|
|
232
262
|
# Remove the file mounts added by the newline.
|
|
@@ -242,7 +272,7 @@ def _optimize_file_mounts(yaml_path: str) -> None:
|
|
|
242
272
|
# - use a remote command to move all runtime files to their right places.
|
|
243
273
|
|
|
244
274
|
# Local tmp dir holding runtime files.
|
|
245
|
-
local_runtime_files_dir =
|
|
275
|
+
local_runtime_files_dir = tempstore.mkdtemp()
|
|
246
276
|
new_file_mounts = {_REMOTE_RUNTIME_FILES_DIR: local_runtime_files_dir}
|
|
247
277
|
|
|
248
278
|
# Generate local_src -> unique_name.
|
|
@@ -310,7 +340,7 @@ def _optimize_file_mounts(yaml_path: str) -> None:
|
|
|
310
340
|
shell=True,
|
|
311
341
|
check=True)
|
|
312
342
|
|
|
313
|
-
|
|
343
|
+
yaml_utils.dump_yaml(tmp_yaml_path, yaml_config)
|
|
314
344
|
|
|
315
345
|
|
|
316
346
|
def path_size_megabytes(path: str) -> int:
|
|
@@ -339,7 +369,13 @@ def path_size_megabytes(path: str) -> int:
|
|
|
339
369
|
f'{git_exclude_filter} --dry-run {path!r}')
|
|
340
370
|
rsync_output = ''
|
|
341
371
|
try:
|
|
342
|
-
|
|
372
|
+
# rsync sometimes fails `--dry-run` for MacOS' rsync build, however this function is only used to display
|
|
373
|
+
# a warning message to the user if the size of a file/directory is too
|
|
374
|
+
# large, so we can safely ignore the error.
|
|
375
|
+
rsync_output = str(
|
|
376
|
+
subprocess.check_output(rsync_command,
|
|
377
|
+
shell=True,
|
|
378
|
+
stderr=subprocess.DEVNULL))
|
|
343
379
|
except subprocess.CalledProcessError:
|
|
344
380
|
logger.debug('Command failed, proceeding without estimating size: '
|
|
345
381
|
f'{rsync_command}')
|
|
@@ -464,8 +500,8 @@ def _replace_yaml_dicts(
|
|
|
464
500
|
if key in old_block:
|
|
465
501
|
_restore_block(value, old_block[key])
|
|
466
502
|
|
|
467
|
-
new_config =
|
|
468
|
-
old_config =
|
|
503
|
+
new_config = yaml_utils.safe_load(new_yaml)
|
|
504
|
+
old_config = yaml_utils.safe_load(old_yaml)
|
|
469
505
|
excluded_results = {}
|
|
470
506
|
# Find all key values excluded from restore
|
|
471
507
|
for exclude_restore_key_name_list in restore_key_names_exceptions:
|
|
@@ -489,7 +525,7 @@ def _replace_yaml_dicts(
|
|
|
489
525
|
for key in exclude_restore_key_name[:-1]:
|
|
490
526
|
curr = curr[key]
|
|
491
527
|
curr[exclude_restore_key_name[-1]] = value
|
|
492
|
-
return
|
|
528
|
+
return yaml_utils.dump_yaml_str(new_config)
|
|
493
529
|
|
|
494
530
|
|
|
495
531
|
def get_expirable_clouds(
|
|
@@ -509,11 +545,55 @@ def get_expirable_clouds(
|
|
|
509
545
|
expirable_clouds = []
|
|
510
546
|
local_credentials_value = schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value
|
|
511
547
|
for cloud in enabled_clouds:
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
548
|
+
# Kubernetes config might have context-specific properties
|
|
549
|
+
if isinstance(cloud, clouds.Kubernetes):
|
|
550
|
+
# get all custom contexts
|
|
551
|
+
contexts = kubernetes_utils.get_custom_config_k8s_contexts()
|
|
552
|
+
# add remote_identity of each context if it exists
|
|
553
|
+
remote_identities: Optional[Union[str, List[Dict[str, str]]]] = None
|
|
554
|
+
for context in contexts:
|
|
555
|
+
context_remote_identity = skypilot_config.get_effective_region_config(
|
|
556
|
+
cloud='kubernetes',
|
|
557
|
+
region=context,
|
|
558
|
+
keys=('remote_identity',),
|
|
559
|
+
default_value=None)
|
|
560
|
+
if context_remote_identity is not None:
|
|
561
|
+
if remote_identities is None:
|
|
562
|
+
remote_identities = []
|
|
563
|
+
if isinstance(context_remote_identity, str):
|
|
564
|
+
assert isinstance(remote_identities, list)
|
|
565
|
+
remote_identities.append(
|
|
566
|
+
{context: context_remote_identity})
|
|
567
|
+
elif isinstance(context_remote_identity, list):
|
|
568
|
+
assert isinstance(remote_identities, list)
|
|
569
|
+
remote_identities.extend(context_remote_identity)
|
|
570
|
+
# add global kubernetes remote identity if it exists, if not, add default
|
|
571
|
+
global_remote_identity = skypilot_config.get_effective_region_config(
|
|
572
|
+
cloud='kubernetes',
|
|
573
|
+
region=None,
|
|
574
|
+
keys=('remote_identity',),
|
|
575
|
+
default_value=None)
|
|
576
|
+
if global_remote_identity is not None:
|
|
577
|
+
if remote_identities is None:
|
|
578
|
+
remote_identities = []
|
|
579
|
+
if isinstance(global_remote_identity, str):
|
|
580
|
+
assert isinstance(remote_identities, list)
|
|
581
|
+
remote_identities.append({'*': global_remote_identity})
|
|
582
|
+
elif isinstance(global_remote_identity, list):
|
|
583
|
+
assert isinstance(remote_identities, list)
|
|
584
|
+
remote_identities.extend(global_remote_identity)
|
|
585
|
+
if remote_identities is None:
|
|
586
|
+
remote_identities = schemas.get_default_remote_identity(
|
|
587
|
+
str(cloud).lower())
|
|
588
|
+
else:
|
|
589
|
+
remote_identities = skypilot_config.get_effective_region_config(
|
|
590
|
+
cloud=str(cloud).lower(),
|
|
591
|
+
region=None,
|
|
592
|
+
keys=('remote_identity',),
|
|
593
|
+
default_value=None)
|
|
594
|
+
if remote_identities is None:
|
|
595
|
+
remote_identities = schemas.get_default_remote_identity(
|
|
596
|
+
str(cloud).lower())
|
|
517
597
|
|
|
518
598
|
local_credential_expiring = cloud.can_credential_expire()
|
|
519
599
|
if isinstance(remote_identities, str):
|
|
@@ -528,19 +608,26 @@ def get_expirable_clouds(
|
|
|
528
608
|
return expirable_clouds
|
|
529
609
|
|
|
530
610
|
|
|
611
|
+
def _get_volume_name(path: str, cluster_name_on_cloud: str) -> str:
|
|
612
|
+
path_hash = hashlib.md5(path.encode()).hexdigest()[:6]
|
|
613
|
+
return f'{cluster_name_on_cloud}-{path_hash}'
|
|
614
|
+
|
|
615
|
+
|
|
531
616
|
# TODO: too many things happening here - leaky abstraction. Refactor.
|
|
532
617
|
@timeline.event
|
|
533
618
|
def write_cluster_config(
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
619
|
+
to_provision: 'resources_lib.Resources',
|
|
620
|
+
num_nodes: int,
|
|
621
|
+
cluster_config_template: str,
|
|
622
|
+
cluster_name: str,
|
|
623
|
+
local_wheel_path: pathlib.Path,
|
|
624
|
+
wheel_hash: str,
|
|
625
|
+
region: clouds.Region,
|
|
626
|
+
zones: Optional[List[clouds.Zone]] = None,
|
|
627
|
+
dryrun: bool = False,
|
|
628
|
+
keep_launch_fields_in_existing_config: bool = True,
|
|
629
|
+
volume_mounts: Optional[List['volume_utils.VolumeMount']] = None,
|
|
630
|
+
) -> Dict[str, str]:
|
|
544
631
|
"""Fills in cluster configuration templates and writes them out.
|
|
545
632
|
|
|
546
633
|
Returns:
|
|
@@ -588,12 +675,15 @@ def write_cluster_config(
|
|
|
588
675
|
resources_utils.ClusterName(
|
|
589
676
|
cluster_name,
|
|
590
677
|
cluster_name_on_cloud,
|
|
591
|
-
), region, zones, num_nodes, dryrun)
|
|
678
|
+
), region, zones, num_nodes, dryrun, volume_mounts)
|
|
592
679
|
config_dict = {}
|
|
593
680
|
|
|
594
681
|
specific_reservations = set(
|
|
595
|
-
skypilot_config.
|
|
596
|
-
|
|
682
|
+
skypilot_config.get_effective_region_config(
|
|
683
|
+
cloud=str(to_provision.cloud).lower(),
|
|
684
|
+
region=to_provision.region,
|
|
685
|
+
keys=('specific_reservations',),
|
|
686
|
+
default_value=set()))
|
|
597
687
|
|
|
598
688
|
# Remote identity handling can have 4 cases:
|
|
599
689
|
# 1. LOCAL_CREDENTIALS (default for most clouds): Upload local credentials
|
|
@@ -605,9 +695,12 @@ def write_cluster_config(
|
|
|
605
695
|
# other cases, we exclude the cloud from credential file uploads after
|
|
606
696
|
# running required checks.
|
|
607
697
|
assert cluster_name is not None
|
|
608
|
-
excluded_clouds = set()
|
|
609
|
-
remote_identity_config = skypilot_config.
|
|
610
|
-
|
|
698
|
+
excluded_clouds: Set[clouds.Cloud] = set()
|
|
699
|
+
remote_identity_config = skypilot_config.get_effective_region_config(
|
|
700
|
+
cloud=str(cloud).lower(),
|
|
701
|
+
region=region.name,
|
|
702
|
+
keys=('remote_identity',),
|
|
703
|
+
default_value=None)
|
|
611
704
|
remote_identity = schemas.get_default_remote_identity(str(cloud).lower())
|
|
612
705
|
if isinstance(remote_identity_config, str):
|
|
613
706
|
remote_identity = remote_identity_config
|
|
@@ -636,15 +729,25 @@ def write_cluster_config(
|
|
|
636
729
|
'is not supported by this cloud. Remove the config or set: '
|
|
637
730
|
'`remote_identity: LOCAL_CREDENTIALS`.')
|
|
638
731
|
if isinstance(cloud, clouds.Kubernetes):
|
|
639
|
-
|
|
640
|
-
|
|
732
|
+
allowed_contexts = skypilot_config.get_workspace_cloud(
|
|
733
|
+
'kubernetes').get('allowed_contexts', None)
|
|
734
|
+
if allowed_contexts is None:
|
|
735
|
+
allowed_contexts = skypilot_config.get_effective_region_config(
|
|
736
|
+
cloud='kubernetes',
|
|
737
|
+
region=None,
|
|
738
|
+
keys=('allowed_contexts',),
|
|
739
|
+
default_value=None)
|
|
740
|
+
if allowed_contexts is None:
|
|
641
741
|
excluded_clouds.add(cloud)
|
|
642
742
|
else:
|
|
643
743
|
excluded_clouds.add(cloud)
|
|
644
744
|
|
|
645
745
|
for cloud_str, cloud_obj in registry.CLOUD_REGISTRY.items():
|
|
646
|
-
remote_identity_config = skypilot_config.
|
|
647
|
-
|
|
746
|
+
remote_identity_config = skypilot_config.get_effective_region_config(
|
|
747
|
+
cloud=cloud_str.lower(),
|
|
748
|
+
region=region.name,
|
|
749
|
+
keys=('remote_identity',),
|
|
750
|
+
default_value=None)
|
|
648
751
|
if remote_identity_config:
|
|
649
752
|
if (remote_identity_config ==
|
|
650
753
|
schemas.RemoteIdentityOptions.NO_UPLOAD.value):
|
|
@@ -652,15 +755,24 @@ def write_cluster_config(
|
|
|
652
755
|
|
|
653
756
|
credentials = sky_check.get_cloud_credential_file_mounts(excluded_clouds)
|
|
654
757
|
|
|
655
|
-
|
|
758
|
+
logging_agent = logs.get_logging_agent()
|
|
759
|
+
if logging_agent:
|
|
760
|
+
for k, v in logging_agent.get_credential_file_mounts().items():
|
|
761
|
+
assert k not in credentials, f'{k} already in credentials'
|
|
762
|
+
credentials[k] = v
|
|
763
|
+
|
|
764
|
+
private_key_path, _ = auth_utils.get_or_generate_keys()
|
|
656
765
|
auth_config = {'ssh_private_key': private_key_path}
|
|
657
766
|
region_name = resources_vars.get('region')
|
|
658
767
|
|
|
659
768
|
yaml_path = _get_yaml_path_from_cluster_name(cluster_name)
|
|
660
769
|
|
|
661
770
|
# Retrieve the ssh_proxy_command for the given cloud / region.
|
|
662
|
-
ssh_proxy_command_config = skypilot_config.
|
|
663
|
-
|
|
771
|
+
ssh_proxy_command_config = skypilot_config.get_effective_region_config(
|
|
772
|
+
cloud=str(cloud).lower(),
|
|
773
|
+
region=None,
|
|
774
|
+
keys=('ssh_proxy_command',),
|
|
775
|
+
default_value=None)
|
|
664
776
|
if (isinstance(ssh_proxy_command_config, str) or
|
|
665
777
|
ssh_proxy_command_config is None):
|
|
666
778
|
ssh_proxy_command = ssh_proxy_command_config
|
|
@@ -683,10 +795,63 @@ def write_cluster_config(
|
|
|
683
795
|
assert region_name in ssh_proxy_command_config, (
|
|
684
796
|
region_name, ssh_proxy_command_config)
|
|
685
797
|
ssh_proxy_command = ssh_proxy_command_config[region_name]
|
|
798
|
+
|
|
799
|
+
use_internal_ips = skypilot_config.get_effective_region_config(
|
|
800
|
+
cloud=str(cloud).lower(),
|
|
801
|
+
region=region.name,
|
|
802
|
+
keys=('use_internal_ips',),
|
|
803
|
+
default_value=False)
|
|
804
|
+
if isinstance(cloud, clouds.AWS):
|
|
805
|
+
# If the use_ssm flag is set to true, we use the ssm proxy command.
|
|
806
|
+
use_ssm = skypilot_config.get_effective_region_config(
|
|
807
|
+
cloud=str(cloud).lower(),
|
|
808
|
+
region=region.name,
|
|
809
|
+
keys=('use_ssm',),
|
|
810
|
+
default_value=None)
|
|
811
|
+
|
|
812
|
+
if use_ssm and ssh_proxy_command is not None:
|
|
813
|
+
raise exceptions.InvalidCloudConfigs(
|
|
814
|
+
'use_ssm is set to true, but ssh_proxy_command '
|
|
815
|
+
f'is already set to {ssh_proxy_command!r}. Please remove '
|
|
816
|
+
'ssh_proxy_command or set use_ssm to false.')
|
|
817
|
+
|
|
818
|
+
if use_internal_ips and ssh_proxy_command is None:
|
|
819
|
+
# Only if use_ssm is explicitly not set, we default to using SSM.
|
|
820
|
+
if use_ssm is None:
|
|
821
|
+
logger.warning(
|
|
822
|
+
f'{colorama.Fore.YELLOW}'
|
|
823
|
+
'use_internal_ips is set to true, '
|
|
824
|
+
'but ssh_proxy_command is not set. Defaulting to '
|
|
825
|
+
'using SSM. Specify ssh_proxy_command to use a different '
|
|
826
|
+
'https://docs.skypilot.co/en/latest/reference/config.html#'
|
|
827
|
+
f'aws.ssh_proxy_command.{colorama.Style.RESET_ALL}')
|
|
828
|
+
use_ssm = True
|
|
829
|
+
|
|
830
|
+
if use_ssm:
|
|
831
|
+
aws_profile = os.environ.get('AWS_PROFILE', None)
|
|
832
|
+
profile_str = f'--profile {aws_profile}' if aws_profile else ''
|
|
833
|
+
ip_address_filter = ('Name=private-ip-address,Values=%h'
|
|
834
|
+
if use_internal_ips else
|
|
835
|
+
'Name=ip-address,Values=%h')
|
|
836
|
+
get_instance_id_command = 'aws ec2 describe-instances ' + \
|
|
837
|
+
f'--region {region_name} --filters {ip_address_filter} ' + \
|
|
838
|
+
'--query \"Reservations[].Instances[].InstanceId\" ' + \
|
|
839
|
+
f'{profile_str} --output text'
|
|
840
|
+
ssm_proxy_command = 'aws ssm start-session --target ' + \
|
|
841
|
+
f'\"$({get_instance_id_command})\" ' + \
|
|
842
|
+
f'--region {region_name} {profile_str} ' + \
|
|
843
|
+
'--document-name AWS-StartSSHSession ' + \
|
|
844
|
+
'--parameters portNumber=%p'
|
|
845
|
+
ssh_proxy_command = ssm_proxy_command
|
|
846
|
+
region_name = 'ssm-session'
|
|
686
847
|
logger.debug(f'Using ssh_proxy_command: {ssh_proxy_command!r}')
|
|
687
848
|
|
|
688
849
|
# User-supplied global instance tags from ~/.sky/config.yaml.
|
|
689
|
-
labels = skypilot_config.
|
|
850
|
+
labels = skypilot_config.get_effective_region_config(
|
|
851
|
+
cloud=str(cloud).lower(),
|
|
852
|
+
region=region.name,
|
|
853
|
+
keys=('labels',),
|
|
854
|
+
default_value={})
|
|
690
855
|
# labels is a dict, which is guaranteed by the type check in
|
|
691
856
|
# schemas.py
|
|
692
857
|
assert isinstance(labels, dict), labels
|
|
@@ -695,12 +860,6 @@ def write_cluster_config(
|
|
|
695
860
|
if to_provision.labels:
|
|
696
861
|
labels.update(to_provision.labels)
|
|
697
862
|
|
|
698
|
-
# Dump the Ray ports to a file for Ray job submission
|
|
699
|
-
dump_port_command = (
|
|
700
|
-
f'{constants.SKY_PYTHON_CMD} -c \'import json, os; json.dump({constants.SKY_REMOTE_RAY_PORT_DICT_STR}, '
|
|
701
|
-
f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", encoding="utf-8"))\''
|
|
702
|
-
)
|
|
703
|
-
|
|
704
863
|
# We disable conda auto-activation if the user has specified a docker image
|
|
705
864
|
# to use, which is likely to already have a conda environment activated.
|
|
706
865
|
conda_auto_activate = ('true' if to_provision.extract_docker_image() is None
|
|
@@ -715,6 +874,32 @@ def write_cluster_config(
|
|
|
715
874
|
high_availability_specified = controller_utils.high_availability_specified(
|
|
716
875
|
cluster_name)
|
|
717
876
|
|
|
877
|
+
volume_mount_vars = []
|
|
878
|
+
ephemeral_volume_mount_vars = []
|
|
879
|
+
if volume_mounts is not None:
|
|
880
|
+
for vol in volume_mounts:
|
|
881
|
+
if vol.is_ephemeral:
|
|
882
|
+
volume_name = _get_volume_name(vol.path, cluster_name_on_cloud)
|
|
883
|
+
vol.volume_name = volume_name
|
|
884
|
+
vol.volume_config.cloud = repr(cloud)
|
|
885
|
+
vol.volume_config.region = region.name
|
|
886
|
+
vol.volume_config.name = volume_name
|
|
887
|
+
ephemeral_volume_mount_vars.append(vol.to_yaml_config())
|
|
888
|
+
else:
|
|
889
|
+
volume_info = volume_utils.VolumeInfo(
|
|
890
|
+
name=vol.volume_name,
|
|
891
|
+
path=vol.path,
|
|
892
|
+
volume_name_on_cloud=vol.volume_config.name_on_cloud,
|
|
893
|
+
volume_id_on_cloud=vol.volume_config.id_on_cloud,
|
|
894
|
+
)
|
|
895
|
+
volume_mount_vars.append(volume_info)
|
|
896
|
+
|
|
897
|
+
runcmd = skypilot_config.get_effective_region_config(
|
|
898
|
+
cloud=str(to_provision.cloud).lower(),
|
|
899
|
+
region=to_provision.region,
|
|
900
|
+
keys=('post_provision_runcmd',),
|
|
901
|
+
default_value=None)
|
|
902
|
+
|
|
718
903
|
# Use a tmp file path to avoid incomplete YAML file being re-used in the
|
|
719
904
|
# future.
|
|
720
905
|
tmp_yaml_path = yaml_path + '.tmp'
|
|
@@ -734,18 +919,23 @@ def write_cluster_config(
|
|
|
734
919
|
os.environ.get(constants.USER_ENV_VAR, '')),
|
|
735
920
|
|
|
736
921
|
# Networking configs
|
|
737
|
-
'use_internal_ips': skypilot_config.
|
|
738
|
-
|
|
922
|
+
'use_internal_ips': skypilot_config.get_effective_region_config(
|
|
923
|
+
cloud=str(cloud).lower(),
|
|
924
|
+
region=region.name,
|
|
925
|
+
keys=('use_internal_ips',),
|
|
926
|
+
default_value=False),
|
|
739
927
|
'ssh_proxy_command': ssh_proxy_command,
|
|
740
|
-
'vpc_name': skypilot_config.
|
|
741
|
-
|
|
742
|
-
|
|
928
|
+
'vpc_name': skypilot_config.get_effective_region_config(
|
|
929
|
+
cloud=str(cloud).lower(),
|
|
930
|
+
region=region.name,
|
|
931
|
+
keys=('vpc_name',),
|
|
932
|
+
default_value=None),
|
|
743
933
|
# User-supplied labels.
|
|
744
934
|
'labels': labels,
|
|
745
935
|
# User-supplied remote_identity
|
|
746
936
|
'remote_identity': remote_identity,
|
|
747
937
|
# The reservation pools that specified by the user. This is
|
|
748
|
-
# currently only used by GCP.
|
|
938
|
+
# currently only used by AWS and GCP.
|
|
749
939
|
'specific_reservations': specific_reservations,
|
|
750
940
|
|
|
751
941
|
# Conda setup
|
|
@@ -766,12 +956,14 @@ def write_cluster_config(
|
|
|
766
956
|
'{sky_wheel_hash}',
|
|
767
957
|
wheel_hash).replace('{cloud}',
|
|
768
958
|
str(cloud).lower()),
|
|
959
|
+
'copy_skypilot_templates_commands':
|
|
960
|
+
constants.COPY_SKYPILOT_TEMPLATES_COMMANDS,
|
|
769
961
|
# Port of Ray (GCS server).
|
|
770
962
|
# Ray's default port 6379 is conflicted with Redis.
|
|
771
963
|
'ray_port': constants.SKY_REMOTE_RAY_PORT,
|
|
772
964
|
'ray_dashboard_port': constants.SKY_REMOTE_RAY_DASHBOARD_PORT,
|
|
773
965
|
'ray_temp_dir': constants.SKY_REMOTE_RAY_TEMPDIR,
|
|
774
|
-
'dump_port_command':
|
|
966
|
+
'dump_port_command': instance_setup.DUMP_RAY_PORTS,
|
|
775
967
|
# Sky-internal constants.
|
|
776
968
|
'sky_ray_cmd': constants.SKY_RAY_CMD,
|
|
777
969
|
# pip install needs to have python env activated to make sure
|
|
@@ -805,6 +997,14 @@ def write_cluster_config(
|
|
|
805
997
|
|
|
806
998
|
# High availability
|
|
807
999
|
'high_availability': high_availability_specified,
|
|
1000
|
+
|
|
1001
|
+
# Volume mounts
|
|
1002
|
+
'volume_mounts': volume_mount_vars,
|
|
1003
|
+
'ephemeral_volume_mounts': ephemeral_volume_mount_vars,
|
|
1004
|
+
|
|
1005
|
+
# runcmd to run before any of the SkyPilot runtime setup commands.
|
|
1006
|
+
# This is currently only used by AWS and Kubernetes.
|
|
1007
|
+
'runcmd': runcmd,
|
|
808
1008
|
}),
|
|
809
1009
|
output_path=tmp_yaml_path)
|
|
810
1010
|
config_dict['cluster_name'] = cluster_name
|
|
@@ -812,14 +1012,20 @@ def write_cluster_config(
|
|
|
812
1012
|
|
|
813
1013
|
# Add kubernetes config fields from ~/.sky/config
|
|
814
1014
|
if isinstance(cloud, clouds.Kubernetes):
|
|
815
|
-
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
|
|
819
|
-
|
|
820
|
-
|
|
1015
|
+
cluster_config_overrides = to_provision.cluster_config_overrides
|
|
1016
|
+
with open(tmp_yaml_path, 'r', encoding='utf-8') as f:
|
|
1017
|
+
tmp_yaml_str = f.read()
|
|
1018
|
+
cluster_yaml_obj = yaml_utils.safe_load(tmp_yaml_str)
|
|
1019
|
+
combined_yaml_obj = kubernetes_utils.combine_pod_config_fields_and_metadata(
|
|
1020
|
+
cluster_yaml_obj,
|
|
1021
|
+
cluster_config_overrides=cluster_config_overrides,
|
|
1022
|
+
cloud=cloud,
|
|
1023
|
+
context=region.name)
|
|
1024
|
+
# Write the updated YAML back to the file
|
|
1025
|
+
yaml_utils.dump_yaml(tmp_yaml_path, combined_yaml_obj)
|
|
1026
|
+
|
|
1027
|
+
pod_config: Dict[str, Any] = combined_yaml_obj['available_node_types'][
|
|
821
1028
|
'ray_head_default']['node_config']
|
|
822
|
-
|
|
823
1029
|
# Check pod spec only. For high availability controllers, we deploy pvc & deployment for the controller. Read kubernetes-ray.yml.j2 for more details.
|
|
824
1030
|
pod_config.pop('deployment_spec', None)
|
|
825
1031
|
pod_config.pop('pvc_spec', None)
|
|
@@ -841,9 +1047,8 @@ def write_cluster_config(
|
|
|
841
1047
|
_add_auth_to_cluster_config(cloud, tmp_yaml_path)
|
|
842
1048
|
|
|
843
1049
|
# Restore the old yaml content for backward compatibility.
|
|
844
|
-
|
|
845
|
-
|
|
846
|
-
old_yaml_content = f.read()
|
|
1050
|
+
old_yaml_content = global_user_state.get_cluster_yaml_str(yaml_path)
|
|
1051
|
+
if old_yaml_content is not None and keep_launch_fields_in_existing_config:
|
|
847
1052
|
with open(tmp_yaml_path, 'r', encoding='utf-8') as f:
|
|
848
1053
|
new_yaml_content = f.read()
|
|
849
1054
|
restored_yaml_content = _replace_yaml_dicts(
|
|
@@ -853,11 +1058,7 @@ def write_cluster_config(
|
|
|
853
1058
|
with open(tmp_yaml_path, 'w', encoding='utf-8') as f:
|
|
854
1059
|
f.write(restored_yaml_content)
|
|
855
1060
|
|
|
856
|
-
|
|
857
|
-
# compatbility restortion above into account.
|
|
858
|
-
# TODO: remove this after 2 minor releases, 0.10.0.
|
|
859
|
-
yaml_config = common_utils.read_yaml(tmp_yaml_path)
|
|
860
|
-
config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
|
|
1061
|
+
config_dict['cluster_name_on_cloud'] = cluster_name_on_cloud
|
|
861
1062
|
|
|
862
1063
|
# Make sure to do this before we optimize file mounts. Optimization is
|
|
863
1064
|
# non-deterministic, but everything else before this point should be
|
|
@@ -880,18 +1081,29 @@ def write_cluster_config(
|
|
|
880
1081
|
# compatibility should go before this call.
|
|
881
1082
|
_optimize_file_mounts(tmp_yaml_path)
|
|
882
1083
|
|
|
883
|
-
#
|
|
884
|
-
|
|
885
|
-
|
|
1084
|
+
# commit the final yaml to the database
|
|
1085
|
+
global_user_state.set_cluster_yaml(
|
|
1086
|
+
cluster_name,
|
|
1087
|
+
open(tmp_yaml_path, 'r', encoding='utf-8').read())
|
|
1088
|
+
|
|
1089
|
+
usage_lib.messages.usage.update_ray_yaml(tmp_yaml_path)
|
|
1090
|
+
|
|
1091
|
+
# Remove the tmp file.
|
|
1092
|
+
if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
|
|
1093
|
+
debug_yaml_path = yaml_path + '.debug'
|
|
1094
|
+
os.rename(tmp_yaml_path, debug_yaml_path)
|
|
1095
|
+
else:
|
|
1096
|
+
os.remove(tmp_yaml_path)
|
|
1097
|
+
|
|
886
1098
|
return config_dict
|
|
887
1099
|
|
|
888
1100
|
|
|
889
|
-
def _add_auth_to_cluster_config(cloud: clouds.Cloud,
|
|
1101
|
+
def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
|
|
890
1102
|
"""Adds SSH key info to the cluster config.
|
|
891
1103
|
|
|
892
1104
|
This function's output removes comments included in the jinja2 template.
|
|
893
1105
|
"""
|
|
894
|
-
config =
|
|
1106
|
+
config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
895
1107
|
# Check the availability of the cloud type.
|
|
896
1108
|
if isinstance(cloud, (
|
|
897
1109
|
clouds.AWS,
|
|
@@ -919,9 +1131,17 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, cluster_config_file: str):
|
|
|
919
1131
|
config = auth.setup_vast_authentication(config)
|
|
920
1132
|
elif isinstance(cloud, clouds.Fluidstack):
|
|
921
1133
|
config = auth.setup_fluidstack_authentication(config)
|
|
1134
|
+
elif isinstance(cloud, clouds.Hyperbolic):
|
|
1135
|
+
config = auth.setup_hyperbolic_authentication(config)
|
|
1136
|
+
elif isinstance(cloud, clouds.Shadeform):
|
|
1137
|
+
config = auth.setup_shadeform_authentication(config)
|
|
1138
|
+
elif isinstance(cloud, clouds.PrimeIntellect):
|
|
1139
|
+
config = auth.setup_primeintellect_authentication(config)
|
|
1140
|
+
elif isinstance(cloud, clouds.Seeweb):
|
|
1141
|
+
config = auth.setup_seeweb_authentication(config)
|
|
922
1142
|
else:
|
|
923
1143
|
assert False, cloud
|
|
924
|
-
|
|
1144
|
+
yaml_utils.dump_yaml(tmp_yaml_path, config)
|
|
925
1145
|
|
|
926
1146
|
|
|
927
1147
|
def get_timestamp_from_run_timestamp(run_timestamp: str) -> float:
|
|
@@ -979,7 +1199,7 @@ def _count_healthy_nodes_from_ray(output: str,
|
|
|
979
1199
|
|
|
980
1200
|
|
|
981
1201
|
@timeline.event
|
|
982
|
-
def _deterministic_cluster_yaml_hash(
|
|
1202
|
+
def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
|
|
983
1203
|
"""Hash the cluster yaml and contents of file mounts to a unique string.
|
|
984
1204
|
|
|
985
1205
|
Two invocations of this function should return the same string if and only
|
|
@@ -1021,9 +1241,8 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
|
|
|
1021
1241
|
Rather than constructing the whole byte sequence, which may be quite large,
|
|
1022
1242
|
we construct it incrementally by using hash.update() to add new bytes.
|
|
1023
1243
|
"""
|
|
1024
|
-
|
|
1025
1244
|
# Load the yaml contents so that we can directly remove keys.
|
|
1026
|
-
yaml_config =
|
|
1245
|
+
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
1027
1246
|
for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
|
|
1028
1247
|
dict_to_remove_from = yaml_config
|
|
1029
1248
|
found_key = True
|
|
@@ -1042,7 +1261,7 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
|
|
|
1042
1261
|
config_hash = hashlib.sha256()
|
|
1043
1262
|
|
|
1044
1263
|
yaml_hash = hashlib.sha256(
|
|
1045
|
-
|
|
1264
|
+
yaml_utils.dump_yaml_str(yaml_config).encode('utf-8'))
|
|
1046
1265
|
config_hash.update(yaml_hash.digest())
|
|
1047
1266
|
|
|
1048
1267
|
file_mounts = yaml_config.get('file_mounts', {})
|
|
@@ -1052,7 +1271,7 @@ def _deterministic_cluster_yaml_hash(yaml_path: str) -> str:
|
|
|
1052
1271
|
file_mounts.pop('')
|
|
1053
1272
|
|
|
1054
1273
|
for dst, src in sorted(file_mounts.items()):
|
|
1055
|
-
if src ==
|
|
1274
|
+
if src == tmp_yaml_path:
|
|
1056
1275
|
# Skip the yaml file itself. We have already hashed a modified
|
|
1057
1276
|
# version of it. The file may include fields we don't want to hash.
|
|
1058
1277
|
continue
|
|
@@ -1147,7 +1366,7 @@ def wait_until_ray_cluster_ready(
|
|
|
1147
1366
|
logger.error(common_utils.format_exception(e))
|
|
1148
1367
|
return False, None # failed
|
|
1149
1368
|
|
|
1150
|
-
config =
|
|
1369
|
+
config = global_user_state.get_cluster_yaml_dict(cluster_config_file)
|
|
1151
1370
|
|
|
1152
1371
|
docker_user = None
|
|
1153
1372
|
if 'docker' in config:
|
|
@@ -1247,11 +1466,11 @@ def ssh_credential_from_yaml(
|
|
|
1247
1466
|
"""
|
|
1248
1467
|
if cluster_yaml is None:
|
|
1249
1468
|
return dict()
|
|
1250
|
-
config =
|
|
1469
|
+
config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
|
|
1251
1470
|
auth_section = config['auth']
|
|
1252
1471
|
if ssh_user is None:
|
|
1253
1472
|
ssh_user = auth_section['ssh_user'].strip()
|
|
1254
|
-
|
|
1473
|
+
ssh_private_key_path = auth_section.get('ssh_private_key')
|
|
1255
1474
|
ssh_control_name = config.get('cluster_name', '__default__')
|
|
1256
1475
|
ssh_proxy_command = auth_section.get('ssh_proxy_command')
|
|
1257
1476
|
|
|
@@ -1260,9 +1479,10 @@ def ssh_credential_from_yaml(
|
|
|
1260
1479
|
constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
|
|
1261
1480
|
ssh_proxy_command = ssh_proxy_command.replace(
|
|
1262
1481
|
constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
|
|
1482
|
+
|
|
1263
1483
|
credentials = {
|
|
1264
1484
|
'ssh_user': ssh_user,
|
|
1265
|
-
'ssh_private_key':
|
|
1485
|
+
'ssh_private_key': ssh_private_key_path,
|
|
1266
1486
|
'ssh_control_name': ssh_control_name,
|
|
1267
1487
|
'ssh_proxy_command': ssh_proxy_command,
|
|
1268
1488
|
}
|
|
@@ -1275,6 +1495,62 @@ def ssh_credential_from_yaml(
|
|
|
1275
1495
|
return credentials
|
|
1276
1496
|
|
|
1277
1497
|
|
|
1498
|
+
def ssh_credentials_from_handles(
|
|
1499
|
+
handles: List['cloud_vm_ray_backend.CloudVmRayResourceHandle'],
|
|
1500
|
+
) -> List[Dict[str, Any]]:
|
|
1501
|
+
"""Returns ssh_user, ssh_private_key and ssh_control name.
|
|
1502
|
+
"""
|
|
1503
|
+
non_empty_cluster_yaml_paths = [
|
|
1504
|
+
handle.cluster_yaml
|
|
1505
|
+
for handle in handles
|
|
1506
|
+
if handle.cluster_yaml is not None
|
|
1507
|
+
]
|
|
1508
|
+
cluster_yaml_dicts = global_user_state.get_cluster_yaml_dict_multiple(
|
|
1509
|
+
non_empty_cluster_yaml_paths)
|
|
1510
|
+
cluster_yaml_dicts_to_index = {
|
|
1511
|
+
cluster_yaml_path: cluster_yaml_dict
|
|
1512
|
+
for cluster_yaml_path, cluster_yaml_dict in zip(
|
|
1513
|
+
non_empty_cluster_yaml_paths, cluster_yaml_dicts)
|
|
1514
|
+
}
|
|
1515
|
+
|
|
1516
|
+
credentials_to_return: List[Dict[str, Any]] = []
|
|
1517
|
+
for handle in handles:
|
|
1518
|
+
if handle.cluster_yaml is None:
|
|
1519
|
+
credentials_to_return.append(dict())
|
|
1520
|
+
continue
|
|
1521
|
+
ssh_user = handle.ssh_user
|
|
1522
|
+
docker_user = handle.docker_user
|
|
1523
|
+
config = cluster_yaml_dicts_to_index[handle.cluster_yaml]
|
|
1524
|
+
auth_section = config['auth']
|
|
1525
|
+
if ssh_user is None:
|
|
1526
|
+
ssh_user = auth_section['ssh_user'].strip()
|
|
1527
|
+
ssh_private_key_path = auth_section.get('ssh_private_key')
|
|
1528
|
+
ssh_control_name = config.get('cluster_name', '__default__')
|
|
1529
|
+
ssh_proxy_command = auth_section.get('ssh_proxy_command')
|
|
1530
|
+
|
|
1531
|
+
# Update the ssh_user placeholder in proxy command, if required
|
|
1532
|
+
if (ssh_proxy_command is not None and
|
|
1533
|
+
constants.SKY_SSH_USER_PLACEHOLDER in ssh_proxy_command):
|
|
1534
|
+
ssh_proxy_command = ssh_proxy_command.replace(
|
|
1535
|
+
constants.SKY_SSH_USER_PLACEHOLDER, ssh_user)
|
|
1536
|
+
|
|
1537
|
+
credentials = {
|
|
1538
|
+
'ssh_user': ssh_user,
|
|
1539
|
+
'ssh_private_key': ssh_private_key_path,
|
|
1540
|
+
'ssh_control_name': ssh_control_name,
|
|
1541
|
+
'ssh_proxy_command': ssh_proxy_command,
|
|
1542
|
+
}
|
|
1543
|
+
if docker_user is not None:
|
|
1544
|
+
credentials['docker_user'] = docker_user
|
|
1545
|
+
ssh_provider_module = config['provider']['module']
|
|
1546
|
+
# If we are running ssh command on kubernetes node.
|
|
1547
|
+
if 'kubernetes' in ssh_provider_module:
|
|
1548
|
+
credentials['disable_control_master'] = True
|
|
1549
|
+
credentials_to_return.append(credentials)
|
|
1550
|
+
|
|
1551
|
+
return credentials_to_return
|
|
1552
|
+
|
|
1553
|
+
|
|
1278
1554
|
def parallel_data_transfer_to_nodes(
|
|
1279
1555
|
runners: List[command_runner.CommandRunner],
|
|
1280
1556
|
source: Optional[str],
|
|
@@ -1435,7 +1711,7 @@ def get_node_ips(cluster_yaml: str,
|
|
|
1435
1711
|
exceptions.FetchClusterInfoError: if we failed to get the IPs. e.reason is
|
|
1436
1712
|
HEAD or WORKER.
|
|
1437
1713
|
"""
|
|
1438
|
-
ray_config =
|
|
1714
|
+
ray_config = global_user_state.get_cluster_yaml_dict(cluster_yaml)
|
|
1439
1715
|
# Use the new provisioner for AWS.
|
|
1440
1716
|
provider_name = cluster_utils.get_provider_name(ray_config)
|
|
1441
1717
|
cloud = registry.CLOUD_REGISTRY.from_str(provider_name)
|
|
@@ -1523,18 +1799,54 @@ def get_node_ips(cluster_yaml: str,
|
|
|
1523
1799
|
|
|
1524
1800
|
def check_network_connection():
|
|
1525
1801
|
# Tolerate 3 retries as it is observed that connections can fail.
|
|
1526
|
-
adapter = adapters.HTTPAdapter(max_retries=retry_lib.Retry(total=3))
|
|
1527
1802
|
http = requests.Session()
|
|
1528
|
-
http.mount('https://',
|
|
1529
|
-
http.mount('http://',
|
|
1530
|
-
|
|
1531
|
-
|
|
1532
|
-
|
|
1533
|
-
|
|
1534
|
-
|
|
1535
|
-
|
|
1536
|
-
|
|
1537
|
-
|
|
1803
|
+
http.mount('https://', adapters.HTTPAdapter())
|
|
1804
|
+
http.mount('http://', adapters.HTTPAdapter())
|
|
1805
|
+
|
|
1806
|
+
# Alternate between IPs on each retry
|
|
1807
|
+
max_retries = 3
|
|
1808
|
+
timeout = 0.5
|
|
1809
|
+
|
|
1810
|
+
for _ in range(max_retries):
|
|
1811
|
+
for ip in _TEST_IP_LIST:
|
|
1812
|
+
try:
|
|
1813
|
+
http.head(ip, timeout=timeout)
|
|
1814
|
+
return
|
|
1815
|
+
except (requests.Timeout, requests.exceptions.ConnectionError):
|
|
1816
|
+
continue
|
|
1817
|
+
|
|
1818
|
+
timeout *= 2 # Double the timeout for next retry
|
|
1819
|
+
|
|
1820
|
+
# If we get here, all IPs failed
|
|
1821
|
+
# Assume network connection is down
|
|
1822
|
+
raise exceptions.NetworkError('Could not refresh the cluster. '
|
|
1823
|
+
'Network seems down.')
|
|
1824
|
+
|
|
1825
|
+
|
|
1826
|
+
async def async_check_network_connection():
|
|
1827
|
+
"""Check if the network connection is available.
|
|
1828
|
+
|
|
1829
|
+
Tolerates 3 retries as it is observed that connections can fail.
|
|
1830
|
+
Uses aiohttp for async HTTP requests.
|
|
1831
|
+
"""
|
|
1832
|
+
# Create a session with retry logic
|
|
1833
|
+
timeout = ClientTimeout(total=15)
|
|
1834
|
+
connector = TCPConnector(limit=1) # Limit to 1 connection at a time
|
|
1835
|
+
|
|
1836
|
+
async with aiohttp.ClientSession(timeout=timeout,
|
|
1837
|
+
connector=connector) as session:
|
|
1838
|
+
for i, ip in enumerate(_TEST_IP_LIST):
|
|
1839
|
+
try:
|
|
1840
|
+
async with session.head(ip) as response:
|
|
1841
|
+
if response.status < 400: # Any 2xx or 3xx status is good
|
|
1842
|
+
return
|
|
1843
|
+
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
|
1844
|
+
if i == len(_TEST_IP_LIST) - 1:
|
|
1845
|
+
raise exceptions.NetworkError(
|
|
1846
|
+
'Could not refresh the cluster. '
|
|
1847
|
+
'Network seems down.') from e
|
|
1848
|
+
# If not the last IP, continue to try the next one
|
|
1849
|
+
continue
|
|
1538
1850
|
|
|
1539
1851
|
|
|
1540
1852
|
@timeline.event
|
|
@@ -1549,14 +1861,34 @@ def check_owner_identity(cluster_name: str) -> None:
|
|
|
1549
1861
|
"""
|
|
1550
1862
|
if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
|
|
1551
1863
|
return
|
|
1552
|
-
record = global_user_state.get_cluster_from_name(cluster_name
|
|
1864
|
+
record = global_user_state.get_cluster_from_name(cluster_name,
|
|
1865
|
+
include_user_info=False,
|
|
1866
|
+
summary_response=True)
|
|
1553
1867
|
if record is None:
|
|
1554
1868
|
return
|
|
1869
|
+
_check_owner_identity_with_record(cluster_name, record)
|
|
1870
|
+
|
|
1871
|
+
|
|
1872
|
+
def _check_owner_identity_with_record(cluster_name: str,
|
|
1873
|
+
record: Dict[str, Any]) -> None:
|
|
1874
|
+
if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
|
|
1875
|
+
return
|
|
1555
1876
|
handle = record['handle']
|
|
1556
1877
|
if not isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1557
1878
|
return
|
|
1879
|
+
active_workspace = skypilot_config.get_active_workspace()
|
|
1880
|
+
cluster_workspace = record.get('workspace',
|
|
1881
|
+
constants.SKYPILOT_DEFAULT_WORKSPACE)
|
|
1882
|
+
if active_workspace != cluster_workspace:
|
|
1883
|
+
with ux_utils.print_exception_no_traceback():
|
|
1884
|
+
raise exceptions.ClusterOwnerIdentityMismatchError(
|
|
1885
|
+
f'{colorama.Fore.YELLOW}'
|
|
1886
|
+
f'The cluster {cluster_name!r} is in workspace '
|
|
1887
|
+
f'{cluster_workspace!r}, but the active workspace is '
|
|
1888
|
+
f'{active_workspace!r}.{colorama.Fore.RESET}')
|
|
1558
1889
|
|
|
1559
|
-
|
|
1890
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
|
1891
|
+
cloud = launched_resources.cloud
|
|
1560
1892
|
user_identities = cloud.get_user_identities()
|
|
1561
1893
|
owner_identity = record['owner']
|
|
1562
1894
|
if user_identities is None:
|
|
@@ -1625,22 +1957,26 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
|
|
|
1625
1957
|
}
|
|
1626
1958
|
|
|
1627
1959
|
|
|
1960
|
+
@context_utils.cancellation_guard
|
|
1628
1961
|
def _query_cluster_status_via_cloud_api(
|
|
1629
|
-
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
|
|
1630
|
-
|
|
1631
|
-
|
|
1962
|
+
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
|
|
1963
|
+
retry_if_missing: bool,
|
|
1964
|
+
) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
|
|
1965
|
+
"""Returns the status of the cluster as a list of tuples corresponding
|
|
1966
|
+
to the node status and an optional reason string for said status.
|
|
1632
1967
|
|
|
1633
1968
|
Raises:
|
|
1634
1969
|
exceptions.ClusterStatusFetchingError: the cluster status cannot be
|
|
1635
1970
|
fetched from the cloud provider.
|
|
1636
1971
|
"""
|
|
1972
|
+
cluster_name = handle.cluster_name
|
|
1637
1973
|
cluster_name_on_cloud = handle.cluster_name_on_cloud
|
|
1638
1974
|
cluster_name_in_hint = common_utils.cluster_name_in_hint(
|
|
1639
1975
|
handle.cluster_name, cluster_name_on_cloud)
|
|
1640
1976
|
# Use region and zone from the cluster config, instead of the
|
|
1641
1977
|
# handle.launched_resources, because the latter may not be set
|
|
1642
1978
|
# correctly yet.
|
|
1643
|
-
ray_config =
|
|
1979
|
+
ray_config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
|
|
1644
1980
|
provider_config = ray_config['provider']
|
|
1645
1981
|
|
|
1646
1982
|
# Query the cloud provider.
|
|
@@ -1651,7 +1987,11 @@ def _query_cluster_status_via_cloud_api(
|
|
|
1651
1987
|
cloud_name = repr(handle.launched_resources.cloud)
|
|
1652
1988
|
try:
|
|
1653
1989
|
node_status_dict = provision_lib.query_instances(
|
|
1654
|
-
cloud_name,
|
|
1990
|
+
cloud_name,
|
|
1991
|
+
cluster_name,
|
|
1992
|
+
cluster_name_on_cloud,
|
|
1993
|
+
provider_config,
|
|
1994
|
+
retry_if_missing=retry_if_missing)
|
|
1655
1995
|
logger.debug(f'Querying {cloud_name} cluster '
|
|
1656
1996
|
f'{cluster_name_in_hint} '
|
|
1657
1997
|
f'status:\n{pprint.pformat(node_status_dict)}')
|
|
@@ -1667,12 +2007,55 @@ def _query_cluster_status_via_cloud_api(
|
|
|
1667
2007
|
region = provider_config.get('region') or provider_config.get(
|
|
1668
2008
|
'location')
|
|
1669
2009
|
zone = ray_config['provider'].get('availability_zone')
|
|
2010
|
+
# TODO (kyuds): refactor cloud.query_status api to include reason.
|
|
2011
|
+
# Currently not refactoring as this API is actually supposed to be
|
|
2012
|
+
# deprecated soon.
|
|
1670
2013
|
node_statuses = cloud.query_status(
|
|
1671
2014
|
cluster_name_on_cloud,
|
|
1672
2015
|
tag_filter_for_cluster(cluster_name_on_cloud), region, zone)
|
|
2016
|
+
node_statuses = [(status, None) for status in node_statuses]
|
|
1673
2017
|
return node_statuses
|
|
1674
2018
|
|
|
1675
2019
|
|
|
2020
|
+
def _query_cluster_info_via_cloud_api(
|
|
2021
|
+
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
|
|
2022
|
+
) -> provision_common.ClusterInfo:
|
|
2023
|
+
"""Returns the cluster info.
|
|
2024
|
+
|
|
2025
|
+
Raises:
|
|
2026
|
+
exceptions.NotSupportedError: the cloud does not support the new provisioner.
|
|
2027
|
+
exceptions.FetchClusterInfoError: the cluster info cannot be
|
|
2028
|
+
fetched from the cloud provider.
|
|
2029
|
+
"""
|
|
2030
|
+
cloud = handle.launched_resources.cloud
|
|
2031
|
+
assert cloud is not None, handle
|
|
2032
|
+
if cloud.STATUS_VERSION >= clouds.StatusVersion.SKYPILOT:
|
|
2033
|
+
try:
|
|
2034
|
+
cloud_name = repr(cloud)
|
|
2035
|
+
ray_config = global_user_state.get_cluster_yaml_dict(
|
|
2036
|
+
handle.cluster_yaml)
|
|
2037
|
+
provider_config = ray_config['provider']
|
|
2038
|
+
region = provider_config.get('region') or provider_config.get(
|
|
2039
|
+
'location')
|
|
2040
|
+
cluster_info = provision_lib.get_cluster_info(
|
|
2041
|
+
cloud_name, region, handle.cluster_name_on_cloud,
|
|
2042
|
+
provider_config)
|
|
2043
|
+
logger.debug(
|
|
2044
|
+
f'Querying {cloud_name} cluster '
|
|
2045
|
+
f'{handle.cluster_name_on_cloud} '
|
|
2046
|
+
f'head instance:\n{cluster_info.get_head_instance()}\n'
|
|
2047
|
+
f'worker instances:\n{cluster_info.get_worker_instances()}')
|
|
2048
|
+
return cluster_info
|
|
2049
|
+
except Exception as e: # pylint: disable=broad-except
|
|
2050
|
+
with ux_utils.print_exception_no_traceback():
|
|
2051
|
+
raise exceptions.FetchClusterInfoError(
|
|
2052
|
+
reason=exceptions.FetchClusterInfoError.Reason.UNKNOWN
|
|
2053
|
+
) from e
|
|
2054
|
+
else:
|
|
2055
|
+
raise exceptions.NotSupportedError(
|
|
2056
|
+
f'The cloud {cloud} does not support the SkyPilot provisioner.')
|
|
2057
|
+
|
|
2058
|
+
|
|
1676
2059
|
def check_can_clone_disk_and_override_task(
|
|
1677
2060
|
cluster_name: str, target_cluster_name: Optional[str], task: 'task_lib.Task'
|
|
1678
2061
|
) -> Tuple['task_lib.Task', 'cloud_vm_ray_backend.CloudVmRayResourceHandle']:
|
|
@@ -1720,12 +2103,12 @@ def check_can_clone_disk_and_override_task(
|
|
|
1720
2103
|
'a new target cluster name.')
|
|
1721
2104
|
|
|
1722
2105
|
new_task_resources = []
|
|
1723
|
-
|
|
2106
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
|
2107
|
+
original_cloud = launched_resources.cloud
|
|
1724
2108
|
original_cloud.check_features_are_supported(
|
|
1725
|
-
|
|
2109
|
+
launched_resources,
|
|
1726
2110
|
{clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER})
|
|
1727
2111
|
|
|
1728
|
-
assert original_cloud is not None, handle.launched_resources
|
|
1729
2112
|
has_override = False
|
|
1730
2113
|
has_disk_size_met = False
|
|
1731
2114
|
has_cloud_met = False
|
|
@@ -1739,7 +2122,7 @@ def check_can_clone_disk_and_override_task(
|
|
|
1739
2122
|
continue
|
|
1740
2123
|
has_cloud_met = True
|
|
1741
2124
|
|
|
1742
|
-
override_param = {}
|
|
2125
|
+
override_param: Dict[str, Any] = {}
|
|
1743
2126
|
if task_resources.cloud is None:
|
|
1744
2127
|
override_param['cloud'] = original_cloud
|
|
1745
2128
|
if task_resources.region is None:
|
|
@@ -1786,7 +2169,12 @@ def check_can_clone_disk_and_override_task(
|
|
|
1786
2169
|
return task, handle
|
|
1787
2170
|
|
|
1788
2171
|
|
|
1789
|
-
def _update_cluster_status(
|
|
2172
|
+
def _update_cluster_status(
|
|
2173
|
+
cluster_name: str,
|
|
2174
|
+
record: Dict[str, Any],
|
|
2175
|
+
retry_if_missing: bool,
|
|
2176
|
+
include_user_info: bool = True,
|
|
2177
|
+
summary_response: bool = False) -> Optional[Dict[str, Any]]:
|
|
1790
2178
|
"""Update the cluster status.
|
|
1791
2179
|
|
|
1792
2180
|
The cluster status is updated by checking ray cluster and real status from
|
|
@@ -1813,13 +2201,16 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
1813
2201
|
fetched from the cloud provider or there are leaked nodes causing
|
|
1814
2202
|
the node number larger than expected.
|
|
1815
2203
|
"""
|
|
1816
|
-
record = global_user_state.get_cluster_from_name(cluster_name)
|
|
1817
|
-
if record is None:
|
|
1818
|
-
return None
|
|
1819
2204
|
handle = record['handle']
|
|
1820
2205
|
if handle.cluster_yaml is None:
|
|
1821
2206
|
# Remove cluster from db since this cluster does not have a config file
|
|
1822
2207
|
# or any other ongoing requests
|
|
2208
|
+
global_user_state.add_cluster_event(
|
|
2209
|
+
cluster_name,
|
|
2210
|
+
None,
|
|
2211
|
+
'Cluster has no YAML file. Removing the cluster from cache.',
|
|
2212
|
+
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2213
|
+
nop_if_duplicate=True)
|
|
1823
2214
|
global_user_state.remove_cluster(cluster_name, terminate=True)
|
|
1824
2215
|
logger.debug(f'Cluster {cluster_name!r} has no YAML file. '
|
|
1825
2216
|
'Removing the cluster from cache.')
|
|
@@ -1828,10 +2219,11 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
1828
2219
|
return record
|
|
1829
2220
|
cluster_name = handle.cluster_name
|
|
1830
2221
|
|
|
1831
|
-
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2222
|
+
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2223
|
+
handle, retry_if_missing=retry_if_missing)
|
|
1832
2224
|
|
|
1833
|
-
all_nodes_up = (all(
|
|
1834
|
-
|
|
2225
|
+
all_nodes_up = (all(status[0] == status_lib.ClusterStatus.UP
|
|
2226
|
+
for status in node_statuses) and
|
|
1835
2227
|
len(node_statuses) == handle.launched_nodes)
|
|
1836
2228
|
|
|
1837
2229
|
def get_node_counts_from_ray_status(
|
|
@@ -1842,14 +2234,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
1842
2234
|
require_outputs=True,
|
|
1843
2235
|
separate_stderr=True)
|
|
1844
2236
|
if rc:
|
|
1845
|
-
raise
|
|
1846
|
-
|
|
1847
|
-
f'ray cluster\'s healthiness
|
|
1848
|
-
|
|
1849
|
-
f'
|
|
2237
|
+
raise exceptions.CommandError(
|
|
2238
|
+
rc, instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND,
|
|
2239
|
+
f'Failed to check ray cluster\'s healthiness.\n'
|
|
2240
|
+
'-- stdout --\n'
|
|
2241
|
+
f'{output}\n', stderr)
|
|
1850
2242
|
return (*_count_healthy_nodes_from_ray(output), output, stderr)
|
|
1851
2243
|
|
|
2244
|
+
ray_status_details: Optional[str] = None
|
|
2245
|
+
|
|
1852
2246
|
def run_ray_status_to_check_ray_cluster_healthy() -> bool:
|
|
2247
|
+
nonlocal ray_status_details
|
|
1853
2248
|
try:
|
|
1854
2249
|
# NOTE: fetching the IPs is very slow as it calls into
|
|
1855
2250
|
# `ray get head-ip/worker-ips`. Using cached IPs is safe because
|
|
@@ -1872,9 +2267,49 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
1872
2267
|
|
|
1873
2268
|
total_nodes = handle.launched_nodes * handle.num_ips_per_node
|
|
1874
2269
|
|
|
2270
|
+
cloud_name = repr(handle.launched_resources.cloud).lower()
|
|
2271
|
+
# Initialize variables in case all retries fail
|
|
2272
|
+
ready_head = 0
|
|
2273
|
+
ready_workers = 0
|
|
2274
|
+
output = ''
|
|
2275
|
+
stderr = ''
|
|
1875
2276
|
for i in range(5):
|
|
1876
|
-
|
|
1877
|
-
|
|
2277
|
+
try:
|
|
2278
|
+
ready_head, ready_workers, output, stderr = (
|
|
2279
|
+
get_node_counts_from_ray_status(head_runner))
|
|
2280
|
+
except exceptions.CommandError as e:
|
|
2281
|
+
logger.debug(f'Refreshing status ({cluster_name!r}) attempt'
|
|
2282
|
+
f' {i}: {common_utils.format_exception(e)}')
|
|
2283
|
+
if cloud_name != 'kubernetes':
|
|
2284
|
+
# Non-k8s clusters can be manually restarted and:
|
|
2285
|
+
# 1. Get new IP addresses, or
|
|
2286
|
+
# 2. Not have the SkyPilot runtime setup
|
|
2287
|
+
#
|
|
2288
|
+
# So we should surface a message to the user to
|
|
2289
|
+
# help them recover from this inconsistent state.
|
|
2290
|
+
has_new_ip_addr = (
|
|
2291
|
+
e.detailed_reason is not None and
|
|
2292
|
+
_SSH_CONNECTION_TIMED_OUT_PATTERN.search(
|
|
2293
|
+
e.detailed_reason.strip()) is not None)
|
|
2294
|
+
runtime_not_setup = (_RAY_CLUSTER_NOT_FOUND_MESSAGE
|
|
2295
|
+
in e.error_msg)
|
|
2296
|
+
if has_new_ip_addr or runtime_not_setup:
|
|
2297
|
+
yellow = colorama.Fore.YELLOW
|
|
2298
|
+
bright = colorama.Style.BRIGHT
|
|
2299
|
+
reset = colorama.Style.RESET_ALL
|
|
2300
|
+
ux_utils.console_newline()
|
|
2301
|
+
logger.warning(
|
|
2302
|
+
f'{yellow}Failed getting cluster status despite all nodes '
|
|
2303
|
+
f'being up ({cluster_name!r}). '
|
|
2304
|
+
f'If the cluster was restarted manually, try running: '
|
|
2305
|
+
f'{reset}{bright}sky start {cluster_name}{reset} '
|
|
2306
|
+
f'{yellow}to recover from INIT status.{reset}')
|
|
2307
|
+
return False
|
|
2308
|
+
raise e
|
|
2309
|
+
# We retry for kubernetes because coreweave can have a
|
|
2310
|
+
# transient network issue.
|
|
2311
|
+
time.sleep(1)
|
|
2312
|
+
continue
|
|
1878
2313
|
if ready_head + ready_workers == total_nodes:
|
|
1879
2314
|
return True
|
|
1880
2315
|
logger.debug(f'Refreshing status ({cluster_name!r}) attempt '
|
|
@@ -1892,19 +2327,25 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
1892
2327
|
# showing up
|
|
1893
2328
|
time.sleep(1)
|
|
1894
2329
|
|
|
2330
|
+
ray_status_details = (
|
|
2331
|
+
f'{ready_head + ready_workers}/{total_nodes} ready')
|
|
1895
2332
|
raise RuntimeError(
|
|
1896
2333
|
f'Refreshing status ({cluster_name!r}): ray status not showing '
|
|
1897
2334
|
f'all nodes ({ready_head + ready_workers}/'
|
|
1898
2335
|
f'{total_nodes});\noutput:\n{output}\nstderr:\n{stderr}')
|
|
1899
2336
|
|
|
1900
2337
|
except exceptions.FetchClusterInfoError:
|
|
2338
|
+
ray_status_details = 'failed to get IPs'
|
|
1901
2339
|
logger.debug(
|
|
1902
2340
|
f'Refreshing status ({cluster_name!r}) failed to get IPs.')
|
|
1903
2341
|
except RuntimeError as e:
|
|
2342
|
+
if ray_status_details is None:
|
|
2343
|
+
ray_status_details = str(e)
|
|
1904
2344
|
logger.debug(common_utils.format_exception(e))
|
|
1905
2345
|
except Exception as e: # pylint: disable=broad-except
|
|
1906
2346
|
# This can be raised by `external_ssh_ports()`, due to the
|
|
1907
2347
|
# underlying call to kubernetes API.
|
|
2348
|
+
ray_status_details = str(e)
|
|
1908
2349
|
logger.debug(f'Refreshing status ({cluster_name!r}) failed: ',
|
|
1909
2350
|
exc_info=e)
|
|
1910
2351
|
return False
|
|
@@ -1925,16 +2366,28 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
1925
2366
|
# run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
|
|
1926
2367
|
# head-ip/worker-ips`.
|
|
1927
2368
|
record['status'] = status_lib.ClusterStatus.UP
|
|
1928
|
-
|
|
1929
|
-
|
|
1930
|
-
|
|
1931
|
-
|
|
1932
|
-
|
|
1933
|
-
|
|
2369
|
+
# Add cluster event for instance status check.
|
|
2370
|
+
global_user_state.add_cluster_event(
|
|
2371
|
+
cluster_name,
|
|
2372
|
+
status_lib.ClusterStatus.UP,
|
|
2373
|
+
'All nodes up; SkyPilot runtime healthy.',
|
|
2374
|
+
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2375
|
+
nop_if_duplicate=True)
|
|
2376
|
+
global_user_state.add_or_update_cluster(
|
|
2377
|
+
cluster_name,
|
|
2378
|
+
handle,
|
|
2379
|
+
requested_resources=None,
|
|
2380
|
+
ready=True,
|
|
2381
|
+
is_launch=False,
|
|
2382
|
+
existing_cluster_hash=record['cluster_hash'])
|
|
2383
|
+
return global_user_state.get_cluster_from_name(
|
|
2384
|
+
cluster_name,
|
|
2385
|
+
include_user_info=include_user_info,
|
|
2386
|
+
summary_response=summary_response)
|
|
1934
2387
|
|
|
1935
2388
|
# All cases below are transitioning the cluster to non-UP states.
|
|
1936
|
-
|
|
1937
|
-
if (not node_statuses and
|
|
2389
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
|
2390
|
+
if (not node_statuses and launched_resources.cloud.STATUS_VERSION >=
|
|
1938
2391
|
clouds.StatusVersion.SKYPILOT):
|
|
1939
2392
|
# Note: launched_at is set during sky launch, even on an existing
|
|
1940
2393
|
# cluster. This will catch the case where the cluster was terminated on
|
|
@@ -1947,7 +2400,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
1947
2400
|
# and check again. This is a best-effort leak prevention check.
|
|
1948
2401
|
# See https://github.com/skypilot-org/skypilot/issues/4431.
|
|
1949
2402
|
time.sleep(_LAUNCH_DOUBLE_CHECK_DELAY)
|
|
1950
|
-
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2403
|
+
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2404
|
+
handle, retry_if_missing=False)
|
|
1951
2405
|
# Note: even if all the node_statuses are UP now, we will still
|
|
1952
2406
|
# consider this cluster abnormal, and its status will be INIT.
|
|
1953
2407
|
|
|
@@ -2002,85 +2456,168 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2002
2456
|
# * The cluster is partially or completely in the INIT state, which means
|
|
2003
2457
|
# that provisioning was interrupted. This is considered abnormal.
|
|
2004
2458
|
#
|
|
2005
|
-
# An abnormal cluster will transition to INIT and
|
|
2006
|
-
#
|
|
2007
|
-
|
|
2008
|
-
|
|
2459
|
+
# An abnormal cluster will transition to INIT, and one of the following will happen:
|
|
2460
|
+
# (1) If the SkyPilot provisioner is used AND the head node is alive, we
|
|
2461
|
+
# will not reset the autostop setting. Because autostop is handled by
|
|
2462
|
+
# the skylet through the cloud APIs, and will continue to function
|
|
2463
|
+
# regardless of the ray cluster's health.
|
|
2464
|
+
# (2) Otherwise, we will reset the autostop setting, unless the cluster is
|
|
2465
|
+
# autostopping/autodowning.
|
|
2466
|
+
some_nodes_terminated = 0 < len(node_statuses) < handle.launched_nodes
|
|
2467
|
+
# If all nodes are up and ray cluster is health, we would have returned
|
|
2468
|
+
# earlier. So if all_nodes_up is True and we are here, it means the ray
|
|
2469
|
+
# cluster must have been unhealthy.
|
|
2470
|
+
ray_cluster_unhealthy = all_nodes_up
|
|
2471
|
+
some_nodes_not_stopped = any(status[0] != status_lib.ClusterStatus.STOPPED
|
|
2472
|
+
for status in node_statuses)
|
|
2473
|
+
is_abnormal = (some_nodes_terminated or some_nodes_not_stopped)
|
|
2474
|
+
|
|
2009
2475
|
if is_abnormal:
|
|
2476
|
+
status_reason = ', '.join(
|
|
2477
|
+
[status[1] for status in node_statuses if status[1] is not None])
|
|
2478
|
+
|
|
2479
|
+
if some_nodes_terminated:
|
|
2480
|
+
init_reason = 'one or more nodes terminated'
|
|
2481
|
+
elif ray_cluster_unhealthy:
|
|
2482
|
+
init_reason = f'ray cluster is unhealthy ({ray_status_details})'
|
|
2483
|
+
elif some_nodes_not_stopped:
|
|
2484
|
+
init_reason = 'some but not all nodes are stopped'
|
|
2010
2485
|
logger.debug('The cluster is abnormal. Setting to INIT status. '
|
|
2011
2486
|
f'node_statuses: {node_statuses}')
|
|
2012
|
-
|
|
2013
|
-
|
|
2014
|
-
|
|
2015
|
-
|
|
2016
|
-
stream_logs=False):
|
|
2017
|
-
# Friendly hint.
|
|
2018
|
-
autostop = record['autostop']
|
|
2019
|
-
maybe_down_str = ' --down' if record['to_down'] else ''
|
|
2020
|
-
noun = 'autodown' if record['to_down'] else 'autostop'
|
|
2021
|
-
|
|
2022
|
-
# Reset the autostopping as the cluster is abnormal, and may
|
|
2023
|
-
# not correctly autostop. Resetting the autostop will let
|
|
2024
|
-
# the user know that the autostop may not happen to avoid
|
|
2025
|
-
# leakages from the assumption that the cluster will autostop.
|
|
2026
|
-
success = True
|
|
2027
|
-
reset_local_autostop = True
|
|
2487
|
+
if record['autostop'] >= 0:
|
|
2488
|
+
is_head_node_alive = False
|
|
2489
|
+
if launched_resources.cloud.PROVISIONER_VERSION >= clouds.ProvisionerVersion.SKYPILOT:
|
|
2490
|
+
# Check if the head node is alive
|
|
2028
2491
|
try:
|
|
2029
|
-
|
|
2030
|
-
|
|
2031
|
-
|
|
2032
|
-
|
|
2033
|
-
|
|
2034
|
-
|
|
2035
|
-
|
|
2036
|
-
|
|
2037
|
-
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
|
|
2041
|
-
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
|
|
2492
|
+
cluster_info = _query_cluster_info_via_cloud_api(handle)
|
|
2493
|
+
is_head_node_alive = cluster_info.get_head_instance(
|
|
2494
|
+
) is not None
|
|
2495
|
+
except Exception as e: # pylint: disable=broad-except
|
|
2496
|
+
logger.debug(
|
|
2497
|
+
f'Failed to get cluster info for {cluster_name!r}: '
|
|
2498
|
+
f'{common_utils.format_exception(e)}')
|
|
2499
|
+
|
|
2500
|
+
backend = get_backend_from_handle(handle)
|
|
2501
|
+
if isinstance(backend, backends.CloudVmRayBackend):
|
|
2502
|
+
if is_head_node_alive:
|
|
2503
|
+
logger.debug(
|
|
2504
|
+
f'Skipping autostop reset for cluster {cluster_name!r} '
|
|
2505
|
+
'because the head node is alive.')
|
|
2506
|
+
elif not backend.is_definitely_autostopping(handle,
|
|
2507
|
+
stream_logs=False):
|
|
2508
|
+
# Friendly hint.
|
|
2509
|
+
autostop = record['autostop']
|
|
2510
|
+
maybe_down_str = ' --down' if record['to_down'] else ''
|
|
2511
|
+
noun = 'autodown' if record['to_down'] else 'autostop'
|
|
2512
|
+
|
|
2513
|
+
# Reset the autostopping as the cluster is abnormal, and may
|
|
2514
|
+
# not correctly autostop. Resetting the autostop will let
|
|
2515
|
+
# the user know that the autostop may not happen to avoid
|
|
2516
|
+
# leakages from the assumption that the cluster will autostop.
|
|
2517
|
+
success = True
|
|
2518
|
+
reset_local_autostop = True
|
|
2519
|
+
try:
|
|
2520
|
+
backend.set_autostop(
|
|
2521
|
+
handle,
|
|
2522
|
+
-1,
|
|
2523
|
+
autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR,
|
|
2524
|
+
stream_logs=False)
|
|
2525
|
+
except (exceptions.CommandError,
|
|
2526
|
+
grpc.FutureTimeoutError) as e:
|
|
2527
|
+
success = False
|
|
2528
|
+
if isinstance(e, grpc.FutureTimeoutError) or (
|
|
2529
|
+
isinstance(e, exceptions.CommandError) and
|
|
2530
|
+
e.returncode == 255):
|
|
2531
|
+
word = 'autostopped' if noun == 'autostop' else 'autodowned'
|
|
2532
|
+
logger.debug(f'The cluster is likely {word}.')
|
|
2533
|
+
reset_local_autostop = False
|
|
2534
|
+
except (Exception, SystemExit) as e: # pylint: disable=broad-except
|
|
2535
|
+
success = False
|
|
2536
|
+
logger.debug(f'Failed to reset autostop. Due to '
|
|
2537
|
+
f'{common_utils.format_exception(e)}')
|
|
2538
|
+
if reset_local_autostop:
|
|
2539
|
+
global_user_state.set_cluster_autostop_value(
|
|
2540
|
+
handle.cluster_name, -1, to_down=False)
|
|
2541
|
+
|
|
2542
|
+
if success:
|
|
2543
|
+
operation_str = (f'Canceled {noun} on the cluster '
|
|
2544
|
+
f'{cluster_name!r}')
|
|
2545
|
+
else:
|
|
2546
|
+
operation_str = (
|
|
2547
|
+
f'Attempted to cancel {noun} on the '
|
|
2548
|
+
f'cluster {cluster_name!r} with best effort')
|
|
2549
|
+
yellow = colorama.Fore.YELLOW
|
|
2550
|
+
bright = colorama.Style.BRIGHT
|
|
2551
|
+
reset = colorama.Style.RESET_ALL
|
|
2552
|
+
ux_utils.console_newline()
|
|
2553
|
+
logger.warning(
|
|
2554
|
+
f'{yellow}{operation_str}, since it is found to be in an '
|
|
2555
|
+
f'abnormal state. To fix, try running: {reset}{bright}sky '
|
|
2556
|
+
f'start -f -i {autostop}{maybe_down_str} {cluster_name}'
|
|
2557
|
+
f'{reset}')
|
|
2047
2558
|
else:
|
|
2048
|
-
|
|
2049
|
-
|
|
2050
|
-
|
|
2051
|
-
|
|
2052
|
-
|
|
2053
|
-
|
|
2054
|
-
ux_utils.console_newline()
|
|
2055
|
-
logger.warning(
|
|
2056
|
-
f'{yellow}{operation_str}, since it is found to be in an '
|
|
2057
|
-
f'abnormal state. To fix, try running: {reset}{bright}sky '
|
|
2058
|
-
f'start -f -i {autostop}{maybe_down_str} {cluster_name}'
|
|
2059
|
-
f'{reset}')
|
|
2060
|
-
else:
|
|
2061
|
-
ux_utils.console_newline()
|
|
2062
|
-
operation_str = 'autodowning' if record[
|
|
2063
|
-
'to_down'] else 'autostopping'
|
|
2064
|
-
logger.info(
|
|
2065
|
-
f'Cluster {cluster_name!r} is {operation_str}. Setting to '
|
|
2066
|
-
'INIT status; try refresh again in a while.')
|
|
2559
|
+
ux_utils.console_newline()
|
|
2560
|
+
operation_str = 'autodowning' if record[
|
|
2561
|
+
'to_down'] else 'autostopping'
|
|
2562
|
+
logger.info(
|
|
2563
|
+
f'Cluster {cluster_name!r} is {operation_str}. Setting to '
|
|
2564
|
+
'INIT status; try refresh again in a while.')
|
|
2067
2565
|
|
|
2068
2566
|
# If the user starts part of a STOPPED cluster, we still need a status
|
|
2069
2567
|
# to represent the abnormal status. For spot cluster, it can also
|
|
2070
2568
|
# represent that the cluster is partially preempted.
|
|
2071
2569
|
# TODO(zhwu): the definition of INIT should be audited/changed.
|
|
2072
2570
|
# Adding a new status UNHEALTHY for abnormal status can be a choice.
|
|
2073
|
-
|
|
2074
|
-
|
|
2075
|
-
|
|
2076
|
-
|
|
2077
|
-
|
|
2078
|
-
|
|
2571
|
+
init_reason_regex = None
|
|
2572
|
+
if not status_reason:
|
|
2573
|
+
# If there is not a status reason, don't re-add (and overwrite) the
|
|
2574
|
+
# event if there is already an event with the same reason which may
|
|
2575
|
+
# have a status reason.
|
|
2576
|
+
# Some status reason clears after a certain time (e.g. k8s events
|
|
2577
|
+
# are only stored for an hour by default), so it is possible that
|
|
2578
|
+
# the previous event has a status reason, but now it does not.
|
|
2579
|
+
init_reason_regex = (f'^Cluster is abnormal because '
|
|
2580
|
+
f'{re.escape(init_reason)}.*')
|
|
2581
|
+
log_message = f'Cluster is abnormal because {init_reason}'
|
|
2582
|
+
if status_reason:
|
|
2583
|
+
log_message += f' ({status_reason})'
|
|
2584
|
+
log_message += '. Transitioned to INIT.'
|
|
2585
|
+
global_user_state.add_cluster_event(
|
|
2586
|
+
cluster_name,
|
|
2587
|
+
status_lib.ClusterStatus.INIT,
|
|
2588
|
+
log_message,
|
|
2589
|
+
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2590
|
+
nop_if_duplicate=True,
|
|
2591
|
+
duplicate_regex=init_reason_regex)
|
|
2592
|
+
global_user_state.add_or_update_cluster(
|
|
2593
|
+
cluster_name,
|
|
2594
|
+
handle,
|
|
2595
|
+
requested_resources=None,
|
|
2596
|
+
ready=False,
|
|
2597
|
+
is_launch=False,
|
|
2598
|
+
existing_cluster_hash=record['cluster_hash'])
|
|
2599
|
+
return global_user_state.get_cluster_from_name(
|
|
2600
|
+
cluster_name,
|
|
2601
|
+
include_user_info=include_user_info,
|
|
2602
|
+
summary_response=summary_response)
|
|
2079
2603
|
# Now is_abnormal is False: either node_statuses is empty or all nodes are
|
|
2080
2604
|
# STOPPED.
|
|
2605
|
+
verb = 'terminated' if to_terminate else 'stopped'
|
|
2081
2606
|
backend = backends.CloudVmRayBackend()
|
|
2607
|
+
global_user_state.add_cluster_event(
|
|
2608
|
+
cluster_name,
|
|
2609
|
+
None,
|
|
2610
|
+
f'All nodes {verb}, cleaning up the cluster.',
|
|
2611
|
+
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2612
|
+
# This won't do anything for a terminated cluster, but it's needed for a
|
|
2613
|
+
# stopped cluster.
|
|
2614
|
+
nop_if_duplicate=True,
|
|
2615
|
+
)
|
|
2082
2616
|
backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
|
|
2083
|
-
return global_user_state.get_cluster_from_name(
|
|
2617
|
+
return global_user_state.get_cluster_from_name(
|
|
2618
|
+
cluster_name,
|
|
2619
|
+
include_user_info=include_user_info,
|
|
2620
|
+
summary_response=summary_response)
|
|
2084
2621
|
|
|
2085
2622
|
|
|
2086
2623
|
def _must_refresh_cluster_status(
|
|
@@ -2102,12 +2639,14 @@ def _must_refresh_cluster_status(
|
|
|
2102
2639
|
|
|
2103
2640
|
|
|
2104
2641
|
def refresh_cluster_record(
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2109
|
-
|
|
2110
|
-
|
|
2642
|
+
cluster_name: str,
|
|
2643
|
+
*,
|
|
2644
|
+
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
|
|
2645
|
+
cluster_lock_already_held: bool = False,
|
|
2646
|
+
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
|
|
2647
|
+
include_user_info: bool = True,
|
|
2648
|
+
summary_response: bool = False,
|
|
2649
|
+
retry_if_missing: bool = True) -> Optional[Dict[str, Any]]:
|
|
2111
2650
|
"""Refresh the cluster, and return the possibly updated record.
|
|
2112
2651
|
|
|
2113
2652
|
The function will update the cached cluster status in the global state. For
|
|
@@ -2124,14 +2663,20 @@ def refresh_cluster_record(
|
|
|
2124
2663
|
_CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
|
|
2125
2664
|
1. the cluster is a spot cluster, or
|
|
2126
2665
|
2. cluster autostop is set and the cluster is not STOPPED.
|
|
2127
|
-
|
|
2128
|
-
|
|
2129
|
-
|
|
2666
|
+
cluster_lock_already_held: Whether the caller is already holding the
|
|
2667
|
+
per-cluster lock. You MUST NOT set this to True if the caller does not
|
|
2668
|
+
already hold the lock. If True, we will not acquire the lock before
|
|
2669
|
+
updating the status. Failing to hold the lock while updating the
|
|
2670
|
+
status can lead to correctness issues - e.g. an launch in-progress may
|
|
2671
|
+
appear to be DOWN incorrectly. Even if this is set to False, the lock
|
|
2672
|
+
may not be acquired if the status does not need to be refreshed.
|
|
2130
2673
|
cluster_status_lock_timeout: The timeout to acquire the per-cluster
|
|
2131
2674
|
lock. If timeout, the function will use the cached status. If the
|
|
2132
2675
|
value is <0, do not timeout (wait for the lock indefinitely). By
|
|
2133
2676
|
default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
|
|
2134
2677
|
if correctness is required, you must set this to -1.
|
|
2678
|
+
retry_if_missing: Whether to retry the call to the cloud api if the
|
|
2679
|
+
cluster is not found when querying the live status on the cloud.
|
|
2135
2680
|
|
|
2136
2681
|
Returns:
|
|
2137
2682
|
If the cluster is terminated or does not exist, return None.
|
|
@@ -2147,69 +2692,95 @@ def refresh_cluster_record(
|
|
|
2147
2692
|
the node number larger than expected.
|
|
2148
2693
|
"""
|
|
2149
2694
|
|
|
2150
|
-
|
|
2695
|
+
ctx = context_lib.get()
|
|
2696
|
+
record = global_user_state.get_cluster_from_name(
|
|
2697
|
+
cluster_name,
|
|
2698
|
+
include_user_info=include_user_info,
|
|
2699
|
+
summary_response=summary_response)
|
|
2151
2700
|
if record is None:
|
|
2152
2701
|
return None
|
|
2153
|
-
|
|
2154
|
-
|
|
2155
|
-
|
|
2156
|
-
|
|
2157
|
-
|
|
2158
|
-
|
|
2159
|
-
|
|
2160
|
-
|
|
2161
|
-
|
|
2162
|
-
|
|
2163
|
-
|
|
2164
|
-
|
|
2165
|
-
|
|
2166
|
-
#
|
|
2167
|
-
|
|
2168
|
-
|
|
2169
|
-
|
|
2170
|
-
|
|
2171
|
-
return
|
|
2172
|
-
|
|
2173
|
-
|
|
2174
|
-
|
|
2175
|
-
|
|
2176
|
-
|
|
2177
|
-
|
|
2178
|
-
|
|
2179
|
-
|
|
2180
|
-
|
|
2181
|
-
|
|
2182
|
-
|
|
2183
|
-
|
|
2184
|
-
|
|
2185
|
-
|
|
2186
|
-
|
|
2187
|
-
|
|
2188
|
-
|
|
2189
|
-
|
|
2190
|
-
|
|
2191
|
-
|
|
2192
|
-
|
|
2193
|
-
|
|
2194
|
-
|
|
2195
|
-
|
|
2196
|
-
|
|
2197
|
-
|
|
2198
|
-
|
|
2199
|
-
|
|
2200
|
-
|
|
2201
|
-
|
|
2202
|
-
|
|
2203
|
-
|
|
2702
|
+
# TODO(zhwu, 05/20): switch to the specific workspace to make sure we are
|
|
2703
|
+
# using the correct cloud credentials.
|
|
2704
|
+
workspace = record.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE)
|
|
2705
|
+
with skypilot_config.local_active_workspace_ctx(workspace):
|
|
2706
|
+
# check_owner_identity returns if the record handle is
|
|
2707
|
+
# not a CloudVmRayResourceHandle
|
|
2708
|
+
_check_owner_identity_with_record(cluster_name, record)
|
|
2709
|
+
|
|
2710
|
+
# The loop logic allows us to notice if the status was updated in the
|
|
2711
|
+
# global_user_state by another process and stop trying to get the lock.
|
|
2712
|
+
lock = locks.get_lock(cluster_status_lock_id(cluster_name))
|
|
2713
|
+
start_time = time.perf_counter()
|
|
2714
|
+
|
|
2715
|
+
# Loop until we have an up-to-date status or until we acquire the lock.
|
|
2716
|
+
while True:
|
|
2717
|
+
# Check if the context is canceled.
|
|
2718
|
+
if ctx is not None and ctx.is_canceled():
|
|
2719
|
+
raise asyncio.CancelledError()
|
|
2720
|
+
# Check to see if we can return the cached status.
|
|
2721
|
+
if not _must_refresh_cluster_status(record, force_refresh_statuses):
|
|
2722
|
+
return record
|
|
2723
|
+
|
|
2724
|
+
if cluster_lock_already_held:
|
|
2725
|
+
return _update_cluster_status(cluster_name, record,
|
|
2726
|
+
retry_if_missing,
|
|
2727
|
+
include_user_info,
|
|
2728
|
+
summary_response)
|
|
2729
|
+
|
|
2730
|
+
# Try to acquire the lock so we can fetch the status.
|
|
2731
|
+
try:
|
|
2732
|
+
with lock.acquire(blocking=False):
|
|
2733
|
+
# Check the cluster status again, since it could have been
|
|
2734
|
+
# updated between our last check and acquiring the lock.
|
|
2735
|
+
record = global_user_state.get_cluster_from_name(
|
|
2736
|
+
cluster_name,
|
|
2737
|
+
include_user_info=include_user_info,
|
|
2738
|
+
summary_response=summary_response)
|
|
2739
|
+
if record is None or not _must_refresh_cluster_status(
|
|
2740
|
+
record, force_refresh_statuses):
|
|
2741
|
+
return record
|
|
2742
|
+
# Update and return the cluster status.
|
|
2743
|
+
return _update_cluster_status(cluster_name, record,
|
|
2744
|
+
retry_if_missing,
|
|
2745
|
+
include_user_info,
|
|
2746
|
+
summary_response)
|
|
2747
|
+
|
|
2748
|
+
except locks.LockTimeout:
|
|
2749
|
+
# lock.acquire() will throw a Timeout exception if the lock is not
|
|
2750
|
+
# available and we have blocking=False.
|
|
2751
|
+
pass
|
|
2752
|
+
|
|
2753
|
+
# Logic adapted from FileLock.acquire().
|
|
2754
|
+
# If cluster_status_lock_time is <0, we will never hit this. No timeout.
|
|
2755
|
+
# Otherwise, if we have timed out, return the cached status. This has
|
|
2756
|
+
# the potential to cause correctness issues, but if so it is the
|
|
2757
|
+
# caller's responsibility to set the timeout to -1.
|
|
2758
|
+
if 0 <= cluster_status_lock_timeout < time.perf_counter(
|
|
2759
|
+
) - start_time:
|
|
2760
|
+
logger.debug(
|
|
2761
|
+
'Refreshing status: Failed get the lock for cluster '
|
|
2762
|
+
f'{cluster_name!r}. Using the cached status.')
|
|
2763
|
+
return record
|
|
2764
|
+
time.sleep(lock.poll_interval)
|
|
2765
|
+
|
|
2766
|
+
# Refresh for next loop iteration.
|
|
2767
|
+
record = global_user_state.get_cluster_from_name(
|
|
2768
|
+
cluster_name,
|
|
2769
|
+
include_user_info=include_user_info,
|
|
2770
|
+
summary_response=summary_response)
|
|
2771
|
+
if record is None:
|
|
2772
|
+
return None
|
|
2204
2773
|
|
|
2205
2774
|
|
|
2206
2775
|
@timeline.event
|
|
2776
|
+
@context_utils.cancellation_guard
|
|
2207
2777
|
def refresh_cluster_status_handle(
|
|
2208
2778
|
cluster_name: str,
|
|
2209
2779
|
*,
|
|
2210
2780
|
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
|
|
2211
|
-
|
|
2212
|
-
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
|
|
2781
|
+
cluster_lock_already_held: bool = False,
|
|
2782
|
+
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
|
|
2783
|
+
retry_if_missing: bool = True,
|
|
2213
2784
|
) -> Tuple[Optional[status_lib.ClusterStatus],
|
|
2214
2785
|
Optional[backends.ResourceHandle]]:
|
|
2215
2786
|
"""Refresh the cluster, and return the possibly updated status and handle.
|
|
@@ -2221,8 +2792,11 @@ def refresh_cluster_status_handle(
|
|
|
2221
2792
|
record = refresh_cluster_record(
|
|
2222
2793
|
cluster_name,
|
|
2223
2794
|
force_refresh_statuses=force_refresh_statuses,
|
|
2224
|
-
|
|
2225
|
-
cluster_status_lock_timeout=cluster_status_lock_timeout
|
|
2795
|
+
cluster_lock_already_held=cluster_lock_already_held,
|
|
2796
|
+
cluster_status_lock_timeout=cluster_status_lock_timeout,
|
|
2797
|
+
include_user_info=False,
|
|
2798
|
+
summary_response=True,
|
|
2799
|
+
retry_if_missing=retry_if_missing)
|
|
2226
2800
|
if record is None:
|
|
2227
2801
|
return None, None
|
|
2228
2802
|
return record['status'], record['handle']
|
|
@@ -2253,6 +2827,7 @@ def check_cluster_available(
|
|
|
2253
2827
|
...
|
|
2254
2828
|
|
|
2255
2829
|
|
|
2830
|
+
@context_utils.cancellation_guard
|
|
2256
2831
|
def check_cluster_available(
|
|
2257
2832
|
cluster_name: str,
|
|
2258
2833
|
*,
|
|
@@ -2272,7 +2847,9 @@ def check_cluster_available(
|
|
|
2272
2847
|
exceptions.CloudUserIdentityError: if we fail to get the current user
|
|
2273
2848
|
identity.
|
|
2274
2849
|
"""
|
|
2275
|
-
record = global_user_state.get_cluster_from_name(cluster_name
|
|
2850
|
+
record = global_user_state.get_cluster_from_name(cluster_name,
|
|
2851
|
+
include_user_info=False,
|
|
2852
|
+
summary_response=True)
|
|
2276
2853
|
if dryrun:
|
|
2277
2854
|
assert record is not None, cluster_name
|
|
2278
2855
|
return record['handle']
|
|
@@ -2404,6 +2981,19 @@ def is_controller_accessible(
|
|
|
2404
2981
|
exceptions.ClusterNotUpError: if the controller is not accessible, or
|
|
2405
2982
|
failed to be connected.
|
|
2406
2983
|
"""
|
|
2984
|
+
if (managed_job_utils.is_consolidation_mode() and
|
|
2985
|
+
controller == controller_utils.Controllers.JOBS_CONTROLLER
|
|
2986
|
+
) or (serve_utils.is_consolidation_mode() and
|
|
2987
|
+
controller == controller_utils.Controllers.SKY_SERVE_CONTROLLER):
|
|
2988
|
+
cn = 'local-controller-consolidation'
|
|
2989
|
+
return backends.LocalResourcesHandle(
|
|
2990
|
+
cluster_name=cn,
|
|
2991
|
+
cluster_name_on_cloud=cn,
|
|
2992
|
+
cluster_yaml=None,
|
|
2993
|
+
launched_nodes=1,
|
|
2994
|
+
launched_resources=sky.Resources(cloud=clouds.Cloud(),
|
|
2995
|
+
instance_type=cn),
|
|
2996
|
+
)
|
|
2407
2997
|
if non_existent_message is None:
|
|
2408
2998
|
non_existent_message = controller.value.default_hint_if_non_existent
|
|
2409
2999
|
cluster_name = controller.value.cluster_name
|
|
@@ -2446,7 +3036,8 @@ def is_controller_accessible(
|
|
|
2446
3036
|
f'fatal, but {controller_name} commands/calls may hang or return '
|
|
2447
3037
|
'stale information, when the controller is not up.\n'
|
|
2448
3038
|
f' Details: {common_utils.format_exception(e, use_bracket=True)}')
|
|
2449
|
-
record = global_user_state.get_cluster_from_name(
|
|
3039
|
+
record = global_user_state.get_cluster_from_name(
|
|
3040
|
+
cluster_name, include_user_info=False, summary_response=True)
|
|
2450
3041
|
if record is not None:
|
|
2451
3042
|
controller_status, handle = record['status'], record['handle']
|
|
2452
3043
|
# We check the connection even if the cluster has a cached status UP
|
|
@@ -2467,7 +3058,7 @@ def is_controller_accessible(
|
|
|
2467
3058
|
need_connection_check):
|
|
2468
3059
|
# Check ssh connection if (1) controller is in INIT state, or (2) we failed to fetch the
|
|
2469
3060
|
# status, both of which can happen when controller's status lock is held by another `sky jobs launch` or
|
|
2470
|
-
# `sky serve up`. If we have
|
|
3061
|
+
# `sky serve up`. If we have controller's head_ip available and it is ssh-reachable,
|
|
2471
3062
|
# we can allow access to the controller.
|
|
2472
3063
|
ssh_credentials = ssh_credential_from_yaml(handle.cluster_yaml,
|
|
2473
3064
|
handle.docker_user,
|
|
@@ -2503,21 +3094,99 @@ class CloudFilter(enum.Enum):
|
|
|
2503
3094
|
LOCAL = 'local'
|
|
2504
3095
|
|
|
2505
3096
|
|
|
2506
|
-
def _get_glob_clusters(
|
|
3097
|
+
def _get_glob_clusters(
|
|
3098
|
+
clusters: List[str],
|
|
3099
|
+
silent: bool = False,
|
|
3100
|
+
workspaces_filter: Optional[Dict[str, Any]] = None) -> List[str]:
|
|
2507
3101
|
"""Returns a list of clusters that match the glob pattern."""
|
|
2508
3102
|
glob_clusters = []
|
|
2509
3103
|
for cluster in clusters:
|
|
2510
|
-
glob_cluster = global_user_state.get_glob_cluster_names(
|
|
3104
|
+
glob_cluster = global_user_state.get_glob_cluster_names(
|
|
3105
|
+
cluster, workspaces_filter=workspaces_filter)
|
|
2511
3106
|
if len(glob_cluster) == 0 and not silent:
|
|
2512
3107
|
logger.info(f'Cluster {cluster} not found.')
|
|
2513
3108
|
glob_clusters.extend(glob_cluster)
|
|
2514
3109
|
return list(set(glob_clusters))
|
|
2515
3110
|
|
|
2516
3111
|
|
|
3112
|
+
def _refresh_cluster(
|
|
3113
|
+
cluster_name: str,
|
|
3114
|
+
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]],
|
|
3115
|
+
include_user_info: bool = True,
|
|
3116
|
+
summary_response: bool = False) -> Optional[Dict[str, Any]]:
|
|
3117
|
+
try:
|
|
3118
|
+
record = refresh_cluster_record(
|
|
3119
|
+
cluster_name,
|
|
3120
|
+
force_refresh_statuses=force_refresh_statuses,
|
|
3121
|
+
cluster_lock_already_held=False,
|
|
3122
|
+
include_user_info=include_user_info,
|
|
3123
|
+
summary_response=summary_response)
|
|
3124
|
+
except (exceptions.ClusterStatusFetchingError,
|
|
3125
|
+
exceptions.CloudUserIdentityError,
|
|
3126
|
+
exceptions.ClusterOwnerIdentityMismatchError) as e:
|
|
3127
|
+
# Do not fail the entire refresh process. The caller will
|
|
3128
|
+
# handle the 'UNKNOWN' status, and collect the errors into
|
|
3129
|
+
# a table.
|
|
3130
|
+
record = {'status': 'UNKNOWN', 'error': e}
|
|
3131
|
+
return record
|
|
3132
|
+
|
|
3133
|
+
|
|
3134
|
+
def refresh_cluster_records() -> None:
|
|
3135
|
+
"""Refreshes the status of all clusters, except managed clusters.
|
|
3136
|
+
|
|
3137
|
+
Used by the background status refresh daemon.
|
|
3138
|
+
This function is a stripped-down version of get_clusters, with only the
|
|
3139
|
+
bare bones refresh logic.
|
|
3140
|
+
|
|
3141
|
+
Returns:
|
|
3142
|
+
None
|
|
3143
|
+
|
|
3144
|
+
Raises:
|
|
3145
|
+
None
|
|
3146
|
+
"""
|
|
3147
|
+
# We force to exclude managed clusters to avoid multiple sources
|
|
3148
|
+
# manipulating them. For example, SkyServe assumes the replica manager
|
|
3149
|
+
# is the only source of truth for the cluster status.
|
|
3150
|
+
cluster_names = set(
|
|
3151
|
+
global_user_state.get_cluster_names(exclude_managed_clusters=True))
|
|
3152
|
+
|
|
3153
|
+
# TODO(syang): we should try not to leak
|
|
3154
|
+
# request info in backend_utils.py.
|
|
3155
|
+
# Refactor this to use some other info to
|
|
3156
|
+
# determine if a launch is in progress.
|
|
3157
|
+
cluster_names_with_launch_request = {
|
|
3158
|
+
request.cluster_name for request in requests_lib.get_request_tasks(
|
|
3159
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
3160
|
+
status=[requests_lib.RequestStatus.RUNNING],
|
|
3161
|
+
include_request_names=['sky.launch'],
|
|
3162
|
+
fields=['cluster_name']))
|
|
3163
|
+
}
|
|
3164
|
+
cluster_names_without_launch_request = (cluster_names -
|
|
3165
|
+
cluster_names_with_launch_request)
|
|
3166
|
+
|
|
3167
|
+
def _refresh_cluster_record(cluster_name):
|
|
3168
|
+
return _refresh_cluster(cluster_name,
|
|
3169
|
+
force_refresh_statuses=set(
|
|
3170
|
+
status_lib.ClusterStatus),
|
|
3171
|
+
include_user_info=False,
|
|
3172
|
+
summary_response=True)
|
|
3173
|
+
|
|
3174
|
+
if len(cluster_names_without_launch_request) > 0:
|
|
3175
|
+
# Do not refresh the clusters that have an active launch request.
|
|
3176
|
+
subprocess_utils.run_in_parallel(_refresh_cluster_record,
|
|
3177
|
+
cluster_names_without_launch_request)
|
|
3178
|
+
|
|
3179
|
+
|
|
2517
3180
|
def get_clusters(
|
|
2518
3181
|
refresh: common.StatusRefreshMode,
|
|
2519
3182
|
cluster_names: Optional[Union[str, List[str]]] = None,
|
|
2520
3183
|
all_users: bool = True,
|
|
3184
|
+
include_credentials: bool = False,
|
|
3185
|
+
summary_response: bool = False,
|
|
3186
|
+
include_handle: bool = True,
|
|
3187
|
+
# Internal only:
|
|
3188
|
+
# pylint: disable=invalid-name
|
|
3189
|
+
_include_is_managed: bool = False,
|
|
2521
3190
|
) -> List[Dict[str, Any]]:
|
|
2522
3191
|
"""Returns a list of cached or optionally refreshed cluster records.
|
|
2523
3192
|
|
|
@@ -2527,114 +3196,159 @@ def get_clusters(
|
|
|
2527
3196
|
of the clusters.
|
|
2528
3197
|
|
|
2529
3198
|
Args:
|
|
2530
|
-
include_controller: Whether to include controllers, e.g. jobs controller
|
|
2531
|
-
or sky serve controller.
|
|
2532
3199
|
refresh: Whether to refresh the status of the clusters. (Refreshing will
|
|
2533
3200
|
set the status to STOPPED if the cluster cannot be pinged.)
|
|
2534
|
-
cloud_filter: Sets which clouds to filer through from the global user
|
|
2535
|
-
state. Supports three values, 'all' for all clouds, 'public' for
|
|
2536
|
-
public clouds only, and 'local' for only local clouds.
|
|
2537
3201
|
cluster_names: If provided, only return records for the given cluster
|
|
2538
3202
|
names.
|
|
3203
|
+
all_users: If True, return clusters from all users. If False, only
|
|
3204
|
+
return clusters from the current user.
|
|
3205
|
+
include_credentials: If True, include cluster ssh credentials in the
|
|
3206
|
+
return value.
|
|
3207
|
+
_include_is_managed: Whether to force include clusters created by the
|
|
3208
|
+
controller.
|
|
2539
3209
|
|
|
2540
3210
|
Returns:
|
|
2541
3211
|
A list of cluster records. If the cluster does not exist or has been
|
|
2542
3212
|
terminated, the record will be omitted from the returned list.
|
|
2543
3213
|
"""
|
|
2544
|
-
|
|
3214
|
+
accessible_workspaces = workspaces_core.get_workspaces()
|
|
3215
|
+
if cluster_names is not None:
|
|
3216
|
+
if isinstance(cluster_names, str):
|
|
3217
|
+
cluster_names = [cluster_names]
|
|
3218
|
+
non_glob_cluster_names = []
|
|
3219
|
+
glob_cluster_names = []
|
|
3220
|
+
for cluster_name in cluster_names:
|
|
3221
|
+
if ux_utils.is_glob_pattern(cluster_name):
|
|
3222
|
+
glob_cluster_names.append(cluster_name)
|
|
3223
|
+
else:
|
|
3224
|
+
non_glob_cluster_names.append(cluster_name)
|
|
3225
|
+
cluster_names = non_glob_cluster_names
|
|
3226
|
+
if glob_cluster_names:
|
|
3227
|
+
cluster_names += _get_glob_clusters(
|
|
3228
|
+
glob_cluster_names,
|
|
3229
|
+
silent=True,
|
|
3230
|
+
workspaces_filter=accessible_workspaces)
|
|
3231
|
+
|
|
3232
|
+
exclude_managed_clusters = False
|
|
3233
|
+
if not (_include_is_managed or env_options.Options.SHOW_DEBUG_INFO.get()):
|
|
3234
|
+
exclude_managed_clusters = True
|
|
3235
|
+
user_hashes_filter = None
|
|
2545
3236
|
if not all_users:
|
|
2546
|
-
|
|
2547
|
-
|
|
2548
|
-
|
|
2549
|
-
|
|
2550
|
-
|
|
3237
|
+
user_hashes_filter = {common_utils.get_current_user().id}
|
|
3238
|
+
records = global_user_state.get_clusters(
|
|
3239
|
+
exclude_managed_clusters=exclude_managed_clusters,
|
|
3240
|
+
user_hashes_filter=user_hashes_filter,
|
|
3241
|
+
workspaces_filter=accessible_workspaces,
|
|
3242
|
+
cluster_names=cluster_names,
|
|
3243
|
+
summary_response=summary_response)
|
|
2551
3244
|
|
|
2552
3245
|
yellow = colorama.Fore.YELLOW
|
|
2553
3246
|
bright = colorama.Style.BRIGHT
|
|
2554
3247
|
reset = colorama.Style.RESET_ALL
|
|
2555
3248
|
|
|
2556
|
-
|
|
2557
|
-
|
|
3249
|
+
if cluster_names is not None:
|
|
3250
|
+
record_names = {record['name'] for record in records}
|
|
3251
|
+
not_found_clusters = ux_utils.get_non_matched_query(
|
|
3252
|
+
cluster_names, record_names)
|
|
3253
|
+
if not_found_clusters:
|
|
3254
|
+
clusters_str = ', '.join(not_found_clusters)
|
|
3255
|
+
logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
|
|
3256
|
+
|
|
3257
|
+
def _get_records_with_handle(
|
|
3258
|
+
records: List[Optional[Dict[str, Any]]]) -> List[Dict[str, Any]]:
|
|
3259
|
+
"""Filter for records that have a handle"""
|
|
3260
|
+
return [
|
|
3261
|
+
record for record in records
|
|
3262
|
+
if record is not None and record['handle'] is not None
|
|
3263
|
+
]
|
|
3264
|
+
|
|
3265
|
+
def _update_records_with_handle_info(
|
|
3266
|
+
records: List[Optional[Dict[str, Any]]]) -> None:
|
|
3267
|
+
"""Add resource str to record"""
|
|
3268
|
+
for record in _get_records_with_handle(records):
|
|
3269
|
+
handle = record['handle']
|
|
3270
|
+
resource_str_simple, resource_str_full = (
|
|
3271
|
+
resources_utils.get_readable_resources_repr(
|
|
3272
|
+
handle, simplified_only=False))
|
|
3273
|
+
record['resources_str'] = resource_str_simple
|
|
3274
|
+
record['resources_str_full'] = resource_str_full
|
|
3275
|
+
if not summary_response:
|
|
3276
|
+
record['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
|
|
3277
|
+
|
|
3278
|
+
def _update_records_with_credentials(
|
|
3279
|
+
records: List[Optional[Dict[str, Any]]]) -> None:
|
|
2558
3280
|
"""Add the credentials to the record.
|
|
2559
3281
|
|
|
2560
3282
|
This is useful for the client side to setup the ssh config of the
|
|
2561
3283
|
cluster.
|
|
2562
3284
|
"""
|
|
2563
|
-
|
|
2564
|
-
|
|
2565
|
-
handle = record['handle']
|
|
2566
|
-
if handle is None:
|
|
3285
|
+
records_with_handle = _get_records_with_handle(records)
|
|
3286
|
+
if len(records_with_handle) == 0:
|
|
2567
3287
|
return
|
|
2568
|
-
record['resources_str'] = resources_utils.get_readable_resources_repr(
|
|
2569
|
-
handle)
|
|
2570
|
-
credentials = ssh_credential_from_yaml(handle.cluster_yaml,
|
|
2571
|
-
handle.docker_user,
|
|
2572
|
-
handle.ssh_user)
|
|
2573
3288
|
|
|
2574
|
-
|
|
2575
|
-
|
|
2576
|
-
|
|
2577
|
-
|
|
2578
|
-
|
|
2579
|
-
|
|
2580
|
-
|
|
2581
|
-
|
|
2582
|
-
|
|
2583
|
-
|
|
2584
|
-
|
|
2585
|
-
|
|
2586
|
-
|
|
2587
|
-
|
|
2588
|
-
|
|
2589
|
-
|
|
2590
|
-
|
|
2591
|
-
|
|
2592
|
-
|
|
2593
|
-
|
|
2594
|
-
new_records = []
|
|
2595
|
-
not_exist_cluster_names = []
|
|
2596
|
-
for cluster_name in cluster_names:
|
|
2597
|
-
for record in records:
|
|
2598
|
-
if record['name'] == cluster_name:
|
|
2599
|
-
new_records.append(record)
|
|
2600
|
-
break
|
|
3289
|
+
handles = [record['handle'] for record in records_with_handle]
|
|
3290
|
+
credentials = ssh_credentials_from_handles(handles)
|
|
3291
|
+
cached_private_keys: Dict[str, str] = {}
|
|
3292
|
+
for record, credential in zip(records_with_handle, credentials):
|
|
3293
|
+
if not credential:
|
|
3294
|
+
continue
|
|
3295
|
+
ssh_private_key_path = credential.get('ssh_private_key', None)
|
|
3296
|
+
if ssh_private_key_path is not None:
|
|
3297
|
+
expanded_private_key_path = os.path.expanduser(
|
|
3298
|
+
ssh_private_key_path)
|
|
3299
|
+
if not os.path.exists(expanded_private_key_path):
|
|
3300
|
+
success = auth_utils.create_ssh_key_files_from_db(
|
|
3301
|
+
ssh_private_key_path)
|
|
3302
|
+
if not success:
|
|
3303
|
+
# If the ssh key files are not found, we do not
|
|
3304
|
+
# update the record with credentials.
|
|
3305
|
+
logger.debug(
|
|
3306
|
+
f'SSH keys not found for cluster {record["name"]} '
|
|
3307
|
+
f'at key path {ssh_private_key_path}')
|
|
3308
|
+
continue
|
|
2601
3309
|
else:
|
|
2602
|
-
|
|
2603
|
-
|
|
2604
|
-
|
|
2605
|
-
|
|
2606
|
-
|
|
2607
|
-
|
|
2608
|
-
|
|
3310
|
+
private_key_path, _ = auth_utils.get_or_generate_keys()
|
|
3311
|
+
expanded_private_key_path = os.path.expanduser(private_key_path)
|
|
3312
|
+
if expanded_private_key_path in cached_private_keys:
|
|
3313
|
+
credential['ssh_private_key_content'] = cached_private_keys[
|
|
3314
|
+
expanded_private_key_path]
|
|
3315
|
+
else:
|
|
3316
|
+
with open(expanded_private_key_path, 'r',
|
|
3317
|
+
encoding='utf-8') as f:
|
|
3318
|
+
credential['ssh_private_key_content'] = f.read()
|
|
3319
|
+
cached_private_keys[expanded_private_key_path] = credential[
|
|
3320
|
+
'ssh_private_key_content']
|
|
3321
|
+
record['credentials'] = credential
|
|
3322
|
+
|
|
3323
|
+
def _update_records_with_resources(
|
|
3324
|
+
records: List[Optional[Dict[str, Any]]],) -> None:
|
|
2609
3325
|
"""Add the resources to the record."""
|
|
2610
|
-
|
|
2611
|
-
|
|
2612
|
-
|
|
2613
|
-
|
|
2614
|
-
|
|
2615
|
-
|
|
2616
|
-
|
|
2617
|
-
|
|
2618
|
-
|
|
2619
|
-
|
|
2620
|
-
|
|
2621
|
-
|
|
2622
|
-
|
|
2623
|
-
|
|
2624
|
-
|
|
2625
|
-
|
|
2626
|
-
|
|
2627
|
-
|
|
2628
|
-
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
-
|
|
2632
|
-
|
|
2633
|
-
|
|
3326
|
+
for record in _get_records_with_handle(records):
|
|
3327
|
+
handle = record['handle']
|
|
3328
|
+
record['nodes'] = handle.launched_nodes
|
|
3329
|
+
if handle.launched_resources is None:
|
|
3330
|
+
continue
|
|
3331
|
+
record['cloud'] = (f'{handle.launched_resources.cloud}'
|
|
3332
|
+
if handle.launched_resources.cloud else None)
|
|
3333
|
+
record['region'] = (f'{handle.launched_resources.region}'
|
|
3334
|
+
if handle.launched_resources.region else None)
|
|
3335
|
+
record['cpus'] = (f'{handle.launched_resources.cpus}'
|
|
3336
|
+
if handle.launched_resources.cpus else None)
|
|
3337
|
+
record['memory'] = (f'{handle.launched_resources.memory}'
|
|
3338
|
+
if handle.launched_resources.memory else None)
|
|
3339
|
+
record['accelerators'] = (
|
|
3340
|
+
f'{handle.launched_resources.accelerators}'
|
|
3341
|
+
if handle.launched_resources.accelerators else None)
|
|
3342
|
+
if not include_handle:
|
|
3343
|
+
record.pop('handle', None)
|
|
3344
|
+
|
|
3345
|
+
# Add handle info to the records
|
|
3346
|
+
_update_records_with_handle_info(records)
|
|
3347
|
+
if include_credentials:
|
|
3348
|
+
_update_records_with_credentials(records)
|
|
2634
3349
|
if refresh == common.StatusRefreshMode.NONE:
|
|
2635
3350
|
# Add resources to the records
|
|
2636
|
-
|
|
2637
|
-
_update_record_with_resources(record)
|
|
3351
|
+
_update_records_with_resources(records)
|
|
2638
3352
|
return records
|
|
2639
3353
|
|
|
2640
3354
|
plural = 's' if len(records) > 1 else ''
|
|
@@ -2650,47 +3364,76 @@ def get_clusters(
|
|
|
2650
3364
|
else:
|
|
2651
3365
|
force_refresh_statuses = None
|
|
2652
3366
|
|
|
2653
|
-
def
|
|
2654
|
-
|
|
2655
|
-
|
|
2656
|
-
|
|
2657
|
-
|
|
2658
|
-
|
|
2659
|
-
|
|
2660
|
-
|
|
2661
|
-
|
|
2662
|
-
|
|
2663
|
-
|
|
2664
|
-
|
|
2665
|
-
|
|
2666
|
-
record = {'status': 'UNKNOWN', 'error': e}
|
|
2667
|
-
progress.update(task, advance=1)
|
|
3367
|
+
def _refresh_cluster_record(cluster_name):
|
|
3368
|
+
record = _refresh_cluster(cluster_name,
|
|
3369
|
+
force_refresh_statuses=force_refresh_statuses,
|
|
3370
|
+
include_user_info=True,
|
|
3371
|
+
summary_response=summary_response)
|
|
3372
|
+
# record may be None if the cluster is deleted during refresh,
|
|
3373
|
+
# e.g. all the Pods of a cluster on Kubernetes have been
|
|
3374
|
+
# deleted before refresh.
|
|
3375
|
+
if record is not None and 'error' not in record:
|
|
3376
|
+
_update_records_with_handle_info([record])
|
|
3377
|
+
if include_credentials:
|
|
3378
|
+
_update_records_with_credentials([record])
|
|
3379
|
+
progress.update(task, advance=1)
|
|
2668
3380
|
return record
|
|
2669
3381
|
|
|
2670
3382
|
cluster_names = [record['name'] for record in records]
|
|
3383
|
+
# TODO(syang): we should try not to leak
|
|
3384
|
+
# request info in backend_utils.py.
|
|
3385
|
+
# Refactor this to use some other info to
|
|
3386
|
+
# determine if a launch is in progress.
|
|
3387
|
+
cluster_names_with_launch_request = {
|
|
3388
|
+
request.cluster_name for request in requests_lib.get_request_tasks(
|
|
3389
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
3390
|
+
status=[requests_lib.RequestStatus.RUNNING],
|
|
3391
|
+
include_request_names=['sky.launch'],
|
|
3392
|
+
cluster_names=cluster_names,
|
|
3393
|
+
fields=['cluster_name']))
|
|
3394
|
+
}
|
|
3395
|
+
# Preserve the index of the cluster name as it appears on "records"
|
|
3396
|
+
cluster_names_without_launch_request = [
|
|
3397
|
+
(i, cluster_name)
|
|
3398
|
+
for i, cluster_name in enumerate(cluster_names)
|
|
3399
|
+
if cluster_name not in cluster_names_with_launch_request
|
|
3400
|
+
]
|
|
3401
|
+
# for clusters that have an active launch request, we do not refresh the status
|
|
2671
3402
|
updated_records = []
|
|
2672
|
-
if len(
|
|
3403
|
+
if len(cluster_names_without_launch_request) > 0:
|
|
2673
3404
|
with progress:
|
|
2674
3405
|
updated_records = subprocess_utils.run_in_parallel(
|
|
2675
|
-
|
|
2676
|
-
|
|
3406
|
+
_refresh_cluster_record, [
|
|
3407
|
+
cluster_name
|
|
3408
|
+
for _, cluster_name in cluster_names_without_launch_request
|
|
3409
|
+
])
|
|
3410
|
+
# Preserve the index of the cluster name as it appears on "records"
|
|
3411
|
+
# before filtering for clusters being launched.
|
|
3412
|
+
updated_records_dict: Dict[int, Optional[Dict[str, Any]]] = {
|
|
3413
|
+
cluster_names_without_launch_request[i][0]: updated_records[i]
|
|
3414
|
+
for i in range(len(cluster_names_without_launch_request))
|
|
3415
|
+
}
|
|
2677
3416
|
# Show information for removed clusters.
|
|
2678
3417
|
kept_records = []
|
|
2679
3418
|
autodown_clusters, remaining_clusters, failed_clusters = [], [], []
|
|
2680
3419
|
for i, record in enumerate(records):
|
|
2681
|
-
if
|
|
3420
|
+
if i not in updated_records_dict:
|
|
3421
|
+
# record was not refreshed, keep the original record
|
|
3422
|
+
kept_records.append(record)
|
|
3423
|
+
continue
|
|
3424
|
+
updated_record = updated_records_dict[i]
|
|
3425
|
+
if updated_record is None:
|
|
2682
3426
|
if record['to_down']:
|
|
2683
|
-
autodown_clusters.append(
|
|
3427
|
+
autodown_clusters.append(record['name'])
|
|
2684
3428
|
else:
|
|
2685
|
-
remaining_clusters.append(
|
|
2686
|
-
elif
|
|
2687
|
-
failed_clusters.append(
|
|
2688
|
-
(cluster_names[i], updated_records[i]['error']))
|
|
3429
|
+
remaining_clusters.append(record['name'])
|
|
3430
|
+
elif updated_record['status'] == 'UNKNOWN':
|
|
3431
|
+
failed_clusters.append((record['name'], updated_record['error']))
|
|
2689
3432
|
# Keep the original record if the status is unknown,
|
|
2690
3433
|
# so that the user can still see the cluster.
|
|
2691
3434
|
kept_records.append(record)
|
|
2692
3435
|
else:
|
|
2693
|
-
kept_records.append(
|
|
3436
|
+
kept_records.append(updated_record)
|
|
2694
3437
|
|
|
2695
3438
|
if autodown_clusters:
|
|
2696
3439
|
plural = 's' if len(autodown_clusters) > 1 else ''
|
|
@@ -2711,8 +3454,7 @@ def get_clusters(
|
|
|
2711
3454
|
logger.warning(f' {bright}{cluster_name}{reset}: {e}')
|
|
2712
3455
|
|
|
2713
3456
|
# Add resources to the records
|
|
2714
|
-
|
|
2715
|
-
_update_record_with_resources(record)
|
|
3457
|
+
_update_records_with_resources(kept_records)
|
|
2716
3458
|
return kept_records
|
|
2717
3459
|
|
|
2718
3460
|
|
|
@@ -2799,6 +3541,7 @@ def get_task_resources_str(task: 'task_lib.Task',
|
|
|
2799
3541
|
if is_managed_job:
|
|
2800
3542
|
if task.best_resources.use_spot:
|
|
2801
3543
|
spot_str = '[Spot]'
|
|
3544
|
+
assert task.best_resources.cpus is not None
|
|
2802
3545
|
task_cpu_demand = task.best_resources.cpus
|
|
2803
3546
|
if accelerator_dict is None:
|
|
2804
3547
|
resources_str = f'CPU:{task_cpu_demand}'
|
|
@@ -2891,13 +3634,8 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str,
|
|
|
2891
3634
|
`stderr`. Typically due to the local client version just got updated, and
|
|
2892
3635
|
the remote runtime is an older version.
|
|
2893
3636
|
"""
|
|
2894
|
-
pattern = re.compile(r'AttributeError: module \'sky\.(.*)\' has no '
|
|
2895
|
-
r'attribute \'(.*)\'')
|
|
2896
3637
|
if returncode != 0:
|
|
2897
|
-
|
|
2898
|
-
# the remote cluster. Remove this after 0.10.0 is released.
|
|
2899
|
-
attribute_error = re.findall(pattern, stderr)
|
|
2900
|
-
if attribute_error or 'SkyPilot runtime is too old' in stderr:
|
|
3638
|
+
if 'SkyPilot runtime is too old' in stderr:
|
|
2901
3639
|
with ux_utils.print_exception_no_traceback():
|
|
2902
3640
|
raise RuntimeError(
|
|
2903
3641
|
f'{colorama.Fore.RED}SkyPilot runtime needs to be updated '
|
|
@@ -2943,7 +3681,8 @@ def get_endpoints(cluster: str,
|
|
|
2943
3681
|
with ux_utils.print_exception_no_traceback():
|
|
2944
3682
|
raise ValueError(f'Invalid endpoint {port!r}.') from None
|
|
2945
3683
|
cluster_records = get_clusters(refresh=common.StatusRefreshMode.NONE,
|
|
2946
|
-
cluster_names=[cluster]
|
|
3684
|
+
cluster_names=[cluster],
|
|
3685
|
+
_include_is_managed=True)
|
|
2947
3686
|
if not cluster_records:
|
|
2948
3687
|
with ux_utils.print_exception_no_traceback():
|
|
2949
3688
|
raise exceptions.ClusterNotUpError(
|
|
@@ -2965,7 +3704,7 @@ def get_endpoints(cluster: str,
|
|
|
2965
3704
|
f'for cluster {cluster!r} with backend '
|
|
2966
3705
|
f'{get_backend_from_handle(handle).NAME}.')
|
|
2967
3706
|
|
|
2968
|
-
launched_resources = handle.launched_resources
|
|
3707
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
|
2969
3708
|
cloud = launched_resources.cloud
|
|
2970
3709
|
try:
|
|
2971
3710
|
cloud.check_features_are_supported(
|
|
@@ -2975,18 +3714,18 @@ def get_endpoints(cluster: str,
|
|
|
2975
3714
|
raise ValueError('Querying endpoints is not supported '
|
|
2976
3715
|
f'for {cluster!r} on {cloud}.') from None
|
|
2977
3716
|
|
|
2978
|
-
config =
|
|
3717
|
+
config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
|
|
2979
3718
|
port_details = provision_lib.query_ports(repr(cloud),
|
|
2980
3719
|
handle.cluster_name_on_cloud,
|
|
2981
3720
|
handle.launched_resources.ports,
|
|
2982
3721
|
head_ip=handle.head_ip,
|
|
2983
3722
|
provider_config=config['provider'])
|
|
2984
3723
|
|
|
3724
|
+
launched_resources = handle.launched_resources.assert_launchable()
|
|
2985
3725
|
# Validation before returning the endpoints
|
|
2986
3726
|
if port is not None:
|
|
2987
3727
|
# If the requested endpoint was not to be exposed
|
|
2988
|
-
port_set = resources_utils.port_ranges_to_set(
|
|
2989
|
-
handle.launched_resources.ports)
|
|
3728
|
+
port_set = resources_utils.port_ranges_to_set(launched_resources.ports)
|
|
2990
3729
|
if port not in port_set:
|
|
2991
3730
|
logger.warning(f'Port {port} is not exposed on '
|
|
2992
3731
|
f'cluster {cluster!r}.')
|
|
@@ -2995,17 +3734,17 @@ def get_endpoints(cluster: str,
|
|
|
2995
3734
|
if port not in port_details:
|
|
2996
3735
|
error_msg = (f'Port {port} not exposed yet. '
|
|
2997
3736
|
f'{_ENDPOINTS_RETRY_MESSAGE} ')
|
|
2998
|
-
if
|
|
2999
|
-
clouds.Kubernetes()):
|
|
3737
|
+
if launched_resources.cloud.is_same_cloud(clouds.Kubernetes()):
|
|
3000
3738
|
# Add Kubernetes specific debugging info
|
|
3001
|
-
error_msg +=
|
|
3739
|
+
error_msg += kubernetes_utils.get_endpoint_debug_message(
|
|
3740
|
+
launched_resources.region)
|
|
3002
3741
|
logger.warning(error_msg)
|
|
3003
3742
|
return {}
|
|
3004
3743
|
return {port: port_details[port][0].url()}
|
|
3005
3744
|
else:
|
|
3006
3745
|
if not port_details:
|
|
3007
3746
|
# If cluster had no ports to be exposed
|
|
3008
|
-
if
|
|
3747
|
+
if launched_resources.ports is None:
|
|
3009
3748
|
logger.warning(f'Cluster {cluster!r} does not have any '
|
|
3010
3749
|
'ports to be exposed.')
|
|
3011
3750
|
return {}
|
|
@@ -3014,13 +3753,200 @@ def get_endpoints(cluster: str,
|
|
|
3014
3753
|
else:
|
|
3015
3754
|
error_msg = (f'No endpoints exposed yet. '
|
|
3016
3755
|
f'{_ENDPOINTS_RETRY_MESSAGE} ')
|
|
3017
|
-
if
|
|
3018
|
-
clouds.Kubernetes()):
|
|
3756
|
+
if launched_resources.cloud.is_same_cloud(clouds.Kubernetes()):
|
|
3019
3757
|
# Add Kubernetes specific debugging info
|
|
3020
|
-
error_msg +=
|
|
3021
|
-
|
|
3758
|
+
error_msg += kubernetes_utils.get_endpoint_debug_message(
|
|
3759
|
+
launched_resources.region)
|
|
3022
3760
|
logger.warning(error_msg)
|
|
3023
3761
|
return {}
|
|
3024
3762
|
return {
|
|
3025
3763
|
port_num: urls[0].url() for port_num, urls in port_details.items()
|
|
3026
3764
|
}
|
|
3765
|
+
|
|
3766
|
+
|
|
3767
|
+
def cluster_status_lock_id(cluster_name: str) -> str:
|
|
3768
|
+
"""Get the lock ID for cluster status operations."""
|
|
3769
|
+
return f'{cluster_name}_status'
|
|
3770
|
+
|
|
3771
|
+
|
|
3772
|
+
def cluster_file_mounts_lock_id(cluster_name: str) -> str:
|
|
3773
|
+
"""Get the lock ID for cluster file mounts operations."""
|
|
3774
|
+
return f'{cluster_name}_file_mounts'
|
|
3775
|
+
|
|
3776
|
+
|
|
3777
|
+
def workspace_lock_id(workspace_name: str) -> str:
|
|
3778
|
+
"""Get the lock ID for workspace operations."""
|
|
3779
|
+
return f'{workspace_name}_workspace'
|
|
3780
|
+
|
|
3781
|
+
|
|
3782
|
+
def cluster_tunnel_lock_id(cluster_name: str) -> str:
|
|
3783
|
+
"""Get the lock ID for cluster tunnel operations."""
|
|
3784
|
+
return f'{cluster_name}_ssh_tunnel'
|
|
3785
|
+
|
|
3786
|
+
|
|
3787
|
+
def open_ssh_tunnel(head_runner: Union[command_runner.SSHCommandRunner,
|
|
3788
|
+
command_runner.KubernetesCommandRunner],
|
|
3789
|
+
port_forward: Tuple[int, int]) -> subprocess.Popen:
|
|
3790
|
+
local_port, remote_port = port_forward
|
|
3791
|
+
if isinstance(head_runner, command_runner.SSHCommandRunner):
|
|
3792
|
+
# Disabling ControlMaster makes things easier to reason about
|
|
3793
|
+
# with respect to resource management/ownership,
|
|
3794
|
+
# as killing the process will close the tunnel too.
|
|
3795
|
+
head_runner.disable_control_master = True
|
|
3796
|
+
head_runner.port_forward_execute_remote_command = True
|
|
3797
|
+
|
|
3798
|
+
# The default connect_timeout of 1s is too short for
|
|
3799
|
+
# connecting to clusters using a jump server.
|
|
3800
|
+
# We use NON_INTERACTIVE mode to avoid allocating a pseudo-tty,
|
|
3801
|
+
# which is counted towards non-idleness.
|
|
3802
|
+
cmd: List[str] = head_runner.port_forward_command(
|
|
3803
|
+
[(local_port, remote_port)],
|
|
3804
|
+
connect_timeout=5,
|
|
3805
|
+
ssh_mode=command_runner.SshMode.NON_INTERACTIVE)
|
|
3806
|
+
if isinstance(head_runner, command_runner.SSHCommandRunner):
|
|
3807
|
+
# cat so the command doesn't exit until we kill it
|
|
3808
|
+
cmd += [f'"echo {_ACK_MESSAGE} && cat"']
|
|
3809
|
+
cmd_str = ' '.join(cmd)
|
|
3810
|
+
logger.debug(f'Running port forward command: {cmd_str}')
|
|
3811
|
+
ssh_tunnel_proc = subprocess.Popen(cmd_str,
|
|
3812
|
+
shell=True,
|
|
3813
|
+
stdin=subprocess.PIPE,
|
|
3814
|
+
stdout=subprocess.PIPE,
|
|
3815
|
+
stderr=subprocess.PIPE,
|
|
3816
|
+
start_new_session=True,
|
|
3817
|
+
text=True)
|
|
3818
|
+
# Wait until we receive an ack from the remote cluster or
|
|
3819
|
+
# the SSH connection times out.
|
|
3820
|
+
queue: queue_lib.Queue = queue_lib.Queue()
|
|
3821
|
+
stdout_thread = threading.Thread(
|
|
3822
|
+
target=lambda queue, stdout: queue.put(stdout.readline()),
|
|
3823
|
+
args=(queue, ssh_tunnel_proc.stdout),
|
|
3824
|
+
daemon=True)
|
|
3825
|
+
stdout_thread.start()
|
|
3826
|
+
while ssh_tunnel_proc.poll() is None:
|
|
3827
|
+
try:
|
|
3828
|
+
ack = queue.get_nowait()
|
|
3829
|
+
except queue_lib.Empty:
|
|
3830
|
+
ack = None
|
|
3831
|
+
time.sleep(0.1)
|
|
3832
|
+
continue
|
|
3833
|
+
assert ack is not None
|
|
3834
|
+
if isinstance(
|
|
3835
|
+
head_runner,
|
|
3836
|
+
command_runner.SSHCommandRunner) and ack == f'{_ACK_MESSAGE}\n':
|
|
3837
|
+
break
|
|
3838
|
+
elif isinstance(head_runner, command_runner.KubernetesCommandRunner
|
|
3839
|
+
) and _FORWARDING_FROM_MESSAGE in ack:
|
|
3840
|
+
# On kind clusters, this error occurs if we make a request
|
|
3841
|
+
# immediately after the port-forward is established on a new pod:
|
|
3842
|
+
# "Unhandled Error" err="an error occurred forwarding ... -> 46590:
|
|
3843
|
+
# failed to execute portforward in network namespace
|
|
3844
|
+
# "/var/run/netns/cni-...": failed to connect to localhost:46590
|
|
3845
|
+
# inside namespace "...", IPv4: dial tcp4 127.0.0.1:46590:
|
|
3846
|
+
# connect: connection refused
|
|
3847
|
+
# So we need to poll the port on the pod to check if it is open.
|
|
3848
|
+
# We did not observe this with real Kubernetes clusters.
|
|
3849
|
+
timeout = 5
|
|
3850
|
+
port_check_cmd = (
|
|
3851
|
+
# We install netcat in our ray-node container,
|
|
3852
|
+
# so we can use it here.
|
|
3853
|
+
# (See kubernetes-ray.yml.j2)
|
|
3854
|
+
f'end=$((SECONDS+{timeout})); '
|
|
3855
|
+
f'while ! nc -z -w 1 localhost {remote_port}; do '
|
|
3856
|
+
'if (( SECONDS >= end )); then exit 1; fi; '
|
|
3857
|
+
'sleep 0.1; '
|
|
3858
|
+
'done')
|
|
3859
|
+
returncode, stdout, stderr = head_runner.run(port_check_cmd,
|
|
3860
|
+
require_outputs=True,
|
|
3861
|
+
stream_logs=False)
|
|
3862
|
+
if returncode != 0:
|
|
3863
|
+
try:
|
|
3864
|
+
ssh_tunnel_proc.terminate()
|
|
3865
|
+
ssh_tunnel_proc.wait(timeout=5)
|
|
3866
|
+
except subprocess.TimeoutExpired:
|
|
3867
|
+
ssh_tunnel_proc.kill()
|
|
3868
|
+
ssh_tunnel_proc.wait()
|
|
3869
|
+
finally:
|
|
3870
|
+
error_msg = (f'Failed to check remote port {remote_port}')
|
|
3871
|
+
if stdout:
|
|
3872
|
+
error_msg += f'\n-- stdout --\n{stdout}\n'
|
|
3873
|
+
raise exceptions.CommandError(returncode=returncode,
|
|
3874
|
+
command=cmd_str,
|
|
3875
|
+
error_msg=error_msg,
|
|
3876
|
+
detailed_reason=stderr)
|
|
3877
|
+
break
|
|
3878
|
+
|
|
3879
|
+
if ssh_tunnel_proc.poll() is not None:
|
|
3880
|
+
stdout, stderr = ssh_tunnel_proc.communicate()
|
|
3881
|
+
error_msg = 'Port forward failed'
|
|
3882
|
+
if stdout:
|
|
3883
|
+
error_msg += f'\n-- stdout --\n{stdout}\n'
|
|
3884
|
+
raise exceptions.CommandError(returncode=ssh_tunnel_proc.returncode,
|
|
3885
|
+
command=cmd_str,
|
|
3886
|
+
error_msg=error_msg,
|
|
3887
|
+
detailed_reason=stderr)
|
|
3888
|
+
return ssh_tunnel_proc
|
|
3889
|
+
|
|
3890
|
+
|
|
3891
|
+
T = TypeVar('T')
|
|
3892
|
+
|
|
3893
|
+
|
|
3894
|
+
def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
|
|
3895
|
+
"""Generic helper for making Skylet gRPC requests.
|
|
3896
|
+
|
|
3897
|
+
This method handles the common pattern of:
|
|
3898
|
+
1. Try the gRPC request
|
|
3899
|
+
2. If SSH tunnel is closed, recreate it and retry
|
|
3900
|
+
"""
|
|
3901
|
+
max_attempts = 5
|
|
3902
|
+
backoff = common_utils.Backoff(initial_backoff=0.5)
|
|
3903
|
+
last_exception: Optional[Exception] = None
|
|
3904
|
+
|
|
3905
|
+
for _ in range(max_attempts):
|
|
3906
|
+
try:
|
|
3907
|
+
return func()
|
|
3908
|
+
except grpc.RpcError as e:
|
|
3909
|
+
last_exception = e
|
|
3910
|
+
_handle_grpc_error(e, backoff.current_backoff())
|
|
3911
|
+
|
|
3912
|
+
raise RuntimeError(
|
|
3913
|
+
f'Failed to invoke Skylet after {max_attempts} attempts: {last_exception}'
|
|
3914
|
+
) from last_exception
|
|
3915
|
+
|
|
3916
|
+
|
|
3917
|
+
def invoke_skylet_streaming_with_retries(
|
|
3918
|
+
stream_func: Callable[..., Iterator[T]]) -> Iterator[T]:
|
|
3919
|
+
"""Generic helper for making Skylet streaming gRPC requests."""
|
|
3920
|
+
max_attempts = 3
|
|
3921
|
+
backoff = common_utils.Backoff(initial_backoff=0.5)
|
|
3922
|
+
last_exception: Optional[Exception] = None
|
|
3923
|
+
|
|
3924
|
+
for _ in range(max_attempts):
|
|
3925
|
+
try:
|
|
3926
|
+
for response in stream_func():
|
|
3927
|
+
yield response
|
|
3928
|
+
return
|
|
3929
|
+
except grpc.RpcError as e:
|
|
3930
|
+
last_exception = e
|
|
3931
|
+
_handle_grpc_error(e, backoff.current_backoff())
|
|
3932
|
+
|
|
3933
|
+
raise RuntimeError(
|
|
3934
|
+
f'Failed to stream Skylet response after {max_attempts} attempts'
|
|
3935
|
+
) from last_exception
|
|
3936
|
+
|
|
3937
|
+
|
|
3938
|
+
def _handle_grpc_error(e: 'grpc.RpcError', current_backoff: float) -> None:
|
|
3939
|
+
if e.code() == grpc.StatusCode.INTERNAL:
|
|
3940
|
+
with ux_utils.print_exception_no_traceback():
|
|
3941
|
+
raise exceptions.SkyletInternalError(e.details())
|
|
3942
|
+
elif e.code() == grpc.StatusCode.UNAVAILABLE:
|
|
3943
|
+
time.sleep(current_backoff)
|
|
3944
|
+
elif e.code() == grpc.StatusCode.UNIMPLEMENTED or e.code(
|
|
3945
|
+
) == grpc.StatusCode.UNKNOWN:
|
|
3946
|
+
# Handle backwards compatibility: old server doesn't implement this RPC.
|
|
3947
|
+
# Let the caller fall back to legacy execution.
|
|
3948
|
+
raise exceptions.SkyletMethodNotImplementedError(
|
|
3949
|
+
f'gRPC method not implemented on server, falling back to legacy execution: {e.details()}'
|
|
3950
|
+
)
|
|
3951
|
+
else:
|
|
3952
|
+
raise e
|