skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/backends/backend_utils.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Util constants/functions for the backends."""
|
|
2
|
+
import asyncio
|
|
2
3
|
from datetime import datetime
|
|
3
4
|
import enum
|
|
4
5
|
import fnmatch
|
|
@@ -6,20 +7,24 @@ import hashlib
|
|
|
6
7
|
import os
|
|
7
8
|
import pathlib
|
|
8
9
|
import pprint
|
|
10
|
+
import queue as queue_lib
|
|
9
11
|
import re
|
|
10
12
|
import shlex
|
|
11
13
|
import subprocess
|
|
12
14
|
import sys
|
|
13
15
|
import tempfile
|
|
16
|
+
import threading
|
|
14
17
|
import time
|
|
15
18
|
import typing
|
|
16
|
-
from typing import (Any, Callable, Dict, List, Optional, Sequence,
|
|
17
|
-
TypeVar, Union)
|
|
19
|
+
from typing import (Any, Callable, Dict, Iterator, List, Optional, Sequence,
|
|
20
|
+
Set, Tuple, TypeVar, Union)
|
|
18
21
|
import uuid
|
|
19
22
|
|
|
23
|
+
import aiohttp
|
|
24
|
+
from aiohttp import ClientTimeout
|
|
25
|
+
from aiohttp import TCPConnector
|
|
20
26
|
import colorama
|
|
21
27
|
from packaging import version
|
|
22
|
-
import psutil
|
|
23
28
|
from typing_extensions import Literal
|
|
24
29
|
|
|
25
30
|
import sky
|
|
@@ -43,10 +48,12 @@ from sky.server.requests import requests as requests_lib
|
|
|
43
48
|
from sky.skylet import autostop_lib
|
|
44
49
|
from sky.skylet import constants
|
|
45
50
|
from sky.usage import usage_lib
|
|
51
|
+
from sky.utils import auth_utils
|
|
46
52
|
from sky.utils import cluster_utils
|
|
47
53
|
from sky.utils import command_runner
|
|
48
54
|
from sky.utils import common
|
|
49
55
|
from sky.utils import common_utils
|
|
56
|
+
from sky.utils import context as context_lib
|
|
50
57
|
from sky.utils import context_utils
|
|
51
58
|
from sky.utils import controller_utils
|
|
52
59
|
from sky.utils import env_options
|
|
@@ -60,6 +67,7 @@ from sky.utils import subprocess_utils
|
|
|
60
67
|
from sky.utils import tempstore
|
|
61
68
|
from sky.utils import timeline
|
|
62
69
|
from sky.utils import ux_utils
|
|
70
|
+
from sky.utils import volume as volume_utils
|
|
63
71
|
from sky.utils import yaml_utils
|
|
64
72
|
from sky.workspaces import core as workspaces_core
|
|
65
73
|
|
|
@@ -75,7 +83,6 @@ if typing.TYPE_CHECKING:
|
|
|
75
83
|
from sky import task as task_lib
|
|
76
84
|
from sky.backends import cloud_vm_ray_backend
|
|
77
85
|
from sky.backends import local_docker_backend
|
|
78
|
-
from sky.utils import volume as volume_lib
|
|
79
86
|
else:
|
|
80
87
|
yaml = adaptors_common.LazyImport('yaml')
|
|
81
88
|
requests = adaptors_common.LazyImport('requests')
|
|
@@ -107,8 +114,12 @@ _LAUNCHED_RESERVED_WORKER_PATTERN = re.compile(
|
|
|
107
114
|
# 10.133.0.5: ray.worker.default,
|
|
108
115
|
_LAUNCHING_IP_PATTERN = re.compile(
|
|
109
116
|
r'({}): ray[._]worker[._](?:default|reserved)'.format(IP_ADDR_REGEX))
|
|
117
|
+
SSH_CONNECTION_ERROR_PATTERN = re.compile(
|
|
118
|
+
r'^ssh:.*(timed out|connection refused)$', re.IGNORECASE)
|
|
110
119
|
_SSH_CONNECTION_TIMED_OUT_PATTERN = re.compile(r'^ssh:.*timed out$',
|
|
111
120
|
re.IGNORECASE)
|
|
121
|
+
K8S_PODS_NOT_FOUND_PATTERN = re.compile(r'.*(NotFound|pods .* not found).*',
|
|
122
|
+
re.IGNORECASE)
|
|
112
123
|
_RAY_CLUSTER_NOT_FOUND_MESSAGE = 'Ray cluster is not found'
|
|
113
124
|
WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
|
|
114
125
|
|
|
@@ -131,6 +142,7 @@ _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
|
|
|
131
142
|
|
|
132
143
|
CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
|
|
133
144
|
WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
|
|
145
|
+
CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10.0
|
|
134
146
|
|
|
135
147
|
# Remote dir that holds our runtime files.
|
|
136
148
|
_REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
|
|
@@ -209,6 +221,9 @@ _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
|
|
|
209
221
|
('provider', 'availability_zone'),
|
|
210
222
|
]
|
|
211
223
|
|
|
224
|
+
_ACK_MESSAGE = 'ack'
|
|
225
|
+
_FORWARDING_FROM_MESSAGE = 'Forwarding from'
|
|
226
|
+
|
|
212
227
|
|
|
213
228
|
def is_ip(s: str) -> bool:
|
|
214
229
|
"""Returns whether this string matches IP_ADDR_REGEX."""
|
|
@@ -535,7 +550,7 @@ def get_expirable_clouds(
|
|
|
535
550
|
# get all custom contexts
|
|
536
551
|
contexts = kubernetes_utils.get_custom_config_k8s_contexts()
|
|
537
552
|
# add remote_identity of each context if it exists
|
|
538
|
-
remote_identities = None
|
|
553
|
+
remote_identities: Optional[Union[str, List[Dict[str, str]]]] = None
|
|
539
554
|
for context in contexts:
|
|
540
555
|
context_remote_identity = skypilot_config.get_effective_region_config(
|
|
541
556
|
cloud='kubernetes',
|
|
@@ -546,9 +561,11 @@ def get_expirable_clouds(
|
|
|
546
561
|
if remote_identities is None:
|
|
547
562
|
remote_identities = []
|
|
548
563
|
if isinstance(context_remote_identity, str):
|
|
564
|
+
assert isinstance(remote_identities, list)
|
|
549
565
|
remote_identities.append(
|
|
550
566
|
{context: context_remote_identity})
|
|
551
567
|
elif isinstance(context_remote_identity, list):
|
|
568
|
+
assert isinstance(remote_identities, list)
|
|
552
569
|
remote_identities.extend(context_remote_identity)
|
|
553
570
|
# add global kubernetes remote identity if it exists, if not, add default
|
|
554
571
|
global_remote_identity = skypilot_config.get_effective_region_config(
|
|
@@ -560,8 +577,10 @@ def get_expirable_clouds(
|
|
|
560
577
|
if remote_identities is None:
|
|
561
578
|
remote_identities = []
|
|
562
579
|
if isinstance(global_remote_identity, str):
|
|
580
|
+
assert isinstance(remote_identities, list)
|
|
563
581
|
remote_identities.append({'*': global_remote_identity})
|
|
564
582
|
elif isinstance(global_remote_identity, list):
|
|
583
|
+
assert isinstance(remote_identities, list)
|
|
565
584
|
remote_identities.extend(global_remote_identity)
|
|
566
585
|
if remote_identities is None:
|
|
567
586
|
remote_identities = schemas.get_default_remote_identity(
|
|
@@ -589,6 +608,11 @@ def get_expirable_clouds(
|
|
|
589
608
|
return expirable_clouds
|
|
590
609
|
|
|
591
610
|
|
|
611
|
+
def _get_volume_name(path: str, cluster_name_on_cloud: str) -> str:
|
|
612
|
+
path_hash = hashlib.md5(path.encode()).hexdigest()[:6]
|
|
613
|
+
return f'{cluster_name_on_cloud}-{path_hash}'
|
|
614
|
+
|
|
615
|
+
|
|
592
616
|
# TODO: too many things happening here - leaky abstraction. Refactor.
|
|
593
617
|
@timeline.event
|
|
594
618
|
def write_cluster_config(
|
|
@@ -602,7 +626,7 @@ def write_cluster_config(
|
|
|
602
626
|
zones: Optional[List[clouds.Zone]] = None,
|
|
603
627
|
dryrun: bool = False,
|
|
604
628
|
keep_launch_fields_in_existing_config: bool = True,
|
|
605
|
-
volume_mounts: Optional[List['
|
|
629
|
+
volume_mounts: Optional[List['volume_utils.VolumeMount']] = None,
|
|
606
630
|
) -> Dict[str, str]:
|
|
607
631
|
"""Fills in cluster configuration templates and writes them out.
|
|
608
632
|
|
|
@@ -705,11 +729,15 @@ def write_cluster_config(
|
|
|
705
729
|
'is not supported by this cloud. Remove the config or set: '
|
|
706
730
|
'`remote_identity: LOCAL_CREDENTIALS`.')
|
|
707
731
|
if isinstance(cloud, clouds.Kubernetes):
|
|
708
|
-
|
|
732
|
+
allowed_contexts = skypilot_config.get_workspace_cloud(
|
|
733
|
+
'kubernetes').get('allowed_contexts', None)
|
|
734
|
+
if allowed_contexts is None:
|
|
735
|
+
allowed_contexts = skypilot_config.get_effective_region_config(
|
|
709
736
|
cloud='kubernetes',
|
|
710
737
|
region=None,
|
|
711
738
|
keys=('allowed_contexts',),
|
|
712
|
-
default_value=None)
|
|
739
|
+
default_value=None)
|
|
740
|
+
if allowed_contexts is None:
|
|
713
741
|
excluded_clouds.add(cloud)
|
|
714
742
|
else:
|
|
715
743
|
excluded_clouds.add(cloud)
|
|
@@ -733,7 +761,7 @@ def write_cluster_config(
|
|
|
733
761
|
assert k not in credentials, f'{k} already in credentials'
|
|
734
762
|
credentials[k] = v
|
|
735
763
|
|
|
736
|
-
private_key_path, _ =
|
|
764
|
+
private_key_path, _ = auth_utils.get_or_generate_keys()
|
|
737
765
|
auth_config = {'ssh_private_key': private_key_path}
|
|
738
766
|
region_name = resources_vars.get('region')
|
|
739
767
|
|
|
@@ -767,6 +795,55 @@ def write_cluster_config(
|
|
|
767
795
|
assert region_name in ssh_proxy_command_config, (
|
|
768
796
|
region_name, ssh_proxy_command_config)
|
|
769
797
|
ssh_proxy_command = ssh_proxy_command_config[region_name]
|
|
798
|
+
|
|
799
|
+
use_internal_ips = skypilot_config.get_effective_region_config(
|
|
800
|
+
cloud=str(cloud).lower(),
|
|
801
|
+
region=region.name,
|
|
802
|
+
keys=('use_internal_ips',),
|
|
803
|
+
default_value=False)
|
|
804
|
+
if isinstance(cloud, clouds.AWS):
|
|
805
|
+
# If the use_ssm flag is set to true, we use the ssm proxy command.
|
|
806
|
+
use_ssm = skypilot_config.get_effective_region_config(
|
|
807
|
+
cloud=str(cloud).lower(),
|
|
808
|
+
region=region.name,
|
|
809
|
+
keys=('use_ssm',),
|
|
810
|
+
default_value=None)
|
|
811
|
+
|
|
812
|
+
if use_ssm and ssh_proxy_command is not None:
|
|
813
|
+
raise exceptions.InvalidCloudConfigs(
|
|
814
|
+
'use_ssm is set to true, but ssh_proxy_command '
|
|
815
|
+
f'is already set to {ssh_proxy_command!r}. Please remove '
|
|
816
|
+
'ssh_proxy_command or set use_ssm to false.')
|
|
817
|
+
|
|
818
|
+
if use_internal_ips and ssh_proxy_command is None:
|
|
819
|
+
# Only if use_ssm is explicitly not set, we default to using SSM.
|
|
820
|
+
if use_ssm is None:
|
|
821
|
+
logger.warning(
|
|
822
|
+
f'{colorama.Fore.YELLOW}'
|
|
823
|
+
'use_internal_ips is set to true, '
|
|
824
|
+
'but ssh_proxy_command is not set. Defaulting to '
|
|
825
|
+
'using SSM. Specify ssh_proxy_command to use a different '
|
|
826
|
+
'https://docs.skypilot.co/en/latest/reference/config.html#'
|
|
827
|
+
f'aws.ssh_proxy_command.{colorama.Style.RESET_ALL}')
|
|
828
|
+
use_ssm = True
|
|
829
|
+
|
|
830
|
+
if use_ssm:
|
|
831
|
+
aws_profile = os.environ.get('AWS_PROFILE', None)
|
|
832
|
+
profile_str = f'--profile {aws_profile}' if aws_profile else ''
|
|
833
|
+
ip_address_filter = ('Name=private-ip-address,Values=%h'
|
|
834
|
+
if use_internal_ips else
|
|
835
|
+
'Name=ip-address,Values=%h')
|
|
836
|
+
get_instance_id_command = 'aws ec2 describe-instances ' + \
|
|
837
|
+
f'--region {region_name} --filters {ip_address_filter} ' + \
|
|
838
|
+
'--query \"Reservations[].Instances[].InstanceId\" ' + \
|
|
839
|
+
f'{profile_str} --output text'
|
|
840
|
+
ssm_proxy_command = 'aws ssm start-session --target ' + \
|
|
841
|
+
f'\"$({get_instance_id_command})\" ' + \
|
|
842
|
+
f'--region {region_name} {profile_str} ' + \
|
|
843
|
+
'--document-name AWS-StartSSHSession ' + \
|
|
844
|
+
'--parameters portNumber=%p'
|
|
845
|
+
ssh_proxy_command = ssm_proxy_command
|
|
846
|
+
region_name = 'ssm-session'
|
|
770
847
|
logger.debug(f'Using ssh_proxy_command: {ssh_proxy_command!r}')
|
|
771
848
|
|
|
772
849
|
# User-supplied global instance tags from ~/.sky/config.yaml.
|
|
@@ -783,12 +860,6 @@ def write_cluster_config(
|
|
|
783
860
|
if to_provision.labels:
|
|
784
861
|
labels.update(to_provision.labels)
|
|
785
862
|
|
|
786
|
-
# Dump the Ray ports to a file for Ray job submission
|
|
787
|
-
dump_port_command = (
|
|
788
|
-
f'{constants.SKY_PYTHON_CMD} -c \'import json, os; json.dump({constants.SKY_REMOTE_RAY_PORT_DICT_STR}, '
|
|
789
|
-
f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", encoding="utf-8"))\''
|
|
790
|
-
)
|
|
791
|
-
|
|
792
863
|
# We disable conda auto-activation if the user has specified a docker image
|
|
793
864
|
# to use, which is likely to already have a conda environment activated.
|
|
794
865
|
conda_auto_activate = ('true' if to_provision.extract_docker_image() is None
|
|
@@ -804,14 +875,24 @@ def write_cluster_config(
|
|
|
804
875
|
cluster_name)
|
|
805
876
|
|
|
806
877
|
volume_mount_vars = []
|
|
878
|
+
ephemeral_volume_mount_vars = []
|
|
807
879
|
if volume_mounts is not None:
|
|
808
880
|
for vol in volume_mounts:
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
881
|
+
if vol.is_ephemeral:
|
|
882
|
+
volume_name = _get_volume_name(vol.path, cluster_name_on_cloud)
|
|
883
|
+
vol.volume_name = volume_name
|
|
884
|
+
vol.volume_config.cloud = repr(cloud)
|
|
885
|
+
vol.volume_config.region = region.name
|
|
886
|
+
vol.volume_config.name = volume_name
|
|
887
|
+
ephemeral_volume_mount_vars.append(vol.to_yaml_config())
|
|
888
|
+
else:
|
|
889
|
+
volume_info = volume_utils.VolumeInfo(
|
|
890
|
+
name=vol.volume_name,
|
|
891
|
+
path=vol.path,
|
|
892
|
+
volume_name_on_cloud=vol.volume_config.name_on_cloud,
|
|
893
|
+
volume_id_on_cloud=vol.volume_config.id_on_cloud,
|
|
894
|
+
)
|
|
895
|
+
volume_mount_vars.append(volume_info)
|
|
815
896
|
|
|
816
897
|
runcmd = skypilot_config.get_effective_region_config(
|
|
817
898
|
cloud=str(to_provision.cloud).lower(),
|
|
@@ -875,12 +956,14 @@ def write_cluster_config(
|
|
|
875
956
|
'{sky_wheel_hash}',
|
|
876
957
|
wheel_hash).replace('{cloud}',
|
|
877
958
|
str(cloud).lower()),
|
|
959
|
+
'copy_skypilot_templates_commands':
|
|
960
|
+
constants.COPY_SKYPILOT_TEMPLATES_COMMANDS,
|
|
878
961
|
# Port of Ray (GCS server).
|
|
879
962
|
# Ray's default port 6379 is conflicted with Redis.
|
|
880
963
|
'ray_port': constants.SKY_REMOTE_RAY_PORT,
|
|
881
964
|
'ray_dashboard_port': constants.SKY_REMOTE_RAY_DASHBOARD_PORT,
|
|
882
965
|
'ray_temp_dir': constants.SKY_REMOTE_RAY_TEMPDIR,
|
|
883
|
-
'dump_port_command':
|
|
966
|
+
'dump_port_command': instance_setup.DUMP_RAY_PORTS,
|
|
884
967
|
# Sky-internal constants.
|
|
885
968
|
'sky_ray_cmd': constants.SKY_RAY_CMD,
|
|
886
969
|
# pip install needs to have python env activated to make sure
|
|
@@ -917,9 +1000,10 @@ def write_cluster_config(
|
|
|
917
1000
|
|
|
918
1001
|
# Volume mounts
|
|
919
1002
|
'volume_mounts': volume_mount_vars,
|
|
1003
|
+
'ephemeral_volume_mounts': ephemeral_volume_mount_vars,
|
|
920
1004
|
|
|
921
|
-
# runcmd to
|
|
922
|
-
#
|
|
1005
|
+
# runcmd to run before any of the SkyPilot runtime setup commands.
|
|
1006
|
+
# This is currently only used by AWS and Kubernetes.
|
|
923
1007
|
'runcmd': runcmd,
|
|
924
1008
|
}),
|
|
925
1009
|
output_path=tmp_yaml_path)
|
|
@@ -974,11 +1058,7 @@ def write_cluster_config(
|
|
|
974
1058
|
with open(tmp_yaml_path, 'w', encoding='utf-8') as f:
|
|
975
1059
|
f.write(restored_yaml_content)
|
|
976
1060
|
|
|
977
|
-
|
|
978
|
-
# compatbility restortion above into account.
|
|
979
|
-
# TODO: remove this after 2 minor releases, 0.10.0.
|
|
980
|
-
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
981
|
-
config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
|
|
1061
|
+
config_dict['cluster_name_on_cloud'] = cluster_name_on_cloud
|
|
982
1062
|
|
|
983
1063
|
# Make sure to do this before we optimize file mounts. Optimization is
|
|
984
1064
|
# non-deterministic, but everything else before this point should be
|
|
@@ -1053,6 +1133,12 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
|
|
|
1053
1133
|
config = auth.setup_fluidstack_authentication(config)
|
|
1054
1134
|
elif isinstance(cloud, clouds.Hyperbolic):
|
|
1055
1135
|
config = auth.setup_hyperbolic_authentication(config)
|
|
1136
|
+
elif isinstance(cloud, clouds.Shadeform):
|
|
1137
|
+
config = auth.setup_shadeform_authentication(config)
|
|
1138
|
+
elif isinstance(cloud, clouds.PrimeIntellect):
|
|
1139
|
+
config = auth.setup_primeintellect_authentication(config)
|
|
1140
|
+
elif isinstance(cloud, clouds.Seeweb):
|
|
1141
|
+
config = auth.setup_seeweb_authentication(config)
|
|
1056
1142
|
else:
|
|
1057
1143
|
assert False, cloud
|
|
1058
1144
|
yaml_utils.dump_yaml(tmp_yaml_path, config)
|
|
@@ -1155,7 +1241,6 @@ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
|
|
|
1155
1241
|
Rather than constructing the whole byte sequence, which may be quite large,
|
|
1156
1242
|
we construct it incrementally by using hash.update() to add new bytes.
|
|
1157
1243
|
"""
|
|
1158
|
-
|
|
1159
1244
|
# Load the yaml contents so that we can directly remove keys.
|
|
1160
1245
|
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
1161
1246
|
for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
|
|
@@ -1738,6 +1823,32 @@ def check_network_connection():
|
|
|
1738
1823
|
'Network seems down.')
|
|
1739
1824
|
|
|
1740
1825
|
|
|
1826
|
+
async def async_check_network_connection():
|
|
1827
|
+
"""Check if the network connection is available.
|
|
1828
|
+
|
|
1829
|
+
Tolerates 3 retries as it is observed that connections can fail.
|
|
1830
|
+
Uses aiohttp for async HTTP requests.
|
|
1831
|
+
"""
|
|
1832
|
+
# Create a session with retry logic
|
|
1833
|
+
timeout = ClientTimeout(total=15)
|
|
1834
|
+
connector = TCPConnector(limit=1) # Limit to 1 connection at a time
|
|
1835
|
+
|
|
1836
|
+
async with aiohttp.ClientSession(timeout=timeout,
|
|
1837
|
+
connector=connector) as session:
|
|
1838
|
+
for i, ip in enumerate(_TEST_IP_LIST):
|
|
1839
|
+
try:
|
|
1840
|
+
async with session.head(ip) as response:
|
|
1841
|
+
if response.status < 400: # Any 2xx or 3xx status is good
|
|
1842
|
+
return
|
|
1843
|
+
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
|
1844
|
+
if i == len(_TEST_IP_LIST) - 1:
|
|
1845
|
+
raise exceptions.NetworkError(
|
|
1846
|
+
'Could not refresh the cluster. '
|
|
1847
|
+
'Network seems down.') from e
|
|
1848
|
+
# If not the last IP, continue to try the next one
|
|
1849
|
+
continue
|
|
1850
|
+
|
|
1851
|
+
|
|
1741
1852
|
@timeline.event
|
|
1742
1853
|
def check_owner_identity(cluster_name: str) -> None:
|
|
1743
1854
|
"""Check if current user is the same as the user who created the cluster.
|
|
@@ -1750,9 +1861,18 @@ def check_owner_identity(cluster_name: str) -> None:
|
|
|
1750
1861
|
"""
|
|
1751
1862
|
if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
|
|
1752
1863
|
return
|
|
1753
|
-
record = global_user_state.get_cluster_from_name(cluster_name
|
|
1864
|
+
record = global_user_state.get_cluster_from_name(cluster_name,
|
|
1865
|
+
include_user_info=False,
|
|
1866
|
+
summary_response=True)
|
|
1754
1867
|
if record is None:
|
|
1755
1868
|
return
|
|
1869
|
+
_check_owner_identity_with_record(cluster_name, record)
|
|
1870
|
+
|
|
1871
|
+
|
|
1872
|
+
def _check_owner_identity_with_record(cluster_name: str,
|
|
1873
|
+
record: Dict[str, Any]) -> None:
|
|
1874
|
+
if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
|
|
1875
|
+
return
|
|
1756
1876
|
handle = record['handle']
|
|
1757
1877
|
if not isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1758
1878
|
return
|
|
@@ -1837,8 +1957,10 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
|
|
|
1837
1957
|
}
|
|
1838
1958
|
|
|
1839
1959
|
|
|
1960
|
+
@context_utils.cancellation_guard
|
|
1840
1961
|
def _query_cluster_status_via_cloud_api(
|
|
1841
|
-
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
|
|
1962
|
+
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
|
|
1963
|
+
retry_if_missing: bool,
|
|
1842
1964
|
) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
|
|
1843
1965
|
"""Returns the status of the cluster as a list of tuples corresponding
|
|
1844
1966
|
to the node status and an optional reason string for said status.
|
|
@@ -1865,8 +1987,11 @@ def _query_cluster_status_via_cloud_api(
|
|
|
1865
1987
|
cloud_name = repr(handle.launched_resources.cloud)
|
|
1866
1988
|
try:
|
|
1867
1989
|
node_status_dict = provision_lib.query_instances(
|
|
1868
|
-
cloud_name,
|
|
1869
|
-
|
|
1990
|
+
cloud_name,
|
|
1991
|
+
cluster_name,
|
|
1992
|
+
cluster_name_on_cloud,
|
|
1993
|
+
provider_config,
|
|
1994
|
+
retry_if_missing=retry_if_missing)
|
|
1870
1995
|
logger.debug(f'Querying {cloud_name} cluster '
|
|
1871
1996
|
f'{cluster_name_in_hint} '
|
|
1872
1997
|
f'status:\n{pprint.pformat(node_status_dict)}')
|
|
@@ -2044,7 +2169,12 @@ def check_can_clone_disk_and_override_task(
|
|
|
2044
2169
|
return task, handle
|
|
2045
2170
|
|
|
2046
2171
|
|
|
2047
|
-
def _update_cluster_status(
|
|
2172
|
+
def _update_cluster_status(
|
|
2173
|
+
cluster_name: str,
|
|
2174
|
+
record: Dict[str, Any],
|
|
2175
|
+
retry_if_missing: bool,
|
|
2176
|
+
include_user_info: bool = True,
|
|
2177
|
+
summary_response: bool = False) -> Optional[Dict[str, Any]]:
|
|
2048
2178
|
"""Update the cluster status.
|
|
2049
2179
|
|
|
2050
2180
|
The cluster status is updated by checking ray cluster and real status from
|
|
@@ -2071,9 +2201,6 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2071
2201
|
fetched from the cloud provider or there are leaked nodes causing
|
|
2072
2202
|
the node number larger than expected.
|
|
2073
2203
|
"""
|
|
2074
|
-
record = global_user_state.get_cluster_from_name(cluster_name)
|
|
2075
|
-
if record is None:
|
|
2076
|
-
return None
|
|
2077
2204
|
handle = record['handle']
|
|
2078
2205
|
if handle.cluster_yaml is None:
|
|
2079
2206
|
# Remove cluster from db since this cluster does not have a config file
|
|
@@ -2092,7 +2219,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2092
2219
|
return record
|
|
2093
2220
|
cluster_name = handle.cluster_name
|
|
2094
2221
|
|
|
2095
|
-
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2222
|
+
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2223
|
+
handle, retry_if_missing=retry_if_missing)
|
|
2096
2224
|
|
|
2097
2225
|
all_nodes_up = (all(status[0] == status_lib.ClusterStatus.UP
|
|
2098
2226
|
for status in node_statuses) and
|
|
@@ -2140,6 +2268,11 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2140
2268
|
total_nodes = handle.launched_nodes * handle.num_ips_per_node
|
|
2141
2269
|
|
|
2142
2270
|
cloud_name = repr(handle.launched_resources.cloud).lower()
|
|
2271
|
+
# Initialize variables in case all retries fail
|
|
2272
|
+
ready_head = 0
|
|
2273
|
+
ready_workers = 0
|
|
2274
|
+
output = ''
|
|
2275
|
+
stderr = ''
|
|
2143
2276
|
for i in range(5):
|
|
2144
2277
|
try:
|
|
2145
2278
|
ready_head, ready_workers, output, stderr = (
|
|
@@ -2240,12 +2373,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2240
2373
|
'All nodes up; SkyPilot runtime healthy.',
|
|
2241
2374
|
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2242
2375
|
nop_if_duplicate=True)
|
|
2243
|
-
global_user_state.add_or_update_cluster(
|
|
2244
|
-
|
|
2245
|
-
|
|
2246
|
-
|
|
2247
|
-
|
|
2248
|
-
|
|
2376
|
+
global_user_state.add_or_update_cluster(
|
|
2377
|
+
cluster_name,
|
|
2378
|
+
handle,
|
|
2379
|
+
requested_resources=None,
|
|
2380
|
+
ready=True,
|
|
2381
|
+
is_launch=False,
|
|
2382
|
+
existing_cluster_hash=record['cluster_hash'])
|
|
2383
|
+
return global_user_state.get_cluster_from_name(
|
|
2384
|
+
cluster_name,
|
|
2385
|
+
include_user_info=include_user_info,
|
|
2386
|
+
summary_response=summary_response)
|
|
2249
2387
|
|
|
2250
2388
|
# All cases below are transitioning the cluster to non-UP states.
|
|
2251
2389
|
launched_resources = handle.launched_resources.assert_launchable()
|
|
@@ -2262,7 +2400,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2262
2400
|
# and check again. This is a best-effort leak prevention check.
|
|
2263
2401
|
# See https://github.com/skypilot-org/skypilot/issues/4431.
|
|
2264
2402
|
time.sleep(_LAUNCH_DOUBLE_CHECK_DELAY)
|
|
2265
|
-
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2403
|
+
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2404
|
+
handle, retry_if_missing=False)
|
|
2266
2405
|
# Note: even if all the node_statuses are UP now, we will still
|
|
2267
2406
|
# consider this cluster abnormal, and its status will be INIT.
|
|
2268
2407
|
|
|
@@ -2450,12 +2589,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2450
2589
|
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2451
2590
|
nop_if_duplicate=True,
|
|
2452
2591
|
duplicate_regex=init_reason_regex)
|
|
2453
|
-
global_user_state.add_or_update_cluster(
|
|
2454
|
-
|
|
2455
|
-
|
|
2456
|
-
|
|
2457
|
-
|
|
2458
|
-
|
|
2592
|
+
global_user_state.add_or_update_cluster(
|
|
2593
|
+
cluster_name,
|
|
2594
|
+
handle,
|
|
2595
|
+
requested_resources=None,
|
|
2596
|
+
ready=False,
|
|
2597
|
+
is_launch=False,
|
|
2598
|
+
existing_cluster_hash=record['cluster_hash'])
|
|
2599
|
+
return global_user_state.get_cluster_from_name(
|
|
2600
|
+
cluster_name,
|
|
2601
|
+
include_user_info=include_user_info,
|
|
2602
|
+
summary_response=summary_response)
|
|
2459
2603
|
# Now is_abnormal is False: either node_statuses is empty or all nodes are
|
|
2460
2604
|
# STOPPED.
|
|
2461
2605
|
verb = 'terminated' if to_terminate else 'stopped'
|
|
@@ -2470,7 +2614,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2470
2614
|
nop_if_duplicate=True,
|
|
2471
2615
|
)
|
|
2472
2616
|
backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
|
|
2473
|
-
return global_user_state.get_cluster_from_name(
|
|
2617
|
+
return global_user_state.get_cluster_from_name(
|
|
2618
|
+
cluster_name,
|
|
2619
|
+
include_user_info=include_user_info,
|
|
2620
|
+
summary_response=summary_response)
|
|
2474
2621
|
|
|
2475
2622
|
|
|
2476
2623
|
def _must_refresh_cluster_status(
|
|
@@ -2492,12 +2639,14 @@ def _must_refresh_cluster_status(
|
|
|
2492
2639
|
|
|
2493
2640
|
|
|
2494
2641
|
def refresh_cluster_record(
|
|
2495
|
-
|
|
2496
|
-
|
|
2497
|
-
|
|
2498
|
-
|
|
2499
|
-
|
|
2500
|
-
|
|
2642
|
+
cluster_name: str,
|
|
2643
|
+
*,
|
|
2644
|
+
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
|
|
2645
|
+
cluster_lock_already_held: bool = False,
|
|
2646
|
+
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
|
|
2647
|
+
include_user_info: bool = True,
|
|
2648
|
+
summary_response: bool = False,
|
|
2649
|
+
retry_if_missing: bool = True) -> Optional[Dict[str, Any]]:
|
|
2501
2650
|
"""Refresh the cluster, and return the possibly updated record.
|
|
2502
2651
|
|
|
2503
2652
|
The function will update the cached cluster status in the global state. For
|
|
@@ -2514,14 +2663,20 @@ def refresh_cluster_record(
|
|
|
2514
2663
|
_CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
|
|
2515
2664
|
1. the cluster is a spot cluster, or
|
|
2516
2665
|
2. cluster autostop is set and the cluster is not STOPPED.
|
|
2517
|
-
|
|
2518
|
-
|
|
2519
|
-
|
|
2666
|
+
cluster_lock_already_held: Whether the caller is already holding the
|
|
2667
|
+
per-cluster lock. You MUST NOT set this to True if the caller does not
|
|
2668
|
+
already hold the lock. If True, we will not acquire the lock before
|
|
2669
|
+
updating the status. Failing to hold the lock while updating the
|
|
2670
|
+
status can lead to correctness issues - e.g. an launch in-progress may
|
|
2671
|
+
appear to be DOWN incorrectly. Even if this is set to False, the lock
|
|
2672
|
+
may not be acquired if the status does not need to be refreshed.
|
|
2520
2673
|
cluster_status_lock_timeout: The timeout to acquire the per-cluster
|
|
2521
2674
|
lock. If timeout, the function will use the cached status. If the
|
|
2522
2675
|
value is <0, do not timeout (wait for the lock indefinitely). By
|
|
2523
2676
|
default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
|
|
2524
2677
|
if correctness is required, you must set this to -1.
|
|
2678
|
+
retry_if_missing: Whether to retry the call to the cloud api if the
|
|
2679
|
+
cluster is not found when querying the live status on the cloud.
|
|
2525
2680
|
|
|
2526
2681
|
Returns:
|
|
2527
2682
|
If the cluster is terminated or does not exist, return None.
|
|
@@ -2537,17 +2692,20 @@ def refresh_cluster_record(
|
|
|
2537
2692
|
the node number larger than expected.
|
|
2538
2693
|
"""
|
|
2539
2694
|
|
|
2540
|
-
|
|
2695
|
+
ctx = context_lib.get()
|
|
2696
|
+
record = global_user_state.get_cluster_from_name(
|
|
2697
|
+
cluster_name,
|
|
2698
|
+
include_user_info=include_user_info,
|
|
2699
|
+
summary_response=summary_response)
|
|
2541
2700
|
if record is None:
|
|
2542
2701
|
return None
|
|
2543
2702
|
# TODO(zhwu, 05/20): switch to the specific workspace to make sure we are
|
|
2544
2703
|
# using the correct cloud credentials.
|
|
2545
2704
|
workspace = record.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE)
|
|
2546
2705
|
with skypilot_config.local_active_workspace_ctx(workspace):
|
|
2547
|
-
check_owner_identity
|
|
2548
|
-
|
|
2549
|
-
|
|
2550
|
-
return record
|
|
2706
|
+
# check_owner_identity returns if the record handle is
|
|
2707
|
+
# not a CloudVmRayResourceHandle
|
|
2708
|
+
_check_owner_identity_with_record(cluster_name, record)
|
|
2551
2709
|
|
|
2552
2710
|
# The loop logic allows us to notice if the status was updated in the
|
|
2553
2711
|
# global_user_state by another process and stop trying to get the lock.
|
|
@@ -2556,12 +2714,18 @@ def refresh_cluster_record(
|
|
|
2556
2714
|
|
|
2557
2715
|
# Loop until we have an up-to-date status or until we acquire the lock.
|
|
2558
2716
|
while True:
|
|
2717
|
+
# Check if the context is canceled.
|
|
2718
|
+
if ctx is not None and ctx.is_canceled():
|
|
2719
|
+
raise asyncio.CancelledError()
|
|
2559
2720
|
# Check to see if we can return the cached status.
|
|
2560
2721
|
if not _must_refresh_cluster_status(record, force_refresh_statuses):
|
|
2561
2722
|
return record
|
|
2562
2723
|
|
|
2563
|
-
if
|
|
2564
|
-
return _update_cluster_status(cluster_name
|
|
2724
|
+
if cluster_lock_already_held:
|
|
2725
|
+
return _update_cluster_status(cluster_name, record,
|
|
2726
|
+
retry_if_missing,
|
|
2727
|
+
include_user_info,
|
|
2728
|
+
summary_response)
|
|
2565
2729
|
|
|
2566
2730
|
# Try to acquire the lock so we can fetch the status.
|
|
2567
2731
|
try:
|
|
@@ -2569,12 +2733,17 @@ def refresh_cluster_record(
|
|
|
2569
2733
|
# Check the cluster status again, since it could have been
|
|
2570
2734
|
# updated between our last check and acquiring the lock.
|
|
2571
2735
|
record = global_user_state.get_cluster_from_name(
|
|
2572
|
-
cluster_name
|
|
2736
|
+
cluster_name,
|
|
2737
|
+
include_user_info=include_user_info,
|
|
2738
|
+
summary_response=summary_response)
|
|
2573
2739
|
if record is None or not _must_refresh_cluster_status(
|
|
2574
2740
|
record, force_refresh_statuses):
|
|
2575
2741
|
return record
|
|
2576
2742
|
# Update and return the cluster status.
|
|
2577
|
-
return _update_cluster_status(cluster_name
|
|
2743
|
+
return _update_cluster_status(cluster_name, record,
|
|
2744
|
+
retry_if_missing,
|
|
2745
|
+
include_user_info,
|
|
2746
|
+
summary_response)
|
|
2578
2747
|
|
|
2579
2748
|
except locks.LockTimeout:
|
|
2580
2749
|
# lock.acquire() will throw a Timeout exception if the lock is not
|
|
@@ -2592,10 +2761,13 @@ def refresh_cluster_record(
|
|
|
2592
2761
|
'Refreshing status: Failed get the lock for cluster '
|
|
2593
2762
|
f'{cluster_name!r}. Using the cached status.')
|
|
2594
2763
|
return record
|
|
2595
|
-
time.sleep(
|
|
2764
|
+
time.sleep(lock.poll_interval)
|
|
2596
2765
|
|
|
2597
2766
|
# Refresh for next loop iteration.
|
|
2598
|
-
record = global_user_state.get_cluster_from_name(
|
|
2767
|
+
record = global_user_state.get_cluster_from_name(
|
|
2768
|
+
cluster_name,
|
|
2769
|
+
include_user_info=include_user_info,
|
|
2770
|
+
summary_response=summary_response)
|
|
2599
2771
|
if record is None:
|
|
2600
2772
|
return None
|
|
2601
2773
|
|
|
@@ -2606,8 +2778,9 @@ def refresh_cluster_status_handle(
|
|
|
2606
2778
|
cluster_name: str,
|
|
2607
2779
|
*,
|
|
2608
2780
|
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
|
|
2609
|
-
|
|
2610
|
-
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
|
|
2781
|
+
cluster_lock_already_held: bool = False,
|
|
2782
|
+
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
|
|
2783
|
+
retry_if_missing: bool = True,
|
|
2611
2784
|
) -> Tuple[Optional[status_lib.ClusterStatus],
|
|
2612
2785
|
Optional[backends.ResourceHandle]]:
|
|
2613
2786
|
"""Refresh the cluster, and return the possibly updated status and handle.
|
|
@@ -2619,8 +2792,11 @@ def refresh_cluster_status_handle(
|
|
|
2619
2792
|
record = refresh_cluster_record(
|
|
2620
2793
|
cluster_name,
|
|
2621
2794
|
force_refresh_statuses=force_refresh_statuses,
|
|
2622
|
-
|
|
2623
|
-
cluster_status_lock_timeout=cluster_status_lock_timeout
|
|
2795
|
+
cluster_lock_already_held=cluster_lock_already_held,
|
|
2796
|
+
cluster_status_lock_timeout=cluster_status_lock_timeout,
|
|
2797
|
+
include_user_info=False,
|
|
2798
|
+
summary_response=True,
|
|
2799
|
+
retry_if_missing=retry_if_missing)
|
|
2624
2800
|
if record is None:
|
|
2625
2801
|
return None, None
|
|
2626
2802
|
return record['status'], record['handle']
|
|
@@ -2671,7 +2847,9 @@ def check_cluster_available(
|
|
|
2671
2847
|
exceptions.CloudUserIdentityError: if we fail to get the current user
|
|
2672
2848
|
identity.
|
|
2673
2849
|
"""
|
|
2674
|
-
record = global_user_state.get_cluster_from_name(cluster_name
|
|
2850
|
+
record = global_user_state.get_cluster_from_name(cluster_name,
|
|
2851
|
+
include_user_info=False,
|
|
2852
|
+
summary_response=True)
|
|
2675
2853
|
if dryrun:
|
|
2676
2854
|
assert record is not None, cluster_name
|
|
2677
2855
|
return record['handle']
|
|
@@ -2858,7 +3036,8 @@ def is_controller_accessible(
|
|
|
2858
3036
|
f'fatal, but {controller_name} commands/calls may hang or return '
|
|
2859
3037
|
'stale information, when the controller is not up.\n'
|
|
2860
3038
|
f' Details: {common_utils.format_exception(e, use_bracket=True)}')
|
|
2861
|
-
record = global_user_state.get_cluster_from_name(
|
|
3039
|
+
record = global_user_state.get_cluster_from_name(
|
|
3040
|
+
cluster_name, include_user_info=False, summary_response=True)
|
|
2862
3041
|
if record is not None:
|
|
2863
3042
|
controller_status, handle = record['status'], record['handle']
|
|
2864
3043
|
# We check the connection even if the cluster has a cached status UP
|
|
@@ -2915,22 +3094,96 @@ class CloudFilter(enum.Enum):
|
|
|
2915
3094
|
LOCAL = 'local'
|
|
2916
3095
|
|
|
2917
3096
|
|
|
2918
|
-
def _get_glob_clusters(
|
|
3097
|
+
def _get_glob_clusters(
|
|
3098
|
+
clusters: List[str],
|
|
3099
|
+
silent: bool = False,
|
|
3100
|
+
workspaces_filter: Optional[Dict[str, Any]] = None) -> List[str]:
|
|
2919
3101
|
"""Returns a list of clusters that match the glob pattern."""
|
|
2920
3102
|
glob_clusters = []
|
|
2921
3103
|
for cluster in clusters:
|
|
2922
|
-
glob_cluster = global_user_state.get_glob_cluster_names(
|
|
3104
|
+
glob_cluster = global_user_state.get_glob_cluster_names(
|
|
3105
|
+
cluster, workspaces_filter=workspaces_filter)
|
|
2923
3106
|
if len(glob_cluster) == 0 and not silent:
|
|
2924
3107
|
logger.info(f'Cluster {cluster} not found.')
|
|
2925
3108
|
glob_clusters.extend(glob_cluster)
|
|
2926
3109
|
return list(set(glob_clusters))
|
|
2927
3110
|
|
|
2928
3111
|
|
|
3112
|
+
def _refresh_cluster(
|
|
3113
|
+
cluster_name: str,
|
|
3114
|
+
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]],
|
|
3115
|
+
include_user_info: bool = True,
|
|
3116
|
+
summary_response: bool = False) -> Optional[Dict[str, Any]]:
|
|
3117
|
+
try:
|
|
3118
|
+
record = refresh_cluster_record(
|
|
3119
|
+
cluster_name,
|
|
3120
|
+
force_refresh_statuses=force_refresh_statuses,
|
|
3121
|
+
cluster_lock_already_held=False,
|
|
3122
|
+
include_user_info=include_user_info,
|
|
3123
|
+
summary_response=summary_response)
|
|
3124
|
+
except (exceptions.ClusterStatusFetchingError,
|
|
3125
|
+
exceptions.CloudUserIdentityError,
|
|
3126
|
+
exceptions.ClusterOwnerIdentityMismatchError) as e:
|
|
3127
|
+
# Do not fail the entire refresh process. The caller will
|
|
3128
|
+
# handle the 'UNKNOWN' status, and collect the errors into
|
|
3129
|
+
# a table.
|
|
3130
|
+
record = {'status': 'UNKNOWN', 'error': e}
|
|
3131
|
+
return record
|
|
3132
|
+
|
|
3133
|
+
|
|
3134
|
+
def refresh_cluster_records() -> None:
|
|
3135
|
+
"""Refreshes the status of all clusters, except managed clusters.
|
|
3136
|
+
|
|
3137
|
+
Used by the background status refresh daemon.
|
|
3138
|
+
This function is a stripped-down version of get_clusters, with only the
|
|
3139
|
+
bare bones refresh logic.
|
|
3140
|
+
|
|
3141
|
+
Returns:
|
|
3142
|
+
None
|
|
3143
|
+
|
|
3144
|
+
Raises:
|
|
3145
|
+
None
|
|
3146
|
+
"""
|
|
3147
|
+
# We force to exclude managed clusters to avoid multiple sources
|
|
3148
|
+
# manipulating them. For example, SkyServe assumes the replica manager
|
|
3149
|
+
# is the only source of truth for the cluster status.
|
|
3150
|
+
cluster_names = set(
|
|
3151
|
+
global_user_state.get_cluster_names(exclude_managed_clusters=True))
|
|
3152
|
+
|
|
3153
|
+
# TODO(syang): we should try not to leak
|
|
3154
|
+
# request info in backend_utils.py.
|
|
3155
|
+
# Refactor this to use some other info to
|
|
3156
|
+
# determine if a launch is in progress.
|
|
3157
|
+
cluster_names_with_launch_request = {
|
|
3158
|
+
request.cluster_name for request in requests_lib.get_request_tasks(
|
|
3159
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
3160
|
+
status=[requests_lib.RequestStatus.RUNNING],
|
|
3161
|
+
include_request_names=['sky.launch'],
|
|
3162
|
+
fields=['cluster_name']))
|
|
3163
|
+
}
|
|
3164
|
+
cluster_names_without_launch_request = (cluster_names -
|
|
3165
|
+
cluster_names_with_launch_request)
|
|
3166
|
+
|
|
3167
|
+
def _refresh_cluster_record(cluster_name):
|
|
3168
|
+
return _refresh_cluster(cluster_name,
|
|
3169
|
+
force_refresh_statuses=set(
|
|
3170
|
+
status_lib.ClusterStatus),
|
|
3171
|
+
include_user_info=False,
|
|
3172
|
+
summary_response=True)
|
|
3173
|
+
|
|
3174
|
+
if len(cluster_names_without_launch_request) > 0:
|
|
3175
|
+
# Do not refresh the clusters that have an active launch request.
|
|
3176
|
+
subprocess_utils.run_in_parallel(_refresh_cluster_record,
|
|
3177
|
+
cluster_names_without_launch_request)
|
|
3178
|
+
|
|
3179
|
+
|
|
2929
3180
|
def get_clusters(
|
|
2930
3181
|
refresh: common.StatusRefreshMode,
|
|
2931
3182
|
cluster_names: Optional[Union[str, List[str]]] = None,
|
|
2932
3183
|
all_users: bool = True,
|
|
2933
3184
|
include_credentials: bool = False,
|
|
3185
|
+
summary_response: bool = False,
|
|
3186
|
+
include_handle: bool = True,
|
|
2934
3187
|
# Internal only:
|
|
2935
3188
|
# pylint: disable=invalid-name
|
|
2936
3189
|
_include_is_managed: bool = False,
|
|
@@ -2958,6 +3211,23 @@ def get_clusters(
|
|
|
2958
3211
|
A list of cluster records. If the cluster does not exist or has been
|
|
2959
3212
|
terminated, the record will be omitted from the returned list.
|
|
2960
3213
|
"""
|
|
3214
|
+
accessible_workspaces = workspaces_core.get_workspaces()
|
|
3215
|
+
if cluster_names is not None:
|
|
3216
|
+
if isinstance(cluster_names, str):
|
|
3217
|
+
cluster_names = [cluster_names]
|
|
3218
|
+
non_glob_cluster_names = []
|
|
3219
|
+
glob_cluster_names = []
|
|
3220
|
+
for cluster_name in cluster_names:
|
|
3221
|
+
if ux_utils.is_glob_pattern(cluster_name):
|
|
3222
|
+
glob_cluster_names.append(cluster_name)
|
|
3223
|
+
else:
|
|
3224
|
+
non_glob_cluster_names.append(cluster_name)
|
|
3225
|
+
cluster_names = non_glob_cluster_names
|
|
3226
|
+
if glob_cluster_names:
|
|
3227
|
+
cluster_names += _get_glob_clusters(
|
|
3228
|
+
glob_cluster_names,
|
|
3229
|
+
silent=True,
|
|
3230
|
+
workspaces_filter=accessible_workspaces)
|
|
2961
3231
|
|
|
2962
3232
|
exclude_managed_clusters = False
|
|
2963
3233
|
if not (_include_is_managed or env_options.Options.SHOW_DEBUG_INFO.get()):
|
|
@@ -2965,34 +3235,24 @@ def get_clusters(
|
|
|
2965
3235
|
user_hashes_filter = None
|
|
2966
3236
|
if not all_users:
|
|
2967
3237
|
user_hashes_filter = {common_utils.get_current_user().id}
|
|
2968
|
-
accessible_workspaces = workspaces_core.get_workspaces()
|
|
2969
|
-
|
|
2970
3238
|
records = global_user_state.get_clusters(
|
|
2971
3239
|
exclude_managed_clusters=exclude_managed_clusters,
|
|
2972
3240
|
user_hashes_filter=user_hashes_filter,
|
|
2973
|
-
workspaces_filter=accessible_workspaces
|
|
3241
|
+
workspaces_filter=accessible_workspaces,
|
|
3242
|
+
cluster_names=cluster_names,
|
|
3243
|
+
summary_response=summary_response)
|
|
2974
3244
|
|
|
2975
3245
|
yellow = colorama.Fore.YELLOW
|
|
2976
3246
|
bright = colorama.Style.BRIGHT
|
|
2977
3247
|
reset = colorama.Style.RESET_ALL
|
|
2978
3248
|
|
|
2979
3249
|
if cluster_names is not None:
|
|
2980
|
-
|
|
2981
|
-
|
|
2982
|
-
|
|
2983
|
-
|
|
2984
|
-
|
|
2985
|
-
for cluster_name in cluster_names:
|
|
2986
|
-
for record in records:
|
|
2987
|
-
if record['name'] == cluster_name:
|
|
2988
|
-
new_records.append(record)
|
|
2989
|
-
break
|
|
2990
|
-
else:
|
|
2991
|
-
not_exist_cluster_names.append(cluster_name)
|
|
2992
|
-
if not_exist_cluster_names:
|
|
2993
|
-
clusters_str = ', '.join(not_exist_cluster_names)
|
|
3250
|
+
record_names = {record['name'] for record in records}
|
|
3251
|
+
not_found_clusters = ux_utils.get_non_matched_query(
|
|
3252
|
+
cluster_names, record_names)
|
|
3253
|
+
if not_found_clusters:
|
|
3254
|
+
clusters_str = ', '.join(not_found_clusters)
|
|
2994
3255
|
logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
|
|
2995
|
-
records = new_records
|
|
2996
3256
|
|
|
2997
3257
|
def _get_records_with_handle(
|
|
2998
3258
|
records: List[Optional[Dict[str, Any]]]) -> List[Dict[str, Any]]:
|
|
@@ -3002,17 +3262,18 @@ def get_clusters(
|
|
|
3002
3262
|
if record is not None and record['handle'] is not None
|
|
3003
3263
|
]
|
|
3004
3264
|
|
|
3005
|
-
def
|
|
3265
|
+
def _update_records_with_handle_info(
|
|
3006
3266
|
records: List[Optional[Dict[str, Any]]]) -> None:
|
|
3007
3267
|
"""Add resource str to record"""
|
|
3008
3268
|
for record in _get_records_with_handle(records):
|
|
3009
3269
|
handle = record['handle']
|
|
3010
|
-
|
|
3011
|
-
|
|
3012
|
-
handle,
|
|
3013
|
-
record[
|
|
3014
|
-
|
|
3015
|
-
|
|
3270
|
+
resource_str_simple, resource_str_full = (
|
|
3271
|
+
resources_utils.get_readable_resources_repr(
|
|
3272
|
+
handle, simplified_only=False))
|
|
3273
|
+
record['resources_str'] = resource_str_simple
|
|
3274
|
+
record['resources_str_full'] = resource_str_full
|
|
3275
|
+
if not summary_response:
|
|
3276
|
+
record['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
|
|
3016
3277
|
|
|
3017
3278
|
def _update_records_with_credentials(
|
|
3018
3279
|
records: List[Optional[Dict[str, Any]]]) -> None:
|
|
@@ -3036,9 +3297,17 @@ def get_clusters(
|
|
|
3036
3297
|
expanded_private_key_path = os.path.expanduser(
|
|
3037
3298
|
ssh_private_key_path)
|
|
3038
3299
|
if not os.path.exists(expanded_private_key_path):
|
|
3039
|
-
|
|
3300
|
+
success = auth_utils.create_ssh_key_files_from_db(
|
|
3301
|
+
ssh_private_key_path)
|
|
3302
|
+
if not success:
|
|
3303
|
+
# If the ssh key files are not found, we do not
|
|
3304
|
+
# update the record with credentials.
|
|
3305
|
+
logger.debug(
|
|
3306
|
+
f'SSH keys not found for cluster {record["name"]} '
|
|
3307
|
+
f'at key path {ssh_private_key_path}')
|
|
3308
|
+
continue
|
|
3040
3309
|
else:
|
|
3041
|
-
private_key_path, _ =
|
|
3310
|
+
private_key_path, _ = auth_utils.get_or_generate_keys()
|
|
3042
3311
|
expanded_private_key_path = os.path.expanduser(private_key_path)
|
|
3043
3312
|
if expanded_private_key_path in cached_private_keys:
|
|
3044
3313
|
credential['ssh_private_key_content'] = cached_private_keys[
|
|
@@ -3052,7 +3321,7 @@ def get_clusters(
|
|
|
3052
3321
|
record['credentials'] = credential
|
|
3053
3322
|
|
|
3054
3323
|
def _update_records_with_resources(
|
|
3055
|
-
|
|
3324
|
+
records: List[Optional[Dict[str, Any]]],) -> None:
|
|
3056
3325
|
"""Add the resources to the record."""
|
|
3057
3326
|
for record in _get_records_with_handle(records):
|
|
3058
3327
|
handle = record['handle']
|
|
@@ -3070,9 +3339,11 @@ def get_clusters(
|
|
|
3070
3339
|
record['accelerators'] = (
|
|
3071
3340
|
f'{handle.launched_resources.accelerators}'
|
|
3072
3341
|
if handle.launched_resources.accelerators else None)
|
|
3342
|
+
if not include_handle:
|
|
3343
|
+
record.pop('handle', None)
|
|
3073
3344
|
|
|
3074
|
-
# Add
|
|
3075
|
-
|
|
3345
|
+
# Add handle info to the records
|
|
3346
|
+
_update_records_with_handle_info(records)
|
|
3076
3347
|
if include_credentials:
|
|
3077
3348
|
_update_records_with_credentials(records)
|
|
3078
3349
|
if refresh == common.StatusRefreshMode.NONE:
|
|
@@ -3093,65 +3364,76 @@ def get_clusters(
|
|
|
3093
3364
|
else:
|
|
3094
3365
|
force_refresh_statuses = None
|
|
3095
3366
|
|
|
3096
|
-
def
|
|
3097
|
-
|
|
3098
|
-
|
|
3099
|
-
|
|
3100
|
-
|
|
3101
|
-
|
|
3102
|
-
|
|
3103
|
-
|
|
3104
|
-
|
|
3105
|
-
|
|
3106
|
-
if len(request) > 0:
|
|
3107
|
-
# There is an active launch request on the cluster,
|
|
3108
|
-
# so we don't want to update the cluster status until
|
|
3109
|
-
# the request is completed.
|
|
3110
|
-
logger.debug(f'skipping refresh for cluster {cluster_name} '
|
|
3111
|
-
'as there is an active launch request')
|
|
3112
|
-
return global_user_state.get_cluster_from_name(cluster_name)
|
|
3113
|
-
try:
|
|
3114
|
-
record = refresh_cluster_record(
|
|
3115
|
-
cluster_name,
|
|
3116
|
-
force_refresh_statuses=force_refresh_statuses,
|
|
3117
|
-
acquire_per_cluster_status_lock=True)
|
|
3118
|
-
_update_records_with_resources_str([record])
|
|
3367
|
+
def _refresh_cluster_record(cluster_name):
|
|
3368
|
+
record = _refresh_cluster(cluster_name,
|
|
3369
|
+
force_refresh_statuses=force_refresh_statuses,
|
|
3370
|
+
include_user_info=True,
|
|
3371
|
+
summary_response=summary_response)
|
|
3372
|
+
# record may be None if the cluster is deleted during refresh,
|
|
3373
|
+
# e.g. all the Pods of a cluster on Kubernetes have been
|
|
3374
|
+
# deleted before refresh.
|
|
3375
|
+
if record is not None and 'error' not in record:
|
|
3376
|
+
_update_records_with_handle_info([record])
|
|
3119
3377
|
if include_credentials:
|
|
3120
3378
|
_update_records_with_credentials([record])
|
|
3121
|
-
|
|
3122
|
-
exceptions.CloudUserIdentityError,
|
|
3123
|
-
exceptions.ClusterOwnerIdentityMismatchError) as e:
|
|
3124
|
-
# Do not fail the entire refresh process. The caller will
|
|
3125
|
-
# handle the 'UNKNOWN' status, and collect the errors into
|
|
3126
|
-
# a table.
|
|
3127
|
-
record = {'status': 'UNKNOWN', 'error': e}
|
|
3128
|
-
progress.update(task, advance=1)
|
|
3379
|
+
progress.update(task, advance=1)
|
|
3129
3380
|
return record
|
|
3130
3381
|
|
|
3131
3382
|
cluster_names = [record['name'] for record in records]
|
|
3383
|
+
# TODO(syang): we should try not to leak
|
|
3384
|
+
# request info in backend_utils.py.
|
|
3385
|
+
# Refactor this to use some other info to
|
|
3386
|
+
# determine if a launch is in progress.
|
|
3387
|
+
cluster_names_with_launch_request = {
|
|
3388
|
+
request.cluster_name for request in requests_lib.get_request_tasks(
|
|
3389
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
3390
|
+
status=[requests_lib.RequestStatus.RUNNING],
|
|
3391
|
+
include_request_names=['sky.launch'],
|
|
3392
|
+
cluster_names=cluster_names,
|
|
3393
|
+
fields=['cluster_name']))
|
|
3394
|
+
}
|
|
3395
|
+
# Preserve the index of the cluster name as it appears on "records"
|
|
3396
|
+
cluster_names_without_launch_request = [
|
|
3397
|
+
(i, cluster_name)
|
|
3398
|
+
for i, cluster_name in enumerate(cluster_names)
|
|
3399
|
+
if cluster_name not in cluster_names_with_launch_request
|
|
3400
|
+
]
|
|
3401
|
+
# for clusters that have an active launch request, we do not refresh the status
|
|
3132
3402
|
updated_records = []
|
|
3133
|
-
if len(
|
|
3403
|
+
if len(cluster_names_without_launch_request) > 0:
|
|
3134
3404
|
with progress:
|
|
3135
3405
|
updated_records = subprocess_utils.run_in_parallel(
|
|
3136
|
-
|
|
3137
|
-
|
|
3406
|
+
_refresh_cluster_record, [
|
|
3407
|
+
cluster_name
|
|
3408
|
+
for _, cluster_name in cluster_names_without_launch_request
|
|
3409
|
+
])
|
|
3410
|
+
# Preserve the index of the cluster name as it appears on "records"
|
|
3411
|
+
# before filtering for clusters being launched.
|
|
3412
|
+
updated_records_dict: Dict[int, Optional[Dict[str, Any]]] = {
|
|
3413
|
+
cluster_names_without_launch_request[i][0]: updated_records[i]
|
|
3414
|
+
for i in range(len(cluster_names_without_launch_request))
|
|
3415
|
+
}
|
|
3138
3416
|
# Show information for removed clusters.
|
|
3139
3417
|
kept_records = []
|
|
3140
3418
|
autodown_clusters, remaining_clusters, failed_clusters = [], [], []
|
|
3141
3419
|
for i, record in enumerate(records):
|
|
3142
|
-
if
|
|
3420
|
+
if i not in updated_records_dict:
|
|
3421
|
+
# record was not refreshed, keep the original record
|
|
3422
|
+
kept_records.append(record)
|
|
3423
|
+
continue
|
|
3424
|
+
updated_record = updated_records_dict[i]
|
|
3425
|
+
if updated_record is None:
|
|
3143
3426
|
if record['to_down']:
|
|
3144
|
-
autodown_clusters.append(
|
|
3427
|
+
autodown_clusters.append(record['name'])
|
|
3145
3428
|
else:
|
|
3146
|
-
remaining_clusters.append(
|
|
3147
|
-
elif
|
|
3148
|
-
failed_clusters.append(
|
|
3149
|
-
(cluster_names[i], updated_records[i]['error']))
|
|
3429
|
+
remaining_clusters.append(record['name'])
|
|
3430
|
+
elif updated_record['status'] == 'UNKNOWN':
|
|
3431
|
+
failed_clusters.append((record['name'], updated_record['error']))
|
|
3150
3432
|
# Keep the original record if the status is unknown,
|
|
3151
3433
|
# so that the user can still see the cluster.
|
|
3152
3434
|
kept_records.append(record)
|
|
3153
3435
|
else:
|
|
3154
|
-
kept_records.append(
|
|
3436
|
+
kept_records.append(updated_record)
|
|
3155
3437
|
|
|
3156
3438
|
if autodown_clusters:
|
|
3157
3439
|
plural = 's' if len(autodown_clusters) > 1 else ''
|
|
@@ -3352,13 +3634,8 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str,
|
|
|
3352
3634
|
`stderr`. Typically due to the local client version just got updated, and
|
|
3353
3635
|
the remote runtime is an older version.
|
|
3354
3636
|
"""
|
|
3355
|
-
pattern = re.compile(r'AttributeError: module \'sky\.(.*)\' has no '
|
|
3356
|
-
r'attribute \'(.*)\'')
|
|
3357
3637
|
if returncode != 0:
|
|
3358
|
-
|
|
3359
|
-
# the remote cluster. Remove this after 0.10.0 is released.
|
|
3360
|
-
attribute_error = re.findall(pattern, stderr)
|
|
3361
|
-
if attribute_error or 'SkyPilot runtime is too old' in stderr:
|
|
3638
|
+
if 'SkyPilot runtime is too old' in stderr:
|
|
3362
3639
|
with ux_utils.print_exception_no_traceback():
|
|
3363
3640
|
raise RuntimeError(
|
|
3364
3641
|
f'{colorama.Fore.RED}SkyPilot runtime needs to be updated '
|
|
@@ -3502,19 +3779,126 @@ def workspace_lock_id(workspace_name: str) -> str:
|
|
|
3502
3779
|
return f'{workspace_name}_workspace'
|
|
3503
3780
|
|
|
3504
3781
|
|
|
3782
|
+
def cluster_tunnel_lock_id(cluster_name: str) -> str:
|
|
3783
|
+
"""Get the lock ID for cluster tunnel operations."""
|
|
3784
|
+
return f'{cluster_name}_ssh_tunnel'
|
|
3785
|
+
|
|
3786
|
+
|
|
3787
|
+
def open_ssh_tunnel(head_runner: Union[command_runner.SSHCommandRunner,
|
|
3788
|
+
command_runner.KubernetesCommandRunner],
|
|
3789
|
+
port_forward: Tuple[int, int]) -> subprocess.Popen:
|
|
3790
|
+
local_port, remote_port = port_forward
|
|
3791
|
+
if isinstance(head_runner, command_runner.SSHCommandRunner):
|
|
3792
|
+
# Disabling ControlMaster makes things easier to reason about
|
|
3793
|
+
# with respect to resource management/ownership,
|
|
3794
|
+
# as killing the process will close the tunnel too.
|
|
3795
|
+
head_runner.disable_control_master = True
|
|
3796
|
+
head_runner.port_forward_execute_remote_command = True
|
|
3797
|
+
|
|
3798
|
+
# The default connect_timeout of 1s is too short for
|
|
3799
|
+
# connecting to clusters using a jump server.
|
|
3800
|
+
# We use NON_INTERACTIVE mode to avoid allocating a pseudo-tty,
|
|
3801
|
+
# which is counted towards non-idleness.
|
|
3802
|
+
cmd: List[str] = head_runner.port_forward_command(
|
|
3803
|
+
[(local_port, remote_port)],
|
|
3804
|
+
connect_timeout=5,
|
|
3805
|
+
ssh_mode=command_runner.SshMode.NON_INTERACTIVE)
|
|
3806
|
+
if isinstance(head_runner, command_runner.SSHCommandRunner):
|
|
3807
|
+
# cat so the command doesn't exit until we kill it
|
|
3808
|
+
cmd += [f'"echo {_ACK_MESSAGE} && cat"']
|
|
3809
|
+
cmd_str = ' '.join(cmd)
|
|
3810
|
+
logger.debug(f'Running port forward command: {cmd_str}')
|
|
3811
|
+
ssh_tunnel_proc = subprocess.Popen(cmd_str,
|
|
3812
|
+
shell=True,
|
|
3813
|
+
stdin=subprocess.PIPE,
|
|
3814
|
+
stdout=subprocess.PIPE,
|
|
3815
|
+
stderr=subprocess.PIPE,
|
|
3816
|
+
start_new_session=True,
|
|
3817
|
+
text=True)
|
|
3818
|
+
# Wait until we receive an ack from the remote cluster or
|
|
3819
|
+
# the SSH connection times out.
|
|
3820
|
+
queue: queue_lib.Queue = queue_lib.Queue()
|
|
3821
|
+
stdout_thread = threading.Thread(
|
|
3822
|
+
target=lambda queue, stdout: queue.put(stdout.readline()),
|
|
3823
|
+
args=(queue, ssh_tunnel_proc.stdout),
|
|
3824
|
+
daemon=True)
|
|
3825
|
+
stdout_thread.start()
|
|
3826
|
+
while ssh_tunnel_proc.poll() is None:
|
|
3827
|
+
try:
|
|
3828
|
+
ack = queue.get_nowait()
|
|
3829
|
+
except queue_lib.Empty:
|
|
3830
|
+
ack = None
|
|
3831
|
+
time.sleep(0.1)
|
|
3832
|
+
continue
|
|
3833
|
+
assert ack is not None
|
|
3834
|
+
if isinstance(
|
|
3835
|
+
head_runner,
|
|
3836
|
+
command_runner.SSHCommandRunner) and ack == f'{_ACK_MESSAGE}\n':
|
|
3837
|
+
break
|
|
3838
|
+
elif isinstance(head_runner, command_runner.KubernetesCommandRunner
|
|
3839
|
+
) and _FORWARDING_FROM_MESSAGE in ack:
|
|
3840
|
+
# On kind clusters, this error occurs if we make a request
|
|
3841
|
+
# immediately after the port-forward is established on a new pod:
|
|
3842
|
+
# "Unhandled Error" err="an error occurred forwarding ... -> 46590:
|
|
3843
|
+
# failed to execute portforward in network namespace
|
|
3844
|
+
# "/var/run/netns/cni-...": failed to connect to localhost:46590
|
|
3845
|
+
# inside namespace "...", IPv4: dial tcp4 127.0.0.1:46590:
|
|
3846
|
+
# connect: connection refused
|
|
3847
|
+
# So we need to poll the port on the pod to check if it is open.
|
|
3848
|
+
# We did not observe this with real Kubernetes clusters.
|
|
3849
|
+
timeout = 5
|
|
3850
|
+
port_check_cmd = (
|
|
3851
|
+
# We install netcat in our ray-node container,
|
|
3852
|
+
# so we can use it here.
|
|
3853
|
+
# (See kubernetes-ray.yml.j2)
|
|
3854
|
+
f'end=$((SECONDS+{timeout})); '
|
|
3855
|
+
f'while ! nc -z -w 1 localhost {remote_port}; do '
|
|
3856
|
+
'if (( SECONDS >= end )); then exit 1; fi; '
|
|
3857
|
+
'sleep 0.1; '
|
|
3858
|
+
'done')
|
|
3859
|
+
returncode, stdout, stderr = head_runner.run(port_check_cmd,
|
|
3860
|
+
require_outputs=True,
|
|
3861
|
+
stream_logs=False)
|
|
3862
|
+
if returncode != 0:
|
|
3863
|
+
try:
|
|
3864
|
+
ssh_tunnel_proc.terminate()
|
|
3865
|
+
ssh_tunnel_proc.wait(timeout=5)
|
|
3866
|
+
except subprocess.TimeoutExpired:
|
|
3867
|
+
ssh_tunnel_proc.kill()
|
|
3868
|
+
ssh_tunnel_proc.wait()
|
|
3869
|
+
finally:
|
|
3870
|
+
error_msg = (f'Failed to check remote port {remote_port}')
|
|
3871
|
+
if stdout:
|
|
3872
|
+
error_msg += f'\n-- stdout --\n{stdout}\n'
|
|
3873
|
+
raise exceptions.CommandError(returncode=returncode,
|
|
3874
|
+
command=cmd_str,
|
|
3875
|
+
error_msg=error_msg,
|
|
3876
|
+
detailed_reason=stderr)
|
|
3877
|
+
break
|
|
3878
|
+
|
|
3879
|
+
if ssh_tunnel_proc.poll() is not None:
|
|
3880
|
+
stdout, stderr = ssh_tunnel_proc.communicate()
|
|
3881
|
+
error_msg = 'Port forward failed'
|
|
3882
|
+
if stdout:
|
|
3883
|
+
error_msg += f'\n-- stdout --\n{stdout}\n'
|
|
3884
|
+
raise exceptions.CommandError(returncode=ssh_tunnel_proc.returncode,
|
|
3885
|
+
command=cmd_str,
|
|
3886
|
+
error_msg=error_msg,
|
|
3887
|
+
detailed_reason=stderr)
|
|
3888
|
+
return ssh_tunnel_proc
|
|
3889
|
+
|
|
3890
|
+
|
|
3505
3891
|
T = TypeVar('T')
|
|
3506
3892
|
|
|
3507
3893
|
|
|
3508
|
-
def invoke_skylet_with_retries(
|
|
3509
|
-
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
|
|
3510
|
-
func: Callable[..., T]) -> T:
|
|
3894
|
+
def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
|
|
3511
3895
|
"""Generic helper for making Skylet gRPC requests.
|
|
3512
3896
|
|
|
3513
3897
|
This method handles the common pattern of:
|
|
3514
3898
|
1. Try the gRPC request
|
|
3515
3899
|
2. If SSH tunnel is closed, recreate it and retry
|
|
3516
3900
|
"""
|
|
3517
|
-
max_attempts =
|
|
3901
|
+
max_attempts = 5
|
|
3518
3902
|
backoff = common_utils.Backoff(initial_backoff=0.5)
|
|
3519
3903
|
last_exception: Optional[Exception] = None
|
|
3520
3904
|
|
|
@@ -3523,26 +3907,46 @@ def invoke_skylet_with_retries(
|
|
|
3523
3907
|
return func()
|
|
3524
3908
|
except grpc.RpcError as e:
|
|
3525
3909
|
last_exception = e
|
|
3526
|
-
|
|
3527
|
-
with ux_utils.print_exception_no_traceback():
|
|
3528
|
-
raise exceptions.SkyletInternalError(e.details())
|
|
3529
|
-
elif e.code() == grpc.StatusCode.UNAVAILABLE:
|
|
3530
|
-
recreate_tunnel = True
|
|
3531
|
-
try:
|
|
3532
|
-
if handle.skylet_ssh_tunnel is not None:
|
|
3533
|
-
proc = psutil.Process(handle.skylet_ssh_tunnel.pid)
|
|
3534
|
-
if proc.is_running(
|
|
3535
|
-
) and proc.status() != psutil.STATUS_ZOMBIE:
|
|
3536
|
-
recreate_tunnel = False
|
|
3537
|
-
except psutil.NoSuchProcess:
|
|
3538
|
-
pass
|
|
3539
|
-
|
|
3540
|
-
if recreate_tunnel:
|
|
3541
|
-
handle.open_and_update_skylet_tunnel()
|
|
3542
|
-
|
|
3543
|
-
time.sleep(backoff.current_backoff())
|
|
3544
|
-
else:
|
|
3545
|
-
raise e
|
|
3910
|
+
_handle_grpc_error(e, backoff.current_backoff())
|
|
3546
3911
|
|
|
3547
|
-
raise RuntimeError(
|
|
3548
|
-
|
|
3912
|
+
raise RuntimeError(
|
|
3913
|
+
f'Failed to invoke Skylet after {max_attempts} attempts: {last_exception}'
|
|
3914
|
+
) from last_exception
|
|
3915
|
+
|
|
3916
|
+
|
|
3917
|
+
def invoke_skylet_streaming_with_retries(
|
|
3918
|
+
stream_func: Callable[..., Iterator[T]]) -> Iterator[T]:
|
|
3919
|
+
"""Generic helper for making Skylet streaming gRPC requests."""
|
|
3920
|
+
max_attempts = 3
|
|
3921
|
+
backoff = common_utils.Backoff(initial_backoff=0.5)
|
|
3922
|
+
last_exception: Optional[Exception] = None
|
|
3923
|
+
|
|
3924
|
+
for _ in range(max_attempts):
|
|
3925
|
+
try:
|
|
3926
|
+
for response in stream_func():
|
|
3927
|
+
yield response
|
|
3928
|
+
return
|
|
3929
|
+
except grpc.RpcError as e:
|
|
3930
|
+
last_exception = e
|
|
3931
|
+
_handle_grpc_error(e, backoff.current_backoff())
|
|
3932
|
+
|
|
3933
|
+
raise RuntimeError(
|
|
3934
|
+
f'Failed to stream Skylet response after {max_attempts} attempts'
|
|
3935
|
+
) from last_exception
|
|
3936
|
+
|
|
3937
|
+
|
|
3938
|
+
def _handle_grpc_error(e: 'grpc.RpcError', current_backoff: float) -> None:
|
|
3939
|
+
if e.code() == grpc.StatusCode.INTERNAL:
|
|
3940
|
+
with ux_utils.print_exception_no_traceback():
|
|
3941
|
+
raise exceptions.SkyletInternalError(e.details())
|
|
3942
|
+
elif e.code() == grpc.StatusCode.UNAVAILABLE:
|
|
3943
|
+
time.sleep(current_backoff)
|
|
3944
|
+
elif e.code() == grpc.StatusCode.UNIMPLEMENTED or e.code(
|
|
3945
|
+
) == grpc.StatusCode.UNKNOWN:
|
|
3946
|
+
# Handle backwards compatibility: old server doesn't implement this RPC.
|
|
3947
|
+
# Let the caller fall back to legacy execution.
|
|
3948
|
+
raise exceptions.SkyletMethodNotImplementedError(
|
|
3949
|
+
f'gRPC method not implemented on server, falling back to legacy execution: {e.details()}'
|
|
3950
|
+
)
|
|
3951
|
+
else:
|
|
3952
|
+
raise e
|