skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/backends/backend_utils.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
"""Util constants/functions for the backends."""
|
|
2
|
+
import asyncio
|
|
2
3
|
from datetime import datetime
|
|
3
4
|
import enum
|
|
4
5
|
import fnmatch
|
|
@@ -6,20 +7,24 @@ import hashlib
|
|
|
6
7
|
import os
|
|
7
8
|
import pathlib
|
|
8
9
|
import pprint
|
|
10
|
+
import queue as queue_lib
|
|
9
11
|
import re
|
|
10
12
|
import shlex
|
|
11
13
|
import subprocess
|
|
12
14
|
import sys
|
|
13
15
|
import tempfile
|
|
16
|
+
import threading
|
|
14
17
|
import time
|
|
15
18
|
import typing
|
|
16
|
-
from typing import (Any, Callable, Dict, List, Optional, Sequence,
|
|
17
|
-
TypeVar, Union)
|
|
19
|
+
from typing import (Any, Callable, Dict, Iterator, List, Optional, Sequence,
|
|
20
|
+
Set, Tuple, TypeVar, Union)
|
|
18
21
|
import uuid
|
|
19
22
|
|
|
23
|
+
import aiohttp
|
|
24
|
+
from aiohttp import ClientTimeout
|
|
25
|
+
from aiohttp import TCPConnector
|
|
20
26
|
import colorama
|
|
21
27
|
from packaging import version
|
|
22
|
-
import psutil
|
|
23
28
|
from typing_extensions import Literal
|
|
24
29
|
|
|
25
30
|
import sky
|
|
@@ -43,10 +48,12 @@ from sky.server.requests import requests as requests_lib
|
|
|
43
48
|
from sky.skylet import autostop_lib
|
|
44
49
|
from sky.skylet import constants
|
|
45
50
|
from sky.usage import usage_lib
|
|
51
|
+
from sky.utils import auth_utils
|
|
46
52
|
from sky.utils import cluster_utils
|
|
47
53
|
from sky.utils import command_runner
|
|
48
54
|
from sky.utils import common
|
|
49
55
|
from sky.utils import common_utils
|
|
56
|
+
from sky.utils import context as context_lib
|
|
50
57
|
from sky.utils import context_utils
|
|
51
58
|
from sky.utils import controller_utils
|
|
52
59
|
from sky.utils import env_options
|
|
@@ -60,6 +67,7 @@ from sky.utils import subprocess_utils
|
|
|
60
67
|
from sky.utils import tempstore
|
|
61
68
|
from sky.utils import timeline
|
|
62
69
|
from sky.utils import ux_utils
|
|
70
|
+
from sky.utils import volume as volume_utils
|
|
63
71
|
from sky.utils import yaml_utils
|
|
64
72
|
from sky.workspaces import core as workspaces_core
|
|
65
73
|
|
|
@@ -75,7 +83,6 @@ if typing.TYPE_CHECKING:
|
|
|
75
83
|
from sky import task as task_lib
|
|
76
84
|
from sky.backends import cloud_vm_ray_backend
|
|
77
85
|
from sky.backends import local_docker_backend
|
|
78
|
-
from sky.utils import volume as volume_lib
|
|
79
86
|
else:
|
|
80
87
|
yaml = adaptors_common.LazyImport('yaml')
|
|
81
88
|
requests = adaptors_common.LazyImport('requests')
|
|
@@ -107,8 +114,12 @@ _LAUNCHED_RESERVED_WORKER_PATTERN = re.compile(
|
|
|
107
114
|
# 10.133.0.5: ray.worker.default,
|
|
108
115
|
_LAUNCHING_IP_PATTERN = re.compile(
|
|
109
116
|
r'({}): ray[._]worker[._](?:default|reserved)'.format(IP_ADDR_REGEX))
|
|
117
|
+
SSH_CONNECTION_ERROR_PATTERN = re.compile(
|
|
118
|
+
r'^ssh:.*(timed out|connection refused)$', re.IGNORECASE)
|
|
110
119
|
_SSH_CONNECTION_TIMED_OUT_PATTERN = re.compile(r'^ssh:.*timed out$',
|
|
111
120
|
re.IGNORECASE)
|
|
121
|
+
K8S_PODS_NOT_FOUND_PATTERN = re.compile(r'.*(NotFound|pods .* not found).*',
|
|
122
|
+
re.IGNORECASE)
|
|
112
123
|
_RAY_CLUSTER_NOT_FOUND_MESSAGE = 'Ray cluster is not found'
|
|
113
124
|
WAIT_HEAD_NODE_IP_MAX_ATTEMPTS = 3
|
|
114
125
|
|
|
@@ -131,10 +142,24 @@ _CLUSTER_STATUS_CACHE_DURATION_SECONDS = 2
|
|
|
131
142
|
|
|
132
143
|
CLUSTER_FILE_MOUNTS_LOCK_TIMEOUT_SECONDS = 10
|
|
133
144
|
WORKSPACE_LOCK_TIMEOUT_SECONDS = 10
|
|
145
|
+
CLUSTER_TUNNEL_LOCK_TIMEOUT_SECONDS = 10.0
|
|
134
146
|
|
|
135
147
|
# Remote dir that holds our runtime files.
|
|
136
148
|
_REMOTE_RUNTIME_FILES_DIR = '~/.sky/.runtime_files'
|
|
137
149
|
|
|
150
|
+
# The maximum size of a command line arguments is 128 KB, i.e. the command
|
|
151
|
+
# executed with /bin/sh should be less than 128KB.
|
|
152
|
+
# https://github.com/torvalds/linux/blob/master/include/uapi/linux/binfmts.h
|
|
153
|
+
#
|
|
154
|
+
# If a user have very long run or setup commands, the generated command may
|
|
155
|
+
# exceed the limit, as we directly include scripts in job submission commands.
|
|
156
|
+
# If the command is too long, we instead write it to a file, rsync and execute
|
|
157
|
+
# it.
|
|
158
|
+
#
|
|
159
|
+
# We use 100KB as a threshold to be safe for other arguments that
|
|
160
|
+
# might be added during ssh.
|
|
161
|
+
_MAX_INLINE_SCRIPT_LENGTH = 100 * 1024
|
|
162
|
+
|
|
138
163
|
_ENDPOINTS_RETRY_MESSAGE = ('If the cluster was recently started, '
|
|
139
164
|
'please retry after a while.')
|
|
140
165
|
|
|
@@ -209,6 +234,21 @@ _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH = [
|
|
|
209
234
|
('provider', 'availability_zone'),
|
|
210
235
|
]
|
|
211
236
|
|
|
237
|
+
_ACK_MESSAGE = 'ack'
|
|
238
|
+
_FORWARDING_FROM_MESSAGE = 'Forwarding from'
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def is_command_length_over_limit(command: str) -> bool:
|
|
242
|
+
"""Check if the length of the command exceeds the limit.
|
|
243
|
+
|
|
244
|
+
We calculate the length of the command after quoting the command twice as
|
|
245
|
+
when it is executed by the CommandRunner, the command will be quoted twice
|
|
246
|
+
to ensure the correctness, which will add significant length to the command.
|
|
247
|
+
"""
|
|
248
|
+
|
|
249
|
+
quoted_length = len(shlex.quote(shlex.quote(command)))
|
|
250
|
+
return quoted_length > _MAX_INLINE_SCRIPT_LENGTH
|
|
251
|
+
|
|
212
252
|
|
|
213
253
|
def is_ip(s: str) -> bool:
|
|
214
254
|
"""Returns whether this string matches IP_ADDR_REGEX."""
|
|
@@ -535,7 +575,7 @@ def get_expirable_clouds(
|
|
|
535
575
|
# get all custom contexts
|
|
536
576
|
contexts = kubernetes_utils.get_custom_config_k8s_contexts()
|
|
537
577
|
# add remote_identity of each context if it exists
|
|
538
|
-
remote_identities = None
|
|
578
|
+
remote_identities: Optional[Union[str, List[Dict[str, str]]]] = None
|
|
539
579
|
for context in contexts:
|
|
540
580
|
context_remote_identity = skypilot_config.get_effective_region_config(
|
|
541
581
|
cloud='kubernetes',
|
|
@@ -546,9 +586,11 @@ def get_expirable_clouds(
|
|
|
546
586
|
if remote_identities is None:
|
|
547
587
|
remote_identities = []
|
|
548
588
|
if isinstance(context_remote_identity, str):
|
|
589
|
+
assert isinstance(remote_identities, list)
|
|
549
590
|
remote_identities.append(
|
|
550
591
|
{context: context_remote_identity})
|
|
551
592
|
elif isinstance(context_remote_identity, list):
|
|
593
|
+
assert isinstance(remote_identities, list)
|
|
552
594
|
remote_identities.extend(context_remote_identity)
|
|
553
595
|
# add global kubernetes remote identity if it exists, if not, add default
|
|
554
596
|
global_remote_identity = skypilot_config.get_effective_region_config(
|
|
@@ -560,8 +602,10 @@ def get_expirable_clouds(
|
|
|
560
602
|
if remote_identities is None:
|
|
561
603
|
remote_identities = []
|
|
562
604
|
if isinstance(global_remote_identity, str):
|
|
605
|
+
assert isinstance(remote_identities, list)
|
|
563
606
|
remote_identities.append({'*': global_remote_identity})
|
|
564
607
|
elif isinstance(global_remote_identity, list):
|
|
608
|
+
assert isinstance(remote_identities, list)
|
|
565
609
|
remote_identities.extend(global_remote_identity)
|
|
566
610
|
if remote_identities is None:
|
|
567
611
|
remote_identities = schemas.get_default_remote_identity(
|
|
@@ -589,6 +633,11 @@ def get_expirable_clouds(
|
|
|
589
633
|
return expirable_clouds
|
|
590
634
|
|
|
591
635
|
|
|
636
|
+
def _get_volume_name(path: str, cluster_name_on_cloud: str) -> str:
|
|
637
|
+
path_hash = hashlib.md5(path.encode()).hexdigest()[:6]
|
|
638
|
+
return f'{cluster_name_on_cloud}-{path_hash}'
|
|
639
|
+
|
|
640
|
+
|
|
592
641
|
# TODO: too many things happening here - leaky abstraction. Refactor.
|
|
593
642
|
@timeline.event
|
|
594
643
|
def write_cluster_config(
|
|
@@ -602,7 +651,7 @@ def write_cluster_config(
|
|
|
602
651
|
zones: Optional[List[clouds.Zone]] = None,
|
|
603
652
|
dryrun: bool = False,
|
|
604
653
|
keep_launch_fields_in_existing_config: bool = True,
|
|
605
|
-
volume_mounts: Optional[List['
|
|
654
|
+
volume_mounts: Optional[List['volume_utils.VolumeMount']] = None,
|
|
606
655
|
) -> Dict[str, str]:
|
|
607
656
|
"""Fills in cluster configuration templates and writes them out.
|
|
608
657
|
|
|
@@ -705,11 +754,15 @@ def write_cluster_config(
|
|
|
705
754
|
'is not supported by this cloud. Remove the config or set: '
|
|
706
755
|
'`remote_identity: LOCAL_CREDENTIALS`.')
|
|
707
756
|
if isinstance(cloud, clouds.Kubernetes):
|
|
708
|
-
|
|
757
|
+
allowed_contexts = skypilot_config.get_workspace_cloud(
|
|
758
|
+
'kubernetes').get('allowed_contexts', None)
|
|
759
|
+
if allowed_contexts is None:
|
|
760
|
+
allowed_contexts = skypilot_config.get_effective_region_config(
|
|
709
761
|
cloud='kubernetes',
|
|
710
762
|
region=None,
|
|
711
763
|
keys=('allowed_contexts',),
|
|
712
|
-
default_value=None)
|
|
764
|
+
default_value=None)
|
|
765
|
+
if allowed_contexts is None:
|
|
713
766
|
excluded_clouds.add(cloud)
|
|
714
767
|
else:
|
|
715
768
|
excluded_clouds.add(cloud)
|
|
@@ -733,7 +786,7 @@ def write_cluster_config(
|
|
|
733
786
|
assert k not in credentials, f'{k} already in credentials'
|
|
734
787
|
credentials[k] = v
|
|
735
788
|
|
|
736
|
-
private_key_path, _ =
|
|
789
|
+
private_key_path, _ = auth_utils.get_or_generate_keys()
|
|
737
790
|
auth_config = {'ssh_private_key': private_key_path}
|
|
738
791
|
region_name = resources_vars.get('region')
|
|
739
792
|
|
|
@@ -767,6 +820,55 @@ def write_cluster_config(
|
|
|
767
820
|
assert region_name in ssh_proxy_command_config, (
|
|
768
821
|
region_name, ssh_proxy_command_config)
|
|
769
822
|
ssh_proxy_command = ssh_proxy_command_config[region_name]
|
|
823
|
+
|
|
824
|
+
use_internal_ips = skypilot_config.get_effective_region_config(
|
|
825
|
+
cloud=str(cloud).lower(),
|
|
826
|
+
region=region.name,
|
|
827
|
+
keys=('use_internal_ips',),
|
|
828
|
+
default_value=False)
|
|
829
|
+
if isinstance(cloud, clouds.AWS):
|
|
830
|
+
# If the use_ssm flag is set to true, we use the ssm proxy command.
|
|
831
|
+
use_ssm = skypilot_config.get_effective_region_config(
|
|
832
|
+
cloud=str(cloud).lower(),
|
|
833
|
+
region=region.name,
|
|
834
|
+
keys=('use_ssm',),
|
|
835
|
+
default_value=None)
|
|
836
|
+
|
|
837
|
+
if use_ssm and ssh_proxy_command is not None:
|
|
838
|
+
raise exceptions.InvalidCloudConfigs(
|
|
839
|
+
'use_ssm is set to true, but ssh_proxy_command '
|
|
840
|
+
f'is already set to {ssh_proxy_command!r}. Please remove '
|
|
841
|
+
'ssh_proxy_command or set use_ssm to false.')
|
|
842
|
+
|
|
843
|
+
if use_internal_ips and ssh_proxy_command is None:
|
|
844
|
+
# Only if use_ssm is explicitly not set, we default to using SSM.
|
|
845
|
+
if use_ssm is None:
|
|
846
|
+
logger.warning(
|
|
847
|
+
f'{colorama.Fore.YELLOW}'
|
|
848
|
+
'use_internal_ips is set to true, '
|
|
849
|
+
'but ssh_proxy_command is not set. Defaulting to '
|
|
850
|
+
'using SSM. Specify ssh_proxy_command to use a different '
|
|
851
|
+
'https://docs.skypilot.co/en/latest/reference/config.html#'
|
|
852
|
+
f'aws.ssh_proxy_command.{colorama.Style.RESET_ALL}')
|
|
853
|
+
use_ssm = True
|
|
854
|
+
|
|
855
|
+
if use_ssm:
|
|
856
|
+
aws_profile = os.environ.get('AWS_PROFILE', None)
|
|
857
|
+
profile_str = f'--profile {aws_profile}' if aws_profile else ''
|
|
858
|
+
ip_address_filter = ('Name=private-ip-address,Values=%h'
|
|
859
|
+
if use_internal_ips else
|
|
860
|
+
'Name=ip-address,Values=%h')
|
|
861
|
+
get_instance_id_command = 'aws ec2 describe-instances ' + \
|
|
862
|
+
f'--region {region_name} --filters {ip_address_filter} ' + \
|
|
863
|
+
'--query \"Reservations[].Instances[].InstanceId\" ' + \
|
|
864
|
+
f'{profile_str} --output text'
|
|
865
|
+
ssm_proxy_command = 'aws ssm start-session --target ' + \
|
|
866
|
+
f'\"$({get_instance_id_command})\" ' + \
|
|
867
|
+
f'--region {region_name} {profile_str} ' + \
|
|
868
|
+
'--document-name AWS-StartSSHSession ' + \
|
|
869
|
+
'--parameters portNumber=%p'
|
|
870
|
+
ssh_proxy_command = ssm_proxy_command
|
|
871
|
+
region_name = 'ssm-session'
|
|
770
872
|
logger.debug(f'Using ssh_proxy_command: {ssh_proxy_command!r}')
|
|
771
873
|
|
|
772
874
|
# User-supplied global instance tags from ~/.sky/config.yaml.
|
|
@@ -783,12 +885,6 @@ def write_cluster_config(
|
|
|
783
885
|
if to_provision.labels:
|
|
784
886
|
labels.update(to_provision.labels)
|
|
785
887
|
|
|
786
|
-
# Dump the Ray ports to a file for Ray job submission
|
|
787
|
-
dump_port_command = (
|
|
788
|
-
f'{constants.SKY_PYTHON_CMD} -c \'import json, os; json.dump({constants.SKY_REMOTE_RAY_PORT_DICT_STR}, '
|
|
789
|
-
f'open(os.path.expanduser("{constants.SKY_REMOTE_RAY_PORT_FILE}"), "w", encoding="utf-8"))\''
|
|
790
|
-
)
|
|
791
|
-
|
|
792
888
|
# We disable conda auto-activation if the user has specified a docker image
|
|
793
889
|
# to use, which is likely to already have a conda environment activated.
|
|
794
890
|
conda_auto_activate = ('true' if to_provision.extract_docker_image() is None
|
|
@@ -804,14 +900,24 @@ def write_cluster_config(
|
|
|
804
900
|
cluster_name)
|
|
805
901
|
|
|
806
902
|
volume_mount_vars = []
|
|
903
|
+
ephemeral_volume_mount_vars = []
|
|
807
904
|
if volume_mounts is not None:
|
|
808
905
|
for vol in volume_mounts:
|
|
809
|
-
|
|
810
|
-
|
|
811
|
-
|
|
812
|
-
|
|
813
|
-
|
|
814
|
-
|
|
906
|
+
if vol.is_ephemeral:
|
|
907
|
+
volume_name = _get_volume_name(vol.path, cluster_name_on_cloud)
|
|
908
|
+
vol.volume_name = volume_name
|
|
909
|
+
vol.volume_config.cloud = repr(cloud)
|
|
910
|
+
vol.volume_config.region = region.name
|
|
911
|
+
vol.volume_config.name = volume_name
|
|
912
|
+
ephemeral_volume_mount_vars.append(vol.to_yaml_config())
|
|
913
|
+
else:
|
|
914
|
+
volume_info = volume_utils.VolumeInfo(
|
|
915
|
+
name=vol.volume_name,
|
|
916
|
+
path=vol.path,
|
|
917
|
+
volume_name_on_cloud=vol.volume_config.name_on_cloud,
|
|
918
|
+
volume_id_on_cloud=vol.volume_config.id_on_cloud,
|
|
919
|
+
)
|
|
920
|
+
volume_mount_vars.append(volume_info)
|
|
815
921
|
|
|
816
922
|
runcmd = skypilot_config.get_effective_region_config(
|
|
817
923
|
cloud=str(to_provision.cloud).lower(),
|
|
@@ -865,6 +971,9 @@ def write_cluster_config(
|
|
|
865
971
|
'{conda_auto_activate}',
|
|
866
972
|
conda_auto_activate).replace('{is_custom_docker}',
|
|
867
973
|
is_custom_docker),
|
|
974
|
+
# Currently only used by Slurm. For other clouds, it is
|
|
975
|
+
# already part of ray_skypilot_installation_commands
|
|
976
|
+
'setup_sky_dirs_commands': constants.SETUP_SKY_DIRS_COMMANDS,
|
|
868
977
|
'ray_skypilot_installation_commands':
|
|
869
978
|
(constants.RAY_SKYPILOT_INSTALLATION_COMMANDS.replace(
|
|
870
979
|
'{sky_wheel_hash}',
|
|
@@ -875,12 +984,14 @@ def write_cluster_config(
|
|
|
875
984
|
'{sky_wheel_hash}',
|
|
876
985
|
wheel_hash).replace('{cloud}',
|
|
877
986
|
str(cloud).lower()),
|
|
987
|
+
'copy_skypilot_templates_commands':
|
|
988
|
+
constants.COPY_SKYPILOT_TEMPLATES_COMMANDS,
|
|
878
989
|
# Port of Ray (GCS server).
|
|
879
990
|
# Ray's default port 6379 is conflicted with Redis.
|
|
880
991
|
'ray_port': constants.SKY_REMOTE_RAY_PORT,
|
|
881
992
|
'ray_dashboard_port': constants.SKY_REMOTE_RAY_DASHBOARD_PORT,
|
|
882
993
|
'ray_temp_dir': constants.SKY_REMOTE_RAY_TEMPDIR,
|
|
883
|
-
'dump_port_command':
|
|
994
|
+
'dump_port_command': instance_setup.DUMP_RAY_PORTS,
|
|
884
995
|
# Sky-internal constants.
|
|
885
996
|
'sky_ray_cmd': constants.SKY_RAY_CMD,
|
|
886
997
|
# pip install needs to have python env activated to make sure
|
|
@@ -917,9 +1028,10 @@ def write_cluster_config(
|
|
|
917
1028
|
|
|
918
1029
|
# Volume mounts
|
|
919
1030
|
'volume_mounts': volume_mount_vars,
|
|
1031
|
+
'ephemeral_volume_mounts': ephemeral_volume_mount_vars,
|
|
920
1032
|
|
|
921
|
-
# runcmd to
|
|
922
|
-
#
|
|
1033
|
+
# runcmd to run before any of the SkyPilot runtime setup commands.
|
|
1034
|
+
# This is currently only used by AWS and Kubernetes.
|
|
923
1035
|
'runcmd': runcmd,
|
|
924
1036
|
}),
|
|
925
1037
|
output_path=tmp_yaml_path)
|
|
@@ -974,9 +1086,9 @@ def write_cluster_config(
|
|
|
974
1086
|
with open(tmp_yaml_path, 'w', encoding='utf-8') as f:
|
|
975
1087
|
f.write(restored_yaml_content)
|
|
976
1088
|
|
|
977
|
-
# Read the
|
|
978
|
-
#
|
|
979
|
-
#
|
|
1089
|
+
# Read the cluster_name_on_cloud from the restored yaml. This is a hack to
|
|
1090
|
+
# make sure that launching on the same cluster across multiple users works
|
|
1091
|
+
# correctly. See #8232.
|
|
980
1092
|
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
981
1093
|
config_dict['cluster_name_on_cloud'] = yaml_config['cluster_name']
|
|
982
1094
|
|
|
@@ -1025,17 +1137,21 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
|
|
|
1025
1137
|
"""
|
|
1026
1138
|
config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
1027
1139
|
# Check the availability of the cloud type.
|
|
1028
|
-
if isinstance(
|
|
1140
|
+
if isinstance(
|
|
1141
|
+
cloud,
|
|
1142
|
+
(
|
|
1029
1143
|
clouds.AWS,
|
|
1030
1144
|
clouds.OCI,
|
|
1031
1145
|
clouds.SCP,
|
|
1146
|
+
# TODO(jwj): Handle Slurm-specific auth logic
|
|
1147
|
+
clouds.Slurm,
|
|
1032
1148
|
clouds.Vsphere,
|
|
1033
1149
|
clouds.Cudo,
|
|
1034
1150
|
clouds.Paperspace,
|
|
1035
1151
|
clouds.Azure,
|
|
1036
1152
|
clouds.DO,
|
|
1037
1153
|
clouds.Nebius,
|
|
1038
|
-
|
|
1154
|
+
)):
|
|
1039
1155
|
config = auth.configure_ssh_info(config)
|
|
1040
1156
|
elif isinstance(cloud, clouds.GCP):
|
|
1041
1157
|
config = auth.setup_gcp_authentication(config)
|
|
@@ -1053,6 +1169,12 @@ def _add_auth_to_cluster_config(cloud: clouds.Cloud, tmp_yaml_path: str):
|
|
|
1053
1169
|
config = auth.setup_fluidstack_authentication(config)
|
|
1054
1170
|
elif isinstance(cloud, clouds.Hyperbolic):
|
|
1055
1171
|
config = auth.setup_hyperbolic_authentication(config)
|
|
1172
|
+
elif isinstance(cloud, clouds.Shadeform):
|
|
1173
|
+
config = auth.setup_shadeform_authentication(config)
|
|
1174
|
+
elif isinstance(cloud, clouds.PrimeIntellect):
|
|
1175
|
+
config = auth.setup_primeintellect_authentication(config)
|
|
1176
|
+
elif isinstance(cloud, clouds.Seeweb):
|
|
1177
|
+
config = auth.setup_seeweb_authentication(config)
|
|
1056
1178
|
else:
|
|
1057
1179
|
assert False, cloud
|
|
1058
1180
|
yaml_utils.dump_yaml(tmp_yaml_path, config)
|
|
@@ -1155,7 +1277,6 @@ def _deterministic_cluster_yaml_hash(tmp_yaml_path: str) -> str:
|
|
|
1155
1277
|
Rather than constructing the whole byte sequence, which may be quite large,
|
|
1156
1278
|
we construct it incrementally by using hash.update() to add new bytes.
|
|
1157
1279
|
"""
|
|
1158
|
-
|
|
1159
1280
|
# Load the yaml contents so that we can directly remove keys.
|
|
1160
1281
|
yaml_config = yaml_utils.read_yaml(tmp_yaml_path)
|
|
1161
1282
|
for key_list in _RAY_YAML_KEYS_TO_REMOVE_FOR_HASH:
|
|
@@ -1738,6 +1859,32 @@ def check_network_connection():
|
|
|
1738
1859
|
'Network seems down.')
|
|
1739
1860
|
|
|
1740
1861
|
|
|
1862
|
+
async def async_check_network_connection():
|
|
1863
|
+
"""Check if the network connection is available.
|
|
1864
|
+
|
|
1865
|
+
Tolerates 3 retries as it is observed that connections can fail.
|
|
1866
|
+
Uses aiohttp for async HTTP requests.
|
|
1867
|
+
"""
|
|
1868
|
+
# Create a session with retry logic
|
|
1869
|
+
timeout = ClientTimeout(total=15)
|
|
1870
|
+
connector = TCPConnector(limit=1) # Limit to 1 connection at a time
|
|
1871
|
+
|
|
1872
|
+
async with aiohttp.ClientSession(timeout=timeout,
|
|
1873
|
+
connector=connector) as session:
|
|
1874
|
+
for i, ip in enumerate(_TEST_IP_LIST):
|
|
1875
|
+
try:
|
|
1876
|
+
async with session.head(ip) as response:
|
|
1877
|
+
if response.status < 400: # Any 2xx or 3xx status is good
|
|
1878
|
+
return
|
|
1879
|
+
except (aiohttp.ClientError, asyncio.TimeoutError) as e:
|
|
1880
|
+
if i == len(_TEST_IP_LIST) - 1:
|
|
1881
|
+
raise exceptions.NetworkError(
|
|
1882
|
+
'Could not refresh the cluster. '
|
|
1883
|
+
'Network seems down.') from e
|
|
1884
|
+
# If not the last IP, continue to try the next one
|
|
1885
|
+
continue
|
|
1886
|
+
|
|
1887
|
+
|
|
1741
1888
|
@timeline.event
|
|
1742
1889
|
def check_owner_identity(cluster_name: str) -> None:
|
|
1743
1890
|
"""Check if current user is the same as the user who created the cluster.
|
|
@@ -1750,9 +1897,18 @@ def check_owner_identity(cluster_name: str) -> None:
|
|
|
1750
1897
|
"""
|
|
1751
1898
|
if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
|
|
1752
1899
|
return
|
|
1753
|
-
record = global_user_state.get_cluster_from_name(cluster_name
|
|
1900
|
+
record = global_user_state.get_cluster_from_name(cluster_name,
|
|
1901
|
+
include_user_info=False,
|
|
1902
|
+
summary_response=True)
|
|
1754
1903
|
if record is None:
|
|
1755
1904
|
return
|
|
1905
|
+
_check_owner_identity_with_record(cluster_name, record)
|
|
1906
|
+
|
|
1907
|
+
|
|
1908
|
+
def _check_owner_identity_with_record(cluster_name: str,
|
|
1909
|
+
record: Dict[str, Any]) -> None:
|
|
1910
|
+
if env_options.Options.SKIP_CLOUD_IDENTITY_CHECK.get():
|
|
1911
|
+
return
|
|
1756
1912
|
handle = record['handle']
|
|
1757
1913
|
if not isinstance(handle, backends.CloudVmRayResourceHandle):
|
|
1758
1914
|
return
|
|
@@ -1837,8 +1993,10 @@ def tag_filter_for_cluster(cluster_name: str) -> Dict[str, str]:
|
|
|
1837
1993
|
}
|
|
1838
1994
|
|
|
1839
1995
|
|
|
1996
|
+
@context_utils.cancellation_guard
|
|
1840
1997
|
def _query_cluster_status_via_cloud_api(
|
|
1841
|
-
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle'
|
|
1998
|
+
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
|
|
1999
|
+
retry_if_missing: bool,
|
|
1842
2000
|
) -> List[Tuple[status_lib.ClusterStatus, Optional[str]]]:
|
|
1843
2001
|
"""Returns the status of the cluster as a list of tuples corresponding
|
|
1844
2002
|
to the node status and an optional reason string for said status.
|
|
@@ -1865,8 +2023,11 @@ def _query_cluster_status_via_cloud_api(
|
|
|
1865
2023
|
cloud_name = repr(handle.launched_resources.cloud)
|
|
1866
2024
|
try:
|
|
1867
2025
|
node_status_dict = provision_lib.query_instances(
|
|
1868
|
-
cloud_name,
|
|
1869
|
-
|
|
2026
|
+
cloud_name,
|
|
2027
|
+
cluster_name,
|
|
2028
|
+
cluster_name_on_cloud,
|
|
2029
|
+
provider_config,
|
|
2030
|
+
retry_if_missing=retry_if_missing)
|
|
1870
2031
|
logger.debug(f'Querying {cloud_name} cluster '
|
|
1871
2032
|
f'{cluster_name_in_hint} '
|
|
1872
2033
|
f'status:\n{pprint.pformat(node_status_dict)}')
|
|
@@ -2044,7 +2205,12 @@ def check_can_clone_disk_and_override_task(
|
|
|
2044
2205
|
return task, handle
|
|
2045
2206
|
|
|
2046
2207
|
|
|
2047
|
-
def _update_cluster_status(
|
|
2208
|
+
def _update_cluster_status(
|
|
2209
|
+
cluster_name: str,
|
|
2210
|
+
record: Dict[str, Any],
|
|
2211
|
+
retry_if_missing: bool,
|
|
2212
|
+
include_user_info: bool = True,
|
|
2213
|
+
summary_response: bool = False) -> Optional[Dict[str, Any]]:
|
|
2048
2214
|
"""Update the cluster status.
|
|
2049
2215
|
|
|
2050
2216
|
The cluster status is updated by checking ray cluster and real status from
|
|
@@ -2071,9 +2237,6 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2071
2237
|
fetched from the cloud provider or there are leaked nodes causing
|
|
2072
2238
|
the node number larger than expected.
|
|
2073
2239
|
"""
|
|
2074
|
-
record = global_user_state.get_cluster_from_name(cluster_name)
|
|
2075
|
-
if record is None:
|
|
2076
|
-
return None
|
|
2077
2240
|
handle = record['handle']
|
|
2078
2241
|
if handle.cluster_yaml is None:
|
|
2079
2242
|
# Remove cluster from db since this cluster does not have a config file
|
|
@@ -2092,7 +2255,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2092
2255
|
return record
|
|
2093
2256
|
cluster_name = handle.cluster_name
|
|
2094
2257
|
|
|
2095
|
-
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2258
|
+
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2259
|
+
handle, retry_if_missing=retry_if_missing)
|
|
2096
2260
|
|
|
2097
2261
|
all_nodes_up = (all(status[0] == status_lib.ClusterStatus.UP
|
|
2098
2262
|
for status in node_statuses) and
|
|
@@ -2140,6 +2304,11 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2140
2304
|
total_nodes = handle.launched_nodes * handle.num_ips_per_node
|
|
2141
2305
|
|
|
2142
2306
|
cloud_name = repr(handle.launched_resources.cloud).lower()
|
|
2307
|
+
# Initialize variables in case all retries fail
|
|
2308
|
+
ready_head = 0
|
|
2309
|
+
ready_workers = 0
|
|
2310
|
+
output = ''
|
|
2311
|
+
stderr = ''
|
|
2143
2312
|
for i in range(5):
|
|
2144
2313
|
try:
|
|
2145
2314
|
ready_head, ready_workers, output, stderr = (
|
|
@@ -2228,7 +2397,12 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2228
2397
|
# remain healthy for a while before the cloud completely preempts the VMs.
|
|
2229
2398
|
# We have mitigated this by again first querying the VM state from the cloud
|
|
2230
2399
|
# provider.
|
|
2231
|
-
|
|
2400
|
+
cloud = handle.launched_resources.cloud
|
|
2401
|
+
|
|
2402
|
+
# For Slurm, skip Ray health check since it doesn't use Ray.
|
|
2403
|
+
should_check_ray = cloud is not None and cloud.uses_ray()
|
|
2404
|
+
if all_nodes_up and (not should_check_ray or
|
|
2405
|
+
run_ray_status_to_check_ray_cluster_healthy()):
|
|
2232
2406
|
# NOTE: all_nodes_up calculation is fast due to calling cloud CLI;
|
|
2233
2407
|
# run_ray_status_to_check_all_nodes_up() is slow due to calling `ray get
|
|
2234
2408
|
# head-ip/worker-ips`.
|
|
@@ -2240,12 +2414,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2240
2414
|
'All nodes up; SkyPilot runtime healthy.',
|
|
2241
2415
|
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2242
2416
|
nop_if_duplicate=True)
|
|
2243
|
-
global_user_state.add_or_update_cluster(
|
|
2244
|
-
|
|
2245
|
-
|
|
2246
|
-
|
|
2247
|
-
|
|
2248
|
-
|
|
2417
|
+
global_user_state.add_or_update_cluster(
|
|
2418
|
+
cluster_name,
|
|
2419
|
+
handle,
|
|
2420
|
+
requested_resources=None,
|
|
2421
|
+
ready=True,
|
|
2422
|
+
is_launch=False,
|
|
2423
|
+
existing_cluster_hash=record['cluster_hash'])
|
|
2424
|
+
return global_user_state.get_cluster_from_name(
|
|
2425
|
+
cluster_name,
|
|
2426
|
+
include_user_info=include_user_info,
|
|
2427
|
+
summary_response=summary_response)
|
|
2249
2428
|
|
|
2250
2429
|
# All cases below are transitioning the cluster to non-UP states.
|
|
2251
2430
|
launched_resources = handle.launched_resources.assert_launchable()
|
|
@@ -2262,7 +2441,8 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2262
2441
|
# and check again. This is a best-effort leak prevention check.
|
|
2263
2442
|
# See https://github.com/skypilot-org/skypilot/issues/4431.
|
|
2264
2443
|
time.sleep(_LAUNCH_DOUBLE_CHECK_DELAY)
|
|
2265
|
-
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2444
|
+
node_statuses = _query_cluster_status_via_cloud_api(
|
|
2445
|
+
handle, retry_if_missing=False)
|
|
2266
2446
|
# Note: even if all the node_statuses are UP now, we will still
|
|
2267
2447
|
# consider this cluster abnormal, and its status will be INIT.
|
|
2268
2448
|
|
|
@@ -2450,12 +2630,17 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2450
2630
|
global_user_state.ClusterEventType.STATUS_CHANGE,
|
|
2451
2631
|
nop_if_duplicate=True,
|
|
2452
2632
|
duplicate_regex=init_reason_regex)
|
|
2453
|
-
global_user_state.add_or_update_cluster(
|
|
2454
|
-
|
|
2455
|
-
|
|
2456
|
-
|
|
2457
|
-
|
|
2458
|
-
|
|
2633
|
+
global_user_state.add_or_update_cluster(
|
|
2634
|
+
cluster_name,
|
|
2635
|
+
handle,
|
|
2636
|
+
requested_resources=None,
|
|
2637
|
+
ready=False,
|
|
2638
|
+
is_launch=False,
|
|
2639
|
+
existing_cluster_hash=record['cluster_hash'])
|
|
2640
|
+
return global_user_state.get_cluster_from_name(
|
|
2641
|
+
cluster_name,
|
|
2642
|
+
include_user_info=include_user_info,
|
|
2643
|
+
summary_response=summary_response)
|
|
2459
2644
|
# Now is_abnormal is False: either node_statuses is empty or all nodes are
|
|
2460
2645
|
# STOPPED.
|
|
2461
2646
|
verb = 'terminated' if to_terminate else 'stopped'
|
|
@@ -2470,7 +2655,10 @@ def _update_cluster_status(cluster_name: str) -> Optional[Dict[str, Any]]:
|
|
|
2470
2655
|
nop_if_duplicate=True,
|
|
2471
2656
|
)
|
|
2472
2657
|
backend.post_teardown_cleanup(handle, terminate=to_terminate, purge=False)
|
|
2473
|
-
return global_user_state.get_cluster_from_name(
|
|
2658
|
+
return global_user_state.get_cluster_from_name(
|
|
2659
|
+
cluster_name,
|
|
2660
|
+
include_user_info=include_user_info,
|
|
2661
|
+
summary_response=summary_response)
|
|
2474
2662
|
|
|
2475
2663
|
|
|
2476
2664
|
def _must_refresh_cluster_status(
|
|
@@ -2492,12 +2680,14 @@ def _must_refresh_cluster_status(
|
|
|
2492
2680
|
|
|
2493
2681
|
|
|
2494
2682
|
def refresh_cluster_record(
|
|
2495
|
-
|
|
2496
|
-
|
|
2497
|
-
|
|
2498
|
-
|
|
2499
|
-
|
|
2500
|
-
|
|
2683
|
+
cluster_name: str,
|
|
2684
|
+
*,
|
|
2685
|
+
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
|
|
2686
|
+
cluster_lock_already_held: bool = False,
|
|
2687
|
+
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
|
|
2688
|
+
include_user_info: bool = True,
|
|
2689
|
+
summary_response: bool = False,
|
|
2690
|
+
retry_if_missing: bool = True) -> Optional[Dict[str, Any]]:
|
|
2501
2691
|
"""Refresh the cluster, and return the possibly updated record.
|
|
2502
2692
|
|
|
2503
2693
|
The function will update the cached cluster status in the global state. For
|
|
@@ -2514,14 +2704,20 @@ def refresh_cluster_record(
|
|
|
2514
2704
|
_CLUSTER_STATUS_CACHE_DURATION_SECONDS old, and one of:
|
|
2515
2705
|
1. the cluster is a spot cluster, or
|
|
2516
2706
|
2. cluster autostop is set and the cluster is not STOPPED.
|
|
2517
|
-
|
|
2518
|
-
|
|
2519
|
-
|
|
2707
|
+
cluster_lock_already_held: Whether the caller is already holding the
|
|
2708
|
+
per-cluster lock. You MUST NOT set this to True if the caller does not
|
|
2709
|
+
already hold the lock. If True, we will not acquire the lock before
|
|
2710
|
+
updating the status. Failing to hold the lock while updating the
|
|
2711
|
+
status can lead to correctness issues - e.g. an launch in-progress may
|
|
2712
|
+
appear to be DOWN incorrectly. Even if this is set to False, the lock
|
|
2713
|
+
may not be acquired if the status does not need to be refreshed.
|
|
2520
2714
|
cluster_status_lock_timeout: The timeout to acquire the per-cluster
|
|
2521
2715
|
lock. If timeout, the function will use the cached status. If the
|
|
2522
2716
|
value is <0, do not timeout (wait for the lock indefinitely). By
|
|
2523
2717
|
default, this is set to CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS. Warning:
|
|
2524
2718
|
if correctness is required, you must set this to -1.
|
|
2719
|
+
retry_if_missing: Whether to retry the call to the cloud api if the
|
|
2720
|
+
cluster is not found when querying the live status on the cloud.
|
|
2525
2721
|
|
|
2526
2722
|
Returns:
|
|
2527
2723
|
If the cluster is terminated or does not exist, return None.
|
|
@@ -2537,17 +2733,20 @@ def refresh_cluster_record(
|
|
|
2537
2733
|
the node number larger than expected.
|
|
2538
2734
|
"""
|
|
2539
2735
|
|
|
2540
|
-
|
|
2736
|
+
ctx = context_lib.get()
|
|
2737
|
+
record = global_user_state.get_cluster_from_name(
|
|
2738
|
+
cluster_name,
|
|
2739
|
+
include_user_info=include_user_info,
|
|
2740
|
+
summary_response=summary_response)
|
|
2541
2741
|
if record is None:
|
|
2542
2742
|
return None
|
|
2543
2743
|
# TODO(zhwu, 05/20): switch to the specific workspace to make sure we are
|
|
2544
2744
|
# using the correct cloud credentials.
|
|
2545
2745
|
workspace = record.get('workspace', constants.SKYPILOT_DEFAULT_WORKSPACE)
|
|
2546
2746
|
with skypilot_config.local_active_workspace_ctx(workspace):
|
|
2547
|
-
check_owner_identity
|
|
2548
|
-
|
|
2549
|
-
|
|
2550
|
-
return record
|
|
2747
|
+
# check_owner_identity returns if the record handle is
|
|
2748
|
+
# not a CloudVmRayResourceHandle
|
|
2749
|
+
_check_owner_identity_with_record(cluster_name, record)
|
|
2551
2750
|
|
|
2552
2751
|
# The loop logic allows us to notice if the status was updated in the
|
|
2553
2752
|
# global_user_state by another process and stop trying to get the lock.
|
|
@@ -2556,12 +2755,18 @@ def refresh_cluster_record(
|
|
|
2556
2755
|
|
|
2557
2756
|
# Loop until we have an up-to-date status or until we acquire the lock.
|
|
2558
2757
|
while True:
|
|
2758
|
+
# Check if the context is canceled.
|
|
2759
|
+
if ctx is not None and ctx.is_canceled():
|
|
2760
|
+
raise asyncio.CancelledError()
|
|
2559
2761
|
# Check to see if we can return the cached status.
|
|
2560
2762
|
if not _must_refresh_cluster_status(record, force_refresh_statuses):
|
|
2561
2763
|
return record
|
|
2562
2764
|
|
|
2563
|
-
if
|
|
2564
|
-
return _update_cluster_status(cluster_name
|
|
2765
|
+
if cluster_lock_already_held:
|
|
2766
|
+
return _update_cluster_status(cluster_name, record,
|
|
2767
|
+
retry_if_missing,
|
|
2768
|
+
include_user_info,
|
|
2769
|
+
summary_response)
|
|
2565
2770
|
|
|
2566
2771
|
# Try to acquire the lock so we can fetch the status.
|
|
2567
2772
|
try:
|
|
@@ -2569,12 +2774,17 @@ def refresh_cluster_record(
|
|
|
2569
2774
|
# Check the cluster status again, since it could have been
|
|
2570
2775
|
# updated between our last check and acquiring the lock.
|
|
2571
2776
|
record = global_user_state.get_cluster_from_name(
|
|
2572
|
-
cluster_name
|
|
2777
|
+
cluster_name,
|
|
2778
|
+
include_user_info=include_user_info,
|
|
2779
|
+
summary_response=summary_response)
|
|
2573
2780
|
if record is None or not _must_refresh_cluster_status(
|
|
2574
2781
|
record, force_refresh_statuses):
|
|
2575
2782
|
return record
|
|
2576
2783
|
# Update and return the cluster status.
|
|
2577
|
-
return _update_cluster_status(cluster_name
|
|
2784
|
+
return _update_cluster_status(cluster_name, record,
|
|
2785
|
+
retry_if_missing,
|
|
2786
|
+
include_user_info,
|
|
2787
|
+
summary_response)
|
|
2578
2788
|
|
|
2579
2789
|
except locks.LockTimeout:
|
|
2580
2790
|
# lock.acquire() will throw a Timeout exception if the lock is not
|
|
@@ -2592,10 +2802,13 @@ def refresh_cluster_record(
|
|
|
2592
2802
|
'Refreshing status: Failed get the lock for cluster '
|
|
2593
2803
|
f'{cluster_name!r}. Using the cached status.')
|
|
2594
2804
|
return record
|
|
2595
|
-
time.sleep(
|
|
2805
|
+
time.sleep(lock.poll_interval)
|
|
2596
2806
|
|
|
2597
2807
|
# Refresh for next loop iteration.
|
|
2598
|
-
record = global_user_state.get_cluster_from_name(
|
|
2808
|
+
record = global_user_state.get_cluster_from_name(
|
|
2809
|
+
cluster_name,
|
|
2810
|
+
include_user_info=include_user_info,
|
|
2811
|
+
summary_response=summary_response)
|
|
2599
2812
|
if record is None:
|
|
2600
2813
|
return None
|
|
2601
2814
|
|
|
@@ -2606,8 +2819,9 @@ def refresh_cluster_status_handle(
|
|
|
2606
2819
|
cluster_name: str,
|
|
2607
2820
|
*,
|
|
2608
2821
|
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]] = None,
|
|
2609
|
-
|
|
2610
|
-
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS
|
|
2822
|
+
cluster_lock_already_held: bool = False,
|
|
2823
|
+
cluster_status_lock_timeout: int = CLUSTER_STATUS_LOCK_TIMEOUT_SECONDS,
|
|
2824
|
+
retry_if_missing: bool = True,
|
|
2611
2825
|
) -> Tuple[Optional[status_lib.ClusterStatus],
|
|
2612
2826
|
Optional[backends.ResourceHandle]]:
|
|
2613
2827
|
"""Refresh the cluster, and return the possibly updated status and handle.
|
|
@@ -2619,8 +2833,11 @@ def refresh_cluster_status_handle(
|
|
|
2619
2833
|
record = refresh_cluster_record(
|
|
2620
2834
|
cluster_name,
|
|
2621
2835
|
force_refresh_statuses=force_refresh_statuses,
|
|
2622
|
-
|
|
2623
|
-
cluster_status_lock_timeout=cluster_status_lock_timeout
|
|
2836
|
+
cluster_lock_already_held=cluster_lock_already_held,
|
|
2837
|
+
cluster_status_lock_timeout=cluster_status_lock_timeout,
|
|
2838
|
+
include_user_info=False,
|
|
2839
|
+
summary_response=True,
|
|
2840
|
+
retry_if_missing=retry_if_missing)
|
|
2624
2841
|
if record is None:
|
|
2625
2842
|
return None, None
|
|
2626
2843
|
return record['status'], record['handle']
|
|
@@ -2671,7 +2888,9 @@ def check_cluster_available(
|
|
|
2671
2888
|
exceptions.CloudUserIdentityError: if we fail to get the current user
|
|
2672
2889
|
identity.
|
|
2673
2890
|
"""
|
|
2674
|
-
record = global_user_state.get_cluster_from_name(cluster_name
|
|
2891
|
+
record = global_user_state.get_cluster_from_name(cluster_name,
|
|
2892
|
+
include_user_info=False,
|
|
2893
|
+
summary_response=True)
|
|
2675
2894
|
if dryrun:
|
|
2676
2895
|
assert record is not None, cluster_name
|
|
2677
2896
|
return record['handle']
|
|
@@ -2858,7 +3077,8 @@ def is_controller_accessible(
|
|
|
2858
3077
|
f'fatal, but {controller_name} commands/calls may hang or return '
|
|
2859
3078
|
'stale information, when the controller is not up.\n'
|
|
2860
3079
|
f' Details: {common_utils.format_exception(e, use_bracket=True)}')
|
|
2861
|
-
record = global_user_state.get_cluster_from_name(
|
|
3080
|
+
record = global_user_state.get_cluster_from_name(
|
|
3081
|
+
cluster_name, include_user_info=False, summary_response=True)
|
|
2862
3082
|
if record is not None:
|
|
2863
3083
|
controller_status, handle = record['status'], record['handle']
|
|
2864
3084
|
# We check the connection even if the cluster has a cached status UP
|
|
@@ -2915,22 +3135,96 @@ class CloudFilter(enum.Enum):
|
|
|
2915
3135
|
LOCAL = 'local'
|
|
2916
3136
|
|
|
2917
3137
|
|
|
2918
|
-
def _get_glob_clusters(
|
|
3138
|
+
def _get_glob_clusters(
|
|
3139
|
+
clusters: List[str],
|
|
3140
|
+
silent: bool = False,
|
|
3141
|
+
workspaces_filter: Optional[Dict[str, Any]] = None) -> List[str]:
|
|
2919
3142
|
"""Returns a list of clusters that match the glob pattern."""
|
|
2920
3143
|
glob_clusters = []
|
|
2921
3144
|
for cluster in clusters:
|
|
2922
|
-
glob_cluster = global_user_state.get_glob_cluster_names(
|
|
3145
|
+
glob_cluster = global_user_state.get_glob_cluster_names(
|
|
3146
|
+
cluster, workspaces_filter=workspaces_filter)
|
|
2923
3147
|
if len(glob_cluster) == 0 and not silent:
|
|
2924
3148
|
logger.info(f'Cluster {cluster} not found.')
|
|
2925
3149
|
glob_clusters.extend(glob_cluster)
|
|
2926
3150
|
return list(set(glob_clusters))
|
|
2927
3151
|
|
|
2928
3152
|
|
|
3153
|
+
def _refresh_cluster(
|
|
3154
|
+
cluster_name: str,
|
|
3155
|
+
force_refresh_statuses: Optional[Set[status_lib.ClusterStatus]],
|
|
3156
|
+
include_user_info: bool = True,
|
|
3157
|
+
summary_response: bool = False) -> Optional[Dict[str, Any]]:
|
|
3158
|
+
try:
|
|
3159
|
+
record = refresh_cluster_record(
|
|
3160
|
+
cluster_name,
|
|
3161
|
+
force_refresh_statuses=force_refresh_statuses,
|
|
3162
|
+
cluster_lock_already_held=False,
|
|
3163
|
+
include_user_info=include_user_info,
|
|
3164
|
+
summary_response=summary_response)
|
|
3165
|
+
except (exceptions.ClusterStatusFetchingError,
|
|
3166
|
+
exceptions.CloudUserIdentityError,
|
|
3167
|
+
exceptions.ClusterOwnerIdentityMismatchError) as e:
|
|
3168
|
+
# Do not fail the entire refresh process. The caller will
|
|
3169
|
+
# handle the 'UNKNOWN' status, and collect the errors into
|
|
3170
|
+
# a table.
|
|
3171
|
+
record = {'status': 'UNKNOWN', 'error': e}
|
|
3172
|
+
return record
|
|
3173
|
+
|
|
3174
|
+
|
|
3175
|
+
def refresh_cluster_records() -> None:
|
|
3176
|
+
"""Refreshes the status of all clusters, except managed clusters.
|
|
3177
|
+
|
|
3178
|
+
Used by the background status refresh daemon.
|
|
3179
|
+
This function is a stripped-down version of get_clusters, with only the
|
|
3180
|
+
bare bones refresh logic.
|
|
3181
|
+
|
|
3182
|
+
Returns:
|
|
3183
|
+
None
|
|
3184
|
+
|
|
3185
|
+
Raises:
|
|
3186
|
+
None
|
|
3187
|
+
"""
|
|
3188
|
+
# We force to exclude managed clusters to avoid multiple sources
|
|
3189
|
+
# manipulating them. For example, SkyServe assumes the replica manager
|
|
3190
|
+
# is the only source of truth for the cluster status.
|
|
3191
|
+
cluster_names = set(
|
|
3192
|
+
global_user_state.get_cluster_names(exclude_managed_clusters=True))
|
|
3193
|
+
|
|
3194
|
+
# TODO(syang): we should try not to leak
|
|
3195
|
+
# request info in backend_utils.py.
|
|
3196
|
+
# Refactor this to use some other info to
|
|
3197
|
+
# determine if a launch is in progress.
|
|
3198
|
+
cluster_names_with_launch_request = {
|
|
3199
|
+
request.cluster_name for request in requests_lib.get_request_tasks(
|
|
3200
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
3201
|
+
status=[requests_lib.RequestStatus.RUNNING],
|
|
3202
|
+
include_request_names=['sky.launch'],
|
|
3203
|
+
fields=['cluster_name']))
|
|
3204
|
+
}
|
|
3205
|
+
cluster_names_without_launch_request = (cluster_names -
|
|
3206
|
+
cluster_names_with_launch_request)
|
|
3207
|
+
|
|
3208
|
+
def _refresh_cluster_record(cluster_name):
|
|
3209
|
+
return _refresh_cluster(cluster_name,
|
|
3210
|
+
force_refresh_statuses=set(
|
|
3211
|
+
status_lib.ClusterStatus),
|
|
3212
|
+
include_user_info=False,
|
|
3213
|
+
summary_response=True)
|
|
3214
|
+
|
|
3215
|
+
if len(cluster_names_without_launch_request) > 0:
|
|
3216
|
+
# Do not refresh the clusters that have an active launch request.
|
|
3217
|
+
subprocess_utils.run_in_parallel(_refresh_cluster_record,
|
|
3218
|
+
cluster_names_without_launch_request)
|
|
3219
|
+
|
|
3220
|
+
|
|
2929
3221
|
def get_clusters(
|
|
2930
3222
|
refresh: common.StatusRefreshMode,
|
|
2931
3223
|
cluster_names: Optional[Union[str, List[str]]] = None,
|
|
2932
3224
|
all_users: bool = True,
|
|
2933
3225
|
include_credentials: bool = False,
|
|
3226
|
+
summary_response: bool = False,
|
|
3227
|
+
include_handle: bool = True,
|
|
2934
3228
|
# Internal only:
|
|
2935
3229
|
# pylint: disable=invalid-name
|
|
2936
3230
|
_include_is_managed: bool = False,
|
|
@@ -2958,6 +3252,23 @@ def get_clusters(
|
|
|
2958
3252
|
A list of cluster records. If the cluster does not exist or has been
|
|
2959
3253
|
terminated, the record will be omitted from the returned list.
|
|
2960
3254
|
"""
|
|
3255
|
+
accessible_workspaces = workspaces_core.get_workspaces()
|
|
3256
|
+
if cluster_names is not None:
|
|
3257
|
+
if isinstance(cluster_names, str):
|
|
3258
|
+
cluster_names = [cluster_names]
|
|
3259
|
+
non_glob_cluster_names = []
|
|
3260
|
+
glob_cluster_names = []
|
|
3261
|
+
for cluster_name in cluster_names:
|
|
3262
|
+
if ux_utils.is_glob_pattern(cluster_name):
|
|
3263
|
+
glob_cluster_names.append(cluster_name)
|
|
3264
|
+
else:
|
|
3265
|
+
non_glob_cluster_names.append(cluster_name)
|
|
3266
|
+
cluster_names = non_glob_cluster_names
|
|
3267
|
+
if glob_cluster_names:
|
|
3268
|
+
cluster_names += _get_glob_clusters(
|
|
3269
|
+
glob_cluster_names,
|
|
3270
|
+
silent=True,
|
|
3271
|
+
workspaces_filter=accessible_workspaces)
|
|
2961
3272
|
|
|
2962
3273
|
exclude_managed_clusters = False
|
|
2963
3274
|
if not (_include_is_managed or env_options.Options.SHOW_DEBUG_INFO.get()):
|
|
@@ -2965,34 +3276,24 @@ def get_clusters(
|
|
|
2965
3276
|
user_hashes_filter = None
|
|
2966
3277
|
if not all_users:
|
|
2967
3278
|
user_hashes_filter = {common_utils.get_current_user().id}
|
|
2968
|
-
accessible_workspaces = workspaces_core.get_workspaces()
|
|
2969
|
-
|
|
2970
3279
|
records = global_user_state.get_clusters(
|
|
2971
3280
|
exclude_managed_clusters=exclude_managed_clusters,
|
|
2972
3281
|
user_hashes_filter=user_hashes_filter,
|
|
2973
|
-
workspaces_filter=accessible_workspaces
|
|
3282
|
+
workspaces_filter=accessible_workspaces,
|
|
3283
|
+
cluster_names=cluster_names,
|
|
3284
|
+
summary_response=summary_response)
|
|
2974
3285
|
|
|
2975
3286
|
yellow = colorama.Fore.YELLOW
|
|
2976
3287
|
bright = colorama.Style.BRIGHT
|
|
2977
3288
|
reset = colorama.Style.RESET_ALL
|
|
2978
3289
|
|
|
2979
3290
|
if cluster_names is not None:
|
|
2980
|
-
|
|
2981
|
-
|
|
2982
|
-
|
|
2983
|
-
|
|
2984
|
-
|
|
2985
|
-
for cluster_name in cluster_names:
|
|
2986
|
-
for record in records:
|
|
2987
|
-
if record['name'] == cluster_name:
|
|
2988
|
-
new_records.append(record)
|
|
2989
|
-
break
|
|
2990
|
-
else:
|
|
2991
|
-
not_exist_cluster_names.append(cluster_name)
|
|
2992
|
-
if not_exist_cluster_names:
|
|
2993
|
-
clusters_str = ', '.join(not_exist_cluster_names)
|
|
3291
|
+
record_names = {record['name'] for record in records}
|
|
3292
|
+
not_found_clusters = ux_utils.get_non_matched_query(
|
|
3293
|
+
cluster_names, record_names)
|
|
3294
|
+
if not_found_clusters:
|
|
3295
|
+
clusters_str = ', '.join(not_found_clusters)
|
|
2994
3296
|
logger.info(f'Cluster(s) not found: {bright}{clusters_str}{reset}.')
|
|
2995
|
-
records = new_records
|
|
2996
3297
|
|
|
2997
3298
|
def _get_records_with_handle(
|
|
2998
3299
|
records: List[Optional[Dict[str, Any]]]) -> List[Dict[str, Any]]:
|
|
@@ -3002,17 +3303,18 @@ def get_clusters(
|
|
|
3002
3303
|
if record is not None and record['handle'] is not None
|
|
3003
3304
|
]
|
|
3004
3305
|
|
|
3005
|
-
def
|
|
3306
|
+
def _update_records_with_handle_info(
|
|
3006
3307
|
records: List[Optional[Dict[str, Any]]]) -> None:
|
|
3007
3308
|
"""Add resource str to record"""
|
|
3008
3309
|
for record in _get_records_with_handle(records):
|
|
3009
3310
|
handle = record['handle']
|
|
3010
|
-
|
|
3011
|
-
|
|
3012
|
-
handle,
|
|
3013
|
-
record[
|
|
3014
|
-
|
|
3015
|
-
|
|
3311
|
+
resource_str_simple, resource_str_full = (
|
|
3312
|
+
resources_utils.get_readable_resources_repr(
|
|
3313
|
+
handle, simplified_only=False))
|
|
3314
|
+
record['resources_str'] = resource_str_simple
|
|
3315
|
+
record['resources_str_full'] = resource_str_full
|
|
3316
|
+
if not summary_response:
|
|
3317
|
+
record['cluster_name_on_cloud'] = handle.cluster_name_on_cloud
|
|
3016
3318
|
|
|
3017
3319
|
def _update_records_with_credentials(
|
|
3018
3320
|
records: List[Optional[Dict[str, Any]]]) -> None:
|
|
@@ -3036,9 +3338,17 @@ def get_clusters(
|
|
|
3036
3338
|
expanded_private_key_path = os.path.expanduser(
|
|
3037
3339
|
ssh_private_key_path)
|
|
3038
3340
|
if not os.path.exists(expanded_private_key_path):
|
|
3039
|
-
|
|
3341
|
+
success = auth_utils.create_ssh_key_files_from_db(
|
|
3342
|
+
ssh_private_key_path)
|
|
3343
|
+
if not success:
|
|
3344
|
+
# If the ssh key files are not found, we do not
|
|
3345
|
+
# update the record with credentials.
|
|
3346
|
+
logger.debug(
|
|
3347
|
+
f'SSH keys not found for cluster {record["name"]} '
|
|
3348
|
+
f'at key path {ssh_private_key_path}')
|
|
3349
|
+
continue
|
|
3040
3350
|
else:
|
|
3041
|
-
private_key_path, _ =
|
|
3351
|
+
private_key_path, _ = auth_utils.get_or_generate_keys()
|
|
3042
3352
|
expanded_private_key_path = os.path.expanduser(private_key_path)
|
|
3043
3353
|
if expanded_private_key_path in cached_private_keys:
|
|
3044
3354
|
credential['ssh_private_key_content'] = cached_private_keys[
|
|
@@ -3052,7 +3362,7 @@ def get_clusters(
|
|
|
3052
3362
|
record['credentials'] = credential
|
|
3053
3363
|
|
|
3054
3364
|
def _update_records_with_resources(
|
|
3055
|
-
|
|
3365
|
+
records: List[Optional[Dict[str, Any]]],) -> None:
|
|
3056
3366
|
"""Add the resources to the record."""
|
|
3057
3367
|
for record in _get_records_with_handle(records):
|
|
3058
3368
|
handle = record['handle']
|
|
@@ -3070,9 +3380,11 @@ def get_clusters(
|
|
|
3070
3380
|
record['accelerators'] = (
|
|
3071
3381
|
f'{handle.launched_resources.accelerators}'
|
|
3072
3382
|
if handle.launched_resources.accelerators else None)
|
|
3383
|
+
if not include_handle:
|
|
3384
|
+
record.pop('handle', None)
|
|
3073
3385
|
|
|
3074
|
-
# Add
|
|
3075
|
-
|
|
3386
|
+
# Add handle info to the records
|
|
3387
|
+
_update_records_with_handle_info(records)
|
|
3076
3388
|
if include_credentials:
|
|
3077
3389
|
_update_records_with_credentials(records)
|
|
3078
3390
|
if refresh == common.StatusRefreshMode.NONE:
|
|
@@ -3093,65 +3405,76 @@ def get_clusters(
|
|
|
3093
3405
|
else:
|
|
3094
3406
|
force_refresh_statuses = None
|
|
3095
3407
|
|
|
3096
|
-
def
|
|
3097
|
-
|
|
3098
|
-
|
|
3099
|
-
|
|
3100
|
-
|
|
3101
|
-
|
|
3102
|
-
|
|
3103
|
-
|
|
3104
|
-
|
|
3105
|
-
|
|
3106
|
-
if len(request) > 0:
|
|
3107
|
-
# There is an active launch request on the cluster,
|
|
3108
|
-
# so we don't want to update the cluster status until
|
|
3109
|
-
# the request is completed.
|
|
3110
|
-
logger.debug(f'skipping refresh for cluster {cluster_name} '
|
|
3111
|
-
'as there is an active launch request')
|
|
3112
|
-
return global_user_state.get_cluster_from_name(cluster_name)
|
|
3113
|
-
try:
|
|
3114
|
-
record = refresh_cluster_record(
|
|
3115
|
-
cluster_name,
|
|
3116
|
-
force_refresh_statuses=force_refresh_statuses,
|
|
3117
|
-
acquire_per_cluster_status_lock=True)
|
|
3118
|
-
_update_records_with_resources_str([record])
|
|
3408
|
+
def _refresh_cluster_record(cluster_name):
|
|
3409
|
+
record = _refresh_cluster(cluster_name,
|
|
3410
|
+
force_refresh_statuses=force_refresh_statuses,
|
|
3411
|
+
include_user_info=True,
|
|
3412
|
+
summary_response=summary_response)
|
|
3413
|
+
# record may be None if the cluster is deleted during refresh,
|
|
3414
|
+
# e.g. all the Pods of a cluster on Kubernetes have been
|
|
3415
|
+
# deleted before refresh.
|
|
3416
|
+
if record is not None and 'error' not in record:
|
|
3417
|
+
_update_records_with_handle_info([record])
|
|
3119
3418
|
if include_credentials:
|
|
3120
3419
|
_update_records_with_credentials([record])
|
|
3121
|
-
|
|
3122
|
-
exceptions.CloudUserIdentityError,
|
|
3123
|
-
exceptions.ClusterOwnerIdentityMismatchError) as e:
|
|
3124
|
-
# Do not fail the entire refresh process. The caller will
|
|
3125
|
-
# handle the 'UNKNOWN' status, and collect the errors into
|
|
3126
|
-
# a table.
|
|
3127
|
-
record = {'status': 'UNKNOWN', 'error': e}
|
|
3128
|
-
progress.update(task, advance=1)
|
|
3420
|
+
progress.update(task, advance=1)
|
|
3129
3421
|
return record
|
|
3130
3422
|
|
|
3131
3423
|
cluster_names = [record['name'] for record in records]
|
|
3424
|
+
# TODO(syang): we should try not to leak
|
|
3425
|
+
# request info in backend_utils.py.
|
|
3426
|
+
# Refactor this to use some other info to
|
|
3427
|
+
# determine if a launch is in progress.
|
|
3428
|
+
cluster_names_with_launch_request = {
|
|
3429
|
+
request.cluster_name for request in requests_lib.get_request_tasks(
|
|
3430
|
+
req_filter=requests_lib.RequestTaskFilter(
|
|
3431
|
+
status=[requests_lib.RequestStatus.RUNNING],
|
|
3432
|
+
include_request_names=['sky.launch'],
|
|
3433
|
+
cluster_names=cluster_names,
|
|
3434
|
+
fields=['cluster_name']))
|
|
3435
|
+
}
|
|
3436
|
+
# Preserve the index of the cluster name as it appears on "records"
|
|
3437
|
+
cluster_names_without_launch_request = [
|
|
3438
|
+
(i, cluster_name)
|
|
3439
|
+
for i, cluster_name in enumerate(cluster_names)
|
|
3440
|
+
if cluster_name not in cluster_names_with_launch_request
|
|
3441
|
+
]
|
|
3442
|
+
# for clusters that have an active launch request, we do not refresh the status
|
|
3132
3443
|
updated_records = []
|
|
3133
|
-
if len(
|
|
3444
|
+
if len(cluster_names_without_launch_request) > 0:
|
|
3134
3445
|
with progress:
|
|
3135
3446
|
updated_records = subprocess_utils.run_in_parallel(
|
|
3136
|
-
|
|
3137
|
-
|
|
3447
|
+
_refresh_cluster_record, [
|
|
3448
|
+
cluster_name
|
|
3449
|
+
for _, cluster_name in cluster_names_without_launch_request
|
|
3450
|
+
])
|
|
3451
|
+
# Preserve the index of the cluster name as it appears on "records"
|
|
3452
|
+
# before filtering for clusters being launched.
|
|
3453
|
+
updated_records_dict: Dict[int, Optional[Dict[str, Any]]] = {
|
|
3454
|
+
cluster_names_without_launch_request[i][0]: updated_records[i]
|
|
3455
|
+
for i in range(len(cluster_names_without_launch_request))
|
|
3456
|
+
}
|
|
3138
3457
|
# Show information for removed clusters.
|
|
3139
3458
|
kept_records = []
|
|
3140
3459
|
autodown_clusters, remaining_clusters, failed_clusters = [], [], []
|
|
3141
3460
|
for i, record in enumerate(records):
|
|
3142
|
-
if
|
|
3461
|
+
if i not in updated_records_dict:
|
|
3462
|
+
# record was not refreshed, keep the original record
|
|
3463
|
+
kept_records.append(record)
|
|
3464
|
+
continue
|
|
3465
|
+
updated_record = updated_records_dict[i]
|
|
3466
|
+
if updated_record is None:
|
|
3143
3467
|
if record['to_down']:
|
|
3144
|
-
autodown_clusters.append(
|
|
3468
|
+
autodown_clusters.append(record['name'])
|
|
3145
3469
|
else:
|
|
3146
|
-
remaining_clusters.append(
|
|
3147
|
-
elif
|
|
3148
|
-
failed_clusters.append(
|
|
3149
|
-
(cluster_names[i], updated_records[i]['error']))
|
|
3470
|
+
remaining_clusters.append(record['name'])
|
|
3471
|
+
elif updated_record['status'] == 'UNKNOWN':
|
|
3472
|
+
failed_clusters.append((record['name'], updated_record['error']))
|
|
3150
3473
|
# Keep the original record if the status is unknown,
|
|
3151
3474
|
# so that the user can still see the cluster.
|
|
3152
3475
|
kept_records.append(record)
|
|
3153
3476
|
else:
|
|
3154
|
-
kept_records.append(
|
|
3477
|
+
kept_records.append(updated_record)
|
|
3155
3478
|
|
|
3156
3479
|
if autodown_clusters:
|
|
3157
3480
|
plural = 's' if len(autodown_clusters) > 1 else ''
|
|
@@ -3352,13 +3675,8 @@ def check_stale_runtime_on_remote(returncode: int, stderr: str,
|
|
|
3352
3675
|
`stderr`. Typically due to the local client version just got updated, and
|
|
3353
3676
|
the remote runtime is an older version.
|
|
3354
3677
|
"""
|
|
3355
|
-
pattern = re.compile(r'AttributeError: module \'sky\.(.*)\' has no '
|
|
3356
|
-
r'attribute \'(.*)\'')
|
|
3357
3678
|
if returncode != 0:
|
|
3358
|
-
|
|
3359
|
-
# the remote cluster. Remove this after 0.10.0 is released.
|
|
3360
|
-
attribute_error = re.findall(pattern, stderr)
|
|
3361
|
-
if attribute_error or 'SkyPilot runtime is too old' in stderr:
|
|
3679
|
+
if 'SkyPilot runtime is too old' in stderr:
|
|
3362
3680
|
with ux_utils.print_exception_no_traceback():
|
|
3363
3681
|
raise RuntimeError(
|
|
3364
3682
|
f'{colorama.Fore.RED}SkyPilot runtime needs to be updated '
|
|
@@ -3502,19 +3820,126 @@ def workspace_lock_id(workspace_name: str) -> str:
|
|
|
3502
3820
|
return f'{workspace_name}_workspace'
|
|
3503
3821
|
|
|
3504
3822
|
|
|
3823
|
+
def cluster_tunnel_lock_id(cluster_name: str) -> str:
|
|
3824
|
+
"""Get the lock ID for cluster tunnel operations."""
|
|
3825
|
+
return f'{cluster_name}_ssh_tunnel'
|
|
3826
|
+
|
|
3827
|
+
|
|
3828
|
+
def open_ssh_tunnel(head_runner: Union[command_runner.SSHCommandRunner,
|
|
3829
|
+
command_runner.KubernetesCommandRunner],
|
|
3830
|
+
port_forward: Tuple[int, int]) -> subprocess.Popen:
|
|
3831
|
+
local_port, remote_port = port_forward
|
|
3832
|
+
if isinstance(head_runner, command_runner.SSHCommandRunner):
|
|
3833
|
+
# Disabling ControlMaster makes things easier to reason about
|
|
3834
|
+
# with respect to resource management/ownership,
|
|
3835
|
+
# as killing the process will close the tunnel too.
|
|
3836
|
+
head_runner.disable_control_master = True
|
|
3837
|
+
head_runner.port_forward_execute_remote_command = True
|
|
3838
|
+
|
|
3839
|
+
# The default connect_timeout of 1s is too short for
|
|
3840
|
+
# connecting to clusters using a jump server.
|
|
3841
|
+
# We use NON_INTERACTIVE mode to avoid allocating a pseudo-tty,
|
|
3842
|
+
# which is counted towards non-idleness.
|
|
3843
|
+
cmd: List[str] = head_runner.port_forward_command(
|
|
3844
|
+
[(local_port, remote_port)],
|
|
3845
|
+
connect_timeout=5,
|
|
3846
|
+
ssh_mode=command_runner.SshMode.NON_INTERACTIVE)
|
|
3847
|
+
if isinstance(head_runner, command_runner.SSHCommandRunner):
|
|
3848
|
+
# cat so the command doesn't exit until we kill it
|
|
3849
|
+
cmd += [f'"echo {_ACK_MESSAGE} && cat"']
|
|
3850
|
+
cmd_str = ' '.join(cmd)
|
|
3851
|
+
logger.debug(f'Running port forward command: {cmd_str}')
|
|
3852
|
+
ssh_tunnel_proc = subprocess.Popen(cmd_str,
|
|
3853
|
+
shell=True,
|
|
3854
|
+
stdin=subprocess.PIPE,
|
|
3855
|
+
stdout=subprocess.PIPE,
|
|
3856
|
+
stderr=subprocess.PIPE,
|
|
3857
|
+
start_new_session=True,
|
|
3858
|
+
text=True)
|
|
3859
|
+
# Wait until we receive an ack from the remote cluster or
|
|
3860
|
+
# the SSH connection times out.
|
|
3861
|
+
queue: queue_lib.Queue = queue_lib.Queue()
|
|
3862
|
+
stdout_thread = threading.Thread(
|
|
3863
|
+
target=lambda queue, stdout: queue.put(stdout.readline()),
|
|
3864
|
+
args=(queue, ssh_tunnel_proc.stdout),
|
|
3865
|
+
daemon=True)
|
|
3866
|
+
stdout_thread.start()
|
|
3867
|
+
while ssh_tunnel_proc.poll() is None:
|
|
3868
|
+
try:
|
|
3869
|
+
ack = queue.get_nowait()
|
|
3870
|
+
except queue_lib.Empty:
|
|
3871
|
+
ack = None
|
|
3872
|
+
time.sleep(0.1)
|
|
3873
|
+
continue
|
|
3874
|
+
assert ack is not None
|
|
3875
|
+
if isinstance(
|
|
3876
|
+
head_runner,
|
|
3877
|
+
command_runner.SSHCommandRunner) and ack == f'{_ACK_MESSAGE}\n':
|
|
3878
|
+
break
|
|
3879
|
+
elif isinstance(head_runner, command_runner.KubernetesCommandRunner
|
|
3880
|
+
) and _FORWARDING_FROM_MESSAGE in ack:
|
|
3881
|
+
# On kind clusters, this error occurs if we make a request
|
|
3882
|
+
# immediately after the port-forward is established on a new pod:
|
|
3883
|
+
# "Unhandled Error" err="an error occurred forwarding ... -> 46590:
|
|
3884
|
+
# failed to execute portforward in network namespace
|
|
3885
|
+
# "/var/run/netns/cni-...": failed to connect to localhost:46590
|
|
3886
|
+
# inside namespace "...", IPv4: dial tcp4 127.0.0.1:46590:
|
|
3887
|
+
# connect: connection refused
|
|
3888
|
+
# So we need to poll the port on the pod to check if it is open.
|
|
3889
|
+
# We did not observe this with real Kubernetes clusters.
|
|
3890
|
+
timeout = 5
|
|
3891
|
+
port_check_cmd = (
|
|
3892
|
+
# We install netcat in our ray-node container,
|
|
3893
|
+
# so we can use it here.
|
|
3894
|
+
# (See kubernetes-ray.yml.j2)
|
|
3895
|
+
f'end=$((SECONDS+{timeout})); '
|
|
3896
|
+
f'while ! nc -z -w 1 localhost {remote_port}; do '
|
|
3897
|
+
'if (( SECONDS >= end )); then exit 1; fi; '
|
|
3898
|
+
'sleep 0.1; '
|
|
3899
|
+
'done')
|
|
3900
|
+
returncode, stdout, stderr = head_runner.run(port_check_cmd,
|
|
3901
|
+
require_outputs=True,
|
|
3902
|
+
stream_logs=False)
|
|
3903
|
+
if returncode != 0:
|
|
3904
|
+
try:
|
|
3905
|
+
ssh_tunnel_proc.terminate()
|
|
3906
|
+
ssh_tunnel_proc.wait(timeout=5)
|
|
3907
|
+
except subprocess.TimeoutExpired:
|
|
3908
|
+
ssh_tunnel_proc.kill()
|
|
3909
|
+
ssh_tunnel_proc.wait()
|
|
3910
|
+
finally:
|
|
3911
|
+
error_msg = (f'Failed to check remote port {remote_port}')
|
|
3912
|
+
if stdout:
|
|
3913
|
+
error_msg += f'\n-- stdout --\n{stdout}\n'
|
|
3914
|
+
raise exceptions.CommandError(returncode=returncode,
|
|
3915
|
+
command=cmd_str,
|
|
3916
|
+
error_msg=error_msg,
|
|
3917
|
+
detailed_reason=stderr)
|
|
3918
|
+
break
|
|
3919
|
+
|
|
3920
|
+
if ssh_tunnel_proc.poll() is not None:
|
|
3921
|
+
stdout, stderr = ssh_tunnel_proc.communicate()
|
|
3922
|
+
error_msg = 'Port forward failed'
|
|
3923
|
+
if stdout:
|
|
3924
|
+
error_msg += f'\n-- stdout --\n{stdout}\n'
|
|
3925
|
+
raise exceptions.CommandError(returncode=ssh_tunnel_proc.returncode,
|
|
3926
|
+
command=cmd_str,
|
|
3927
|
+
error_msg=error_msg,
|
|
3928
|
+
detailed_reason=stderr)
|
|
3929
|
+
return ssh_tunnel_proc
|
|
3930
|
+
|
|
3931
|
+
|
|
3505
3932
|
T = TypeVar('T')
|
|
3506
3933
|
|
|
3507
3934
|
|
|
3508
|
-
def invoke_skylet_with_retries(
|
|
3509
|
-
handle: 'cloud_vm_ray_backend.CloudVmRayResourceHandle',
|
|
3510
|
-
func: Callable[..., T]) -> T:
|
|
3935
|
+
def invoke_skylet_with_retries(func: Callable[..., T]) -> T:
|
|
3511
3936
|
"""Generic helper for making Skylet gRPC requests.
|
|
3512
3937
|
|
|
3513
3938
|
This method handles the common pattern of:
|
|
3514
3939
|
1. Try the gRPC request
|
|
3515
3940
|
2. If SSH tunnel is closed, recreate it and retry
|
|
3516
3941
|
"""
|
|
3517
|
-
max_attempts =
|
|
3942
|
+
max_attempts = 5
|
|
3518
3943
|
backoff = common_utils.Backoff(initial_backoff=0.5)
|
|
3519
3944
|
last_exception: Optional[Exception] = None
|
|
3520
3945
|
|
|
@@ -3523,26 +3948,46 @@ def invoke_skylet_with_retries(
|
|
|
3523
3948
|
return func()
|
|
3524
3949
|
except grpc.RpcError as e:
|
|
3525
3950
|
last_exception = e
|
|
3526
|
-
|
|
3527
|
-
|
|
3528
|
-
|
|
3529
|
-
|
|
3530
|
-
|
|
3531
|
-
|
|
3532
|
-
if handle.skylet_ssh_tunnel is not None:
|
|
3533
|
-
proc = psutil.Process(handle.skylet_ssh_tunnel.pid)
|
|
3534
|
-
if proc.is_running(
|
|
3535
|
-
) and proc.status() != psutil.STATUS_ZOMBIE:
|
|
3536
|
-
recreate_tunnel = False
|
|
3537
|
-
except psutil.NoSuchProcess:
|
|
3538
|
-
pass
|
|
3539
|
-
|
|
3540
|
-
if recreate_tunnel:
|
|
3541
|
-
handle.open_and_update_skylet_tunnel()
|
|
3542
|
-
|
|
3543
|
-
time.sleep(backoff.current_backoff())
|
|
3544
|
-
else:
|
|
3545
|
-
raise e
|
|
3951
|
+
_handle_grpc_error(e, backoff.current_backoff())
|
|
3952
|
+
|
|
3953
|
+
raise RuntimeError(
|
|
3954
|
+
f'Failed to invoke Skylet after {max_attempts} attempts: {last_exception}'
|
|
3955
|
+
) from last_exception
|
|
3956
|
+
|
|
3546
3957
|
|
|
3547
|
-
|
|
3548
|
-
|
|
3958
|
+
def invoke_skylet_streaming_with_retries(
|
|
3959
|
+
stream_func: Callable[..., Iterator[T]]) -> Iterator[T]:
|
|
3960
|
+
"""Generic helper for making Skylet streaming gRPC requests."""
|
|
3961
|
+
max_attempts = 3
|
|
3962
|
+
backoff = common_utils.Backoff(initial_backoff=0.5)
|
|
3963
|
+
last_exception: Optional[Exception] = None
|
|
3964
|
+
|
|
3965
|
+
for _ in range(max_attempts):
|
|
3966
|
+
try:
|
|
3967
|
+
for response in stream_func():
|
|
3968
|
+
yield response
|
|
3969
|
+
return
|
|
3970
|
+
except grpc.RpcError as e:
|
|
3971
|
+
last_exception = e
|
|
3972
|
+
_handle_grpc_error(e, backoff.current_backoff())
|
|
3973
|
+
|
|
3974
|
+
raise RuntimeError(
|
|
3975
|
+
f'Failed to stream Skylet response after {max_attempts} attempts'
|
|
3976
|
+
) from last_exception
|
|
3977
|
+
|
|
3978
|
+
|
|
3979
|
+
def _handle_grpc_error(e: 'grpc.RpcError', current_backoff: float) -> None:
|
|
3980
|
+
if e.code() == grpc.StatusCode.INTERNAL:
|
|
3981
|
+
with ux_utils.print_exception_no_traceback():
|
|
3982
|
+
raise exceptions.SkyletInternalError(e.details())
|
|
3983
|
+
elif e.code() == grpc.StatusCode.UNAVAILABLE:
|
|
3984
|
+
time.sleep(current_backoff)
|
|
3985
|
+
elif e.code() == grpc.StatusCode.UNIMPLEMENTED or e.code(
|
|
3986
|
+
) == grpc.StatusCode.UNKNOWN:
|
|
3987
|
+
# Handle backwards compatibility: old server doesn't implement this RPC.
|
|
3988
|
+
# Let the caller fall back to legacy execution.
|
|
3989
|
+
raise exceptions.SkyletMethodNotImplementedError(
|
|
3990
|
+
f'gRPC method not implemented on server, falling back to legacy execution: {e.details()}'
|
|
3991
|
+
)
|
|
3992
|
+
else:
|
|
3993
|
+
raise e
|