skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,952 @@
|
|
|
1
|
+
"""SSH-based Kubernetes Cluster Deployment Script"""
|
|
2
|
+
# pylint: disable=line-too-long
|
|
3
|
+
import base64
|
|
4
|
+
import concurrent.futures as cf
|
|
5
|
+
import os
|
|
6
|
+
import re
|
|
7
|
+
import shlex
|
|
8
|
+
import shutil
|
|
9
|
+
import tempfile
|
|
10
|
+
from typing import List, Optional
|
|
11
|
+
|
|
12
|
+
import colorama
|
|
13
|
+
import yaml
|
|
14
|
+
|
|
15
|
+
from sky import sky_logging
|
|
16
|
+
from sky.ssh_node_pools import constants
|
|
17
|
+
from sky.ssh_node_pools import utils as ssh_utils
|
|
18
|
+
from sky.ssh_node_pools.deploy import tunnel_utils
|
|
19
|
+
from sky.ssh_node_pools.deploy import utils as deploy_utils
|
|
20
|
+
from sky.utils import rich_utils
|
|
21
|
+
from sky.utils import ux_utils
|
|
22
|
+
|
|
23
|
+
RESET_ALL = colorama.Style.RESET_ALL
|
|
24
|
+
|
|
25
|
+
# Get the directory of this script
|
|
26
|
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
27
|
+
|
|
28
|
+
logger = sky_logging.init_logger(__name__)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def progress_message(message):
|
|
32
|
+
"""Show a progress message."""
|
|
33
|
+
logger.info(f'{colorama.Fore.YELLOW}➜ {message}{RESET_ALL}')
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def success_message(message):
|
|
37
|
+
"""Show a success message."""
|
|
38
|
+
logger.info(f'{colorama.Fore.GREEN}✔ {message}{RESET_ALL}')
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
def force_update_status(message):
|
|
42
|
+
"""Force update rich spinner status."""
|
|
43
|
+
rich_utils.force_update_status(ux_utils.spinner_message(message))
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def run(cleanup: bool = False,
|
|
47
|
+
infra: Optional[str] = None,
|
|
48
|
+
kubeconfig_path: str = constants.DEFAULT_KUBECONFIG_PATH):
|
|
49
|
+
"""Deploy a Kubernetes cluster on SSH targets.
|
|
50
|
+
|
|
51
|
+
This function reads ~/.sky/ssh_node_pools.yaml and uses it to deploy a
|
|
52
|
+
Kubernetes cluster on the specified machines.
|
|
53
|
+
|
|
54
|
+
Args:
|
|
55
|
+
cleanup: Whether to clean up the cluster instead of deploying.
|
|
56
|
+
infra: Name of the cluster in ssh_node_pools.yaml to use.
|
|
57
|
+
If None, the first cluster in the file will be used.
|
|
58
|
+
kubeconfig_path: Path to save the Kubernetes configuration file.
|
|
59
|
+
If None, the default ~/.kube/config will be used.
|
|
60
|
+
"""
|
|
61
|
+
deploy_utils.check_ssh_cluster_dependencies()
|
|
62
|
+
action = 'Cleanup' if cleanup else 'Deployment'
|
|
63
|
+
msg_str = f'Initializing SSH Node Pools {action}...'
|
|
64
|
+
|
|
65
|
+
with rich_utils.safe_status(ux_utils.spinner_message(msg_str)):
|
|
66
|
+
try:
|
|
67
|
+
deploy_multiple_clusters(infra=infra,
|
|
68
|
+
cleanup=cleanup,
|
|
69
|
+
kubeconfig_path=kubeconfig_path)
|
|
70
|
+
except Exception as e: # pylint: disable=broad-except
|
|
71
|
+
logger.error(str(e))
|
|
72
|
+
with ux_utils.print_exception_no_traceback():
|
|
73
|
+
raise RuntimeError(
|
|
74
|
+
'Failed to deploy SkyPilot on some Node Pools.') from e
|
|
75
|
+
|
|
76
|
+
# Add empty line for ux-purposes.
|
|
77
|
+
logger.info('')
|
|
78
|
+
if cleanup:
|
|
79
|
+
logger.info(
|
|
80
|
+
ux_utils.finishing_message(
|
|
81
|
+
'🎉 SSH Node Pools cleaned up successfully.'))
|
|
82
|
+
else:
|
|
83
|
+
logger.info(
|
|
84
|
+
ux_utils.finishing_message(
|
|
85
|
+
'🎉 SSH Node Pools set up successfully. ',
|
|
86
|
+
follow_up_message=(
|
|
87
|
+
f'Run `{colorama.Style.BRIGHT}'
|
|
88
|
+
f'sky check ssh'
|
|
89
|
+
f'{colorama.Style.RESET_ALL}` to verify access, '
|
|
90
|
+
f'`{colorama.Style.BRIGHT}sky launch --infra ssh'
|
|
91
|
+
f'{colorama.Style.RESET_ALL}` to launch a cluster.')))
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def deploy_multiple_clusters(
|
|
95
|
+
infra: Optional[str],
|
|
96
|
+
ssh_node_pools_file: str = constants.DEFAULT_SSH_NODE_POOLS_PATH,
|
|
97
|
+
kubeconfig_path: str = constants.DEFAULT_KUBECONFIG_PATH,
|
|
98
|
+
cleanup: bool = True):
|
|
99
|
+
|
|
100
|
+
kubeconfig_path = kubeconfig_path or constants.DEFAULT_KUBECONFIG_PATH
|
|
101
|
+
kubeconfig_path = os.path.expanduser(kubeconfig_path)
|
|
102
|
+
|
|
103
|
+
failed_clusters = []
|
|
104
|
+
successful_clusters = []
|
|
105
|
+
|
|
106
|
+
# Using YAML configuration
|
|
107
|
+
targets = ssh_utils.load_ssh_targets(ssh_node_pools_file)
|
|
108
|
+
clusters_config = ssh_utils.get_cluster_config(
|
|
109
|
+
targets, infra, file_path=ssh_node_pools_file)
|
|
110
|
+
|
|
111
|
+
# Print information about clusters being processed
|
|
112
|
+
num_clusters = len(clusters_config)
|
|
113
|
+
cluster_names = list(clusters_config.keys())
|
|
114
|
+
cluster_info = f'Found {num_clusters} Node Pool{"s" if num_clusters > 1 else ""}: {", ".join(cluster_names)}'
|
|
115
|
+
logger.info(f'{colorama.Fore.CYAN}{cluster_info}{RESET_ALL}')
|
|
116
|
+
|
|
117
|
+
# Process each cluster
|
|
118
|
+
for cluster_name, cluster_config in clusters_config.items():
|
|
119
|
+
try:
|
|
120
|
+
action = 'Cleaning up' if cleanup else 'Deploying'
|
|
121
|
+
force_update_status(f'{action} Node Pool: {cluster_name}')
|
|
122
|
+
hosts_info = ssh_utils.prepare_hosts_info(cluster_name,
|
|
123
|
+
cluster_config)
|
|
124
|
+
|
|
125
|
+
if not hosts_info:
|
|
126
|
+
logger.warning(
|
|
127
|
+
f'{colorama.Fore.RED}Error: No valid hosts found '
|
|
128
|
+
f'for cluster {cluster_name!r}. Skipping.{RESET_ALL}')
|
|
129
|
+
continue
|
|
130
|
+
|
|
131
|
+
context_name = f'ssh-{cluster_name}'
|
|
132
|
+
|
|
133
|
+
# Check cluster history
|
|
134
|
+
os.makedirs(constants.NODE_POOLS_INFO_DIR, exist_ok=True)
|
|
135
|
+
history_yaml_file = os.path.join(constants.NODE_POOLS_INFO_DIR,
|
|
136
|
+
f'{context_name}-history.yaml')
|
|
137
|
+
|
|
138
|
+
history = None
|
|
139
|
+
if os.path.exists(history_yaml_file):
|
|
140
|
+
logger.debug(f'Loading history from {history_yaml_file}')
|
|
141
|
+
with open(history_yaml_file, 'r', encoding='utf-8') as f:
|
|
142
|
+
history = yaml.safe_load(f)
|
|
143
|
+
else:
|
|
144
|
+
logger.debug(f'No history found for {context_name}.')
|
|
145
|
+
|
|
146
|
+
history_workers_info = None
|
|
147
|
+
history_worker_nodes = None
|
|
148
|
+
history_use_ssh_config = None
|
|
149
|
+
# Do not support changing anything besides hosts for now
|
|
150
|
+
if history is not None:
|
|
151
|
+
for key in ['user', 'identity_file', 'password']:
|
|
152
|
+
if not cleanup and history.get(key) != cluster_config.get(
|
|
153
|
+
key):
|
|
154
|
+
raise ValueError(
|
|
155
|
+
f'Cluster configuration has changed for field {key!r}. '
|
|
156
|
+
f'Previous value: {history.get(key)}, '
|
|
157
|
+
f'Current value: {cluster_config.get(key)}')
|
|
158
|
+
history_hosts_info = ssh_utils.prepare_hosts_info(
|
|
159
|
+
cluster_name, history)
|
|
160
|
+
if not cleanup and history_hosts_info[0] != hosts_info[0]:
|
|
161
|
+
raise ValueError(
|
|
162
|
+
f'Cluster configuration has changed for master node. '
|
|
163
|
+
f'Previous value: {history_hosts_info[0]}, '
|
|
164
|
+
f'Current value: {hosts_info[0]}')
|
|
165
|
+
history_workers_info = history_hosts_info[1:] if len(
|
|
166
|
+
history_hosts_info) > 1 else []
|
|
167
|
+
history_worker_nodes = [h['ip'] for h in history_workers_info]
|
|
168
|
+
history_use_ssh_config = [
|
|
169
|
+
h.get('use_ssh_config', False) for h in history_workers_info
|
|
170
|
+
]
|
|
171
|
+
|
|
172
|
+
# Use the first host as the head node and the rest as worker nodes
|
|
173
|
+
head_host = hosts_info[0]
|
|
174
|
+
worker_hosts = hosts_info[1:] if len(hosts_info) > 1 else []
|
|
175
|
+
|
|
176
|
+
head_node = head_host['ip']
|
|
177
|
+
worker_nodes = [h['ip'] for h in worker_hosts]
|
|
178
|
+
ssh_user = head_host['user']
|
|
179
|
+
ssh_key = head_host['identity_file']
|
|
180
|
+
head_use_ssh_config = head_host.get('use_ssh_config', False)
|
|
181
|
+
worker_use_ssh_config = [
|
|
182
|
+
h.get('use_ssh_config', False) for h in worker_hosts
|
|
183
|
+
]
|
|
184
|
+
password = head_host['password']
|
|
185
|
+
|
|
186
|
+
# Deploy this cluster
|
|
187
|
+
unsuccessful_workers = deploy_single_cluster(
|
|
188
|
+
cluster_name,
|
|
189
|
+
head_node,
|
|
190
|
+
worker_nodes,
|
|
191
|
+
ssh_user,
|
|
192
|
+
ssh_key,
|
|
193
|
+
context_name,
|
|
194
|
+
password,
|
|
195
|
+
head_use_ssh_config,
|
|
196
|
+
worker_use_ssh_config,
|
|
197
|
+
kubeconfig_path,
|
|
198
|
+
cleanup,
|
|
199
|
+
worker_hosts=worker_hosts,
|
|
200
|
+
history_worker_nodes=history_worker_nodes,
|
|
201
|
+
history_workers_info=history_workers_info,
|
|
202
|
+
history_use_ssh_config=history_use_ssh_config)
|
|
203
|
+
|
|
204
|
+
if not cleanup:
|
|
205
|
+
successful_hosts = []
|
|
206
|
+
for host in cluster_config['hosts']:
|
|
207
|
+
if isinstance(host, str):
|
|
208
|
+
host_node = host
|
|
209
|
+
else:
|
|
210
|
+
host_node = host['ip']
|
|
211
|
+
if host_node not in unsuccessful_workers:
|
|
212
|
+
successful_hosts.append(host)
|
|
213
|
+
cluster_config['hosts'] = successful_hosts
|
|
214
|
+
with open(history_yaml_file, 'w', encoding='utf-8') as f:
|
|
215
|
+
logger.debug(f'Writing history to {history_yaml_file}')
|
|
216
|
+
yaml.dump(cluster_config, f)
|
|
217
|
+
|
|
218
|
+
action = 'cleanup' if cleanup else 'deployment'
|
|
219
|
+
logger.info(
|
|
220
|
+
f'{colorama.Fore.CYAN}Completed {action} for cluster: {cluster_name}{colorama.Style.RESET_ALL}'
|
|
221
|
+
)
|
|
222
|
+
successful_clusters.append(cluster_name)
|
|
223
|
+
except Exception as e: # pylint: disable=broad-except
|
|
224
|
+
reason = str(e)
|
|
225
|
+
failed_clusters.append((cluster_name, reason))
|
|
226
|
+
action = 'cleaning' if cleanup else 'deploying'
|
|
227
|
+
logger.debug(
|
|
228
|
+
f'Error {action} SSH Node Pool `{cluster_name}`: {reason}')
|
|
229
|
+
|
|
230
|
+
if failed_clusters:
|
|
231
|
+
action = 'clean' if cleanup else 'deploy'
|
|
232
|
+
msg = f'{colorama.Fore.GREEN}Successfully {action}ed {len(successful_clusters)} cluster(s) ({", ".join(successful_clusters)}). {RESET_ALL}'
|
|
233
|
+
msg += f'{colorama.Fore.RED}Failed to {action} {len(failed_clusters)} cluster(s): {RESET_ALL}'
|
|
234
|
+
for cluster_name, reason in failed_clusters:
|
|
235
|
+
msg += f'\n {cluster_name}: {reason}'
|
|
236
|
+
raise RuntimeError(msg)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def deploy_single_cluster(cluster_name,
|
|
240
|
+
head_node,
|
|
241
|
+
worker_nodes,
|
|
242
|
+
ssh_user,
|
|
243
|
+
ssh_key,
|
|
244
|
+
context_name,
|
|
245
|
+
password,
|
|
246
|
+
head_use_ssh_config,
|
|
247
|
+
worker_use_ssh_config,
|
|
248
|
+
kubeconfig_path,
|
|
249
|
+
cleanup,
|
|
250
|
+
worker_hosts=None,
|
|
251
|
+
history_worker_nodes=None,
|
|
252
|
+
history_workers_info=None,
|
|
253
|
+
history_use_ssh_config=None) -> List[str]:
|
|
254
|
+
"""Deploy or clean up a single Kubernetes cluster.
|
|
255
|
+
|
|
256
|
+
Returns: List of unsuccessful worker nodes.
|
|
257
|
+
"""
|
|
258
|
+
history_yaml_file = os.path.join(constants.NODE_POOLS_INFO_DIR,
|
|
259
|
+
f'{context_name}-history.yaml')
|
|
260
|
+
cert_file_path = os.path.join(constants.NODE_POOLS_INFO_DIR,
|
|
261
|
+
f'{context_name}-cert.pem')
|
|
262
|
+
key_file_path = os.path.join(constants.NODE_POOLS_INFO_DIR,
|
|
263
|
+
f'{context_name}-key.pem')
|
|
264
|
+
tunnel_log_file_path = os.path.join(constants.NODE_POOLS_INFO_DIR,
|
|
265
|
+
f'{context_name}-tunnel.log')
|
|
266
|
+
|
|
267
|
+
# Generate the askpass block if password is provided
|
|
268
|
+
askpass_block = create_askpass_script(password)
|
|
269
|
+
|
|
270
|
+
# Token for k3s
|
|
271
|
+
# TODO (kyuds): make this configurable?
|
|
272
|
+
k3s_token = constants.K3S_TOKEN
|
|
273
|
+
|
|
274
|
+
# Pre-flight checks
|
|
275
|
+
logger.info(f'Checking SSH connection to head node ({head_node})...')
|
|
276
|
+
result = deploy_utils.run_remote(
|
|
277
|
+
head_node,
|
|
278
|
+
f'echo \'SSH connection successful ({head_node})\'',
|
|
279
|
+
ssh_user,
|
|
280
|
+
ssh_key,
|
|
281
|
+
use_ssh_config=head_use_ssh_config)
|
|
282
|
+
if result is None:
|
|
283
|
+
with ux_utils.print_exception_no_traceback():
|
|
284
|
+
raise RuntimeError(
|
|
285
|
+
f'Failed to SSH to head node ({head_node}). '
|
|
286
|
+
f'Please check the SSH configuration and logs for more details.'
|
|
287
|
+
)
|
|
288
|
+
elif result.startswith('SSH connection successful'):
|
|
289
|
+
success_message(f'SSH connection established to head node {head_node}.')
|
|
290
|
+
|
|
291
|
+
# Checking history
|
|
292
|
+
history_exists = (history_worker_nodes is not None and
|
|
293
|
+
history_workers_info is not None and
|
|
294
|
+
history_use_ssh_config is not None)
|
|
295
|
+
|
|
296
|
+
# Cleanup history worker nodes
|
|
297
|
+
worker_nodes_to_cleanup = []
|
|
298
|
+
remove_worker_cmds = []
|
|
299
|
+
if history_exists:
|
|
300
|
+
for history_node, history_info, use_ssh_config in zip(
|
|
301
|
+
history_worker_nodes, history_workers_info,
|
|
302
|
+
history_use_ssh_config):
|
|
303
|
+
if worker_hosts is not None and history_info not in worker_hosts:
|
|
304
|
+
logger.debug(
|
|
305
|
+
f'Worker node {history_node} not found in YAML config. '
|
|
306
|
+
'Removing from history...')
|
|
307
|
+
worker_nodes_to_cleanup.append(
|
|
308
|
+
dict(
|
|
309
|
+
node=history_node,
|
|
310
|
+
user=ssh_user
|
|
311
|
+
if history_info is None else history_info['user'],
|
|
312
|
+
ssh_key=ssh_key if history_info is None else
|
|
313
|
+
history_info['identity_file'],
|
|
314
|
+
askpass_block=(askpass_block if history_info is None
|
|
315
|
+
else create_askpass_script(
|
|
316
|
+
history_info['password'])),
|
|
317
|
+
use_ssh_config=use_ssh_config,
|
|
318
|
+
))
|
|
319
|
+
remove_worker_cmds.append(
|
|
320
|
+
f'kubectl delete node -l skypilot-ip={history_node}')
|
|
321
|
+
# If this is a create operation and there exists some stale log,
|
|
322
|
+
# cleanup the log for a new file to store new logs.
|
|
323
|
+
if not cleanup and os.path.exists(tunnel_log_file_path):
|
|
324
|
+
os.remove(tunnel_log_file_path)
|
|
325
|
+
|
|
326
|
+
# If --cleanup flag is set, uninstall k3s and exit
|
|
327
|
+
if cleanup:
|
|
328
|
+
# Pickup all nodes
|
|
329
|
+
worker_nodes_to_cleanup.clear()
|
|
330
|
+
for node, info, use_ssh_config in zip(worker_nodes, worker_hosts,
|
|
331
|
+
worker_use_ssh_config):
|
|
332
|
+
worker_nodes_to_cleanup.append(
|
|
333
|
+
dict(
|
|
334
|
+
node=node,
|
|
335
|
+
user=ssh_user if info is None else info['user'],
|
|
336
|
+
ssh_key=ssh_key if info is None else info['identity_file'],
|
|
337
|
+
askpass_block=(askpass_block if info is None else
|
|
338
|
+
create_askpass_script(info['password'])),
|
|
339
|
+
use_ssh_config=use_ssh_config,
|
|
340
|
+
))
|
|
341
|
+
|
|
342
|
+
# Clean up head node
|
|
343
|
+
cleanup_node(head_node,
|
|
344
|
+
ssh_user,
|
|
345
|
+
ssh_key,
|
|
346
|
+
askpass_block,
|
|
347
|
+
use_ssh_config=head_use_ssh_config,
|
|
348
|
+
is_worker=False)
|
|
349
|
+
# Clean up worker nodes
|
|
350
|
+
force_update_status(f'Cleaning up worker nodes [{cluster_name}]')
|
|
351
|
+
with cf.ThreadPoolExecutor() as executor:
|
|
352
|
+
executor.map(lambda kwargs: cleanup_node(**kwargs),
|
|
353
|
+
worker_nodes_to_cleanup)
|
|
354
|
+
|
|
355
|
+
with cf.ThreadPoolExecutor() as executor:
|
|
356
|
+
executor.map(lambda cmd: deploy_utils.run_command(cmd, shell=True),
|
|
357
|
+
remove_worker_cmds)
|
|
358
|
+
|
|
359
|
+
if cleanup:
|
|
360
|
+
# Remove the context from local kubeconfig if it exists
|
|
361
|
+
if os.path.isfile(kubeconfig_path):
|
|
362
|
+
logger.debug(
|
|
363
|
+
f'Removing context {context_name!r} from local kubeconfig...')
|
|
364
|
+
deploy_utils.run_command(
|
|
365
|
+
['kubectl', 'config', 'delete-context', context_name],
|
|
366
|
+
shell=False,
|
|
367
|
+
silent=True)
|
|
368
|
+
deploy_utils.run_command(
|
|
369
|
+
['kubectl', 'config', 'delete-cluster', context_name],
|
|
370
|
+
shell=False,
|
|
371
|
+
silent=True)
|
|
372
|
+
deploy_utils.run_command(
|
|
373
|
+
['kubectl', 'config', 'delete-user', context_name],
|
|
374
|
+
shell=False,
|
|
375
|
+
silent=True)
|
|
376
|
+
|
|
377
|
+
# Update the current context to the first available context
|
|
378
|
+
contexts = deploy_utils.run_command([
|
|
379
|
+
'kubectl', 'config', 'view', '-o',
|
|
380
|
+
'jsonpath=\'{.contexts[0].name}\''
|
|
381
|
+
],
|
|
382
|
+
shell=False,
|
|
383
|
+
silent=True)
|
|
384
|
+
if contexts:
|
|
385
|
+
deploy_utils.run_command(
|
|
386
|
+
['kubectl', 'config', 'use-context', contexts],
|
|
387
|
+
shell=False,
|
|
388
|
+
silent=True)
|
|
389
|
+
else:
|
|
390
|
+
# If no context is available, simply unset the current context
|
|
391
|
+
deploy_utils.run_command(
|
|
392
|
+
['kubectl', 'config', 'unset', 'current-context'],
|
|
393
|
+
shell=False,
|
|
394
|
+
silent=True)
|
|
395
|
+
|
|
396
|
+
logger.debug(
|
|
397
|
+
f'Context {context_name!r} removed from local kubeconfig.')
|
|
398
|
+
|
|
399
|
+
for file in [history_yaml_file, cert_file_path, key_file_path]:
|
|
400
|
+
if os.path.exists(file):
|
|
401
|
+
os.remove(file)
|
|
402
|
+
|
|
403
|
+
# Clean up SSH tunnel after clean up kubeconfig, because the kubectl
|
|
404
|
+
# will restart the ssh tunnel if it's not running.
|
|
405
|
+
tunnel_utils.cleanup_kubectl_ssh_tunnel(cluster_name, context_name)
|
|
406
|
+
|
|
407
|
+
success_message(f'Node Pool `{cluster_name}` cleaned up successfully.')
|
|
408
|
+
return []
|
|
409
|
+
|
|
410
|
+
logger.debug('Checking TCP Forwarding Options...')
|
|
411
|
+
cmd = (
|
|
412
|
+
'if [ "$(sudo sshd -T | grep allowtcpforwarding)" = "allowtcpforwarding yes" ]; then '
|
|
413
|
+
f'echo "TCP Forwarding already enabled on head node ({head_node})."; '
|
|
414
|
+
'else '
|
|
415
|
+
'sudo sed -i \'s/^#\?\s*AllowTcpForwarding.*/AllowTcpForwarding yes/\' ' # pylint: disable=anomalous-backslash-in-string
|
|
416
|
+
'/etc/ssh/sshd_config && sudo systemctl restart sshd && '
|
|
417
|
+
f'echo "Successfully enabled TCP Forwarding on head node ({head_node})."; '
|
|
418
|
+
'fi')
|
|
419
|
+
result = deploy_utils.run_remote(head_node,
|
|
420
|
+
shlex.quote(cmd),
|
|
421
|
+
ssh_user,
|
|
422
|
+
ssh_key,
|
|
423
|
+
use_ssh_config=head_use_ssh_config,
|
|
424
|
+
use_shell=True)
|
|
425
|
+
if result is None:
|
|
426
|
+
with ux_utils.print_exception_no_traceback():
|
|
427
|
+
raise RuntimeError(
|
|
428
|
+
f'Failed to setup TCP forwarding on head node ({head_node}). '
|
|
429
|
+
f'Please check the SSH configuration.')
|
|
430
|
+
|
|
431
|
+
# Get effective IP for master node if using SSH config - needed for workers to connect
|
|
432
|
+
if head_use_ssh_config:
|
|
433
|
+
effective_master_ip = deploy_utils.get_effective_host_ip(head_node)
|
|
434
|
+
logger.info(f'{colorama.Fore.GREEN}Resolved head node {head_node} '
|
|
435
|
+
f'to {effective_master_ip} from SSH config{RESET_ALL}')
|
|
436
|
+
else:
|
|
437
|
+
effective_master_ip = head_node
|
|
438
|
+
|
|
439
|
+
# Step 1: Install k3s on the head node
|
|
440
|
+
# Check if head node has a GPU
|
|
441
|
+
install_gpu = False
|
|
442
|
+
force_update_status(
|
|
443
|
+
f'Deploying SkyPilot runtime on head node ({head_node}).')
|
|
444
|
+
cmd = f"""
|
|
445
|
+
{askpass_block}
|
|
446
|
+
curl -sfL https://get.k3s.io | K3S_TOKEN={k3s_token} K3S_NODE_NAME={head_node} sudo -E -A sh - &&
|
|
447
|
+
mkdir -p ~/.kube &&
|
|
448
|
+
sudo -A cp /etc/rancher/k3s/k3s.yaml ~/.kube/config &&
|
|
449
|
+
sudo -A chown $(id -u):$(id -g) ~/.kube/config &&
|
|
450
|
+
for i in {{1..3}}; do
|
|
451
|
+
if kubectl wait --for=condition=ready node --all --timeout=2m --kubeconfig ~/.kube/config; then
|
|
452
|
+
break
|
|
453
|
+
else
|
|
454
|
+
echo 'Waiting for nodes to be ready...'
|
|
455
|
+
sleep 5
|
|
456
|
+
fi
|
|
457
|
+
done
|
|
458
|
+
if [ $i -eq 3 ]; then
|
|
459
|
+
echo 'Failed to wait for nodes to be ready after 3 attempts'
|
|
460
|
+
exit 1
|
|
461
|
+
fi
|
|
462
|
+
"""
|
|
463
|
+
result = deploy_utils.run_remote(head_node,
|
|
464
|
+
cmd,
|
|
465
|
+
ssh_user,
|
|
466
|
+
ssh_key,
|
|
467
|
+
use_ssh_config=head_use_ssh_config)
|
|
468
|
+
if result is None:
|
|
469
|
+
with ux_utils.print_exception_no_traceback():
|
|
470
|
+
raise RuntimeError(
|
|
471
|
+
f'Failed to deploy K3s on head node ({head_node}).')
|
|
472
|
+
success_message(
|
|
473
|
+
f'SkyPilot runtime successfully deployed on head node ({head_node}).')
|
|
474
|
+
|
|
475
|
+
# Check if head node has a GPU
|
|
476
|
+
install_gpu = False
|
|
477
|
+
if deploy_utils.check_gpu(head_node,
|
|
478
|
+
ssh_user,
|
|
479
|
+
ssh_key,
|
|
480
|
+
use_ssh_config=head_use_ssh_config,
|
|
481
|
+
is_head=True):
|
|
482
|
+
install_gpu = True
|
|
483
|
+
|
|
484
|
+
# Fetch the head node's internal IP (this will be passed to worker nodes)
|
|
485
|
+
master_addr = deploy_utils.run_remote(head_node,
|
|
486
|
+
'hostname -I | awk \'{print $1}\'',
|
|
487
|
+
ssh_user,
|
|
488
|
+
ssh_key,
|
|
489
|
+
use_ssh_config=head_use_ssh_config)
|
|
490
|
+
if master_addr is None:
|
|
491
|
+
with ux_utils.print_exception_no_traceback():
|
|
492
|
+
raise RuntimeError(f'Failed to SSH to head node ({head_node}). '
|
|
493
|
+
f'Please check the SSH configuration.')
|
|
494
|
+
logger.debug(f'Master node internal IP: {master_addr}')
|
|
495
|
+
|
|
496
|
+
# Step 2: Install k3s on worker nodes and join them to the master node
|
|
497
|
+
def deploy_worker(args):
|
|
498
|
+
(i, node, worker_hosts, history_workers_info, ssh_user, ssh_key,
|
|
499
|
+
askpass_block, worker_use_ssh_config, master_addr, k3s_token) = args
|
|
500
|
+
|
|
501
|
+
# If using YAML config with specific worker info
|
|
502
|
+
if worker_hosts and i < len(worker_hosts):
|
|
503
|
+
if history_workers_info is not None and worker_hosts[
|
|
504
|
+
i] in history_workers_info:
|
|
505
|
+
logger.info(
|
|
506
|
+
f'{colorama.Style.DIM}✔ SkyPilot runtime already deployed on worker node {node}. '
|
|
507
|
+
f'Skipping...{RESET_ALL}')
|
|
508
|
+
return node, True, False
|
|
509
|
+
worker_user = worker_hosts[i]['user']
|
|
510
|
+
worker_key = worker_hosts[i]['identity_file']
|
|
511
|
+
worker_password = worker_hosts[i]['password']
|
|
512
|
+
worker_askpass = create_askpass_script(worker_password)
|
|
513
|
+
worker_config = worker_use_ssh_config[i]
|
|
514
|
+
else:
|
|
515
|
+
worker_user = ssh_user
|
|
516
|
+
worker_key = ssh_key
|
|
517
|
+
worker_askpass = askpass_block
|
|
518
|
+
worker_config = worker_use_ssh_config[i]
|
|
519
|
+
|
|
520
|
+
return start_agent_node(node,
|
|
521
|
+
master_addr,
|
|
522
|
+
k3s_token,
|
|
523
|
+
worker_user,
|
|
524
|
+
worker_key,
|
|
525
|
+
worker_askpass,
|
|
526
|
+
use_ssh_config=worker_config)
|
|
527
|
+
|
|
528
|
+
unsuccessful_workers = []
|
|
529
|
+
|
|
530
|
+
# Deploy workers in parallel using thread pool
|
|
531
|
+
force_update_status(
|
|
532
|
+
f'Deploying SkyPilot runtime on worker nodes [{cluster_name}]')
|
|
533
|
+
with cf.ThreadPoolExecutor() as executor:
|
|
534
|
+
futures = []
|
|
535
|
+
for i, node in enumerate(worker_nodes):
|
|
536
|
+
args = (i, node, worker_hosts, history_workers_info, ssh_user,
|
|
537
|
+
ssh_key, askpass_block, worker_use_ssh_config, master_addr,
|
|
538
|
+
k3s_token)
|
|
539
|
+
futures.append(executor.submit(deploy_worker, args))
|
|
540
|
+
|
|
541
|
+
# Check if worker node has a GPU
|
|
542
|
+
for future in cf.as_completed(futures):
|
|
543
|
+
node, suc, has_gpu = future.result()
|
|
544
|
+
install_gpu = install_gpu or has_gpu
|
|
545
|
+
if not suc:
|
|
546
|
+
unsuccessful_workers.append(node)
|
|
547
|
+
|
|
548
|
+
# Step 3: Configure local kubectl to connect to the cluster
|
|
549
|
+
force_update_status(f'Setting up SkyPilot configuration [{cluster_name}]')
|
|
550
|
+
|
|
551
|
+
# Create temporary directory for kubeconfig operations
|
|
552
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
|
553
|
+
temp_kubeconfig = os.path.join(temp_dir, 'kubeconfig')
|
|
554
|
+
|
|
555
|
+
# Get the kubeconfig from remote server
|
|
556
|
+
if head_use_ssh_config:
|
|
557
|
+
scp_cmd = ['scp', head_node + ':~/.kube/config', temp_kubeconfig]
|
|
558
|
+
else:
|
|
559
|
+
scp_cmd = [
|
|
560
|
+
'scp', '-o', 'StrictHostKeyChecking=no', '-o',
|
|
561
|
+
'IdentitiesOnly=yes', '-i', ssh_key,
|
|
562
|
+
f'{ssh_user}@{head_node}:~/.kube/config', temp_kubeconfig
|
|
563
|
+
]
|
|
564
|
+
deploy_utils.run_command(scp_cmd, shell=False)
|
|
565
|
+
|
|
566
|
+
# Create the directory for the kubeconfig file if it doesn't exist
|
|
567
|
+
deploy_utils.ensure_directory_exists(kubeconfig_path)
|
|
568
|
+
|
|
569
|
+
# Create empty kubeconfig if it doesn't exist
|
|
570
|
+
if not os.path.isfile(kubeconfig_path):
|
|
571
|
+
open(kubeconfig_path, 'a', encoding='utf-8').close()
|
|
572
|
+
|
|
573
|
+
# Modify the temporary kubeconfig to update server address and context name
|
|
574
|
+
modified_config = os.path.join(temp_dir, 'modified_config')
|
|
575
|
+
with open(temp_kubeconfig, 'r', encoding='utf-8') as f_in:
|
|
576
|
+
with open(modified_config, 'w', encoding='utf-8') as f_out:
|
|
577
|
+
in_cluster = False
|
|
578
|
+
in_user = False
|
|
579
|
+
client_cert_data = None
|
|
580
|
+
client_key_data = None
|
|
581
|
+
|
|
582
|
+
for line in f_in:
|
|
583
|
+
if 'clusters:' in line:
|
|
584
|
+
in_cluster = True
|
|
585
|
+
in_user = False
|
|
586
|
+
elif 'users:' in line:
|
|
587
|
+
in_cluster = False
|
|
588
|
+
in_user = True
|
|
589
|
+
elif 'contexts:' in line:
|
|
590
|
+
in_cluster = False
|
|
591
|
+
in_user = False
|
|
592
|
+
|
|
593
|
+
# Skip certificate authority data in cluster section
|
|
594
|
+
if in_cluster and 'certificate-authority-data:' in line:
|
|
595
|
+
continue
|
|
596
|
+
# Skip client certificate data in user section but extract it
|
|
597
|
+
elif in_user and 'client-certificate-data:' in line:
|
|
598
|
+
client_cert_data = line.split(':', 1)[1].strip()
|
|
599
|
+
continue
|
|
600
|
+
# Skip client key data in user section but extract it
|
|
601
|
+
elif in_user and 'client-key-data:' in line:
|
|
602
|
+
client_key_data = line.split(':', 1)[1].strip()
|
|
603
|
+
continue
|
|
604
|
+
elif in_cluster and 'server:' in line:
|
|
605
|
+
# Initially just set to the effective master IP
|
|
606
|
+
# (will be changed to localhost by setup_kubectl_ssh_tunnel later)
|
|
607
|
+
f_out.write(
|
|
608
|
+
f' server: https://{effective_master_ip}:6443\n')
|
|
609
|
+
f_out.write(' insecure-skip-tls-verify: true\n')
|
|
610
|
+
continue
|
|
611
|
+
|
|
612
|
+
# Replace default context names with user-provided context name
|
|
613
|
+
line = line.replace('name: default',
|
|
614
|
+
f'name: {context_name}')
|
|
615
|
+
line = line.replace('cluster: default',
|
|
616
|
+
f'cluster: {context_name}')
|
|
617
|
+
line = line.replace('user: default',
|
|
618
|
+
f'user: {context_name}')
|
|
619
|
+
line = line.replace('current-context: default',
|
|
620
|
+
f'current-context: {context_name}')
|
|
621
|
+
|
|
622
|
+
f_out.write(line)
|
|
623
|
+
|
|
624
|
+
# Save certificate data if available
|
|
625
|
+
|
|
626
|
+
if client_cert_data:
|
|
627
|
+
# Decode base64 data and save as PEM
|
|
628
|
+
try:
|
|
629
|
+
# Clean up the certificate data by removing whitespace
|
|
630
|
+
clean_cert_data = ''.join(client_cert_data.split())
|
|
631
|
+
cert_pem = base64.b64decode(clean_cert_data).decode(
|
|
632
|
+
'utf-8')
|
|
633
|
+
|
|
634
|
+
# Check if the data already looks like a PEM file
|
|
635
|
+
has_begin = '-----BEGIN CERTIFICATE-----' in cert_pem
|
|
636
|
+
has_end = '-----END CERTIFICATE-----' in cert_pem
|
|
637
|
+
|
|
638
|
+
if not has_begin or not has_end:
|
|
639
|
+
logger.debug(
|
|
640
|
+
'Warning: Certificate data missing PEM markers, attempting to fix...'
|
|
641
|
+
)
|
|
642
|
+
# Add PEM markers if missing
|
|
643
|
+
if not has_begin:
|
|
644
|
+
cert_pem = f'-----BEGIN CERTIFICATE-----\n{cert_pem}'
|
|
645
|
+
if not has_end:
|
|
646
|
+
cert_pem = f'{cert_pem}\n-----END CERTIFICATE-----'
|
|
647
|
+
|
|
648
|
+
# Write the certificate
|
|
649
|
+
with open(cert_file_path, 'w',
|
|
650
|
+
encoding='utf-8') as cert_file:
|
|
651
|
+
cert_file.write(cert_pem)
|
|
652
|
+
|
|
653
|
+
# Verify the file was written correctly
|
|
654
|
+
if os.path.getsize(cert_file_path) > 0:
|
|
655
|
+
logger.debug(
|
|
656
|
+
f'Successfully saved certificate data ({len(cert_pem)} bytes)'
|
|
657
|
+
)
|
|
658
|
+
|
|
659
|
+
# Quick validation of PEM format
|
|
660
|
+
with open(cert_file_path, 'r',
|
|
661
|
+
encoding='utf-8') as f:
|
|
662
|
+
content = f.readlines()
|
|
663
|
+
first_line = content[0].strip(
|
|
664
|
+
) if content else ''
|
|
665
|
+
last_line = content[-1].strip(
|
|
666
|
+
) if content else ''
|
|
667
|
+
|
|
668
|
+
if not first_line.startswith(
|
|
669
|
+
'-----BEGIN') or not last_line.startswith(
|
|
670
|
+
'-----END'):
|
|
671
|
+
logger.debug(
|
|
672
|
+
'Warning: Certificate may not be in proper PEM format'
|
|
673
|
+
)
|
|
674
|
+
else:
|
|
675
|
+
logger.error(
|
|
676
|
+
f'{colorama.Fore.RED}Error: '
|
|
677
|
+
f'Certificate file is empty{RESET_ALL}')
|
|
678
|
+
except Exception as e: # pylint: disable=broad-except
|
|
679
|
+
logger.error(f'{colorama.Fore.RED}'
|
|
680
|
+
f'Error processing certificate data: {e}'
|
|
681
|
+
f'{RESET_ALL}')
|
|
682
|
+
|
|
683
|
+
if client_key_data:
|
|
684
|
+
# Decode base64 data and save as PEM
|
|
685
|
+
try:
|
|
686
|
+
# Clean up the key data by removing whitespace
|
|
687
|
+
clean_key_data = ''.join(client_key_data.split())
|
|
688
|
+
key_pem = base64.b64decode(clean_key_data).decode(
|
|
689
|
+
'utf-8')
|
|
690
|
+
|
|
691
|
+
# Check if the data already looks like a PEM file
|
|
692
|
+
|
|
693
|
+
# Check for EC key format
|
|
694
|
+
if 'EC PRIVATE KEY' in key_pem:
|
|
695
|
+
# Handle EC KEY format directly
|
|
696
|
+
match_ec = re.search(
|
|
697
|
+
r'-----BEGIN EC PRIVATE KEY-----(.*?)-----END EC PRIVATE KEY-----',
|
|
698
|
+
key_pem, re.DOTALL)
|
|
699
|
+
if match_ec:
|
|
700
|
+
# Extract and properly format EC key
|
|
701
|
+
key_content = match_ec.group(1).strip()
|
|
702
|
+
key_pem = f'-----BEGIN EC PRIVATE KEY-----\n{key_content}\n-----END EC PRIVATE KEY-----'
|
|
703
|
+
else:
|
|
704
|
+
# Extract content and assume EC format
|
|
705
|
+
key_content = re.sub(r'-----BEGIN.*?-----', '',
|
|
706
|
+
key_pem)
|
|
707
|
+
key_content = re.sub(r'-----END.*?-----.*', '',
|
|
708
|
+
key_content).strip()
|
|
709
|
+
key_pem = f'-----BEGIN EC PRIVATE KEY-----\n{key_content}\n-----END EC PRIVATE KEY-----'
|
|
710
|
+
else:
|
|
711
|
+
# Handle regular private key format
|
|
712
|
+
has_begin = any(marker in key_pem for marker in [
|
|
713
|
+
'-----BEGIN PRIVATE KEY-----',
|
|
714
|
+
'-----BEGIN RSA PRIVATE KEY-----'
|
|
715
|
+
])
|
|
716
|
+
has_end = any(marker in key_pem for marker in [
|
|
717
|
+
'-----END PRIVATE KEY-----',
|
|
718
|
+
'-----END RSA PRIVATE KEY-----'
|
|
719
|
+
])
|
|
720
|
+
|
|
721
|
+
if not has_begin or not has_end:
|
|
722
|
+
logger.debug(
|
|
723
|
+
'Warning: Key data missing PEM markers, attempting to fix...'
|
|
724
|
+
)
|
|
725
|
+
# Add PEM markers if missing
|
|
726
|
+
if not has_begin:
|
|
727
|
+
key_pem = f'-----BEGIN PRIVATE KEY-----\n{key_pem}'
|
|
728
|
+
if not has_end:
|
|
729
|
+
key_pem = f'{key_pem}\n-----END PRIVATE KEY-----'
|
|
730
|
+
# Remove any trailing characters after END marker
|
|
731
|
+
key_pem = re.sub(
|
|
732
|
+
r'(-----END PRIVATE KEY-----).*', r'\1',
|
|
733
|
+
key_pem)
|
|
734
|
+
|
|
735
|
+
# Write the key
|
|
736
|
+
with open(key_file_path, 'w',
|
|
737
|
+
encoding='utf-8') as key_file:
|
|
738
|
+
key_file.write(key_pem)
|
|
739
|
+
|
|
740
|
+
# Verify the file was written correctly
|
|
741
|
+
if os.path.getsize(key_file_path) > 0:
|
|
742
|
+
logger.debug(
|
|
743
|
+
f'Successfully saved key data ({len(key_pem)} bytes)'
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
# Quick validation of PEM format
|
|
747
|
+
with open(key_file_path, 'r',
|
|
748
|
+
encoding='utf-8') as f:
|
|
749
|
+
content = f.readlines()
|
|
750
|
+
first_line = content[0].strip(
|
|
751
|
+
) if content else ''
|
|
752
|
+
last_line = content[-1].strip(
|
|
753
|
+
) if content else ''
|
|
754
|
+
|
|
755
|
+
if not first_line.startswith(
|
|
756
|
+
'-----BEGIN') or not last_line.startswith(
|
|
757
|
+
'-----END'):
|
|
758
|
+
logger.debug(
|
|
759
|
+
'Warning: Key may not be in proper PEM format'
|
|
760
|
+
)
|
|
761
|
+
else:
|
|
762
|
+
logger.error(f'{colorama.Fore.RED}Error: '
|
|
763
|
+
f'Key file is empty{RESET_ALL}')
|
|
764
|
+
except Exception as e: # pylint: disable=broad-except
|
|
765
|
+
logger.error(f'{colorama.Fore.RED}'
|
|
766
|
+
f'Error processing key data: {e}'
|
|
767
|
+
f'{RESET_ALL}')
|
|
768
|
+
|
|
769
|
+
# First check if context name exists and delete it if it does
|
|
770
|
+
# TODO(romilb): Should we throw an error here instead?
|
|
771
|
+
deploy_utils.run_command(
|
|
772
|
+
['kubectl', 'config', 'delete-context', context_name],
|
|
773
|
+
shell=False,
|
|
774
|
+
silent=True)
|
|
775
|
+
deploy_utils.run_command(
|
|
776
|
+
['kubectl', 'config', 'delete-cluster', context_name],
|
|
777
|
+
shell=False,
|
|
778
|
+
silent=True)
|
|
779
|
+
deploy_utils.run_command(
|
|
780
|
+
['kubectl', 'config', 'delete-user', context_name],
|
|
781
|
+
shell=False,
|
|
782
|
+
silent=True)
|
|
783
|
+
|
|
784
|
+
# Merge the configurations using kubectl
|
|
785
|
+
merged_config = os.path.join(temp_dir, 'merged_config')
|
|
786
|
+
os.environ['KUBECONFIG'] = f'{kubeconfig_path}:{modified_config}'
|
|
787
|
+
with open(merged_config, 'w', encoding='utf-8') as merged_file:
|
|
788
|
+
kubectl_cmd = ['kubectl', 'config', 'view', '--flatten']
|
|
789
|
+
result = deploy_utils.run_command(kubectl_cmd, shell=False)
|
|
790
|
+
if result:
|
|
791
|
+
merged_file.write(result)
|
|
792
|
+
|
|
793
|
+
# Replace the kubeconfig with the merged config
|
|
794
|
+
shutil.move(merged_config, kubeconfig_path)
|
|
795
|
+
|
|
796
|
+
# Set the new context as the current context
|
|
797
|
+
deploy_utils.run_command(
|
|
798
|
+
['kubectl', 'config', 'use-context', context_name],
|
|
799
|
+
shell=False,
|
|
800
|
+
silent=True)
|
|
801
|
+
|
|
802
|
+
# Always set up SSH tunnel since we assume only port 22 is accessible
|
|
803
|
+
tunnel_utils.setup_kubectl_ssh_tunnel(head_node,
|
|
804
|
+
ssh_user,
|
|
805
|
+
ssh_key,
|
|
806
|
+
context_name,
|
|
807
|
+
use_ssh_config=head_use_ssh_config)
|
|
808
|
+
|
|
809
|
+
logger.debug(f'kubectl configured with new context \'{context_name}\'.')
|
|
810
|
+
success_message(f'SkyPilot runtime is up [{cluster_name}].')
|
|
811
|
+
|
|
812
|
+
# Install GPU operator if a GPU was detected on any node
|
|
813
|
+
if install_gpu:
|
|
814
|
+
force_update_status(f'Configuring NVIDIA GPUs [{cluster_name}]')
|
|
815
|
+
cmd = f"""
|
|
816
|
+
{askpass_block}
|
|
817
|
+
curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
|
|
818
|
+
chmod 700 get_helm.sh &&
|
|
819
|
+
./get_helm.sh &&
|
|
820
|
+
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update &&
|
|
821
|
+
kubectl create namespace gpu-operator --kubeconfig ~/.kube/config || true &&
|
|
822
|
+
sudo -A ln -s /sbin/ldconfig /sbin/ldconfig.real || true &&
|
|
823
|
+
helm install gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator \\
|
|
824
|
+
--set 'toolkit.env[0].name=CONTAINERD_CONFIG' \\
|
|
825
|
+
--set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \\
|
|
826
|
+
--set 'toolkit.env[1].name=CONTAINERD_SOCKET' \\
|
|
827
|
+
--set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \\
|
|
828
|
+
--set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \\
|
|
829
|
+
--set 'toolkit.env[2].value=nvidia' &&
|
|
830
|
+
echo 'Waiting for GPU operator installation...' &&
|
|
831
|
+
while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:' || ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu.product'; do
|
|
832
|
+
echo 'Waiting for GPU operator...'
|
|
833
|
+
sleep 5
|
|
834
|
+
done
|
|
835
|
+
echo 'GPU operator installed successfully.'
|
|
836
|
+
"""
|
|
837
|
+
result = deploy_utils.run_remote(head_node,
|
|
838
|
+
cmd,
|
|
839
|
+
ssh_user,
|
|
840
|
+
ssh_key,
|
|
841
|
+
use_ssh_config=head_use_ssh_config)
|
|
842
|
+
if result is None:
|
|
843
|
+
logger.error(f'{colorama.Fore.RED}Failed to install GPU Operator.'
|
|
844
|
+
f'{RESET_ALL}')
|
|
845
|
+
else:
|
|
846
|
+
success_message('GPU Operator installed.')
|
|
847
|
+
else:
|
|
848
|
+
logger.debug('No GPUs detected. Skipping GPU Operator installation.')
|
|
849
|
+
|
|
850
|
+
# The env var KUBECONFIG ensures sky check uses the right kubeconfig
|
|
851
|
+
os.environ['KUBECONFIG'] = kubeconfig_path
|
|
852
|
+
deploy_utils.run_command(['sky', 'check', 'ssh'], shell=False)
|
|
853
|
+
|
|
854
|
+
success_message('SkyPilot configured successfully.')
|
|
855
|
+
|
|
856
|
+
if unsuccessful_workers:
|
|
857
|
+
quoted_unsuccessful_workers = [
|
|
858
|
+
f'"{worker}"' for worker in unsuccessful_workers
|
|
859
|
+
]
|
|
860
|
+
|
|
861
|
+
logger.info(f'{colorama.Fore.YELLOW}'
|
|
862
|
+
'Failed to deploy Kubernetes on the following nodes: '
|
|
863
|
+
f'{", ".join(quoted_unsuccessful_workers)}. Please check '
|
|
864
|
+
f'the logs for more details.{RESET_ALL}')
|
|
865
|
+
else:
|
|
866
|
+
success_message(f'Node Pool `{cluster_name}` deployed successfully.')
|
|
867
|
+
|
|
868
|
+
return unsuccessful_workers
|
|
869
|
+
|
|
870
|
+
|
|
871
|
+
def create_askpass_script(password):
|
|
872
|
+
"""Create an askpass script block for sudo with password."""
|
|
873
|
+
if not password:
|
|
874
|
+
return ''
|
|
875
|
+
|
|
876
|
+
return f"""
|
|
877
|
+
# Create temporary askpass script
|
|
878
|
+
ASKPASS_SCRIPT=$(mktemp)
|
|
879
|
+
trap 'rm -f $ASKPASS_SCRIPT' EXIT INT TERM ERR QUIT
|
|
880
|
+
cat > $ASKPASS_SCRIPT << EOF
|
|
881
|
+
#!/bin/bash
|
|
882
|
+
echo {password}
|
|
883
|
+
EOF
|
|
884
|
+
chmod 700 $ASKPASS_SCRIPT
|
|
885
|
+
# Use askpass
|
|
886
|
+
export SUDO_ASKPASS=$ASKPASS_SCRIPT
|
|
887
|
+
"""
|
|
888
|
+
|
|
889
|
+
|
|
890
|
+
def cleanup_node(node,
|
|
891
|
+
user,
|
|
892
|
+
ssh_key,
|
|
893
|
+
askpass_block,
|
|
894
|
+
use_ssh_config=False,
|
|
895
|
+
is_worker=True):
|
|
896
|
+
"""Uninstall k3s and clean up the state on a node."""
|
|
897
|
+
ntype = 'worker' if is_worker else 'head'
|
|
898
|
+
force_update_status(f'Cleaning up {ntype} node ({node})...')
|
|
899
|
+
script = f'k3s{"-agent" if is_worker else ""}-uninstall.sh'
|
|
900
|
+
cmd = f"""
|
|
901
|
+
{askpass_block}
|
|
902
|
+
echo 'Uninstalling k3s...' &&
|
|
903
|
+
sudo -A /usr/local/bin/{script} || true &&
|
|
904
|
+
sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
|
|
905
|
+
"""
|
|
906
|
+
result = deploy_utils.run_remote(node,
|
|
907
|
+
cmd,
|
|
908
|
+
user,
|
|
909
|
+
ssh_key,
|
|
910
|
+
use_ssh_config=use_ssh_config)
|
|
911
|
+
if result is None:
|
|
912
|
+
logger.error(f'{colorama.Fore.RED}Failed to clean up {ntype} '
|
|
913
|
+
f'node ({node}).{RESET_ALL}')
|
|
914
|
+
else:
|
|
915
|
+
success_message(f'Node {node} cleaned up successfully.')
|
|
916
|
+
|
|
917
|
+
|
|
918
|
+
def start_agent_node(node,
|
|
919
|
+
master_addr,
|
|
920
|
+
k3s_token,
|
|
921
|
+
user,
|
|
922
|
+
ssh_key,
|
|
923
|
+
askpass_block,
|
|
924
|
+
use_ssh_config=False):
|
|
925
|
+
"""Start a k3s agent node.
|
|
926
|
+
Returns: if the start is successful, and whether the node has a GPU."""
|
|
927
|
+
logger.info(f'Deploying worker node ({node}).')
|
|
928
|
+
cmd = f"""
|
|
929
|
+
{askpass_block}
|
|
930
|
+
curl -sfL https://get.k3s.io | K3S_NODE_NAME={node} INSTALL_K3S_EXEC='agent --node-label skypilot-ip={node}' \
|
|
931
|
+
K3S_URL=https://{master_addr}:6443 K3S_TOKEN={k3s_token} sudo -E -A sh -
|
|
932
|
+
"""
|
|
933
|
+
result = deploy_utils.run_remote(node,
|
|
934
|
+
cmd,
|
|
935
|
+
user,
|
|
936
|
+
ssh_key,
|
|
937
|
+
use_ssh_config=use_ssh_config)
|
|
938
|
+
if result is None:
|
|
939
|
+
logger.error(f'{colorama.Fore.RED}✗ Failed to deploy K3s on worker '
|
|
940
|
+
f'node ({node}).{RESET_ALL}')
|
|
941
|
+
return node, False, False
|
|
942
|
+
success_message(
|
|
943
|
+
f'SkyPilot runtime successfully deployed on worker node ({node}).')
|
|
944
|
+
# Check if worker node has a GPU
|
|
945
|
+
if deploy_utils.check_gpu(node,
|
|
946
|
+
user,
|
|
947
|
+
ssh_key,
|
|
948
|
+
use_ssh_config=use_ssh_config):
|
|
949
|
+
logger.info(f'{colorama.Fore.YELLOW}GPU detected on worker node '
|
|
950
|
+
f'({node}).{RESET_ALL}')
|
|
951
|
+
return node, True, True
|
|
952
|
+
return node, True, False
|