skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""SSH-based Kubernetes Cluster Deployment Script"""
|
|
2
|
-
#
|
|
3
|
-
import argparse
|
|
2
|
+
# pylint: disable=line-too-long
|
|
4
3
|
import base64
|
|
5
4
|
import concurrent.futures as cf
|
|
6
5
|
import os
|
|
@@ -11,10 +10,13 @@ import shutil
|
|
|
11
10
|
import subprocess
|
|
12
11
|
import sys
|
|
13
12
|
import tempfile
|
|
14
|
-
from typing import List, Set
|
|
13
|
+
from typing import List, Optional, Set
|
|
15
14
|
|
|
15
|
+
import colorama
|
|
16
16
|
import yaml
|
|
17
17
|
|
|
18
|
+
from sky import sky_logging
|
|
19
|
+
from sky.utils import rich_utils
|
|
18
20
|
from sky.utils import ux_utils
|
|
19
21
|
from sky.utils.kubernetes import ssh_utils
|
|
20
22
|
|
|
@@ -24,6 +26,9 @@ GREEN = '\033[0;32m'
|
|
|
24
26
|
YELLOW = '\033[1;33m'
|
|
25
27
|
WARNING_YELLOW = '\x1b[33m'
|
|
26
28
|
NC = '\033[0m' # No color
|
|
29
|
+
DIM = colorama.Style.DIM
|
|
30
|
+
CYAN = colorama.Fore.CYAN
|
|
31
|
+
RESET_ALL = colorama.Style.RESET_ALL
|
|
27
32
|
|
|
28
33
|
DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
|
|
29
34
|
SSH_CONFIG_PATH = os.path.expanduser('~/.ssh/config')
|
|
@@ -32,69 +37,10 @@ NODE_POOLS_INFO_DIR = os.path.expanduser('~/.sky/ssh_node_pools_info')
|
|
|
32
37
|
# Get the directory of this script
|
|
33
38
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
34
39
|
|
|
40
|
+
logger = sky_logging.init_logger(__name__)
|
|
35
41
|
|
|
36
|
-
def parse_args():
|
|
37
|
-
parser = argparse.ArgumentParser(
|
|
38
|
-
description='Deploy a Kubernetes cluster on remote machines.')
|
|
39
|
-
parser.add_argument(
|
|
40
|
-
'--infra', help='Name of the cluster in ssh_node_pools.yaml to use')
|
|
41
|
-
parser.add_argument(
|
|
42
|
-
'--ssh-node-pools-file',
|
|
43
|
-
dest='ssh_node_pools_file',
|
|
44
|
-
default=ssh_utils.DEFAULT_SSH_NODE_POOLS_PATH,
|
|
45
|
-
help=
|
|
46
|
-
f'Path to SSH node pools YAML file (default: {ssh_utils.DEFAULT_SSH_NODE_POOLS_PATH})'
|
|
47
|
-
)
|
|
48
|
-
parser.add_argument(
|
|
49
|
-
'--kubeconfig-path',
|
|
50
|
-
dest='kubeconfig_path',
|
|
51
|
-
default=DEFAULT_KUBECONFIG_PATH,
|
|
52
|
-
help=
|
|
53
|
-
f'Path to save the kubeconfig file (default: {DEFAULT_KUBECONFIG_PATH})'
|
|
54
|
-
)
|
|
55
|
-
parser.add_argument(
|
|
56
|
-
'--use-ssh-config',
|
|
57
|
-
dest='use_ssh_config',
|
|
58
|
-
action='store_true',
|
|
59
|
-
help='Use SSH config for host settings instead of explicit parameters')
|
|
60
|
-
#TODO(romilb): The `sky local up --ips` command is deprecated and these args are now captured in the ssh_node_pools.yaml file.
|
|
61
|
-
# Remove these args after 0.11.0 release.
|
|
62
|
-
parser.add_argument(
|
|
63
|
-
'--ips-file',
|
|
64
|
-
dest='ips_file',
|
|
65
|
-
help=
|
|
66
|
-
'[Deprecated, use --ssh-node-pools-file instead] File containing IP addresses or SSH host entries (one per line)'
|
|
67
|
-
)
|
|
68
|
-
parser.add_argument(
|
|
69
|
-
'--user',
|
|
70
|
-
help=
|
|
71
|
-
'[Deprecated, use --ssh-node-pools-file instead] Username to use for SSH (overridden by SSH config if host exists there)'
|
|
72
|
-
)
|
|
73
|
-
parser.add_argument(
|
|
74
|
-
'--ssh-key',
|
|
75
|
-
dest='ssh_key',
|
|
76
|
-
help=
|
|
77
|
-
'[Deprecated, use --ssh-node-pools-file instead] Path to SSH private key (overridden by SSH config if host exists there)'
|
|
78
|
-
)
|
|
79
|
-
parser.add_argument(
|
|
80
|
-
'--context-name',
|
|
81
|
-
dest='context_name',
|
|
82
|
-
default='default',
|
|
83
|
-
help=
|
|
84
|
-
'[Deprecated, use --ssh-node-pools-file instead] Kubernetes context name'
|
|
85
|
-
)
|
|
86
|
-
parser.add_argument('--cleanup',
|
|
87
|
-
action='store_true',
|
|
88
|
-
help='Clean up the cluster')
|
|
89
|
-
parser.add_argument(
|
|
90
|
-
'--password',
|
|
91
|
-
help='[Deprecated, use --ssh-node-pools-file instead] Password for sudo'
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
return parser.parse_args()
|
|
95
42
|
|
|
96
|
-
|
|
97
|
-
def run_command(cmd, shell=False):
|
|
43
|
+
def run_command(cmd, shell=False, silent=False):
|
|
98
44
|
"""Run a local command and return the output."""
|
|
99
45
|
process = subprocess.run(cmd,
|
|
100
46
|
shell=shell,
|
|
@@ -102,9 +48,10 @@ def run_command(cmd, shell=False):
|
|
|
102
48
|
text=True,
|
|
103
49
|
check=False)
|
|
104
50
|
if process.returncode != 0:
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
51
|
+
if not silent:
|
|
52
|
+
logger.error(f'{RED}Error executing command: {cmd}{NC}\n'
|
|
53
|
+
f'STDOUT: {process.stdout}\n'
|
|
54
|
+
f'STDERR: {process.stderr}')
|
|
108
55
|
return None
|
|
109
56
|
return process.stdout.strip()
|
|
110
57
|
|
|
@@ -132,8 +79,12 @@ def run_remote(node,
|
|
|
132
79
|
connect_timeout=30,
|
|
133
80
|
use_ssh_config=False,
|
|
134
81
|
print_output=False,
|
|
135
|
-
use_shell=False
|
|
136
|
-
|
|
82
|
+
use_shell=False,
|
|
83
|
+
silent=False):
|
|
84
|
+
"""Run a command on a remote machine via SSH.
|
|
85
|
+
|
|
86
|
+
silent is used for gpu checking (will show error logs when no gpus are found)"""
|
|
87
|
+
ssh_cmd: List[str]
|
|
137
88
|
if use_ssh_config:
|
|
138
89
|
# Use SSH config for connection parameters
|
|
139
90
|
ssh_cmd = ['ssh', node, cmd]
|
|
@@ -153,20 +104,19 @@ def run_remote(node,
|
|
|
153
104
|
ssh_cmd.append(f'{user}@{node}' if user else node)
|
|
154
105
|
ssh_cmd.append(cmd)
|
|
155
106
|
|
|
156
|
-
if use_shell
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
process = subprocess.run(ssh_cmd,
|
|
107
|
+
subprocess_cmd = ' '.join(ssh_cmd) if use_shell else ssh_cmd
|
|
108
|
+
process = subprocess.run(subprocess_cmd,
|
|
160
109
|
capture_output=True,
|
|
161
110
|
text=True,
|
|
162
111
|
check=False,
|
|
163
112
|
shell=use_shell)
|
|
164
113
|
if process.returncode != 0:
|
|
165
|
-
|
|
166
|
-
|
|
114
|
+
if not silent:
|
|
115
|
+
logger.error(f'{RED}Error executing command {cmd} on {node}:{NC} '
|
|
116
|
+
f'{process.stderr}')
|
|
167
117
|
return None
|
|
168
118
|
if print_output:
|
|
169
|
-
|
|
119
|
+
logger.info(process.stdout)
|
|
170
120
|
return process.stdout.strip()
|
|
171
121
|
|
|
172
122
|
|
|
@@ -191,12 +141,17 @@ export SUDO_ASKPASS=$ASKPASS_SCRIPT
|
|
|
191
141
|
|
|
192
142
|
def progress_message(message):
|
|
193
143
|
"""Show a progress message."""
|
|
194
|
-
|
|
144
|
+
logger.info(f'{YELLOW}➜ {message}{NC}')
|
|
195
145
|
|
|
196
146
|
|
|
197
147
|
def success_message(message):
|
|
198
148
|
"""Show a success message."""
|
|
199
|
-
|
|
149
|
+
logger.info(f'{GREEN}✔ {message}{NC}')
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def force_update_status(message):
|
|
153
|
+
"""Force update rich spinner status."""
|
|
154
|
+
rich_utils.force_update_status(ux_utils.spinner_message(message))
|
|
200
155
|
|
|
201
156
|
|
|
202
157
|
def cleanup_server_node(node,
|
|
@@ -205,7 +160,7 @@ def cleanup_server_node(node,
|
|
|
205
160
|
askpass_block,
|
|
206
161
|
use_ssh_config=False):
|
|
207
162
|
"""Uninstall k3s and clean up the state on a server node."""
|
|
208
|
-
|
|
163
|
+
force_update_status(f'Cleaning up head node ({node})...')
|
|
209
164
|
cmd = f"""
|
|
210
165
|
{askpass_block}
|
|
211
166
|
echo 'Uninstalling k3s...' &&
|
|
@@ -214,7 +169,7 @@ def cleanup_server_node(node,
|
|
|
214
169
|
"""
|
|
215
170
|
result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
|
|
216
171
|
if result is None:
|
|
217
|
-
|
|
172
|
+
logger.error(f'{RED}Failed to clean up head node ({node}).{NC}')
|
|
218
173
|
else:
|
|
219
174
|
success_message(f'Node {node} cleaned up successfully.')
|
|
220
175
|
|
|
@@ -225,7 +180,7 @@ def cleanup_agent_node(node,
|
|
|
225
180
|
askpass_block,
|
|
226
181
|
use_ssh_config=False):
|
|
227
182
|
"""Uninstall k3s and clean up the state on an agent node."""
|
|
228
|
-
|
|
183
|
+
force_update_status(f'Cleaning up worker node ({node})...')
|
|
229
184
|
cmd = f"""
|
|
230
185
|
{askpass_block}
|
|
231
186
|
echo 'Uninstalling k3s...' &&
|
|
@@ -234,7 +189,7 @@ def cleanup_agent_node(node,
|
|
|
234
189
|
"""
|
|
235
190
|
result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
|
|
236
191
|
if result is None:
|
|
237
|
-
|
|
192
|
+
logger.error(f'{RED}Failed to clean up worker node ({node}).{NC}')
|
|
238
193
|
else:
|
|
239
194
|
success_message(f'Node {node} cleaned up successfully.')
|
|
240
195
|
|
|
@@ -248,6 +203,7 @@ def start_agent_node(node,
|
|
|
248
203
|
use_ssh_config=False):
|
|
249
204
|
"""Start a k3s agent node.
|
|
250
205
|
Returns: if the start is successful, and if the node has a GPU."""
|
|
206
|
+
logger.info(f'Deploying worker node ({node}).')
|
|
251
207
|
cmd = f"""
|
|
252
208
|
{askpass_block}
|
|
253
209
|
curl -sfL https://get.k3s.io | K3S_NODE_NAME={node} INSTALL_K3S_EXEC='agent --node-label skypilot-ip={node}' \
|
|
@@ -255,12 +211,14 @@ def start_agent_node(node,
|
|
|
255
211
|
"""
|
|
256
212
|
result = run_remote(node, cmd, user, ssh_key, use_ssh_config=use_ssh_config)
|
|
257
213
|
if result is None:
|
|
258
|
-
|
|
214
|
+
logger.error(
|
|
215
|
+
f'{RED}✗ Failed to deploy K3s on worker node ({node}).{NC}')
|
|
259
216
|
return node, False, False
|
|
260
|
-
success_message(
|
|
217
|
+
success_message(
|
|
218
|
+
f'SkyPilot runtime successfully deployed on worker node ({node}).')
|
|
261
219
|
# Check if worker node has a GPU
|
|
262
220
|
if check_gpu(node, user, ssh_key, use_ssh_config=use_ssh_config):
|
|
263
|
-
|
|
221
|
+
logger.info(f'{YELLOW}GPU detected on worker node ({node}).{NC}')
|
|
264
222
|
return node, True, True
|
|
265
223
|
return node, True, False
|
|
266
224
|
|
|
@@ -268,7 +226,12 @@ def start_agent_node(node,
|
|
|
268
226
|
def check_gpu(node, user, ssh_key, use_ssh_config=False):
|
|
269
227
|
"""Check if a node has a GPU."""
|
|
270
228
|
cmd = 'command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null'
|
|
271
|
-
result = run_remote(node,
|
|
229
|
+
result = run_remote(node,
|
|
230
|
+
cmd,
|
|
231
|
+
user,
|
|
232
|
+
ssh_key,
|
|
233
|
+
use_ssh_config=use_ssh_config,
|
|
234
|
+
silent=True)
|
|
272
235
|
return result is not None
|
|
273
236
|
|
|
274
237
|
|
|
@@ -399,7 +362,7 @@ def setup_kubectl_ssh_tunnel(head_node,
|
|
|
399
362
|
has_cert_files = os.path.isfile(client_cert_file) and os.path.isfile(
|
|
400
363
|
client_key_file)
|
|
401
364
|
if has_cert_files:
|
|
402
|
-
|
|
365
|
+
logger.info(
|
|
403
366
|
f'{GREEN}Client certificate data extracted and will be used for authentication{NC}'
|
|
404
367
|
)
|
|
405
368
|
|
|
@@ -426,22 +389,22 @@ def setup_kubectl_ssh_tunnel(head_node,
|
|
|
426
389
|
success_message(
|
|
427
390
|
f'SSH tunnel configured through kubectl credential plugin on port {port}'
|
|
428
391
|
)
|
|
429
|
-
|
|
392
|
+
logger.info(
|
|
430
393
|
f'{GREEN}Your kubectl connection is now tunneled through SSH (port {port}).{NC}'
|
|
431
394
|
)
|
|
432
|
-
|
|
395
|
+
logger.info(
|
|
433
396
|
f'{GREEN}This tunnel will be automatically established when needed.{NC}'
|
|
434
397
|
)
|
|
435
|
-
|
|
398
|
+
logger.info(
|
|
436
399
|
f'{GREEN}Credential TTL set to {ttl_seconds}s to ensure tunnel health is checked frequently.{NC}'
|
|
437
400
|
)
|
|
438
401
|
|
|
439
402
|
return port
|
|
440
403
|
|
|
441
404
|
|
|
442
|
-
def cleanup_kubectl_ssh_tunnel(context_name):
|
|
405
|
+
def cleanup_kubectl_ssh_tunnel(cluster_name, context_name):
|
|
443
406
|
"""Clean up the SSH tunnel for a specific context"""
|
|
444
|
-
progress_message(f'Cleaning up SSH tunnel for
|
|
407
|
+
progress_message(f'Cleaning up SSH tunnel for `{cluster_name}`...')
|
|
445
408
|
|
|
446
409
|
# Path to cleanup script
|
|
447
410
|
cleanup_script = os.path.join(SCRIPT_DIR, 'cleanup-tunnel.sh')
|
|
@@ -456,201 +419,148 @@ def cleanup_kubectl_ssh_tunnel(context_name):
|
|
|
456
419
|
stderr=subprocess.DEVNULL,
|
|
457
420
|
check=False)
|
|
458
421
|
|
|
459
|
-
success_message(f'SSH tunnel for
|
|
422
|
+
success_message(f'SSH tunnel for `{cluster_name}` cleaned up.')
|
|
460
423
|
else:
|
|
461
|
-
|
|
424
|
+
logger.error(f'{YELLOW}Cleanup script not found: {cleanup_script}{NC}')
|
|
462
425
|
|
|
463
426
|
|
|
464
|
-
def
|
|
465
|
-
|
|
427
|
+
def deploy_clusters(
|
|
428
|
+
infra: Optional[str],
|
|
429
|
+
ssh_node_pools_file: str = ssh_utils.DEFAULT_SSH_NODE_POOLS_PATH,
|
|
430
|
+
kubeconfig_path: Optional[str] = None,
|
|
431
|
+
cleanup: bool = True):
|
|
466
432
|
|
|
467
|
-
kubeconfig_path =
|
|
468
|
-
|
|
433
|
+
kubeconfig_path = kubeconfig_path or DEFAULT_KUBECONFIG_PATH
|
|
434
|
+
kubeconfig_path = os.path.expanduser(kubeconfig_path)
|
|
469
435
|
|
|
470
436
|
failed_clusters = []
|
|
471
437
|
successful_clusters = []
|
|
472
438
|
|
|
473
|
-
#
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
head_node = hosts[0]
|
|
498
|
-
worker_nodes = hosts[1:]
|
|
499
|
-
ssh_user = args.user if not global_use_ssh_config else ''
|
|
500
|
-
ssh_key = args.ssh_key if not global_use_ssh_config else ''
|
|
501
|
-
context_name = args.context_name
|
|
502
|
-
password = args.password
|
|
503
|
-
|
|
504
|
-
# Check if hosts are in SSH config
|
|
505
|
-
head_use_ssh_config = global_use_ssh_config or ssh_utils.check_host_in_ssh_config(
|
|
506
|
-
head_node)
|
|
507
|
-
worker_use_ssh_config = [
|
|
508
|
-
global_use_ssh_config or ssh_utils.check_host_in_ssh_config(node)
|
|
509
|
-
for node in worker_nodes
|
|
510
|
-
]
|
|
439
|
+
# Using YAML configuration
|
|
440
|
+
targets = ssh_utils.load_ssh_targets(ssh_node_pools_file)
|
|
441
|
+
clusters_config = ssh_utils.get_cluster_config(
|
|
442
|
+
targets, infra, file_path=ssh_node_pools_file)
|
|
443
|
+
|
|
444
|
+
# Print information about clusters being processed
|
|
445
|
+
num_clusters = len(clusters_config)
|
|
446
|
+
cluster_names = list(clusters_config.keys())
|
|
447
|
+
cluster_info = f'Found {num_clusters} Node Pool{"s" if num_clusters > 1 else ""}: {", ".join(cluster_names)}'
|
|
448
|
+
logger.info(f'{colorama.Fore.CYAN}{cluster_info}{colorama.Style.RESET_ALL}')
|
|
449
|
+
|
|
450
|
+
# Process each cluster
|
|
451
|
+
for cluster_name, cluster_config in clusters_config.items():
|
|
452
|
+
try:
|
|
453
|
+
action = 'Cleaning up' if cleanup else 'Deploying'
|
|
454
|
+
force_update_status(f'{action} Node Pool: {cluster_name}')
|
|
455
|
+
hosts_info = ssh_utils.prepare_hosts_info(cluster_name,
|
|
456
|
+
cluster_config)
|
|
457
|
+
|
|
458
|
+
if not hosts_info:
|
|
459
|
+
logger.warning(
|
|
460
|
+
f'{RED}Error: No valid hosts found for cluster {cluster_name!r}. Skipping.{NC}'
|
|
461
|
+
)
|
|
462
|
+
continue
|
|
511
463
|
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
cluster_name, cluster_config)
|
|
536
|
-
|
|
537
|
-
if not hosts_info:
|
|
538
|
-
print(
|
|
539
|
-
f'{RED}Error: No valid hosts found for cluster {cluster_name!r}. Skipping.{NC}'
|
|
540
|
-
)
|
|
541
|
-
continue
|
|
542
|
-
|
|
543
|
-
# Generate a unique context name for each cluster
|
|
544
|
-
context_name = args.context_name
|
|
545
|
-
if context_name == 'default':
|
|
546
|
-
context_name = 'ssh-' + cluster_name
|
|
547
|
-
|
|
548
|
-
# Check cluster history
|
|
549
|
-
os.makedirs(NODE_POOLS_INFO_DIR, exist_ok=True)
|
|
550
|
-
history_yaml_file = os.path.join(
|
|
551
|
-
NODE_POOLS_INFO_DIR, f'{context_name}-history.yaml')
|
|
552
|
-
|
|
553
|
-
history = None
|
|
554
|
-
if os.path.exists(history_yaml_file):
|
|
555
|
-
print(
|
|
556
|
-
f'{YELLOW}Loading history from {history_yaml_file}{NC}')
|
|
557
|
-
with open(history_yaml_file, 'r', encoding='utf-8') as f:
|
|
558
|
-
history = yaml.safe_load(f)
|
|
559
|
-
else:
|
|
560
|
-
print(f'{YELLOW}No history found for {context_name}.{NC}')
|
|
561
|
-
|
|
562
|
-
history_workers_info = None
|
|
563
|
-
history_worker_nodes = None
|
|
564
|
-
history_use_ssh_config = None
|
|
565
|
-
# Do not support changing anything besides hosts for now
|
|
566
|
-
if history is not None:
|
|
567
|
-
for key in ['user', 'identity_file', 'password']:
|
|
568
|
-
if not args.cleanup and history.get(
|
|
569
|
-
key) != cluster_config.get(key):
|
|
570
|
-
raise ValueError(
|
|
571
|
-
f'Cluster configuration has changed for field {key!r}. '
|
|
572
|
-
f'Previous value: {history.get(key)}, '
|
|
573
|
-
f'Current value: {cluster_config.get(key)}')
|
|
574
|
-
history_hosts_info = ssh_utils.prepare_hosts_info(
|
|
575
|
-
cluster_name, history)
|
|
576
|
-
if not args.cleanup and history_hosts_info[0] != hosts_info[
|
|
577
|
-
0]:
|
|
464
|
+
context_name = f'ssh-{cluster_name}'
|
|
465
|
+
|
|
466
|
+
# Check cluster history
|
|
467
|
+
os.makedirs(NODE_POOLS_INFO_DIR, exist_ok=True)
|
|
468
|
+
history_yaml_file = os.path.join(NODE_POOLS_INFO_DIR,
|
|
469
|
+
f'{context_name}-history.yaml')
|
|
470
|
+
|
|
471
|
+
history = None
|
|
472
|
+
if os.path.exists(history_yaml_file):
|
|
473
|
+
logger.debug(f'Loading history from {history_yaml_file}')
|
|
474
|
+
with open(history_yaml_file, 'r', encoding='utf-8') as f:
|
|
475
|
+
history = yaml.safe_load(f)
|
|
476
|
+
else:
|
|
477
|
+
logger.debug(f'No history found for {context_name}.')
|
|
478
|
+
|
|
479
|
+
history_workers_info = None
|
|
480
|
+
history_worker_nodes = None
|
|
481
|
+
history_use_ssh_config = None
|
|
482
|
+
# Do not support changing anything besides hosts for now
|
|
483
|
+
if history is not None:
|
|
484
|
+
for key in ['user', 'identity_file', 'password']:
|
|
485
|
+
if not cleanup and history.get(key) != cluster_config.get(
|
|
486
|
+
key):
|
|
578
487
|
raise ValueError(
|
|
579
|
-
f'Cluster configuration has changed for
|
|
580
|
-
f'Previous value: {
|
|
581
|
-
f'Current value: {
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
]
|
|
591
|
-
|
|
592
|
-
|
|
593
|
-
|
|
594
|
-
worker_hosts = hosts_info[1:] if len(hosts_info) > 1 else []
|
|
595
|
-
|
|
596
|
-
head_node = head_host['ip']
|
|
597
|
-
worker_nodes = [h['ip'] for h in worker_hosts]
|
|
598
|
-
ssh_user = head_host['user']
|
|
599
|
-
ssh_key = head_host['identity_file']
|
|
600
|
-
head_use_ssh_config = global_use_ssh_config or head_host.get(
|
|
601
|
-
'use_ssh_config', False)
|
|
602
|
-
worker_use_ssh_config = [
|
|
603
|
-
global_use_ssh_config or h.get('use_ssh_config', False)
|
|
604
|
-
for h in worker_hosts
|
|
488
|
+
f'Cluster configuration has changed for field {key!r}. '
|
|
489
|
+
f'Previous value: {history.get(key)}, '
|
|
490
|
+
f'Current value: {cluster_config.get(key)}')
|
|
491
|
+
history_hosts_info = ssh_utils.prepare_hosts_info(
|
|
492
|
+
cluster_name, history)
|
|
493
|
+
if not cleanup and history_hosts_info[0] != hosts_info[0]:
|
|
494
|
+
raise ValueError(
|
|
495
|
+
f'Cluster configuration has changed for master node. '
|
|
496
|
+
f'Previous value: {history_hosts_info[0]}, '
|
|
497
|
+
f'Current value: {hosts_info[0]}')
|
|
498
|
+
history_workers_info = history_hosts_info[1:] if len(
|
|
499
|
+
history_hosts_info) > 1 else []
|
|
500
|
+
history_worker_nodes = [h['ip'] for h in history_workers_info]
|
|
501
|
+
history_use_ssh_config = [
|
|
502
|
+
h.get('use_ssh_config', False) for h in history_workers_info
|
|
605
503
|
]
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
504
|
+
|
|
505
|
+
# Use the first host as the head node and the rest as worker nodes
|
|
506
|
+
head_host = hosts_info[0]
|
|
507
|
+
worker_hosts = hosts_info[1:] if len(hosts_info) > 1 else []
|
|
508
|
+
|
|
509
|
+
head_node = head_host['ip']
|
|
510
|
+
worker_nodes = [h['ip'] for h in worker_hosts]
|
|
511
|
+
ssh_user = head_host['user']
|
|
512
|
+
ssh_key = head_host['identity_file']
|
|
513
|
+
head_use_ssh_config = head_host.get('use_ssh_config', False)
|
|
514
|
+
worker_use_ssh_config = [
|
|
515
|
+
h.get('use_ssh_config', False) for h in worker_hosts
|
|
516
|
+
]
|
|
517
|
+
password = head_host['password']
|
|
518
|
+
|
|
519
|
+
# Deploy this cluster
|
|
520
|
+
unsuccessful_workers = deploy_cluster(
|
|
521
|
+
cluster_name,
|
|
522
|
+
head_node,
|
|
523
|
+
worker_nodes,
|
|
524
|
+
ssh_user,
|
|
525
|
+
ssh_key,
|
|
526
|
+
context_name,
|
|
527
|
+
password,
|
|
528
|
+
head_use_ssh_config,
|
|
529
|
+
worker_use_ssh_config,
|
|
530
|
+
kubeconfig_path,
|
|
531
|
+
cleanup,
|
|
532
|
+
worker_hosts=worker_hosts,
|
|
533
|
+
history_worker_nodes=history_worker_nodes,
|
|
534
|
+
history_workers_info=history_workers_info,
|
|
535
|
+
history_use_ssh_config=history_use_ssh_config)
|
|
536
|
+
|
|
537
|
+
if not cleanup:
|
|
538
|
+
successful_hosts = []
|
|
539
|
+
for host in cluster_config['hosts']:
|
|
540
|
+
if isinstance(host, str):
|
|
541
|
+
host_node = host
|
|
542
|
+
else:
|
|
543
|
+
host_node = host['ip']
|
|
544
|
+
if host_node not in unsuccessful_workers:
|
|
545
|
+
successful_hosts.append(host)
|
|
546
|
+
cluster_config['hosts'] = successful_hosts
|
|
547
|
+
with open(history_yaml_file, 'w', encoding='utf-8') as f:
|
|
548
|
+
logger.debug(f'Writing history to {history_yaml_file}')
|
|
549
|
+
yaml.dump(cluster_config, f)
|
|
550
|
+
|
|
551
|
+
action = 'cleanup' if cleanup else 'deployment'
|
|
552
|
+
logger.info(
|
|
553
|
+
f'{colorama.Fore.CYAN}Completed {action} for cluster: {cluster_name}{colorama.Style.RESET_ALL}'
|
|
554
|
+
)
|
|
555
|
+
successful_clusters.append(cluster_name)
|
|
556
|
+
except Exception as e: # pylint: disable=broad-except
|
|
557
|
+
reason = str(e)
|
|
558
|
+
failed_clusters.append((cluster_name, reason))
|
|
559
|
+
logger.debug(
|
|
560
|
+
f'Error deploying SSH Node Pool `{cluster_name}`: {reason}')
|
|
651
561
|
|
|
652
562
|
if failed_clusters:
|
|
653
|
-
action = 'clean' if
|
|
563
|
+
action = 'clean' if cleanup else 'deploy'
|
|
654
564
|
msg = f'{GREEN}Successfully {action}ed {len(successful_clusters)} cluster(s) ({", ".join(successful_clusters)}). {NC}'
|
|
655
565
|
msg += f'{RED}Failed to {action} {len(failed_clusters)} cluster(s): {NC}'
|
|
656
566
|
for cluster_name, reason in failed_clusters:
|
|
@@ -658,7 +568,8 @@ def main():
|
|
|
658
568
|
raise RuntimeError(msg)
|
|
659
569
|
|
|
660
570
|
|
|
661
|
-
def deploy_cluster(
|
|
571
|
+
def deploy_cluster(cluster_name,
|
|
572
|
+
head_node,
|
|
662
573
|
worker_nodes,
|
|
663
574
|
ssh_user,
|
|
664
575
|
ssh_key,
|
|
@@ -691,15 +602,15 @@ def deploy_cluster(head_node,
|
|
|
691
602
|
k3s_token = 'mytoken' # Any string can be used as the token
|
|
692
603
|
|
|
693
604
|
# Pre-flight checks
|
|
694
|
-
|
|
695
|
-
result = run_remote(
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
605
|
+
logger.info(f'Checking SSH connection to head node ({head_node})...')
|
|
606
|
+
result = run_remote(head_node,
|
|
607
|
+
f'echo \'SSH connection successful ({head_node})\'',
|
|
608
|
+
ssh_user,
|
|
609
|
+
ssh_key,
|
|
610
|
+
use_ssh_config=head_use_ssh_config)
|
|
611
|
+
if result.startswith('SSH connection successful'):
|
|
612
|
+
success_message(f'SSH connection established to head node {head_node}.')
|
|
613
|
+
|
|
703
614
|
if not cleanup and result is None:
|
|
704
615
|
with ux_utils.print_exception_no_traceback():
|
|
705
616
|
raise RuntimeError(
|
|
@@ -720,9 +631,9 @@ def deploy_cluster(head_node,
|
|
|
720
631
|
history_worker_nodes, history_workers_info,
|
|
721
632
|
history_use_ssh_config):
|
|
722
633
|
if worker_hosts is not None and history_info not in worker_hosts:
|
|
723
|
-
|
|
724
|
-
f'
|
|
725
|
-
|
|
634
|
+
logger.debug(
|
|
635
|
+
f'Worker node {history_node} not found in YAML config. '
|
|
636
|
+
'Removing from history...')
|
|
726
637
|
worker_nodes_to_cleanup.append(
|
|
727
638
|
dict(
|
|
728
639
|
node=history_node,
|
|
@@ -758,8 +669,6 @@ def deploy_cluster(head_node,
|
|
|
758
669
|
use_ssh_config=use_ssh_config,
|
|
759
670
|
))
|
|
760
671
|
|
|
761
|
-
print(f'{YELLOW}Starting cleanup...{NC}')
|
|
762
|
-
|
|
763
672
|
# Clean up head node
|
|
764
673
|
cleanup_server_node(head_node,
|
|
765
674
|
ssh_user,
|
|
@@ -767,23 +676,20 @@ def deploy_cluster(head_node,
|
|
|
767
676
|
askpass_block,
|
|
768
677
|
use_ssh_config=head_use_ssh_config)
|
|
769
678
|
# Clean up worker nodes
|
|
679
|
+
force_update_status(f'Cleaning up worker nodes [{cluster_name}]')
|
|
770
680
|
with cf.ThreadPoolExecutor() as executor:
|
|
771
681
|
executor.map(lambda kwargs: cleanup_agent_node(**kwargs),
|
|
772
682
|
worker_nodes_to_cleanup)
|
|
773
683
|
|
|
774
684
|
with cf.ThreadPoolExecutor() as executor:
|
|
775
|
-
|
|
776
|
-
|
|
777
|
-
print('Cleaning up worker nodes:', cmd)
|
|
778
|
-
run_command(cmd, shell=True)
|
|
779
|
-
|
|
780
|
-
executor.map(run_cleanup_cmd, remove_worker_cmds)
|
|
685
|
+
executor.map(lambda cmd: run_command(cmd, shell=True),
|
|
686
|
+
remove_worker_cmds)
|
|
781
687
|
|
|
782
688
|
if cleanup:
|
|
783
689
|
|
|
784
690
|
# Remove the context from local kubeconfig if it exists
|
|
785
691
|
if os.path.isfile(kubeconfig_path):
|
|
786
|
-
|
|
692
|
+
logger.debug(
|
|
787
693
|
f'Removing context {context_name!r} from local kubeconfig...')
|
|
788
694
|
run_command(['kubectl', 'config', 'delete-context', context_name],
|
|
789
695
|
shell=False)
|
|
@@ -806,7 +712,7 @@ def deploy_cluster(head_node,
|
|
|
806
712
|
run_command(['kubectl', 'config', 'unset', 'current-context'],
|
|
807
713
|
shell=False)
|
|
808
714
|
|
|
809
|
-
|
|
715
|
+
logger.debug(
|
|
810
716
|
f'Context {context_name!r} removed from local kubeconfig.')
|
|
811
717
|
|
|
812
718
|
for file in [history_yaml_file, cert_file_path, key_file_path]:
|
|
@@ -815,16 +721,12 @@ def deploy_cluster(head_node,
|
|
|
815
721
|
|
|
816
722
|
# Clean up SSH tunnel after clean up kubeconfig, because the kubectl
|
|
817
723
|
# will restart the ssh tunnel if it's not running.
|
|
818
|
-
cleanup_kubectl_ssh_tunnel(context_name)
|
|
819
|
-
|
|
820
|
-
print(f'{GREEN}Cleanup completed successfully.{NC}')
|
|
821
|
-
|
|
822
|
-
# Print completion marker for current cluster
|
|
823
|
-
print(f'{GREEN}SKYPILOT_CLUSTER_COMPLETED: {NC}')
|
|
724
|
+
cleanup_kubectl_ssh_tunnel(cluster_name, context_name)
|
|
824
725
|
|
|
726
|
+
success_message(f'Node Pool `{cluster_name}` cleaned up successfully.')
|
|
825
727
|
return []
|
|
826
728
|
|
|
827
|
-
|
|
729
|
+
logger.debug('Checking TCP Forwarding Options...')
|
|
828
730
|
cmd = (
|
|
829
731
|
'if [ "$(sudo sshd -T | grep allowtcpforwarding)" = "allowtcpforwarding yes" ]; then '
|
|
830
732
|
f'echo "TCP Forwarding already enabled on head node ({head_node})."; '
|
|
@@ -833,15 +735,12 @@ def deploy_cluster(head_node,
|
|
|
833
735
|
'/etc/ssh/sshd_config && sudo systemctl restart sshd && '
|
|
834
736
|
f'echo "Successfully enabled TCP Forwarding on head node ({head_node})."; '
|
|
835
737
|
'fi')
|
|
836
|
-
result = run_remote(
|
|
837
|
-
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
# For SkySSHUpLineProcessor
|
|
843
|
-
print_output=True,
|
|
844
|
-
use_shell=True)
|
|
738
|
+
result = run_remote(head_node,
|
|
739
|
+
shlex.quote(cmd),
|
|
740
|
+
ssh_user,
|
|
741
|
+
ssh_key,
|
|
742
|
+
use_ssh_config=head_use_ssh_config,
|
|
743
|
+
use_shell=True)
|
|
845
744
|
if result is None:
|
|
846
745
|
with ux_utils.print_exception_no_traceback():
|
|
847
746
|
raise RuntimeError(
|
|
@@ -851,7 +750,7 @@ def deploy_cluster(head_node,
|
|
|
851
750
|
# Get effective IP for master node if using SSH config - needed for workers to connect
|
|
852
751
|
if head_use_ssh_config:
|
|
853
752
|
effective_master_ip = get_effective_host_ip(head_node)
|
|
854
|
-
|
|
753
|
+
logger.info(
|
|
855
754
|
f'{GREEN}Resolved head node {head_node} to {effective_master_ip} from SSH config{NC}'
|
|
856
755
|
)
|
|
857
756
|
else:
|
|
@@ -860,7 +759,8 @@ def deploy_cluster(head_node,
|
|
|
860
759
|
# Step 1: Install k3s on the head node
|
|
861
760
|
# Check if head node has a GPU
|
|
862
761
|
install_gpu = False
|
|
863
|
-
|
|
762
|
+
force_update_status(
|
|
763
|
+
f'Deploying SkyPilot runtime on head node ({head_node}).')
|
|
864
764
|
cmd = f"""
|
|
865
765
|
{askpass_block}
|
|
866
766
|
curl -sfL https://get.k3s.io | K3S_TOKEN={k3s_token} K3S_NODE_NAME={head_node} sudo -E -A sh - &&
|
|
@@ -889,7 +789,8 @@ def deploy_cluster(head_node,
|
|
|
889
789
|
with ux_utils.print_exception_no_traceback():
|
|
890
790
|
raise RuntimeError(
|
|
891
791
|
f'Failed to deploy K3s on head node ({head_node}).')
|
|
892
|
-
success_message(
|
|
792
|
+
success_message(
|
|
793
|
+
f'SkyPilot runtime successfully deployed on head node ({head_node}).')
|
|
893
794
|
|
|
894
795
|
# Check if head node has a GPU
|
|
895
796
|
install_gpu = False
|
|
@@ -897,7 +798,7 @@ def deploy_cluster(head_node,
|
|
|
897
798
|
ssh_user,
|
|
898
799
|
ssh_key,
|
|
899
800
|
use_ssh_config=head_use_ssh_config):
|
|
900
|
-
|
|
801
|
+
logger.info(f'{YELLOW}GPU detected on head node ({head_node}).{NC}')
|
|
901
802
|
install_gpu = True
|
|
902
803
|
|
|
903
804
|
# Fetch the head node's internal IP (this will be passed to worker nodes)
|
|
@@ -910,21 +811,20 @@ def deploy_cluster(head_node,
|
|
|
910
811
|
with ux_utils.print_exception_no_traceback():
|
|
911
812
|
raise RuntimeError(f'Failed to SSH to head node ({head_node}). '
|
|
912
813
|
f'Please check the SSH configuration.')
|
|
913
|
-
|
|
814
|
+
logger.debug(f'Master node internal IP: {master_addr}')
|
|
914
815
|
|
|
915
816
|
# Step 2: Install k3s on worker nodes and join them to the master node
|
|
916
817
|
def deploy_worker(args):
|
|
917
818
|
(i, node, worker_hosts, history_workers_info, ssh_user, ssh_key,
|
|
918
819
|
askpass_block, worker_use_ssh_config, master_addr, k3s_token) = args
|
|
919
|
-
progress_message(f'Deploying Kubernetes on worker node ({node})...')
|
|
920
820
|
|
|
921
821
|
# If using YAML config with specific worker info
|
|
922
822
|
if worker_hosts and i < len(worker_hosts):
|
|
923
823
|
if history_workers_info is not None and worker_hosts[
|
|
924
824
|
i] in history_workers_info:
|
|
925
|
-
|
|
926
|
-
f'{
|
|
927
|
-
f'Skipping...{
|
|
825
|
+
logger.info(
|
|
826
|
+
f'{colorama.Style.DIM}✔ SkyPilot runtime already deployed on worker node {node}. '
|
|
827
|
+
f'Skipping...{colorama.Style.RESET_ALL}')
|
|
928
828
|
return node, True, False
|
|
929
829
|
worker_user = worker_hosts[i]['user']
|
|
930
830
|
worker_key = worker_hosts[i]['identity_file']
|
|
@@ -948,6 +848,8 @@ def deploy_cluster(head_node,
|
|
|
948
848
|
unsuccessful_workers = []
|
|
949
849
|
|
|
950
850
|
# Deploy workers in parallel using thread pool
|
|
851
|
+
force_update_status(
|
|
852
|
+
f'Deploying SkyPilot runtime on worker nodes [{cluster_name}]')
|
|
951
853
|
with cf.ThreadPoolExecutor() as executor:
|
|
952
854
|
futures = []
|
|
953
855
|
for i, node in enumerate(worker_nodes):
|
|
@@ -964,7 +866,7 @@ def deploy_cluster(head_node,
|
|
|
964
866
|
unsuccessful_workers.append(node)
|
|
965
867
|
|
|
966
868
|
# Step 3: Configure local kubectl to connect to the cluster
|
|
967
|
-
|
|
869
|
+
force_update_status(f'Setting up SkyPilot configuration [{cluster_name}]')
|
|
968
870
|
|
|
969
871
|
# Create temporary directory for kubeconfig operations
|
|
970
872
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
@@ -1054,8 +956,8 @@ def deploy_cluster(head_node,
|
|
|
1054
956
|
has_end = '-----END CERTIFICATE-----' in cert_pem
|
|
1055
957
|
|
|
1056
958
|
if not has_begin or not has_end:
|
|
1057
|
-
|
|
1058
|
-
|
|
959
|
+
logger.debug(
|
|
960
|
+
'Warning: Certificate data missing PEM markers, attempting to fix...'
|
|
1059
961
|
)
|
|
1060
962
|
# Add PEM markers if missing
|
|
1061
963
|
if not has_begin:
|
|
@@ -1070,8 +972,8 @@ def deploy_cluster(head_node,
|
|
|
1070
972
|
|
|
1071
973
|
# Verify the file was written correctly
|
|
1072
974
|
if os.path.getsize(cert_file_path) > 0:
|
|
1073
|
-
|
|
1074
|
-
f'
|
|
975
|
+
logger.debug(
|
|
976
|
+
f'Successfully saved certificate data ({len(cert_pem)} bytes)'
|
|
1075
977
|
)
|
|
1076
978
|
|
|
1077
979
|
# Quick validation of PEM format
|
|
@@ -1086,13 +988,14 @@ def deploy_cluster(head_node,
|
|
|
1086
988
|
if not first_line.startswith(
|
|
1087
989
|
'-----BEGIN') or not last_line.startswith(
|
|
1088
990
|
'-----END'):
|
|
1089
|
-
|
|
1090
|
-
|
|
991
|
+
logger.debug(
|
|
992
|
+
'Warning: Certificate may not be in proper PEM format'
|
|
1091
993
|
)
|
|
1092
994
|
else:
|
|
1093
|
-
|
|
995
|
+
logger.error(
|
|
996
|
+
f'{RED}Error: Certificate file is empty{NC}')
|
|
1094
997
|
except Exception as e: # pylint: disable=broad-except
|
|
1095
|
-
|
|
998
|
+
logger.error(
|
|
1096
999
|
f'{RED}Error processing certificate data: {e}{NC}')
|
|
1097
1000
|
|
|
1098
1001
|
if client_key_data:
|
|
@@ -1134,8 +1037,8 @@ def deploy_cluster(head_node,
|
|
|
1134
1037
|
])
|
|
1135
1038
|
|
|
1136
1039
|
if not has_begin or not has_end:
|
|
1137
|
-
|
|
1138
|
-
|
|
1040
|
+
logger.debug(
|
|
1041
|
+
'Warning: Key data missing PEM markers, attempting to fix...'
|
|
1139
1042
|
)
|
|
1140
1043
|
# Add PEM markers if missing
|
|
1141
1044
|
if not has_begin:
|
|
@@ -1154,8 +1057,8 @@ def deploy_cluster(head_node,
|
|
|
1154
1057
|
|
|
1155
1058
|
# Verify the file was written correctly
|
|
1156
1059
|
if os.path.getsize(key_file_path) > 0:
|
|
1157
|
-
|
|
1158
|
-
f'
|
|
1060
|
+
logger.debug(
|
|
1061
|
+
f'Successfully saved key data ({len(key_pem)} bytes)'
|
|
1159
1062
|
)
|
|
1160
1063
|
|
|
1161
1064
|
# Quick validation of PEM format
|
|
@@ -1170,22 +1073,25 @@ def deploy_cluster(head_node,
|
|
|
1170
1073
|
if not first_line.startswith(
|
|
1171
1074
|
'-----BEGIN') or not last_line.startswith(
|
|
1172
1075
|
'-----END'):
|
|
1173
|
-
|
|
1174
|
-
|
|
1076
|
+
logger.debug(
|
|
1077
|
+
'Warning: Key may not be in proper PEM format'
|
|
1175
1078
|
)
|
|
1176
1079
|
else:
|
|
1177
|
-
|
|
1080
|
+
logger.error(f'{RED}Error: Key file is empty{NC}')
|
|
1178
1081
|
except Exception as e: # pylint: disable=broad-except
|
|
1179
|
-
|
|
1082
|
+
logger.error(f'{RED}Error processing key data: {e}{NC}')
|
|
1180
1083
|
|
|
1181
1084
|
# First check if context name exists and delete it if it does
|
|
1182
1085
|
# TODO(romilb): Should we throw an error here instead?
|
|
1183
1086
|
run_command(['kubectl', 'config', 'delete-context', context_name],
|
|
1184
|
-
shell=False
|
|
1087
|
+
shell=False,
|
|
1088
|
+
silent=True)
|
|
1185
1089
|
run_command(['kubectl', 'config', 'delete-cluster', context_name],
|
|
1186
|
-
shell=False
|
|
1090
|
+
shell=False,
|
|
1091
|
+
silent=True)
|
|
1187
1092
|
run_command(['kubectl', 'config', 'delete-user', context_name],
|
|
1188
|
-
shell=False
|
|
1093
|
+
shell=False,
|
|
1094
|
+
silent=True)
|
|
1189
1095
|
|
|
1190
1096
|
# Merge the configurations using kubectl
|
|
1191
1097
|
merged_config = os.path.join(temp_dir, 'merged_config')
|
|
@@ -1210,17 +1116,12 @@ def deploy_cluster(head_node,
|
|
|
1210
1116
|
context_name,
|
|
1211
1117
|
use_ssh_config=head_use_ssh_config)
|
|
1212
1118
|
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
print(
|
|
1216
|
-
f'Cluster deployment completed. Kubeconfig saved to {kubeconfig_path}')
|
|
1217
|
-
print('You can now run \'kubectl get nodes\' to verify the setup.')
|
|
1119
|
+
logger.debug(f'kubectl configured with new context \'{context_name}\'.')
|
|
1120
|
+
success_message(f'SkyPilot runtime is up [{cluster_name}].')
|
|
1218
1121
|
|
|
1219
1122
|
# Install GPU operator if a GPU was detected on any node
|
|
1220
1123
|
if install_gpu:
|
|
1221
|
-
|
|
1222
|
-
f'{YELLOW}GPU detected in the cluster. Installing Nvidia GPU Operator...{NC}'
|
|
1223
|
-
)
|
|
1124
|
+
force_update_status(f'Configuring NVIDIA GPUs [{cluster_name}]')
|
|
1224
1125
|
cmd = f"""
|
|
1225
1126
|
{askpass_block}
|
|
1226
1127
|
curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
|
|
@@ -1240,7 +1141,7 @@ def deploy_cluster(head_node,
|
|
|
1240
1141
|
while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:' || ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu.product'; do
|
|
1241
1142
|
echo 'Waiting for GPU operator...'
|
|
1242
1143
|
sleep 5
|
|
1243
|
-
done
|
|
1144
|
+
done
|
|
1244
1145
|
echo 'GPU operator installed successfully.'
|
|
1245
1146
|
"""
|
|
1246
1147
|
result = run_remote(head_node,
|
|
@@ -1249,51 +1150,28 @@ def deploy_cluster(head_node,
|
|
|
1249
1150
|
ssh_key,
|
|
1250
1151
|
use_ssh_config=head_use_ssh_config)
|
|
1251
1152
|
if result is None:
|
|
1252
|
-
|
|
1153
|
+
logger.error(f'{RED}Failed to install GPU Operator.{NC}')
|
|
1253
1154
|
else:
|
|
1254
1155
|
success_message('GPU Operator installed.')
|
|
1255
1156
|
else:
|
|
1256
|
-
|
|
1257
|
-
f'{YELLOW}No GPUs detected. Skipping GPU Operator installation.{NC}'
|
|
1258
|
-
)
|
|
1259
|
-
|
|
1260
|
-
# Configure SkyPilot
|
|
1261
|
-
progress_message('Configuring SkyPilot...')
|
|
1157
|
+
logger.debug('No GPUs detected. Skipping GPU Operator installation.')
|
|
1262
1158
|
|
|
1263
1159
|
# The env var KUBECONFIG ensures sky check uses the right kubeconfig
|
|
1264
1160
|
os.environ['KUBECONFIG'] = kubeconfig_path
|
|
1265
|
-
run_command(['sky', 'check', '
|
|
1161
|
+
run_command(['sky', 'check', 'ssh'], shell=False)
|
|
1266
1162
|
|
|
1267
1163
|
success_message('SkyPilot configured successfully.')
|
|
1268
1164
|
|
|
1269
|
-
# Display final success message
|
|
1270
|
-
print(
|
|
1271
|
-
f'{GREEN}==== 🎉 Kubernetes cluster deployment completed successfully 🎉 ====${NC}'
|
|
1272
|
-
)
|
|
1273
|
-
print(
|
|
1274
|
-
'You can now interact with your Kubernetes cluster through SkyPilot: ')
|
|
1275
|
-
print(' • List available GPUs: sky show-gpus --cloud kubernetes')
|
|
1276
|
-
print(
|
|
1277
|
-
' • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes'
|
|
1278
|
-
)
|
|
1279
|
-
print(
|
|
1280
|
-
' • Connect to pod with VSCode: code --remote ssh-remote+devbox "/home"'
|
|
1281
|
-
)
|
|
1282
|
-
# Print completion marker for current cluster
|
|
1283
|
-
print(f'{GREEN}SKYPILOT_CLUSTER_COMPLETED: {NC}')
|
|
1284
|
-
|
|
1285
1165
|
if unsuccessful_workers:
|
|
1286
1166
|
quoted_unsuccessful_workers = [
|
|
1287
1167
|
f'"{worker}"' for worker in unsuccessful_workers
|
|
1288
1168
|
]
|
|
1289
1169
|
|
|
1290
|
-
|
|
1170
|
+
logger.info(
|
|
1291
1171
|
f'{WARNING_YELLOW}Failed to deploy Kubernetes on the following nodes: '
|
|
1292
1172
|
f'{", ".join(quoted_unsuccessful_workers)}. Please check '
|
|
1293
1173
|
f'the logs for more details.{NC}')
|
|
1174
|
+
else:
|
|
1175
|
+
success_message(f'Node Pool `{cluster_name}` deployed successfully.')
|
|
1294
1176
|
|
|
1295
1177
|
return unsuccessful_workers
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
if __name__ == '__main__':
|
|
1299
|
-
main()
|