skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,199 @@
|
|
|
1
|
+
"""Utilities to setup SSH Tunnel"""
|
|
2
|
+
import os
|
|
3
|
+
import random
|
|
4
|
+
import re
|
|
5
|
+
import subprocess
|
|
6
|
+
import sys
|
|
7
|
+
from typing import Set
|
|
8
|
+
|
|
9
|
+
import colorama
|
|
10
|
+
|
|
11
|
+
from sky import sky_logging
|
|
12
|
+
from sky.ssh_node_pools import constants
|
|
13
|
+
from sky.ssh_node_pools.deploy import utils as deploy_utils
|
|
14
|
+
|
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
|
16
|
+
|
|
17
|
+
# Get the directory of this script
|
|
18
|
+
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _get_used_localhost_ports() -> Set[int]:
|
|
22
|
+
"""Get SSH port forwardings already in use on localhost"""
|
|
23
|
+
used_ports = set()
|
|
24
|
+
|
|
25
|
+
# Get ports from netstat (works on macOS and Linux)
|
|
26
|
+
try:
|
|
27
|
+
if sys.platform == 'darwin':
|
|
28
|
+
# macOS
|
|
29
|
+
result = subprocess.run(['netstat', '-an', '-p', 'tcp'],
|
|
30
|
+
capture_output=True,
|
|
31
|
+
text=True,
|
|
32
|
+
check=False)
|
|
33
|
+
else:
|
|
34
|
+
# Linux and other Unix-like systems
|
|
35
|
+
result = subprocess.run(['netstat', '-tln'],
|
|
36
|
+
capture_output=True,
|
|
37
|
+
text=True,
|
|
38
|
+
check=False)
|
|
39
|
+
|
|
40
|
+
if result.returncode == 0:
|
|
41
|
+
# Look for lines with 'localhost:<port>' or '127.0.0.1:<port>'
|
|
42
|
+
for line in result.stdout.splitlines():
|
|
43
|
+
if '127.0.0.1:' in line or 'localhost:' in line:
|
|
44
|
+
match = re.search(r':(64\d\d)\s', line)
|
|
45
|
+
if match:
|
|
46
|
+
port = int(match.group(1))
|
|
47
|
+
if 6400 <= port <= 6500: # Only consider our range
|
|
48
|
+
used_ports.add(port)
|
|
49
|
+
except (subprocess.SubprocessError, FileNotFoundError):
|
|
50
|
+
# If netstat fails, try another approach
|
|
51
|
+
pass
|
|
52
|
+
|
|
53
|
+
# Also check ports from existing kubeconfig entries
|
|
54
|
+
try:
|
|
55
|
+
result = subprocess.run([
|
|
56
|
+
'kubectl', 'config', 'view', '-o',
|
|
57
|
+
'jsonpath=\'{.clusters[*].cluster.server}\''
|
|
58
|
+
],
|
|
59
|
+
capture_output=True,
|
|
60
|
+
text=True,
|
|
61
|
+
check=False)
|
|
62
|
+
|
|
63
|
+
if result.returncode == 0:
|
|
64
|
+
# Look for localhost URLs with ports
|
|
65
|
+
for url in result.stdout.split():
|
|
66
|
+
if 'localhost:' in url or '127.0.0.1:' in url:
|
|
67
|
+
match = re.search(r':(\d+)', url)
|
|
68
|
+
if match:
|
|
69
|
+
port = int(match.group(1))
|
|
70
|
+
if 6400 <= port <= 6500: # Only consider our range
|
|
71
|
+
used_ports.add(port)
|
|
72
|
+
except subprocess.SubprocessError:
|
|
73
|
+
pass
|
|
74
|
+
|
|
75
|
+
return used_ports
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def get_available_port(start: int = 6443, end: int = 6499) -> int:
|
|
79
|
+
"""Get an available port in the given range not used by other tunnels"""
|
|
80
|
+
used_ports = _get_used_localhost_ports()
|
|
81
|
+
|
|
82
|
+
# Try to use port 6443 first if available for the first cluster
|
|
83
|
+
if start == 6443 and start not in used_ports:
|
|
84
|
+
return start
|
|
85
|
+
|
|
86
|
+
# Otherwise find any available port in the range
|
|
87
|
+
available_ports = list(set(range(start, end + 1)) - used_ports)
|
|
88
|
+
|
|
89
|
+
if not available_ports:
|
|
90
|
+
# If all ports are used, pick a random one from our range
|
|
91
|
+
# (we'll terminate any existing connection in the setup)
|
|
92
|
+
return random.randint(start, end)
|
|
93
|
+
|
|
94
|
+
# Sort to get deterministic allocation
|
|
95
|
+
available_ports.sort()
|
|
96
|
+
return available_ports[0]
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def setup_kubectl_ssh_tunnel(head_node,
|
|
100
|
+
ssh_user,
|
|
101
|
+
ssh_key,
|
|
102
|
+
context_name,
|
|
103
|
+
use_ssh_config=False):
|
|
104
|
+
"""Set up kubeconfig exec credential plugin for SSH tunnel"""
|
|
105
|
+
logger.info(f'{colorama.Fore.YELLOW}➜ Setting up SSH tunnel for '
|
|
106
|
+
f'Kubernetes API access...{colorama.Style.RESET_ALL}')
|
|
107
|
+
|
|
108
|
+
# Get an available port for this cluster
|
|
109
|
+
port = get_available_port()
|
|
110
|
+
|
|
111
|
+
# Paths to scripts
|
|
112
|
+
tunnel_script = os.path.join(SCRIPT_DIR, 'tunnel', 'ssh-tunnel.sh')
|
|
113
|
+
|
|
114
|
+
# Make sure scripts are executable
|
|
115
|
+
os.chmod(tunnel_script, 0o755)
|
|
116
|
+
|
|
117
|
+
# Certificate files
|
|
118
|
+
client_cert_file = os.path.join(constants.NODE_POOLS_INFO_DIR,
|
|
119
|
+
f'{context_name}-cert.pem')
|
|
120
|
+
client_key_file = os.path.join(constants.NODE_POOLS_INFO_DIR,
|
|
121
|
+
f'{context_name}-key.pem')
|
|
122
|
+
|
|
123
|
+
# Update kubeconfig to use localhost with the selected port
|
|
124
|
+
deploy_utils.run_command([
|
|
125
|
+
'kubectl', 'config', 'set-cluster', context_name,
|
|
126
|
+
f'--server=https://127.0.0.1:{port}', '--insecure-skip-tls-verify=true'
|
|
127
|
+
])
|
|
128
|
+
|
|
129
|
+
# Build the exec args list based on auth method
|
|
130
|
+
exec_args = [
|
|
131
|
+
'--exec-command', tunnel_script, '--exec-api-version',
|
|
132
|
+
'client.authentication.k8s.io/v1beta1'
|
|
133
|
+
]
|
|
134
|
+
|
|
135
|
+
# Set credential TTL to force frequent tunnel checks
|
|
136
|
+
ttl_seconds = 30
|
|
137
|
+
|
|
138
|
+
# Verify if we have extracted certificate data files
|
|
139
|
+
has_cert_files = os.path.isfile(client_cert_file) and os.path.isfile(
|
|
140
|
+
client_key_file)
|
|
141
|
+
if has_cert_files:
|
|
142
|
+
logger.info(f'{colorama.Fore.GREEN}Client certificate data extracted '
|
|
143
|
+
'and will be used for authentication'
|
|
144
|
+
f'{colorama.Style.RESET_ALL}')
|
|
145
|
+
|
|
146
|
+
if use_ssh_config:
|
|
147
|
+
deploy_utils.run_command(
|
|
148
|
+
['kubectl', 'config', 'set-credentials', context_name] + exec_args +
|
|
149
|
+
[
|
|
150
|
+
'--exec-arg=--context', f'--exec-arg={context_name}',
|
|
151
|
+
'--exec-arg=--port', f'--exec-arg={port}', '--exec-arg=--ttl',
|
|
152
|
+
f'--exec-arg={ttl_seconds}', '--exec-arg=--use-ssh-config',
|
|
153
|
+
'--exec-arg=--host', f'--exec-arg={head_node}'
|
|
154
|
+
])
|
|
155
|
+
else:
|
|
156
|
+
deploy_utils.run_command(
|
|
157
|
+
['kubectl', 'config', 'set-credentials', context_name] + exec_args +
|
|
158
|
+
[
|
|
159
|
+
'--exec-arg=--context', f'--exec-arg={context_name}',
|
|
160
|
+
'--exec-arg=--port', f'--exec-arg={port}', '--exec-arg=--ttl',
|
|
161
|
+
f'--exec-arg={ttl_seconds}', '--exec-arg=--host',
|
|
162
|
+
f'--exec-arg={head_node}', '--exec-arg=--user',
|
|
163
|
+
f'--exec-arg={ssh_user}', '--exec-arg=--ssh-key',
|
|
164
|
+
f'--exec-arg={ssh_key}'
|
|
165
|
+
])
|
|
166
|
+
|
|
167
|
+
logger.info(f'{colorama.Fore.GREEN}✔ SSH tunnel configured through '
|
|
168
|
+
'kubectl credential plugin on port '
|
|
169
|
+
f'{port}{colorama.Style.RESET_ALL}')
|
|
170
|
+
logger.info('Your kubectl connection is now tunneled through SSH '
|
|
171
|
+
f'(port {port}).')
|
|
172
|
+
logger.info('This tunnel will be automatically established when needed.')
|
|
173
|
+
logger.info(f'Credential TTL set to {ttl_seconds}s to ensure tunnel '
|
|
174
|
+
'health is checked frequently.')
|
|
175
|
+
return port
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def cleanup_kubectl_ssh_tunnel(cluster_name, context_name):
|
|
179
|
+
"""Clean up the SSH tunnel for a specific context"""
|
|
180
|
+
logger.info(f'{colorama.Fore.YELLOW}➜ Cleaning up SSH tunnel for '
|
|
181
|
+
f'`{cluster_name}`...{colorama.Style.RESET_ALL}')
|
|
182
|
+
|
|
183
|
+
# Path to cleanup script
|
|
184
|
+
cleanup_script = os.path.join(SCRIPT_DIR, 'tunnel', 'cleanup-tunnel.sh')
|
|
185
|
+
|
|
186
|
+
# Make sure script is executable
|
|
187
|
+
if os.path.exists(cleanup_script):
|
|
188
|
+
os.chmod(cleanup_script, 0o755)
|
|
189
|
+
|
|
190
|
+
# Run the cleanup script
|
|
191
|
+
subprocess.run([cleanup_script, context_name],
|
|
192
|
+
stdout=subprocess.DEVNULL,
|
|
193
|
+
stderr=subprocess.DEVNULL,
|
|
194
|
+
check=False)
|
|
195
|
+
logger.info(f'{colorama.Fore.GREEN}✔ SSH tunnel for `{cluster_name}` '
|
|
196
|
+
f'cleaned up.{colorama.Style.RESET_ALL}')
|
|
197
|
+
else:
|
|
198
|
+
logger.error(f'{colorama.Fore.YELLOW}Cleanup script not found: '
|
|
199
|
+
f'{cleanup_script}{colorama.Style.RESET_ALL}')
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""Utilities for SSH Node Pools Deployment"""
|
|
2
|
+
import os
|
|
3
|
+
import subprocess
|
|
4
|
+
from typing import List, Optional
|
|
5
|
+
|
|
6
|
+
import colorama
|
|
7
|
+
|
|
8
|
+
from sky import sky_logging
|
|
9
|
+
from sky.utils import ux_utils
|
|
10
|
+
|
|
11
|
+
logger = sky_logging.init_logger(__name__)
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
def check_ssh_cluster_dependencies(
|
|
15
|
+
raise_error: bool = True) -> Optional[List[str]]:
|
|
16
|
+
"""Checks if the dependencies for ssh cluster are installed.
|
|
17
|
+
|
|
18
|
+
Args:
|
|
19
|
+
raise_error: set to true when the dependency needs to be present.
|
|
20
|
+
set to false for `sky check`, where reason strings are compiled
|
|
21
|
+
at the end.
|
|
22
|
+
|
|
23
|
+
Returns: the reasons list if there are missing dependencies.
|
|
24
|
+
"""
|
|
25
|
+
# error message
|
|
26
|
+
jq_message = ('`jq` is required to setup ssh cluster.')
|
|
27
|
+
|
|
28
|
+
# save
|
|
29
|
+
reasons = []
|
|
30
|
+
required_binaries = []
|
|
31
|
+
|
|
32
|
+
# Ensure jq is installed
|
|
33
|
+
try:
|
|
34
|
+
subprocess.run(['jq', '--version'],
|
|
35
|
+
stdout=subprocess.DEVNULL,
|
|
36
|
+
stderr=subprocess.DEVNULL,
|
|
37
|
+
check=True)
|
|
38
|
+
except (FileNotFoundError, subprocess.CalledProcessError):
|
|
39
|
+
required_binaries.append('jq')
|
|
40
|
+
reasons.append(jq_message)
|
|
41
|
+
|
|
42
|
+
if required_binaries:
|
|
43
|
+
reasons.extend([
|
|
44
|
+
'On Debian/Ubuntu, install the missing dependenc(ies) with:',
|
|
45
|
+
f' $ sudo apt install {" ".join(required_binaries)}',
|
|
46
|
+
'On MacOS, install with: ',
|
|
47
|
+
f' $ brew install {" ".join(required_binaries)}',
|
|
48
|
+
])
|
|
49
|
+
if raise_error:
|
|
50
|
+
with ux_utils.print_exception_no_traceback():
|
|
51
|
+
raise RuntimeError('\n'.join(reasons))
|
|
52
|
+
return reasons
|
|
53
|
+
return None
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def run_command(cmd, shell=False, silent=False):
|
|
57
|
+
"""Run a local command and return the output."""
|
|
58
|
+
process = subprocess.run(cmd,
|
|
59
|
+
shell=shell,
|
|
60
|
+
capture_output=True,
|
|
61
|
+
text=True,
|
|
62
|
+
check=False)
|
|
63
|
+
if process.returncode != 0:
|
|
64
|
+
if not silent:
|
|
65
|
+
logger.error(f'{colorama.Fore.RED}Error executing command: {cmd}\n'
|
|
66
|
+
f'{colorama.Style.RESET_ALL}STDOUT: {process.stdout}\n'
|
|
67
|
+
f'STDERR: {process.stderr}')
|
|
68
|
+
return None
|
|
69
|
+
return process.stdout.strip()
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_effective_host_ip(hostname: str) -> str:
|
|
73
|
+
"""Get the effective IP for a hostname from SSH config."""
|
|
74
|
+
try:
|
|
75
|
+
result = subprocess.run(['ssh', '-G', hostname],
|
|
76
|
+
capture_output=True,
|
|
77
|
+
text=True,
|
|
78
|
+
check=False)
|
|
79
|
+
if result.returncode == 0:
|
|
80
|
+
for line in result.stdout.splitlines():
|
|
81
|
+
if line.startswith('hostname '):
|
|
82
|
+
return line.split(' ', 1)[1].strip()
|
|
83
|
+
except Exception: # pylint: disable=broad-except
|
|
84
|
+
pass
|
|
85
|
+
return hostname # Return the original hostname if lookup fails
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def run_remote(node,
|
|
89
|
+
cmd,
|
|
90
|
+
user='',
|
|
91
|
+
ssh_key='',
|
|
92
|
+
connect_timeout=30,
|
|
93
|
+
use_ssh_config=False,
|
|
94
|
+
print_output=False,
|
|
95
|
+
use_shell=False,
|
|
96
|
+
silent=False):
|
|
97
|
+
"""Run a command on a remote machine via SSH."""
|
|
98
|
+
ssh_cmd: List[str]
|
|
99
|
+
if use_ssh_config:
|
|
100
|
+
# Use SSH config for connection parameters
|
|
101
|
+
ssh_cmd = ['ssh', node, cmd]
|
|
102
|
+
else:
|
|
103
|
+
# Use explicit parameters
|
|
104
|
+
ssh_cmd = [
|
|
105
|
+
'ssh', '-o', 'StrictHostKeyChecking=no', '-o', 'IdentitiesOnly=yes',
|
|
106
|
+
'-o', f'ConnectTimeout={connect_timeout}', '-o',
|
|
107
|
+
'ServerAliveInterval=10', '-o', 'ServerAliveCountMax=3'
|
|
108
|
+
]
|
|
109
|
+
|
|
110
|
+
if ssh_key:
|
|
111
|
+
if not os.path.isfile(ssh_key):
|
|
112
|
+
raise ValueError(f'SSH key not found: {ssh_key}')
|
|
113
|
+
ssh_cmd.extend(['-i', ssh_key])
|
|
114
|
+
|
|
115
|
+
ssh_cmd.append(f'{user}@{node}' if user else node)
|
|
116
|
+
ssh_cmd.append(cmd)
|
|
117
|
+
|
|
118
|
+
subprocess_cmd = ' '.join(ssh_cmd) if use_shell else ssh_cmd
|
|
119
|
+
process = subprocess.run(subprocess_cmd,
|
|
120
|
+
capture_output=True,
|
|
121
|
+
text=True,
|
|
122
|
+
check=False,
|
|
123
|
+
shell=use_shell)
|
|
124
|
+
if process.returncode != 0:
|
|
125
|
+
if not silent:
|
|
126
|
+
logger.error(f'{colorama.Fore.RED}Error executing command {cmd} on '
|
|
127
|
+
f'{node}:{colorama.Style.RESET_ALL} {process.stderr}')
|
|
128
|
+
return None
|
|
129
|
+
if print_output:
|
|
130
|
+
logger.info(process.stdout)
|
|
131
|
+
return process.stdout.strip()
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def ensure_directory_exists(path):
|
|
135
|
+
"""Ensure the directory for the specified file path exists."""
|
|
136
|
+
directory = os.path.dirname(path)
|
|
137
|
+
if directory and not os.path.exists(directory):
|
|
138
|
+
os.makedirs(directory, exist_ok=True)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
def check_gpu(node, user, ssh_key, use_ssh_config=False, is_head=False):
|
|
142
|
+
"""Check if a node has a GPU."""
|
|
143
|
+
cmd = ('command -v nvidia-smi &> /dev/null && '
|
|
144
|
+
'nvidia-smi --query-gpu=gpu_name --format=csv,noheader')
|
|
145
|
+
result = run_remote(node,
|
|
146
|
+
cmd,
|
|
147
|
+
user,
|
|
148
|
+
ssh_key,
|
|
149
|
+
use_ssh_config=use_ssh_config,
|
|
150
|
+
silent=True)
|
|
151
|
+
if result is not None:
|
|
152
|
+
# Check that all GPUs have the same type.
|
|
153
|
+
# Currently, SkyPilot does not support heterogeneous GPU node
|
|
154
|
+
# (i.e. more than one GPU type on the same node).
|
|
155
|
+
gpu_names = {
|
|
156
|
+
line.strip() for line in result.splitlines() if line.strip()
|
|
157
|
+
}
|
|
158
|
+
if not gpu_names:
|
|
159
|
+
# This can happen if nvidia-smi returns only whitespace.
|
|
160
|
+
# Set result to None to ensure this function returns False.
|
|
161
|
+
result = None
|
|
162
|
+
elif len(gpu_names) > 1:
|
|
163
|
+
# Sort for a deterministic error message.
|
|
164
|
+
sorted_gpu_names = sorted(list(gpu_names))
|
|
165
|
+
raise RuntimeError(
|
|
166
|
+
f'Node {node} has more than one GPU types '
|
|
167
|
+
f'({", ".join(sorted_gpu_names)}). '
|
|
168
|
+
'SkyPilot does not support a node with multiple GPU types.')
|
|
169
|
+
else:
|
|
170
|
+
logger.info(f'{colorama.Fore.YELLOW}➜ GPU {list(gpu_names)[0]} '
|
|
171
|
+
f'detected on {"head" if is_head else "worker"} '
|
|
172
|
+
f'node ({node}).{colorama.Style.RESET_ALL}')
|
|
173
|
+
return result is not None
|
sky/ssh_node_pools/server.py
CHANGED
|
@@ -4,11 +4,11 @@ from typing import Any, Dict, List
|
|
|
4
4
|
|
|
5
5
|
import fastapi
|
|
6
6
|
|
|
7
|
-
from sky import core as sky_core
|
|
8
7
|
from sky.server.requests import executor
|
|
9
8
|
from sky.server.requests import payloads
|
|
9
|
+
from sky.server.requests import request_names
|
|
10
10
|
from sky.server.requests import requests as requests_lib
|
|
11
|
-
from sky.ssh_node_pools import core
|
|
11
|
+
from sky.ssh_node_pools import core
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
|
|
14
14
|
router = fastapi.APIRouter()
|
|
@@ -18,7 +18,7 @@ router = fastapi.APIRouter()
|
|
|
18
18
|
def get_ssh_node_pools() -> Dict[str, Any]:
|
|
19
19
|
"""Get all SSH Node Pool configurations."""
|
|
20
20
|
try:
|
|
21
|
-
return
|
|
21
|
+
return core.get_all_pools()
|
|
22
22
|
except Exception as e:
|
|
23
23
|
raise fastapi.HTTPException(
|
|
24
24
|
status_code=500,
|
|
@@ -30,7 +30,7 @@ def get_ssh_node_pools() -> Dict[str, Any]:
|
|
|
30
30
|
def update_ssh_node_pools(pools_config: Dict[str, Any]) -> Dict[str, str]:
|
|
31
31
|
"""Update SSH Node Pool configurations."""
|
|
32
32
|
try:
|
|
33
|
-
|
|
33
|
+
core.update_pools(pools_config)
|
|
34
34
|
return {'status': 'success'}
|
|
35
35
|
except Exception as e:
|
|
36
36
|
raise fastapi.HTTPException(status_code=400,
|
|
@@ -42,7 +42,7 @@ def update_ssh_node_pools(pools_config: Dict[str, Any]) -> Dict[str, str]:
|
|
|
42
42
|
def delete_ssh_node_pool(pool_name: str) -> Dict[str, str]:
|
|
43
43
|
"""Delete a SSH Node Pool configuration."""
|
|
44
44
|
try:
|
|
45
|
-
if
|
|
45
|
+
if core.delete_pool(pool_name):
|
|
46
46
|
return {'status': 'success'}
|
|
47
47
|
else:
|
|
48
48
|
raise fastapi.HTTPException(
|
|
@@ -69,8 +69,7 @@ async def upload_ssh_key(request: fastapi.Request) -> Dict[str, str]:
|
|
|
69
69
|
detail='Missing key_name or key_file')
|
|
70
70
|
|
|
71
71
|
key_content = await key_file.read()
|
|
72
|
-
key_path =
|
|
73
|
-
key_content.decode())
|
|
72
|
+
key_path = core.upload_ssh_key(key_name, key_content.decode())
|
|
74
73
|
|
|
75
74
|
return {'status': 'success', 'key_path': key_path}
|
|
76
75
|
except fastapi.HTTPException:
|
|
@@ -86,7 +85,7 @@ async def upload_ssh_key(request: fastapi.Request) -> Dict[str, str]:
|
|
|
86
85
|
def list_ssh_keys() -> List[str]:
|
|
87
86
|
"""List available SSH keys."""
|
|
88
87
|
try:
|
|
89
|
-
return
|
|
88
|
+
return core.list_ssh_keys()
|
|
90
89
|
except Exception as e:
|
|
91
90
|
exception_msg = common_utils.format_exception(e)
|
|
92
91
|
raise fastapi.HTTPException(
|
|
@@ -99,11 +98,11 @@ async def deploy_ssh_node_pool(request: fastapi.Request,
|
|
|
99
98
|
"""Deploy SSH Node Pool using existing ssh_up functionality."""
|
|
100
99
|
try:
|
|
101
100
|
ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=False)
|
|
102
|
-
executor.
|
|
101
|
+
await executor.schedule_request_async(
|
|
103
102
|
request_id=request.state.request_id,
|
|
104
|
-
request_name=
|
|
103
|
+
request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
|
|
105
104
|
request_body=ssh_up_body,
|
|
106
|
-
func=
|
|
105
|
+
func=core.ssh_up,
|
|
107
106
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
108
107
|
)
|
|
109
108
|
|
|
@@ -124,11 +123,11 @@ async def deploy_ssh_node_pool_general(
|
|
|
124
123
|
ssh_up_body: payloads.SSHUpBody) -> Dict[str, str]:
|
|
125
124
|
"""Deploys all SSH Node Pools."""
|
|
126
125
|
try:
|
|
127
|
-
executor.
|
|
126
|
+
await executor.schedule_request_async(
|
|
128
127
|
request_id=request.state.request_id,
|
|
129
|
-
request_name=
|
|
128
|
+
request_name=request_names.RequestName.SSH_NODE_POOLS_UP,
|
|
130
129
|
request_body=ssh_up_body,
|
|
131
|
-
func=
|
|
130
|
+
func=core.ssh_up,
|
|
132
131
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
133
132
|
)
|
|
134
133
|
|
|
@@ -150,11 +149,11 @@ async def down_ssh_node_pool(request: fastapi.Request,
|
|
|
150
149
|
"""Cleans up a SSH Node Pools."""
|
|
151
150
|
try:
|
|
152
151
|
ssh_up_body = payloads.SSHUpBody(infra=pool_name, cleanup=True)
|
|
153
|
-
executor.
|
|
152
|
+
await executor.schedule_request_async(
|
|
154
153
|
request_id=request.state.request_id,
|
|
155
|
-
request_name=
|
|
154
|
+
request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
|
|
156
155
|
request_body=ssh_up_body,
|
|
157
|
-
func=
|
|
156
|
+
func=core.ssh_up, # Reuse ssh_up function with cleanup=True
|
|
158
157
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
159
158
|
)
|
|
160
159
|
|
|
@@ -178,11 +177,11 @@ async def down_ssh_node_pool_general(
|
|
|
178
177
|
try:
|
|
179
178
|
# Set cleanup=True for down operation
|
|
180
179
|
ssh_up_body.cleanup = True
|
|
181
|
-
executor.
|
|
180
|
+
await executor.schedule_request_async(
|
|
182
181
|
request_id=request.state.request_id,
|
|
183
|
-
request_name=
|
|
182
|
+
request_name=request_names.RequestName.SSH_NODE_POOLS_DOWN,
|
|
184
183
|
request_body=ssh_up_body,
|
|
185
|
-
func=
|
|
184
|
+
func=core.ssh_up, # Reuse ssh_up function with cleanup=True
|
|
186
185
|
schedule_type=requests_lib.ScheduleType.LONG,
|
|
187
186
|
)
|
|
188
187
|
|
|
@@ -205,7 +204,7 @@ def get_ssh_node_pool_status(pool_name: str) -> Dict[str, str]:
|
|
|
205
204
|
try:
|
|
206
205
|
# Call ssh_status to check the context
|
|
207
206
|
context_name = f'ssh-{pool_name}'
|
|
208
|
-
is_ready, reason =
|
|
207
|
+
is_ready, reason = core.ssh_status(context_name)
|
|
209
208
|
|
|
210
209
|
# Strip ANSI escape codes from the reason text
|
|
211
210
|
def strip_ansi_codes(text):
|
|
@@ -5,13 +5,14 @@ import subprocess
|
|
|
5
5
|
from typing import Any, Callable, Dict, List, Optional
|
|
6
6
|
import uuid
|
|
7
7
|
|
|
8
|
+
import colorama
|
|
8
9
|
import yaml
|
|
9
10
|
|
|
11
|
+
from sky import sky_logging
|
|
12
|
+
from sky.ssh_node_pools import constants
|
|
10
13
|
from sky.utils import ux_utils
|
|
11
14
|
|
|
12
|
-
|
|
13
|
-
RED = '\033[0;31m'
|
|
14
|
-
NC = '\033[0m' # No color
|
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
|
15
16
|
|
|
16
17
|
|
|
17
18
|
def check_host_in_ssh_config(hostname: str) -> bool:
|
|
@@ -92,7 +93,8 @@ def load_ssh_targets(file_path: str) -> Dict[str, Any]:
|
|
|
92
93
|
def get_cluster_config(
|
|
93
94
|
targets: Dict[str, Any],
|
|
94
95
|
cluster_name: Optional[str] = None,
|
|
95
|
-
file_path: str = DEFAULT_SSH_NODE_POOLS_PATH
|
|
96
|
+
file_path: str = constants.DEFAULT_SSH_NODE_POOLS_PATH
|
|
97
|
+
) -> Dict[str, Any]:
|
|
96
98
|
"""Get configuration for specific clusters or all clusters."""
|
|
97
99
|
if not targets:
|
|
98
100
|
with ux_utils.print_exception_no_traceback():
|
|
@@ -186,8 +188,9 @@ def prepare_hosts_info(
|
|
|
186
188
|
else:
|
|
187
189
|
# It's a dict with potential overrides
|
|
188
190
|
if 'ip' not in host:
|
|
189
|
-
|
|
190
|
-
|
|
191
|
+
logger.warning(f'{colorama.Fore.RED}Warning: Host missing'
|
|
192
|
+
f'\'ip\' field, skipping: {host}'
|
|
193
|
+
f'{colorama.Style.RESET_ALL}')
|
|
191
194
|
continue
|
|
192
195
|
|
|
193
196
|
# Check if this is an SSH config hostname
|