skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,12 +1,11 @@
|
|
|
1
|
-
"""Utility functions for deploying Kubernetes clusters."""
|
|
1
|
+
"""Utility functions for deploying local Kubernetes kind clusters."""
|
|
2
2
|
import os
|
|
3
|
+
import random
|
|
3
4
|
import shlex
|
|
4
5
|
import subprocess
|
|
5
|
-
import sys
|
|
6
6
|
import tempfile
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
import colorama
|
|
7
|
+
import textwrap
|
|
8
|
+
from typing import Optional, Tuple
|
|
10
9
|
|
|
11
10
|
from sky import check as sky_check
|
|
12
11
|
from sky import sky_logging
|
|
@@ -24,279 +23,154 @@ logger = sky_logging.init_logger(__name__)
|
|
|
24
23
|
|
|
25
24
|
# Default path for Kubernetes configuration file
|
|
26
25
|
DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
|
|
26
|
+
DEFAULT_LOCAL_CLUSTER_NAME = 'skypilot'
|
|
27
|
+
LOCAL_CLUSTER_PORT_RANGE = 100
|
|
28
|
+
LOCAL_CLUSTER_INTERNAL_PORT_START = 30000
|
|
29
|
+
LOCAL_CLUSTER_INTERNAL_PORT_END = 30099
|
|
27
30
|
|
|
28
31
|
|
|
29
|
-
def
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
Args:
|
|
34
|
-
raise_error: set to true when the dependency needs to be present.
|
|
35
|
-
set to false for `sky check`, where reason strings are compiled
|
|
36
|
-
at the end.
|
|
37
|
-
|
|
38
|
-
Returns: the reasons list if there are missing dependencies.
|
|
39
|
-
"""
|
|
40
|
-
# error message
|
|
41
|
-
jq_message = ('`jq` is required to setup ssh cluster.')
|
|
42
|
-
|
|
43
|
-
# save
|
|
44
|
-
reasons = []
|
|
45
|
-
required_binaries = []
|
|
46
|
-
|
|
47
|
-
# Ensure jq is installed
|
|
48
|
-
try:
|
|
49
|
-
subprocess.run(['jq', '--version'],
|
|
50
|
-
stdout=subprocess.DEVNULL,
|
|
51
|
-
stderr=subprocess.DEVNULL,
|
|
52
|
-
check=True)
|
|
53
|
-
except (FileNotFoundError, subprocess.CalledProcessError):
|
|
54
|
-
required_binaries.append('jq')
|
|
55
|
-
reasons.append(jq_message)
|
|
56
|
-
|
|
57
|
-
if required_binaries:
|
|
58
|
-
reasons.extend([
|
|
59
|
-
'On Debian/Ubuntu, install the missing dependenc(ies) with:',
|
|
60
|
-
f' $ sudo apt install {" ".join(required_binaries)}',
|
|
61
|
-
'On MacOS, install with: ',
|
|
62
|
-
f' $ brew install {" ".join(required_binaries)}',
|
|
63
|
-
])
|
|
64
|
-
if raise_error:
|
|
65
|
-
with ux_utils.print_exception_no_traceback():
|
|
66
|
-
raise RuntimeError('\n'.join(reasons))
|
|
67
|
-
return reasons
|
|
68
|
-
return None
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def deploy_ssh_cluster(cleanup: bool = False,
|
|
72
|
-
infra: Optional[str] = None,
|
|
73
|
-
kubeconfig_path: Optional[str] = None):
|
|
74
|
-
"""Deploy a Kubernetes cluster on SSH targets.
|
|
32
|
+
def generate_kind_config(port_start: int,
|
|
33
|
+
num_nodes: int = 1,
|
|
34
|
+
gpus: bool = False) -> str:
|
|
35
|
+
"""Generate a kind cluster config with ports mapped from host to container
|
|
75
36
|
|
|
76
|
-
|
|
77
|
-
|
|
37
|
+
Port range will be [port_start, port_start + LOCAL_CLUSTER_PORT_RANGE)
|
|
38
|
+
Internally, this will map to ports 30000 - 30099
|
|
78
39
|
|
|
79
40
|
Args:
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
If None, the default ~/.kube/config will be used.
|
|
85
|
-
"""
|
|
86
|
-
check_ssh_cluster_dependencies()
|
|
87
|
-
|
|
88
|
-
# Prepare command to call deploy_remote_cluster.py script
|
|
89
|
-
# TODO(romilb): We should move this to a native python method/class call
|
|
90
|
-
# instead of invoking a script with subprocess.
|
|
91
|
-
path_to_package = os.path.dirname(__file__)
|
|
92
|
-
up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.py')
|
|
93
|
-
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
|
94
|
-
|
|
95
|
-
deploy_command = [sys.executable, up_script_path]
|
|
41
|
+
path: Path to generate the config file at
|
|
42
|
+
port_start: Port range start for mappings
|
|
43
|
+
num_nodes: Number of nodes in the cluster
|
|
44
|
+
gpus: If true, initialize kind cluster with GPU support
|
|
96
45
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
46
|
+
Returns:
|
|
47
|
+
The kind cluster config
|
|
48
|
+
"""
|
|
49
|
+
internal_start = LOCAL_CLUSTER_INTERNAL_PORT_START
|
|
50
|
+
internal_end = LOCAL_CLUSTER_INTERNAL_PORT_END
|
|
51
|
+
|
|
52
|
+
config = textwrap.dedent(f"""
|
|
53
|
+
apiVersion: kind.x-k8s.io/v1alpha4
|
|
54
|
+
kind: Cluster
|
|
55
|
+
kubeadmConfigPatches:
|
|
56
|
+
- |
|
|
57
|
+
kind: ClusterConfiguration
|
|
58
|
+
apiServer:
|
|
59
|
+
extraArgs:
|
|
60
|
+
"service-node-port-range": {internal_start}-{internal_end}
|
|
61
|
+
nodes:
|
|
62
|
+
- role: control-plane
|
|
63
|
+
kubeadmConfigPatches:
|
|
64
|
+
- |
|
|
65
|
+
kind: InitConfiguration
|
|
66
|
+
nodeRegistration:
|
|
67
|
+
kubeletExtraArgs:
|
|
68
|
+
node-labels: "ingress-ready=true"
|
|
69
|
+
""")
|
|
70
|
+
if gpus:
|
|
71
|
+
config += textwrap.indent(
|
|
72
|
+
textwrap.dedent("""
|
|
73
|
+
extraMounts:
|
|
74
|
+
- hostPath: /dev/null
|
|
75
|
+
containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
|
|
76
|
+
config += textwrap.indent(textwrap.dedent("""
|
|
77
|
+
extraPortMappings:"""), ' ' * 2)
|
|
78
|
+
for offset in range(LOCAL_CLUSTER_PORT_RANGE):
|
|
79
|
+
config += textwrap.indent(
|
|
80
|
+
textwrap.dedent(f"""
|
|
81
|
+
- containerPort: {internal_start + offset}
|
|
82
|
+
hostPort: {port_start + offset}
|
|
83
|
+
listenAddress: "0.0.0.0"
|
|
84
|
+
protocol: tcp
|
|
85
|
+
"""), ' ' * 2)
|
|
86
|
+
if num_nodes > 1:
|
|
87
|
+
config += '- role: worker\n' * (num_nodes - 1)
|
|
88
|
+
return config
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _get_port_range(name: str, port_start: Optional[int]) -> Tuple[int, int]:
|
|
92
|
+
is_default = name == DEFAULT_LOCAL_CLUSTER_NAME
|
|
93
|
+
if port_start is None:
|
|
94
|
+
if is_default:
|
|
95
|
+
port_start = LOCAL_CLUSTER_INTERNAL_PORT_START
|
|
96
|
+
else:
|
|
97
|
+
port_start = random.randint(301, 399) * 100
|
|
98
|
+
port_end = port_start + LOCAL_CLUSTER_PORT_RANGE - 1
|
|
99
|
+
|
|
100
|
+
port_range = f'Current port range: {port_start}-{port_end}'
|
|
101
|
+
if is_default and port_start != LOCAL_CLUSTER_INTERNAL_PORT_START:
|
|
102
|
+
raise ValueError('Default local cluster `skypilot` should have '
|
|
103
|
+
f'port range from 30000 to 30099. {port_range}.')
|
|
104
|
+
if not is_default and port_start == LOCAL_CLUSTER_INTERNAL_PORT_START:
|
|
105
|
+
raise ValueError('Port range 30000 to 30099 is reserved for '
|
|
106
|
+
f'default local cluster `skypilot`. {port_range}.')
|
|
107
|
+
if port_start % 100 != 0:
|
|
108
|
+
raise ValueError('Local cluster port start must be a multiple of 100. '
|
|
109
|
+
f'{port_range}.')
|
|
110
|
+
|
|
111
|
+
return port_start, port_end
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def deploy_local_cluster(name: Optional[str], port_start: Optional[int],
|
|
115
|
+
gpus: bool):
|
|
116
|
+
name = name or DEFAULT_LOCAL_CLUSTER_NAME
|
|
117
|
+
port_start, port_end = _get_port_range(name, port_start)
|
|
118
|
+
context_name = f'kind-{name}'
|
|
119
|
+
cluster_created = False
|
|
133
120
|
|
|
134
|
-
if
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
with ux_utils.print_exception_no_traceback():
|
|
138
|
-
log_hint = ux_utils.log_path_hint(log_path, is_local=False)
|
|
139
|
-
raise RuntimeError('Failed to deploy SkyPilot on some Node Pools. '
|
|
140
|
-
f'{log_hint}'
|
|
141
|
-
f'\nError: {stderr}')
|
|
121
|
+
# Check if GPUs are available on the host
|
|
122
|
+
local_gpus_available = backend_utils.check_local_gpus()
|
|
123
|
+
gpus = gpus and local_gpus_available
|
|
142
124
|
|
|
143
|
-
if
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
if cleanup:
|
|
148
|
-
logger.info(
|
|
149
|
-
ux_utils.finishing_message(
|
|
150
|
-
'🎉 SSH Node Pools cleaned up successfully.',
|
|
151
|
-
log_path=log_path,
|
|
152
|
-
is_local=True))
|
|
153
|
-
else:
|
|
125
|
+
# Check if ~/.kube/config exists:
|
|
126
|
+
if os.path.exists(os.path.expanduser('~/.kube/config')):
|
|
127
|
+
curr_context = kubernetes_utils.get_current_kube_config_context_name()
|
|
128
|
+
if curr_context is not None and curr_context != context_name:
|
|
154
129
|
logger.info(
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
with tempfile.NamedTemporaryFile(mode='w') as ip_file, \
|
|
181
|
-
tempfile.NamedTemporaryFile(mode='w') as key_file:
|
|
182
|
-
|
|
183
|
-
# Write IPs and SSH key to temporary files
|
|
184
|
-
ip_file.write('\n'.join(ip_list))
|
|
185
|
-
ip_file.flush()
|
|
186
|
-
|
|
187
|
-
key_file.write(ssh_key)
|
|
188
|
-
key_file.flush()
|
|
189
|
-
os.chmod(key_file.name, 0o600)
|
|
190
|
-
|
|
191
|
-
# Use the legacy mode command line arguments for backward compatibility
|
|
192
|
-
deploy_command = [
|
|
193
|
-
sys.executable, up_script_path, '--ips-file', ip_file.name,
|
|
194
|
-
'--user', ssh_user, '--ssh-key', key_file.name
|
|
195
|
-
]
|
|
196
|
-
|
|
197
|
-
if context_name is not None:
|
|
198
|
-
deploy_command.extend(['--context-name', context_name])
|
|
199
|
-
if password is not None:
|
|
200
|
-
deploy_command.extend(['--password', password])
|
|
201
|
-
if cleanup:
|
|
202
|
-
deploy_command.append('--cleanup')
|
|
130
|
+
f'Current context in kube config: {curr_context}'
|
|
131
|
+
f'\nWill automatically switch to {context_name} after the '
|
|
132
|
+
'local cluster is created.')
|
|
133
|
+
message_str = 'Creating local cluster {}{}...'
|
|
134
|
+
message_str = message_str.format(
|
|
135
|
+
name,
|
|
136
|
+
' with GPU support (this may take up to 15 minutes)' if gpus else '')
|
|
137
|
+
|
|
138
|
+
with tempfile.NamedTemporaryFile(mode='w+', suffix='.yaml',
|
|
139
|
+
delete=True) as f:
|
|
140
|
+
# Choose random port range to use on the host machine.
|
|
141
|
+
# Port range is port_start - port_start + 99 (exactly 100 ports).
|
|
142
|
+
logger.debug(f'Using host port range {port_start}-{port_end}')
|
|
143
|
+
f.write(generate_kind_config(port_start, gpus=gpus))
|
|
144
|
+
f.flush()
|
|
145
|
+
|
|
146
|
+
path_to_package = os.path.dirname(__file__)
|
|
147
|
+
up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
|
|
148
|
+
|
|
149
|
+
# Get directory of script and run it from there
|
|
150
|
+
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
|
151
|
+
run_command = f'{up_script_path} {name} {f.name}'
|
|
152
|
+
if gpus:
|
|
153
|
+
run_command += ' --gpus'
|
|
154
|
+
run_command = shlex.split(run_command)
|
|
203
155
|
|
|
204
156
|
# Setup logging paths
|
|
205
157
|
run_timestamp = sky_logging.get_run_timestamp()
|
|
206
158
|
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
207
159
|
'local_up.log')
|
|
208
|
-
|
|
209
|
-
if cleanup:
|
|
210
|
-
msg_str = 'Cleaning up remote cluster...'
|
|
211
|
-
else:
|
|
212
|
-
msg_str = 'Deploying remote cluster...'
|
|
213
|
-
|
|
214
|
-
# Create environment with PYTHONUNBUFFERED=1 to ensure unbuffered output
|
|
215
|
-
env = os.environ.copy()
|
|
216
|
-
env['PYTHONUNBUFFERED'] = '1'
|
|
160
|
+
logger.info(message_str)
|
|
217
161
|
|
|
218
162
|
with rich_utils.safe_status(
|
|
219
|
-
ux_utils.spinner_message(
|
|
163
|
+
ux_utils.spinner_message(message_str,
|
|
220
164
|
log_path=log_path,
|
|
221
165
|
is_local=True)):
|
|
222
166
|
returncode, _, stderr = log_lib.run_with_log(
|
|
223
|
-
cmd=
|
|
167
|
+
cmd=run_command,
|
|
224
168
|
log_path=log_path,
|
|
225
169
|
require_outputs=True,
|
|
226
170
|
stream_logs=False,
|
|
227
|
-
line_processor=log_utils.
|
|
171
|
+
line_processor=log_utils.SkyLocalUpLineProcessor(
|
|
228
172
|
log_path=log_path, is_local=True),
|
|
229
|
-
cwd=cwd
|
|
230
|
-
env=env)
|
|
231
|
-
if returncode == 0:
|
|
232
|
-
success = True
|
|
233
|
-
else:
|
|
234
|
-
with ux_utils.print_exception_no_traceback():
|
|
235
|
-
log_hint = ux_utils.log_path_hint(log_path, is_local=True)
|
|
236
|
-
raise RuntimeError('Failed to deploy remote cluster. '
|
|
237
|
-
f'Full log: {log_hint}'
|
|
238
|
-
f'\nError: {stderr}')
|
|
239
|
-
|
|
240
|
-
if success:
|
|
241
|
-
if cleanup:
|
|
242
|
-
logger.info(
|
|
243
|
-
ux_utils.finishing_message(
|
|
244
|
-
'🎉 Remote cluster cleaned up successfully.',
|
|
245
|
-
log_path=log_path,
|
|
246
|
-
is_local=True))
|
|
247
|
-
else:
|
|
248
|
-
logger.info(
|
|
249
|
-
ux_utils.finishing_message(
|
|
250
|
-
'🎉 Remote cluster deployed successfully.',
|
|
251
|
-
log_path=log_path,
|
|
252
|
-
is_local=True))
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
def deploy_local_cluster(gpus: bool):
|
|
256
|
-
cluster_created = False
|
|
257
|
-
|
|
258
|
-
# Check if GPUs are available on the host
|
|
259
|
-
local_gpus_available = backend_utils.check_local_gpus()
|
|
260
|
-
gpus = gpus and local_gpus_available
|
|
261
|
-
|
|
262
|
-
# Check if ~/.kube/config exists:
|
|
263
|
-
if os.path.exists(os.path.expanduser('~/.kube/config')):
|
|
264
|
-
curr_context = kubernetes_utils.get_current_kube_config_context_name()
|
|
265
|
-
skypilot_context = 'kind-skypilot'
|
|
266
|
-
if curr_context is not None and curr_context != skypilot_context:
|
|
267
|
-
logger.info(
|
|
268
|
-
f'Current context in kube config: {curr_context}'
|
|
269
|
-
'\nWill automatically switch to kind-skypilot after the local '
|
|
270
|
-
'cluster is created.')
|
|
271
|
-
message_str = 'Creating local cluster{}...'
|
|
272
|
-
message_str = message_str.format((' with GPU support (this may take up '
|
|
273
|
-
'to 15 minutes)') if gpus else '')
|
|
274
|
-
path_to_package = os.path.dirname(__file__)
|
|
275
|
-
up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
|
|
276
|
-
|
|
277
|
-
# Get directory of script and run it from there
|
|
278
|
-
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
|
279
|
-
run_command = up_script_path + ' --gpus' if gpus else up_script_path
|
|
280
|
-
run_command = shlex.split(run_command)
|
|
281
|
-
|
|
282
|
-
# Setup logging paths
|
|
283
|
-
run_timestamp = sky_logging.get_run_timestamp()
|
|
284
|
-
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
285
|
-
'local_up.log')
|
|
286
|
-
logger.info(message_str)
|
|
287
|
-
|
|
288
|
-
with rich_utils.safe_status(
|
|
289
|
-
ux_utils.spinner_message(message_str,
|
|
290
|
-
log_path=log_path,
|
|
291
|
-
is_local=True)):
|
|
292
|
-
returncode, _, stderr = log_lib.run_with_log(
|
|
293
|
-
cmd=run_command,
|
|
294
|
-
log_path=log_path,
|
|
295
|
-
require_outputs=True,
|
|
296
|
-
stream_logs=False,
|
|
297
|
-
line_processor=log_utils.SkyLocalUpLineProcessor(log_path=log_path,
|
|
298
|
-
is_local=True),
|
|
299
|
-
cwd=cwd)
|
|
173
|
+
cwd=cwd)
|
|
300
174
|
|
|
301
175
|
# Kind always writes to stderr even if it succeeds.
|
|
302
176
|
# If the failure happens after the cluster is created, we need
|
|
@@ -309,11 +183,11 @@ def deploy_local_cluster(gpus: bool):
|
|
|
309
183
|
elif returncode == 100:
|
|
310
184
|
logger.info(
|
|
311
185
|
ux_utils.finishing_message(
|
|
312
|
-
'Local cluster already exists.\n',
|
|
186
|
+
f'Local cluster {name} already exists.\n',
|
|
313
187
|
log_path=log_path,
|
|
314
188
|
is_local=True,
|
|
315
189
|
follow_up_message=
|
|
316
|
-
'If you want to delete it instead, run: sky local down'))
|
|
190
|
+
'If you want to delete it instead, run: `sky local down --name {name}`')) # pylint: disable=line-too-long
|
|
317
191
|
else:
|
|
318
192
|
with ux_utils.print_exception_no_traceback():
|
|
319
193
|
log_hint = ux_utils.log_path_hint(log_path, is_local=True)
|
|
@@ -339,7 +213,7 @@ def deploy_local_cluster(gpus: bool):
|
|
|
339
213
|
if gpus:
|
|
340
214
|
# Get GPU model by querying the node labels
|
|
341
215
|
label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
|
|
342
|
-
gpu_type_cmd = f'kubectl get node
|
|
216
|
+
gpu_type_cmd = f'kubectl get node {name}-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
|
|
343
217
|
try:
|
|
344
218
|
# Run the command and capture the output
|
|
345
219
|
gpu_count_output = subprocess.check_output(gpu_type_cmd,
|
|
@@ -375,8 +249,10 @@ def deploy_local_cluster(gpus: bool):
|
|
|
375
249
|
'This may cause issues with running tasks.')
|
|
376
250
|
logger.info(
|
|
377
251
|
ux_utils.finishing_message(
|
|
378
|
-
message=(
|
|
379
|
-
|
|
252
|
+
message=(
|
|
253
|
+
f'Local Kubernetes cluster {name} created successfully '
|
|
254
|
+
f'with {num_cpus} CPUs{gpu_message} on host port range '
|
|
255
|
+
f'{port_start}-{port_end}.'),
|
|
380
256
|
log_path=log_path,
|
|
381
257
|
is_local=True,
|
|
382
258
|
follow_up_message=(
|
|
@@ -384,3 +260,54 @@ def deploy_local_cluster(gpus: bool):
|
|
|
384
260
|
'Hint: To change the number of CPUs, change your docker '
|
|
385
261
|
'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
|
|
386
262
|
f'{gpu_hint}')))
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def teardown_local_cluster(name: Optional[str] = None):
|
|
266
|
+
name = name or DEFAULT_LOCAL_CLUSTER_NAME
|
|
267
|
+
cluster_removed = False
|
|
268
|
+
|
|
269
|
+
path_to_package = os.path.dirname(__file__)
|
|
270
|
+
down_script_path = os.path.join(path_to_package, 'delete_cluster.sh')
|
|
271
|
+
|
|
272
|
+
cwd = os.path.dirname(os.path.abspath(down_script_path))
|
|
273
|
+
run_command = f'{down_script_path} {name}'
|
|
274
|
+
run_command = shlex.split(run_command)
|
|
275
|
+
|
|
276
|
+
# Setup logging paths
|
|
277
|
+
run_timestamp = sky_logging.get_run_timestamp()
|
|
278
|
+
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
279
|
+
'local_down.log')
|
|
280
|
+
|
|
281
|
+
with rich_utils.safe_status(
|
|
282
|
+
ux_utils.spinner_message(f'Removing local cluster {name}',
|
|
283
|
+
log_path=log_path,
|
|
284
|
+
is_local=True)):
|
|
285
|
+
|
|
286
|
+
returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
|
|
287
|
+
log_path=log_path,
|
|
288
|
+
require_outputs=True,
|
|
289
|
+
stream_logs=False,
|
|
290
|
+
cwd=cwd)
|
|
291
|
+
stderr = stderr.replace('No kind clusters found.\n', '')
|
|
292
|
+
|
|
293
|
+
if returncode == 0:
|
|
294
|
+
cluster_removed = True
|
|
295
|
+
elif returncode == 100:
|
|
296
|
+
logger.info(
|
|
297
|
+
ux_utils.error_message(f'Local cluster {name} does not exist.'))
|
|
298
|
+
else:
|
|
299
|
+
with ux_utils.print_exception_no_traceback():
|
|
300
|
+
raise RuntimeError(f'Failed to down local cluster {name}. '
|
|
301
|
+
f'Stdout: {stdout}'
|
|
302
|
+
f'\nError: {stderr}')
|
|
303
|
+
if cluster_removed:
|
|
304
|
+
# Run sky check
|
|
305
|
+
with rich_utils.safe_status(
|
|
306
|
+
ux_utils.spinner_message('Running sky check...')):
|
|
307
|
+
sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
|
|
308
|
+
clouds=['kubernetes'],
|
|
309
|
+
quiet=True)
|
|
310
|
+
logger.info(
|
|
311
|
+
ux_utils.finishing_message(f'Local cluster {name} removed.',
|
|
312
|
+
log_path=log_path,
|
|
313
|
+
is_local=True))
|
|
@@ -48,8 +48,16 @@ fi
|
|
|
48
48
|
|
|
49
49
|
if [ -z "$context" ] || [ "$context_lower" = "none" ]; then
|
|
50
50
|
# If context is none, it means we are using incluster auth. In this case,
|
|
51
|
-
#
|
|
52
|
-
kubectl exec
|
|
51
|
+
# we need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
|
|
52
|
+
kubectl_cmd_base="kubectl exec \"$resource_type/$resource_name\" -n \"$namespace\" --kubeconfig=/dev/null --"
|
|
53
53
|
else
|
|
54
|
-
kubectl exec
|
|
54
|
+
kubectl_cmd_base="kubectl exec \"$resource_type/$resource_name\" -n \"$namespace\" --context=\"$context\" --"
|
|
55
55
|
fi
|
|
56
|
+
|
|
57
|
+
# Execute command on remote pod, waiting for rsync to be available first.
|
|
58
|
+
# The waiting happens on the remote pod, not locally, which is more efficient
|
|
59
|
+
# and reliable than polling from the local machine.
|
|
60
|
+
# We wrap the command in a bash script that waits for rsync, then execs the original command.
|
|
61
|
+
# Timeout after MAX_WAIT_TIME_SECONDS seconds.
|
|
62
|
+
MAX_WAIT_TIME_SECONDS=300
|
|
63
|
+
eval "${kubectl_cmd_base% --} -i -- bash -c 'count=0; max_count=$MAX_WAIT_TIME_SECONDS*2; until which rsync >/dev/null 2>&1; do if [ \$count -ge \$max_count ]; then echo \"Error when trying to rsync files to kubernetes cluster. Package installation may have failed.\" >&2; exit 1; fi; sleep 0.5; count=\$((count+1)); done; exec \"\$@\"' -- \"\$@\""
|