skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,10 +1,11 @@
|
|
|
1
1
|
"""Utility functions for deploying Kubernetes clusters."""
|
|
2
2
|
import os
|
|
3
|
+
import random
|
|
3
4
|
import shlex
|
|
4
5
|
import subprocess
|
|
5
|
-
import sys
|
|
6
6
|
import tempfile
|
|
7
|
-
|
|
7
|
+
import textwrap
|
|
8
|
+
from typing import List, Optional, Tuple
|
|
8
9
|
|
|
9
10
|
import colorama
|
|
10
11
|
|
|
@@ -19,11 +20,16 @@ from sky.utils import log_utils
|
|
|
19
20
|
from sky.utils import rich_utils
|
|
20
21
|
from sky.utils import subprocess_utils
|
|
21
22
|
from sky.utils import ux_utils
|
|
23
|
+
from sky.utils.kubernetes import deploy_ssh_node_pools
|
|
22
24
|
|
|
23
25
|
logger = sky_logging.init_logger(__name__)
|
|
24
26
|
|
|
25
27
|
# Default path for Kubernetes configuration file
|
|
26
28
|
DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
|
|
29
|
+
DEFAULT_LOCAL_CLUSTER_NAME = 'skypilot'
|
|
30
|
+
LOCAL_CLUSTER_PORT_RANGE = 100
|
|
31
|
+
LOCAL_CLUSTER_INTERNAL_PORT_START = 30000
|
|
32
|
+
LOCAL_CLUSTER_INTERNAL_PORT_END = 30099
|
|
27
33
|
|
|
28
34
|
|
|
29
35
|
def check_ssh_cluster_dependencies(
|
|
@@ -85,218 +91,178 @@ def deploy_ssh_cluster(cleanup: bool = False,
|
|
|
85
91
|
"""
|
|
86
92
|
check_ssh_cluster_dependencies()
|
|
87
93
|
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
# instead of invoking a script with subprocess.
|
|
91
|
-
path_to_package = os.path.dirname(__file__)
|
|
92
|
-
up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.py')
|
|
93
|
-
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
|
94
|
+
action = 'Cleanup' if cleanup else 'Deployment'
|
|
95
|
+
msg_str = f'Initializing SSH Node Pools {action}...'
|
|
94
96
|
|
|
95
|
-
|
|
97
|
+
with rich_utils.safe_status(ux_utils.spinner_message(msg_str)):
|
|
98
|
+
try:
|
|
99
|
+
deploy_ssh_node_pools.deploy_clusters(
|
|
100
|
+
infra=infra, cleanup=cleanup, kubeconfig_path=kubeconfig_path)
|
|
101
|
+
except Exception as e: # pylint: disable=broad-except
|
|
102
|
+
logger.error(str(e))
|
|
103
|
+
with ux_utils.print_exception_no_traceback():
|
|
104
|
+
raise RuntimeError(
|
|
105
|
+
'Failed to deploy SkyPilot on some Node Pools.') from e
|
|
96
106
|
|
|
107
|
+
logger.info('')
|
|
97
108
|
if cleanup:
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
109
|
+
logger.info(
|
|
110
|
+
ux_utils.finishing_message(
|
|
111
|
+
'🎉 SSH Node Pools cleaned up successfully.'))
|
|
112
|
+
else:
|
|
113
|
+
logger.info(
|
|
114
|
+
ux_utils.finishing_message(
|
|
115
|
+
'🎉 SSH Node Pools set up successfully. ',
|
|
116
|
+
follow_up_message=(
|
|
117
|
+
f'Run `{colorama.Style.BRIGHT}'
|
|
118
|
+
f'sky check ssh'
|
|
119
|
+
f'{colorama.Style.RESET_ALL}` to verify access, '
|
|
120
|
+
f'`{colorama.Style.BRIGHT}sky launch --infra ssh'
|
|
121
|
+
f'{colorama.Style.RESET_ALL}` to launch a cluster.')))
|
|
102
122
|
|
|
103
|
-
# Use the default kubeconfig path if none is provided
|
|
104
|
-
kubeconfig_path = kubeconfig_path or DEFAULT_KUBECONFIG_PATH
|
|
105
|
-
deploy_command.extend(['--kubeconfig-path', kubeconfig_path])
|
|
106
123
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
124
|
+
def generate_kind_config(port_start: int,
|
|
125
|
+
num_nodes: int = 1,
|
|
126
|
+
gpus: bool = False) -> str:
|
|
127
|
+
"""Generate a kind cluster config with ports mapped from host to container
|
|
111
128
|
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
else:
|
|
115
|
-
msg_str = 'Initializing deployment to SSH Node Pools...'
|
|
129
|
+
Port range will be [port_start, port_start + LOCAL_CLUSTER_PORT_RANGE)
|
|
130
|
+
Internally, this will map to ports 30000 - 30099
|
|
116
131
|
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
132
|
+
Args:
|
|
133
|
+
path: Path to generate the config file at
|
|
134
|
+
port_start: Port range start for mappings
|
|
135
|
+
num_nodes: Number of nodes in the cluster
|
|
136
|
+
gpus: If true, initialize kind cluster with GPU support
|
|
120
137
|
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
138
|
+
Returns:
|
|
139
|
+
The kind cluster config
|
|
140
|
+
"""
|
|
141
|
+
internal_start = LOCAL_CLUSTER_INTERNAL_PORT_START
|
|
142
|
+
internal_end = LOCAL_CLUSTER_INTERNAL_PORT_END
|
|
143
|
+
|
|
144
|
+
config = textwrap.dedent(f"""
|
|
145
|
+
apiVersion: kind.x-k8s.io/v1alpha4
|
|
146
|
+
kind: Cluster
|
|
147
|
+
kubeadmConfigPatches:
|
|
148
|
+
- |
|
|
149
|
+
kind: ClusterConfiguration
|
|
150
|
+
apiServer:
|
|
151
|
+
extraArgs:
|
|
152
|
+
"service-node-port-range": {internal_start}-{internal_end}
|
|
153
|
+
nodes:
|
|
154
|
+
- role: control-plane
|
|
155
|
+
kubeadmConfigPatches:
|
|
156
|
+
- |
|
|
157
|
+
kind: InitConfiguration
|
|
158
|
+
nodeRegistration:
|
|
159
|
+
kubeletExtraArgs:
|
|
160
|
+
node-labels: "ingress-ready=true"
|
|
161
|
+
""")
|
|
162
|
+
if gpus:
|
|
163
|
+
config += textwrap.indent(
|
|
164
|
+
textwrap.dedent("""
|
|
165
|
+
extraMounts:
|
|
166
|
+
- hostPath: /dev/null
|
|
167
|
+
containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
|
|
168
|
+
config += textwrap.indent(textwrap.dedent("""
|
|
169
|
+
extraPortMappings:"""), ' ' * 2)
|
|
170
|
+
for offset in range(LOCAL_CLUSTER_PORT_RANGE):
|
|
171
|
+
config += textwrap.indent(
|
|
172
|
+
textwrap.dedent(f"""
|
|
173
|
+
- containerPort: {internal_start + offset}
|
|
174
|
+
hostPort: {port_start + offset}
|
|
175
|
+
listenAddress: "0.0.0.0"
|
|
176
|
+
protocol: tcp
|
|
177
|
+
"""), ' ' * 2)
|
|
178
|
+
if num_nodes > 1:
|
|
179
|
+
config += '- role: worker\n' * (num_nodes - 1)
|
|
180
|
+
return config
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _get_port_range(name: str, port_start: Optional[int]) -> Tuple[int, int]:
|
|
184
|
+
is_default = name == DEFAULT_LOCAL_CLUSTER_NAME
|
|
185
|
+
if port_start is None:
|
|
186
|
+
if is_default:
|
|
187
|
+
port_start = LOCAL_CLUSTER_INTERNAL_PORT_START
|
|
188
|
+
else:
|
|
189
|
+
port_start = random.randint(301, 399) * 100
|
|
190
|
+
port_end = port_start + LOCAL_CLUSTER_PORT_RANGE - 1
|
|
191
|
+
|
|
192
|
+
port_range = f'Current port range: {port_start}-{port_end}'
|
|
193
|
+
if is_default and port_start != LOCAL_CLUSTER_INTERNAL_PORT_START:
|
|
194
|
+
raise ValueError('Default local cluster `skypilot` should have '
|
|
195
|
+
f'port range from 30000 to 30099. {port_range}.')
|
|
196
|
+
if not is_default and port_start == LOCAL_CLUSTER_INTERNAL_PORT_START:
|
|
197
|
+
raise ValueError('Port range 30000 to 30099 is reserved for '
|
|
198
|
+
f'default local cluster `skypilot`. {port_range}.')
|
|
199
|
+
if port_start % 100 != 0:
|
|
200
|
+
raise ValueError('Local cluster port start must be a multiple of 100. '
|
|
201
|
+
f'{port_range}.')
|
|
202
|
+
|
|
203
|
+
return port_start, port_end
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
def deploy_local_cluster(name: Optional[str], port_start: Optional[int],
|
|
207
|
+
gpus: bool):
|
|
208
|
+
name = name or DEFAULT_LOCAL_CLUSTER_NAME
|
|
209
|
+
port_start, port_end = _get_port_range(name, port_start)
|
|
210
|
+
context_name = f'kind-{name}'
|
|
211
|
+
cluster_created = False
|
|
133
212
|
|
|
134
|
-
if
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
with ux_utils.print_exception_no_traceback():
|
|
138
|
-
log_hint = ux_utils.log_path_hint(log_path, is_local=False)
|
|
139
|
-
raise RuntimeError('Failed to deploy SkyPilot on some Node Pools. '
|
|
140
|
-
f'{log_hint}'
|
|
141
|
-
f'\nError: {stderr}')
|
|
213
|
+
# Check if GPUs are available on the host
|
|
214
|
+
local_gpus_available = backend_utils.check_local_gpus()
|
|
215
|
+
gpus = gpus and local_gpus_available
|
|
142
216
|
|
|
143
|
-
if
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
if cleanup:
|
|
148
|
-
logger.info(
|
|
149
|
-
ux_utils.finishing_message(
|
|
150
|
-
'🎉 SSH Node Pools cleaned up successfully.',
|
|
151
|
-
log_path=log_path,
|
|
152
|
-
is_local=True))
|
|
153
|
-
else:
|
|
217
|
+
# Check if ~/.kube/config exists:
|
|
218
|
+
if os.path.exists(os.path.expanduser('~/.kube/config')):
|
|
219
|
+
curr_context = kubernetes_utils.get_current_kube_config_context_name()
|
|
220
|
+
if curr_context is not None and curr_context != context_name:
|
|
154
221
|
logger.info(
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
with tempfile.NamedTemporaryFile(mode='w') as ip_file, \
|
|
181
|
-
tempfile.NamedTemporaryFile(mode='w') as key_file:
|
|
182
|
-
|
|
183
|
-
# Write IPs and SSH key to temporary files
|
|
184
|
-
ip_file.write('\n'.join(ip_list))
|
|
185
|
-
ip_file.flush()
|
|
186
|
-
|
|
187
|
-
key_file.write(ssh_key)
|
|
188
|
-
key_file.flush()
|
|
189
|
-
os.chmod(key_file.name, 0o600)
|
|
190
|
-
|
|
191
|
-
# Use the legacy mode command line arguments for backward compatibility
|
|
192
|
-
deploy_command = [
|
|
193
|
-
sys.executable, up_script_path, '--ips-file', ip_file.name,
|
|
194
|
-
'--user', ssh_user, '--ssh-key', key_file.name
|
|
195
|
-
]
|
|
196
|
-
|
|
197
|
-
if context_name is not None:
|
|
198
|
-
deploy_command.extend(['--context-name', context_name])
|
|
199
|
-
if password is not None:
|
|
200
|
-
deploy_command.extend(['--password', password])
|
|
201
|
-
if cleanup:
|
|
202
|
-
deploy_command.append('--cleanup')
|
|
222
|
+
f'Current context in kube config: {curr_context}'
|
|
223
|
+
f'\nWill automatically switch to {context_name} after the '
|
|
224
|
+
'local cluster is created.')
|
|
225
|
+
message_str = 'Creating local cluster {}{}...'
|
|
226
|
+
message_str = message_str.format(
|
|
227
|
+
name,
|
|
228
|
+
' with GPU support (this may take up to 15 minutes)' if gpus else '')
|
|
229
|
+
|
|
230
|
+
with tempfile.NamedTemporaryFile(mode='w+', suffix='.yaml',
|
|
231
|
+
delete=True) as f:
|
|
232
|
+
# Choose random port range to use on the host machine.
|
|
233
|
+
# Port range is port_start - port_start + 99 (exactly 100 ports).
|
|
234
|
+
logger.debug(f'Using host port range {port_start}-{port_end}')
|
|
235
|
+
f.write(generate_kind_config(port_start, gpus=gpus))
|
|
236
|
+
f.flush()
|
|
237
|
+
|
|
238
|
+
path_to_package = os.path.dirname(__file__)
|
|
239
|
+
up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
|
|
240
|
+
|
|
241
|
+
# Get directory of script and run it from there
|
|
242
|
+
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
|
243
|
+
run_command = f'{up_script_path} {name} {f.name}'
|
|
244
|
+
if gpus:
|
|
245
|
+
run_command += ' --gpus'
|
|
246
|
+
run_command = shlex.split(run_command)
|
|
203
247
|
|
|
204
248
|
# Setup logging paths
|
|
205
249
|
run_timestamp = sky_logging.get_run_timestamp()
|
|
206
250
|
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
207
251
|
'local_up.log')
|
|
208
|
-
|
|
209
|
-
if cleanup:
|
|
210
|
-
msg_str = 'Cleaning up remote cluster...'
|
|
211
|
-
else:
|
|
212
|
-
msg_str = 'Deploying remote cluster...'
|
|
213
|
-
|
|
214
|
-
# Create environment with PYTHONUNBUFFERED=1 to ensure unbuffered output
|
|
215
|
-
env = os.environ.copy()
|
|
216
|
-
env['PYTHONUNBUFFERED'] = '1'
|
|
252
|
+
logger.info(message_str)
|
|
217
253
|
|
|
218
254
|
with rich_utils.safe_status(
|
|
219
|
-
ux_utils.spinner_message(
|
|
255
|
+
ux_utils.spinner_message(message_str,
|
|
220
256
|
log_path=log_path,
|
|
221
257
|
is_local=True)):
|
|
222
258
|
returncode, _, stderr = log_lib.run_with_log(
|
|
223
|
-
cmd=
|
|
259
|
+
cmd=run_command,
|
|
224
260
|
log_path=log_path,
|
|
225
261
|
require_outputs=True,
|
|
226
262
|
stream_logs=False,
|
|
227
|
-
line_processor=log_utils.
|
|
263
|
+
line_processor=log_utils.SkyLocalUpLineProcessor(
|
|
228
264
|
log_path=log_path, is_local=True),
|
|
229
|
-
cwd=cwd
|
|
230
|
-
env=env)
|
|
231
|
-
if returncode == 0:
|
|
232
|
-
success = True
|
|
233
|
-
else:
|
|
234
|
-
with ux_utils.print_exception_no_traceback():
|
|
235
|
-
log_hint = ux_utils.log_path_hint(log_path, is_local=True)
|
|
236
|
-
raise RuntimeError('Failed to deploy remote cluster. '
|
|
237
|
-
f'Full log: {log_hint}'
|
|
238
|
-
f'\nError: {stderr}')
|
|
239
|
-
|
|
240
|
-
if success:
|
|
241
|
-
if cleanup:
|
|
242
|
-
logger.info(
|
|
243
|
-
ux_utils.finishing_message(
|
|
244
|
-
'🎉 Remote cluster cleaned up successfully.',
|
|
245
|
-
log_path=log_path,
|
|
246
|
-
is_local=True))
|
|
247
|
-
else:
|
|
248
|
-
logger.info(
|
|
249
|
-
ux_utils.finishing_message(
|
|
250
|
-
'🎉 Remote cluster deployed successfully.',
|
|
251
|
-
log_path=log_path,
|
|
252
|
-
is_local=True))
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
def deploy_local_cluster(gpus: bool):
|
|
256
|
-
cluster_created = False
|
|
257
|
-
|
|
258
|
-
# Check if GPUs are available on the host
|
|
259
|
-
local_gpus_available = backend_utils.check_local_gpus()
|
|
260
|
-
gpus = gpus and local_gpus_available
|
|
261
|
-
|
|
262
|
-
# Check if ~/.kube/config exists:
|
|
263
|
-
if os.path.exists(os.path.expanduser('~/.kube/config')):
|
|
264
|
-
curr_context = kubernetes_utils.get_current_kube_config_context_name()
|
|
265
|
-
skypilot_context = 'kind-skypilot'
|
|
266
|
-
if curr_context is not None and curr_context != skypilot_context:
|
|
267
|
-
logger.info(
|
|
268
|
-
f'Current context in kube config: {curr_context}'
|
|
269
|
-
'\nWill automatically switch to kind-skypilot after the local '
|
|
270
|
-
'cluster is created.')
|
|
271
|
-
message_str = 'Creating local cluster{}...'
|
|
272
|
-
message_str = message_str.format((' with GPU support (this may take up '
|
|
273
|
-
'to 15 minutes)') if gpus else '')
|
|
274
|
-
path_to_package = os.path.dirname(__file__)
|
|
275
|
-
up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
|
|
276
|
-
|
|
277
|
-
# Get directory of script and run it from there
|
|
278
|
-
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
|
279
|
-
run_command = up_script_path + ' --gpus' if gpus else up_script_path
|
|
280
|
-
run_command = shlex.split(run_command)
|
|
281
|
-
|
|
282
|
-
# Setup logging paths
|
|
283
|
-
run_timestamp = sky_logging.get_run_timestamp()
|
|
284
|
-
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
285
|
-
'local_up.log')
|
|
286
|
-
logger.info(message_str)
|
|
287
|
-
|
|
288
|
-
with rich_utils.safe_status(
|
|
289
|
-
ux_utils.spinner_message(message_str,
|
|
290
|
-
log_path=log_path,
|
|
291
|
-
is_local=True)):
|
|
292
|
-
returncode, _, stderr = log_lib.run_with_log(
|
|
293
|
-
cmd=run_command,
|
|
294
|
-
log_path=log_path,
|
|
295
|
-
require_outputs=True,
|
|
296
|
-
stream_logs=False,
|
|
297
|
-
line_processor=log_utils.SkyLocalUpLineProcessor(log_path=log_path,
|
|
298
|
-
is_local=True),
|
|
299
|
-
cwd=cwd)
|
|
265
|
+
cwd=cwd)
|
|
300
266
|
|
|
301
267
|
# Kind always writes to stderr even if it succeeds.
|
|
302
268
|
# If the failure happens after the cluster is created, we need
|
|
@@ -309,11 +275,11 @@ def deploy_local_cluster(gpus: bool):
|
|
|
309
275
|
elif returncode == 100:
|
|
310
276
|
logger.info(
|
|
311
277
|
ux_utils.finishing_message(
|
|
312
|
-
'Local cluster already exists.\n',
|
|
278
|
+
f'Local cluster {name} already exists.\n',
|
|
313
279
|
log_path=log_path,
|
|
314
280
|
is_local=True,
|
|
315
281
|
follow_up_message=
|
|
316
|
-
'If you want to delete it instead, run: sky local down'))
|
|
282
|
+
'If you want to delete it instead, run: `sky local down --name {name}`')) # pylint: disable=line-too-long
|
|
317
283
|
else:
|
|
318
284
|
with ux_utils.print_exception_no_traceback():
|
|
319
285
|
log_hint = ux_utils.log_path_hint(log_path, is_local=True)
|
|
@@ -339,7 +305,7 @@ def deploy_local_cluster(gpus: bool):
|
|
|
339
305
|
if gpus:
|
|
340
306
|
# Get GPU model by querying the node labels
|
|
341
307
|
label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
|
|
342
|
-
gpu_type_cmd = f'kubectl get node
|
|
308
|
+
gpu_type_cmd = f'kubectl get node {name}-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
|
|
343
309
|
try:
|
|
344
310
|
# Run the command and capture the output
|
|
345
311
|
gpu_count_output = subprocess.check_output(gpu_type_cmd,
|
|
@@ -375,8 +341,10 @@ def deploy_local_cluster(gpus: bool):
|
|
|
375
341
|
'This may cause issues with running tasks.')
|
|
376
342
|
logger.info(
|
|
377
343
|
ux_utils.finishing_message(
|
|
378
|
-
message=(
|
|
379
|
-
|
|
344
|
+
message=(
|
|
345
|
+
f'Local Kubernetes cluster {name} created successfully '
|
|
346
|
+
f'with {num_cpus} CPUs{gpu_message} on host port range '
|
|
347
|
+
f'{port_start}-{port_end}.'),
|
|
380
348
|
log_path=log_path,
|
|
381
349
|
is_local=True,
|
|
382
350
|
follow_up_message=(
|
|
@@ -384,3 +352,54 @@ def deploy_local_cluster(gpus: bool):
|
|
|
384
352
|
'Hint: To change the number of CPUs, change your docker '
|
|
385
353
|
'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
|
|
386
354
|
f'{gpu_hint}')))
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def teardown_local_cluster(name: Optional[str] = None):
|
|
358
|
+
name = name or DEFAULT_LOCAL_CLUSTER_NAME
|
|
359
|
+
cluster_removed = False
|
|
360
|
+
|
|
361
|
+
path_to_package = os.path.dirname(__file__)
|
|
362
|
+
down_script_path = os.path.join(path_to_package, 'delete_cluster.sh')
|
|
363
|
+
|
|
364
|
+
cwd = os.path.dirname(os.path.abspath(down_script_path))
|
|
365
|
+
run_command = f'{down_script_path} {name}'
|
|
366
|
+
run_command = shlex.split(run_command)
|
|
367
|
+
|
|
368
|
+
# Setup logging paths
|
|
369
|
+
run_timestamp = sky_logging.get_run_timestamp()
|
|
370
|
+
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
371
|
+
'local_down.log')
|
|
372
|
+
|
|
373
|
+
with rich_utils.safe_status(
|
|
374
|
+
ux_utils.spinner_message(f'Removing local cluster {name}',
|
|
375
|
+
log_path=log_path,
|
|
376
|
+
is_local=True)):
|
|
377
|
+
|
|
378
|
+
returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
|
|
379
|
+
log_path=log_path,
|
|
380
|
+
require_outputs=True,
|
|
381
|
+
stream_logs=False,
|
|
382
|
+
cwd=cwd)
|
|
383
|
+
stderr = stderr.replace('No kind clusters found.\n', '')
|
|
384
|
+
|
|
385
|
+
if returncode == 0:
|
|
386
|
+
cluster_removed = True
|
|
387
|
+
elif returncode == 100:
|
|
388
|
+
logger.info(
|
|
389
|
+
ux_utils.error_message(f'Local cluster {name} does not exist.'))
|
|
390
|
+
else:
|
|
391
|
+
with ux_utils.print_exception_no_traceback():
|
|
392
|
+
raise RuntimeError(f'Failed to down local cluster {name}. '
|
|
393
|
+
f'Stdout: {stdout}'
|
|
394
|
+
f'\nError: {stderr}')
|
|
395
|
+
if cluster_removed:
|
|
396
|
+
# Run sky check
|
|
397
|
+
with rich_utils.safe_status(
|
|
398
|
+
ux_utils.spinner_message('Running sky check...')):
|
|
399
|
+
sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
|
|
400
|
+
clouds=['kubernetes'],
|
|
401
|
+
quiet=True)
|
|
402
|
+
logger.info(
|
|
403
|
+
ux_utils.finishing_message(f'Local cluster {name} removed.',
|
|
404
|
+
log_path=log_path,
|
|
405
|
+
is_local=True))
|
|
@@ -48,8 +48,16 @@ fi
|
|
|
48
48
|
|
|
49
49
|
if [ -z "$context" ] || [ "$context_lower" = "none" ]; then
|
|
50
50
|
# If context is none, it means we are using incluster auth. In this case,
|
|
51
|
-
#
|
|
52
|
-
kubectl exec
|
|
51
|
+
# we need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
|
|
52
|
+
kubectl_cmd_base="kubectl exec \"$resource_type/$resource_name\" -n \"$namespace\" --kubeconfig=/dev/null --"
|
|
53
53
|
else
|
|
54
|
-
kubectl exec
|
|
54
|
+
kubectl_cmd_base="kubectl exec \"$resource_type/$resource_name\" -n \"$namespace\" --context=\"$context\" --"
|
|
55
55
|
fi
|
|
56
|
+
|
|
57
|
+
# Execute command on remote pod, waiting for rsync to be available first.
|
|
58
|
+
# The waiting happens on the remote pod, not locally, which is more efficient
|
|
59
|
+
# and reliable than polling from the local machine.
|
|
60
|
+
# We wrap the command in a bash script that waits for rsync, then execs the original command.
|
|
61
|
+
# Timeout after MAX_WAIT_TIME_SECONDS seconds.
|
|
62
|
+
MAX_WAIT_TIME_SECONDS=300
|
|
63
|
+
eval "${kubectl_cmd_base% --} -i -- bash -c 'count=0; max_count=$MAX_WAIT_TIME_SECONDS*2; until which rsync >/dev/null 2>&1; do if [ \$count -ge \$max_count ]; then echo \"Error when trying to rsync files to kubernetes cluster. Package installation may have failed.\" >&2; exit 1; fi; sleep 0.5; count=\$((count+1)); done; exec \"\$@\"' -- \"\$@\""
|
sky/utils/kubernetes_enums.py
CHANGED
|
@@ -2,26 +2,13 @@
|
|
|
2
2
|
import enum
|
|
3
3
|
|
|
4
4
|
|
|
5
|
+
# TODO(kevin): Remove this enum in v0.13.0.
|
|
5
6
|
class KubernetesNetworkingMode(enum.Enum):
|
|
6
|
-
"""Enum for the different types of networking modes for accessing
|
|
7
|
-
jump pods.
|
|
7
|
+
"""Enum for the different types of networking modes for accessing pods.
|
|
8
8
|
"""
|
|
9
9
|
NODEPORT = 'nodeport'
|
|
10
10
|
PORTFORWARD = 'portforward'
|
|
11
11
|
|
|
12
|
-
@classmethod
|
|
13
|
-
def from_str(cls, mode: str) -> 'KubernetesNetworkingMode':
|
|
14
|
-
"""Returns the enum value for the given string."""
|
|
15
|
-
if mode.lower() == cls.NODEPORT.value:
|
|
16
|
-
return cls.NODEPORT
|
|
17
|
-
elif mode.lower() == cls.PORTFORWARD.value:
|
|
18
|
-
return cls.PORTFORWARD
|
|
19
|
-
else:
|
|
20
|
-
raise ValueError(f'Unsupported kubernetes networking mode: '
|
|
21
|
-
f'{mode}. The mode must be either '
|
|
22
|
-
f'\'{cls.PORTFORWARD.value}\' or '
|
|
23
|
-
f'\'{cls.NODEPORT.value}\'. ')
|
|
24
|
-
|
|
25
12
|
|
|
26
13
|
class KubernetesServiceType(enum.Enum):
|
|
27
14
|
"""Enum for the different types of services."""
|
|
@@ -44,3 +31,8 @@ class KubernetesAutoscalerType(enum.Enum):
|
|
|
44
31
|
KARPENTER = 'karpenter'
|
|
45
32
|
COREWEAVE = 'coreweave'
|
|
46
33
|
GENERIC = 'generic'
|
|
34
|
+
|
|
35
|
+
def emits_autoscale_event(self) -> bool:
|
|
36
|
+
"""Returns whether specific autoscaler emits the event reason
|
|
37
|
+
TriggeredScaleUp."""
|
|
38
|
+
return self not in {self.KARPENTER}
|
sky/utils/lock_events.py
CHANGED
|
@@ -20,17 +20,17 @@ class DistributedLockEvent:
|
|
|
20
20
|
f'[DistributedLock.hold]:{lock_id}')
|
|
21
21
|
|
|
22
22
|
def acquire(self):
|
|
23
|
-
was_locked = self._lock.is_locked
|
|
23
|
+
was_locked = self._lock.is_locked # type: ignore[truthy-function]
|
|
24
24
|
with timeline.Event(f'[DistributedLock.acquire]:{self._lock_id}'):
|
|
25
25
|
self._lock.acquire()
|
|
26
|
-
if not was_locked and self._lock.is_locked:
|
|
26
|
+
if not was_locked and self._lock.is_locked: # type: ignore[truthy-function] # pylint: disable=line-too-long
|
|
27
27
|
# start holding the lock after initial acquiring
|
|
28
28
|
self._hold_lock_event.begin()
|
|
29
29
|
|
|
30
30
|
def release(self):
|
|
31
|
-
was_locked = self._lock.is_locked
|
|
31
|
+
was_locked = self._lock.is_locked # type: ignore[truthy-function]
|
|
32
32
|
self._lock.release()
|
|
33
|
-
if was_locked and not self._lock.is_locked:
|
|
33
|
+
if was_locked and not self._lock.is_locked: # type: ignore[truthy-function] # pylint: disable=line-too-long
|
|
34
34
|
# stop holding the lock after initial releasing
|
|
35
35
|
self._hold_lock_event.end()
|
|
36
36
|
|