skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,9 +1,13 @@
|
|
|
1
1
|
"""Utility functions for deploying Kubernetes clusters."""
|
|
2
2
|
import os
|
|
3
|
+
import random
|
|
3
4
|
import shlex
|
|
4
5
|
import subprocess
|
|
5
6
|
import tempfile
|
|
6
|
-
|
|
7
|
+
import textwrap
|
|
8
|
+
from typing import List, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
import colorama
|
|
7
11
|
|
|
8
12
|
from sky import check as sky_check
|
|
9
13
|
from sky import sky_logging
|
|
@@ -16,92 +20,194 @@ from sky.utils import log_utils
|
|
|
16
20
|
from sky.utils import rich_utils
|
|
17
21
|
from sky.utils import subprocess_utils
|
|
18
22
|
from sky.utils import ux_utils
|
|
23
|
+
from sky.utils.kubernetes import deploy_ssh_node_pools
|
|
19
24
|
|
|
20
25
|
logger = sky_logging.init_logger(__name__)
|
|
21
26
|
|
|
27
|
+
# Default path for Kubernetes configuration file
|
|
28
|
+
DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
|
|
29
|
+
DEFAULT_LOCAL_CLUSTER_NAME = 'skypilot'
|
|
30
|
+
LOCAL_CLUSTER_PORT_RANGE = 100
|
|
31
|
+
LOCAL_CLUSTER_INTERNAL_PORT_START = 30000
|
|
32
|
+
LOCAL_CLUSTER_INTERNAL_PORT_END = 30099
|
|
22
33
|
|
|
23
|
-
def deploy_remote_cluster(ip_list: List[str],
|
|
24
|
-
ssh_user: str,
|
|
25
|
-
ssh_key: str,
|
|
26
|
-
cleanup: bool,
|
|
27
|
-
context_name: Optional[str] = None,
|
|
28
|
-
password: Optional[str] = None):
|
|
29
|
-
success = False
|
|
30
|
-
path_to_package = os.path.dirname(__file__)
|
|
31
|
-
up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.sh')
|
|
32
|
-
# Get directory of script and run it from there
|
|
33
|
-
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
|
34
|
-
|
|
35
|
-
# Create temporary files for the IPs and SSH key
|
|
36
|
-
with tempfile.NamedTemporaryFile(mode='w') as ip_file, \
|
|
37
|
-
tempfile.NamedTemporaryFile(mode='w') as key_file:
|
|
38
|
-
|
|
39
|
-
# Write IPs and SSH key to temporary files
|
|
40
|
-
ip_file.write('\n'.join(ip_list))
|
|
41
|
-
ip_file.flush()
|
|
42
|
-
|
|
43
|
-
key_file.write(ssh_key)
|
|
44
|
-
key_file.flush()
|
|
45
|
-
os.chmod(key_file.name, 0o600)
|
|
46
|
-
|
|
47
|
-
deploy_command = (f'{up_script_path} {ip_file.name} '
|
|
48
|
-
f'{ssh_user} {key_file.name}')
|
|
49
|
-
if context_name is not None:
|
|
50
|
-
deploy_command += f' {context_name}'
|
|
51
|
-
if password is not None:
|
|
52
|
-
deploy_command += f' --password {password}'
|
|
53
|
-
if cleanup:
|
|
54
|
-
deploy_command += ' --cleanup'
|
|
55
|
-
|
|
56
|
-
# Convert the command to a format suitable for subprocess
|
|
57
|
-
deploy_command = shlex.split(deploy_command)
|
|
58
34
|
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
'local_up.log')
|
|
35
|
+
def check_ssh_cluster_dependencies(
|
|
36
|
+
raise_error: bool = True) -> Optional[List[str]]:
|
|
37
|
+
"""Checks if the dependencies for ssh cluster are installed.
|
|
63
38
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
39
|
+
Args:
|
|
40
|
+
raise_error: set to true when the dependency needs to be present.
|
|
41
|
+
set to false for `sky check`, where reason strings are compiled
|
|
42
|
+
at the end.
|
|
43
|
+
|
|
44
|
+
Returns: the reasons list if there are missing dependencies.
|
|
45
|
+
"""
|
|
46
|
+
# error message
|
|
47
|
+
jq_message = ('`jq` is required to setup ssh cluster.')
|
|
48
|
+
|
|
49
|
+
# save
|
|
50
|
+
reasons = []
|
|
51
|
+
required_binaries = []
|
|
52
|
+
|
|
53
|
+
# Ensure jq is installed
|
|
54
|
+
try:
|
|
55
|
+
subprocess.run(['jq', '--version'],
|
|
56
|
+
stdout=subprocess.DEVNULL,
|
|
57
|
+
stderr=subprocess.DEVNULL,
|
|
58
|
+
check=True)
|
|
59
|
+
except (FileNotFoundError, subprocess.CalledProcessError):
|
|
60
|
+
required_binaries.append('jq')
|
|
61
|
+
reasons.append(jq_message)
|
|
62
|
+
|
|
63
|
+
if required_binaries:
|
|
64
|
+
reasons.extend([
|
|
65
|
+
'On Debian/Ubuntu, install the missing dependenc(ies) with:',
|
|
66
|
+
f' $ sudo apt install {" ".join(required_binaries)}',
|
|
67
|
+
'On MacOS, install with: ',
|
|
68
|
+
f' $ brew install {" ".join(required_binaries)}',
|
|
69
|
+
])
|
|
70
|
+
if raise_error:
|
|
83
71
|
with ux_utils.print_exception_no_traceback():
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
72
|
+
raise RuntimeError('\n'.join(reasons))
|
|
73
|
+
return reasons
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def deploy_ssh_cluster(cleanup: bool = False,
|
|
78
|
+
infra: Optional[str] = None,
|
|
79
|
+
kubeconfig_path: Optional[str] = None):
|
|
80
|
+
"""Deploy a Kubernetes cluster on SSH targets.
|
|
81
|
+
|
|
82
|
+
This function reads ~/.sky/ssh_node_pools.yaml and uses it to deploy a
|
|
83
|
+
Kubernetes cluster on the specified machines.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
cleanup: Whether to clean up the cluster instead of deploying.
|
|
87
|
+
infra: Name of the cluster in ssh_node_pools.yaml to use.
|
|
88
|
+
If None, the first cluster in the file will be used.
|
|
89
|
+
kubeconfig_path: Path to save the Kubernetes configuration file.
|
|
90
|
+
If None, the default ~/.kube/config will be used.
|
|
91
|
+
"""
|
|
92
|
+
check_ssh_cluster_dependencies()
|
|
93
|
+
|
|
94
|
+
action = 'Cleanup' if cleanup else 'Deployment'
|
|
95
|
+
msg_str = f'Initializing SSH Node Pools {action}...'
|
|
96
|
+
|
|
97
|
+
with rich_utils.safe_status(ux_utils.spinner_message(msg_str)):
|
|
98
|
+
try:
|
|
99
|
+
deploy_ssh_node_pools.deploy_clusters(
|
|
100
|
+
infra=infra, cleanup=cleanup, kubeconfig_path=kubeconfig_path)
|
|
101
|
+
except Exception as e: # pylint: disable=broad-except
|
|
102
|
+
logger.error(str(e))
|
|
103
|
+
with ux_utils.print_exception_no_traceback():
|
|
104
|
+
raise RuntimeError(
|
|
105
|
+
'Failed to deploy SkyPilot on some Node Pools.') from e
|
|
106
|
+
|
|
107
|
+
logger.info('')
|
|
108
|
+
if cleanup:
|
|
109
|
+
logger.info(
|
|
110
|
+
ux_utils.finishing_message(
|
|
111
|
+
'🎉 SSH Node Pools cleaned up successfully.'))
|
|
112
|
+
else:
|
|
113
|
+
logger.info(
|
|
114
|
+
ux_utils.finishing_message(
|
|
115
|
+
'🎉 SSH Node Pools set up successfully. ',
|
|
116
|
+
follow_up_message=(
|
|
117
|
+
f'Run `{colorama.Style.BRIGHT}'
|
|
118
|
+
f'sky check ssh'
|
|
119
|
+
f'{colorama.Style.RESET_ALL}` to verify access, '
|
|
120
|
+
f'`{colorama.Style.BRIGHT}sky launch --infra ssh'
|
|
121
|
+
f'{colorama.Style.RESET_ALL}` to launch a cluster.')))
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def generate_kind_config(port_start: int,
|
|
125
|
+
num_nodes: int = 1,
|
|
126
|
+
gpus: bool = False) -> str:
|
|
127
|
+
"""Generate a kind cluster config with ports mapped from host to container
|
|
128
|
+
|
|
129
|
+
Port range will be [port_start, port_start + LOCAL_CLUSTER_PORT_RANGE)
|
|
130
|
+
Internally, this will map to ports 30000 - 30099
|
|
131
|
+
|
|
132
|
+
Args:
|
|
133
|
+
path: Path to generate the config file at
|
|
134
|
+
port_start: Port range start for mappings
|
|
135
|
+
num_nodes: Number of nodes in the cluster
|
|
136
|
+
gpus: If true, initialize kind cluster with GPU support
|
|
137
|
+
|
|
138
|
+
Returns:
|
|
139
|
+
The kind cluster config
|
|
140
|
+
"""
|
|
141
|
+
internal_start = LOCAL_CLUSTER_INTERNAL_PORT_START
|
|
142
|
+
internal_end = LOCAL_CLUSTER_INTERNAL_PORT_END
|
|
143
|
+
|
|
144
|
+
config = textwrap.dedent(f"""
|
|
145
|
+
apiVersion: kind.x-k8s.io/v1alpha4
|
|
146
|
+
kind: Cluster
|
|
147
|
+
kubeadmConfigPatches:
|
|
148
|
+
- |
|
|
149
|
+
kind: ClusterConfiguration
|
|
150
|
+
apiServer:
|
|
151
|
+
extraArgs:
|
|
152
|
+
"service-node-port-range": {internal_start}-{internal_end}
|
|
153
|
+
nodes:
|
|
154
|
+
- role: control-plane
|
|
155
|
+
kubeadmConfigPatches:
|
|
156
|
+
- |
|
|
157
|
+
kind: InitConfiguration
|
|
158
|
+
nodeRegistration:
|
|
159
|
+
kubeletExtraArgs:
|
|
160
|
+
node-labels: "ingress-ready=true"
|
|
161
|
+
""")
|
|
162
|
+
if gpus:
|
|
163
|
+
config += textwrap.indent(
|
|
164
|
+
textwrap.dedent("""
|
|
165
|
+
extraMounts:
|
|
166
|
+
- hostPath: /dev/null
|
|
167
|
+
containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
|
|
168
|
+
config += textwrap.indent(textwrap.dedent("""
|
|
169
|
+
extraPortMappings:"""), ' ' * 2)
|
|
170
|
+
for offset in range(LOCAL_CLUSTER_PORT_RANGE):
|
|
171
|
+
config += textwrap.indent(
|
|
172
|
+
textwrap.dedent(f"""
|
|
173
|
+
- containerPort: {internal_start + offset}
|
|
174
|
+
hostPort: {port_start + offset}
|
|
175
|
+
listenAddress: "0.0.0.0"
|
|
176
|
+
protocol: tcp
|
|
177
|
+
"""), ' ' * 2)
|
|
178
|
+
if num_nodes > 1:
|
|
179
|
+
config += '- role: worker\n' * (num_nodes - 1)
|
|
180
|
+
return config
|
|
181
|
+
|
|
182
|
+
|
|
183
|
+
def _get_port_range(name: str, port_start: Optional[int]) -> Tuple[int, int]:
|
|
184
|
+
is_default = name == DEFAULT_LOCAL_CLUSTER_NAME
|
|
185
|
+
if port_start is None:
|
|
186
|
+
if is_default:
|
|
187
|
+
port_start = LOCAL_CLUSTER_INTERNAL_PORT_START
|
|
188
|
+
else:
|
|
189
|
+
port_start = random.randint(301, 399) * 100
|
|
190
|
+
port_end = port_start + LOCAL_CLUSTER_PORT_RANGE - 1
|
|
191
|
+
|
|
192
|
+
port_range = f'Current port range: {port_start}-{port_end}'
|
|
193
|
+
if is_default and port_start != LOCAL_CLUSTER_INTERNAL_PORT_START:
|
|
194
|
+
raise ValueError('Default local cluster `skypilot` should have '
|
|
195
|
+
f'port range from 30000 to 30099. {port_range}.')
|
|
196
|
+
if not is_default and port_start == LOCAL_CLUSTER_INTERNAL_PORT_START:
|
|
197
|
+
raise ValueError('Port range 30000 to 30099 is reserved for '
|
|
198
|
+
f'default local cluster `skypilot`. {port_range}.')
|
|
199
|
+
if port_start % 100 != 0:
|
|
200
|
+
raise ValueError('Local cluster port start must be a multiple of 100. '
|
|
201
|
+
f'{port_range}.')
|
|
202
|
+
|
|
203
|
+
return port_start, port_end
|
|
88
204
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
is_local=True))
|
|
96
|
-
else:
|
|
97
|
-
logger.info(
|
|
98
|
-
ux_utils.finishing_message(
|
|
99
|
-
'🎉 Remote cluster deployed successfully.',
|
|
100
|
-
log_path=log_path,
|
|
101
|
-
is_local=True))
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
def deploy_local_cluster(gpus: bool):
|
|
205
|
+
|
|
206
|
+
def deploy_local_cluster(name: Optional[str], port_start: Optional[int],
|
|
207
|
+
gpus: bool):
|
|
208
|
+
name = name or DEFAULT_LOCAL_CLUSTER_NAME
|
|
209
|
+
port_start, port_end = _get_port_range(name, port_start)
|
|
210
|
+
context_name = f'kind-{name}'
|
|
105
211
|
cluster_created = False
|
|
106
212
|
|
|
107
213
|
# Check if GPUs are available on the host
|
|
@@ -111,41 +217,52 @@ def deploy_local_cluster(gpus: bool):
|
|
|
111
217
|
# Check if ~/.kube/config exists:
|
|
112
218
|
if os.path.exists(os.path.expanduser('~/.kube/config')):
|
|
113
219
|
curr_context = kubernetes_utils.get_current_kube_config_context_name()
|
|
114
|
-
|
|
115
|
-
if curr_context is not None and curr_context != skypilot_context:
|
|
220
|
+
if curr_context is not None and curr_context != context_name:
|
|
116
221
|
logger.info(
|
|
117
222
|
f'Current context in kube config: {curr_context}'
|
|
118
|
-
'\nWill automatically switch to
|
|
119
|
-
'cluster is created.')
|
|
120
|
-
message_str = 'Creating local cluster{}...'
|
|
121
|
-
message_str = message_str.format(
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
|
|
223
|
+
f'\nWill automatically switch to {context_name} after the '
|
|
224
|
+
'local cluster is created.')
|
|
225
|
+
message_str = 'Creating local cluster {}{}...'
|
|
226
|
+
message_str = message_str.format(
|
|
227
|
+
name,
|
|
228
|
+
' with GPU support (this may take up to 15 minutes)' if gpus else '')
|
|
125
229
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
230
|
+
with tempfile.NamedTemporaryFile(mode='w+', suffix='.yaml',
|
|
231
|
+
delete=True) as f:
|
|
232
|
+
# Choose random port range to use on the host machine.
|
|
233
|
+
# Port range is port_start - port_start + 99 (exactly 100 ports).
|
|
234
|
+
logger.debug(f'Using host port range {port_start}-{port_end}')
|
|
235
|
+
f.write(generate_kind_config(port_start, gpus=gpus))
|
|
236
|
+
f.flush()
|
|
130
237
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
134
|
-
'local_up.log')
|
|
135
|
-
logger.info(message_str)
|
|
238
|
+
path_to_package = os.path.dirname(__file__)
|
|
239
|
+
up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
|
|
136
240
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
241
|
+
# Get directory of script and run it from there
|
|
242
|
+
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
|
243
|
+
run_command = f'{up_script_path} {name} {f.name}'
|
|
244
|
+
if gpus:
|
|
245
|
+
run_command += ' --gpus'
|
|
246
|
+
run_command = shlex.split(run_command)
|
|
247
|
+
|
|
248
|
+
# Setup logging paths
|
|
249
|
+
run_timestamp = sky_logging.get_run_timestamp()
|
|
250
|
+
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
251
|
+
'local_up.log')
|
|
252
|
+
logger.info(message_str)
|
|
253
|
+
|
|
254
|
+
with rich_utils.safe_status(
|
|
255
|
+
ux_utils.spinner_message(message_str,
|
|
256
|
+
log_path=log_path,
|
|
257
|
+
is_local=True)):
|
|
258
|
+
returncode, _, stderr = log_lib.run_with_log(
|
|
259
|
+
cmd=run_command,
|
|
260
|
+
log_path=log_path,
|
|
261
|
+
require_outputs=True,
|
|
262
|
+
stream_logs=False,
|
|
263
|
+
line_processor=log_utils.SkyLocalUpLineProcessor(
|
|
264
|
+
log_path=log_path, is_local=True),
|
|
265
|
+
cwd=cwd)
|
|
149
266
|
|
|
150
267
|
# Kind always writes to stderr even if it succeeds.
|
|
151
268
|
# If the failure happens after the cluster is created, we need
|
|
@@ -158,11 +275,11 @@ def deploy_local_cluster(gpus: bool):
|
|
|
158
275
|
elif returncode == 100:
|
|
159
276
|
logger.info(
|
|
160
277
|
ux_utils.finishing_message(
|
|
161
|
-
'Local cluster already exists.\n',
|
|
278
|
+
f'Local cluster {name} already exists.\n',
|
|
162
279
|
log_path=log_path,
|
|
163
280
|
is_local=True,
|
|
164
281
|
follow_up_message=
|
|
165
|
-
'If you want to delete it instead, run: sky local down'))
|
|
282
|
+
'If you want to delete it instead, run: `sky local down --name {name}`')) # pylint: disable=line-too-long
|
|
166
283
|
else:
|
|
167
284
|
with ux_utils.print_exception_no_traceback():
|
|
168
285
|
log_hint = ux_utils.log_path_hint(log_path, is_local=True)
|
|
@@ -188,7 +305,7 @@ def deploy_local_cluster(gpus: bool):
|
|
|
188
305
|
if gpus:
|
|
189
306
|
# Get GPU model by querying the node labels
|
|
190
307
|
label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
|
|
191
|
-
gpu_type_cmd = f'kubectl get node
|
|
308
|
+
gpu_type_cmd = f'kubectl get node {name}-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
|
|
192
309
|
try:
|
|
193
310
|
# Run the command and capture the output
|
|
194
311
|
gpu_count_output = subprocess.check_output(gpu_type_cmd,
|
|
@@ -224,8 +341,10 @@ def deploy_local_cluster(gpus: bool):
|
|
|
224
341
|
'This may cause issues with running tasks.')
|
|
225
342
|
logger.info(
|
|
226
343
|
ux_utils.finishing_message(
|
|
227
|
-
message=(
|
|
228
|
-
|
|
344
|
+
message=(
|
|
345
|
+
f'Local Kubernetes cluster {name} created successfully '
|
|
346
|
+
f'with {num_cpus} CPUs{gpu_message} on host port range '
|
|
347
|
+
f'{port_start}-{port_end}.'),
|
|
229
348
|
log_path=log_path,
|
|
230
349
|
is_local=True,
|
|
231
350
|
follow_up_message=(
|
|
@@ -233,3 +352,54 @@ def deploy_local_cluster(gpus: bool):
|
|
|
233
352
|
'Hint: To change the number of CPUs, change your docker '
|
|
234
353
|
'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
|
|
235
354
|
f'{gpu_hint}')))
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def teardown_local_cluster(name: Optional[str] = None):
|
|
358
|
+
name = name or DEFAULT_LOCAL_CLUSTER_NAME
|
|
359
|
+
cluster_removed = False
|
|
360
|
+
|
|
361
|
+
path_to_package = os.path.dirname(__file__)
|
|
362
|
+
down_script_path = os.path.join(path_to_package, 'delete_cluster.sh')
|
|
363
|
+
|
|
364
|
+
cwd = os.path.dirname(os.path.abspath(down_script_path))
|
|
365
|
+
run_command = f'{down_script_path} {name}'
|
|
366
|
+
run_command = shlex.split(run_command)
|
|
367
|
+
|
|
368
|
+
# Setup logging paths
|
|
369
|
+
run_timestamp = sky_logging.get_run_timestamp()
|
|
370
|
+
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
371
|
+
'local_down.log')
|
|
372
|
+
|
|
373
|
+
with rich_utils.safe_status(
|
|
374
|
+
ux_utils.spinner_message(f'Removing local cluster {name}',
|
|
375
|
+
log_path=log_path,
|
|
376
|
+
is_local=True)):
|
|
377
|
+
|
|
378
|
+
returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
|
|
379
|
+
log_path=log_path,
|
|
380
|
+
require_outputs=True,
|
|
381
|
+
stream_logs=False,
|
|
382
|
+
cwd=cwd)
|
|
383
|
+
stderr = stderr.replace('No kind clusters found.\n', '')
|
|
384
|
+
|
|
385
|
+
if returncode == 0:
|
|
386
|
+
cluster_removed = True
|
|
387
|
+
elif returncode == 100:
|
|
388
|
+
logger.info(
|
|
389
|
+
ux_utils.error_message(f'Local cluster {name} does not exist.'))
|
|
390
|
+
else:
|
|
391
|
+
with ux_utils.print_exception_no_traceback():
|
|
392
|
+
raise RuntimeError(f'Failed to down local cluster {name}. '
|
|
393
|
+
f'Stdout: {stdout}'
|
|
394
|
+
f'\nError: {stderr}')
|
|
395
|
+
if cluster_removed:
|
|
396
|
+
# Run sky check
|
|
397
|
+
with rich_utils.safe_status(
|
|
398
|
+
ux_utils.spinner_message('Running sky check...')):
|
|
399
|
+
sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
|
|
400
|
+
clouds=['kubernetes'],
|
|
401
|
+
quiet=True)
|
|
402
|
+
logger.info(
|
|
403
|
+
ux_utils.finishing_message(f'Local cluster {name} removed.',
|
|
404
|
+
log_path=log_path,
|
|
405
|
+
is_local=True))
|
|
@@ -48,8 +48,16 @@ fi
|
|
|
48
48
|
|
|
49
49
|
if [ -z "$context" ] || [ "$context_lower" = "none" ]; then
|
|
50
50
|
# If context is none, it means we are using incluster auth. In this case,
|
|
51
|
-
#
|
|
52
|
-
kubectl exec
|
|
51
|
+
# we need to set KUBECONFIG to /dev/null to avoid using kubeconfig file.
|
|
52
|
+
kubectl_cmd_base="kubectl exec \"$resource_type/$resource_name\" -n \"$namespace\" --kubeconfig=/dev/null --"
|
|
53
53
|
else
|
|
54
|
-
kubectl exec
|
|
54
|
+
kubectl_cmd_base="kubectl exec \"$resource_type/$resource_name\" -n \"$namespace\" --context=\"$context\" --"
|
|
55
55
|
fi
|
|
56
|
+
|
|
57
|
+
# Execute command on remote pod, waiting for rsync to be available first.
|
|
58
|
+
# The waiting happens on the remote pod, not locally, which is more efficient
|
|
59
|
+
# and reliable than polling from the local machine.
|
|
60
|
+
# We wrap the command in a bash script that waits for rsync, then execs the original command.
|
|
61
|
+
# Timeout after MAX_WAIT_TIME_SECONDS seconds.
|
|
62
|
+
MAX_WAIT_TIME_SECONDS=300
|
|
63
|
+
eval "${kubectl_cmd_base% --} -i -- bash -c 'count=0; max_count=$MAX_WAIT_TIME_SECONDS*2; until which rsync >/dev/null 2>&1; do if [ \$count -ge \$max_count ]; then echo \"Error when trying to rsync files to kubernetes cluster. Package installation may have failed.\" >&2; exit 1; fi; sleep 0.5; count=\$((count+1)); done; exec \"\$@\"' -- \"\$@\""
|