skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -1,9 +1,14 @@
|
|
|
1
1
|
"""Utility functions for deploying Kubernetes clusters."""
|
|
2
2
|
import os
|
|
3
|
+
import random
|
|
3
4
|
import shlex
|
|
4
5
|
import subprocess
|
|
6
|
+
import sys
|
|
5
7
|
import tempfile
|
|
6
|
-
|
|
8
|
+
import textwrap
|
|
9
|
+
from typing import List, Optional, Tuple
|
|
10
|
+
|
|
11
|
+
import colorama
|
|
7
12
|
|
|
8
13
|
from sky import check as sky_check
|
|
9
14
|
from sky import sky_logging
|
|
@@ -19,6 +24,151 @@ from sky.utils import ux_utils
|
|
|
19
24
|
|
|
20
25
|
logger = sky_logging.init_logger(__name__)
|
|
21
26
|
|
|
27
|
+
# Default path for Kubernetes configuration file
|
|
28
|
+
DEFAULT_KUBECONFIG_PATH = os.path.expanduser('~/.kube/config')
|
|
29
|
+
DEFAULT_LOCAL_CLUSTER_NAME = 'skypilot'
|
|
30
|
+
LOCAL_CLUSTER_PORT_RANGE = 100
|
|
31
|
+
LOCAL_CLUSTER_INTERNAL_PORT_START = 30000
|
|
32
|
+
LOCAL_CLUSTER_INTERNAL_PORT_END = 30099
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def check_ssh_cluster_dependencies(
|
|
36
|
+
raise_error: bool = True) -> Optional[List[str]]:
|
|
37
|
+
"""Checks if the dependencies for ssh cluster are installed.
|
|
38
|
+
|
|
39
|
+
Args:
|
|
40
|
+
raise_error: set to true when the dependency needs to be present.
|
|
41
|
+
set to false for `sky check`, where reason strings are compiled
|
|
42
|
+
at the end.
|
|
43
|
+
|
|
44
|
+
Returns: the reasons list if there are missing dependencies.
|
|
45
|
+
"""
|
|
46
|
+
# error message
|
|
47
|
+
jq_message = ('`jq` is required to setup ssh cluster.')
|
|
48
|
+
|
|
49
|
+
# save
|
|
50
|
+
reasons = []
|
|
51
|
+
required_binaries = []
|
|
52
|
+
|
|
53
|
+
# Ensure jq is installed
|
|
54
|
+
try:
|
|
55
|
+
subprocess.run(['jq', '--version'],
|
|
56
|
+
stdout=subprocess.DEVNULL,
|
|
57
|
+
stderr=subprocess.DEVNULL,
|
|
58
|
+
check=True)
|
|
59
|
+
except (FileNotFoundError, subprocess.CalledProcessError):
|
|
60
|
+
required_binaries.append('jq')
|
|
61
|
+
reasons.append(jq_message)
|
|
62
|
+
|
|
63
|
+
if required_binaries:
|
|
64
|
+
reasons.extend([
|
|
65
|
+
'On Debian/Ubuntu, install the missing dependenc(ies) with:',
|
|
66
|
+
f' $ sudo apt install {" ".join(required_binaries)}',
|
|
67
|
+
'On MacOS, install with: ',
|
|
68
|
+
f' $ brew install {" ".join(required_binaries)}',
|
|
69
|
+
])
|
|
70
|
+
if raise_error:
|
|
71
|
+
with ux_utils.print_exception_no_traceback():
|
|
72
|
+
raise RuntimeError('\n'.join(reasons))
|
|
73
|
+
return reasons
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def deploy_ssh_cluster(cleanup: bool = False,
|
|
78
|
+
infra: Optional[str] = None,
|
|
79
|
+
kubeconfig_path: Optional[str] = None):
|
|
80
|
+
"""Deploy a Kubernetes cluster on SSH targets.
|
|
81
|
+
|
|
82
|
+
This function reads ~/.sky/ssh_node_pools.yaml and uses it to deploy a
|
|
83
|
+
Kubernetes cluster on the specified machines.
|
|
84
|
+
|
|
85
|
+
Args:
|
|
86
|
+
cleanup: Whether to clean up the cluster instead of deploying.
|
|
87
|
+
infra: Name of the cluster in ssh_node_pools.yaml to use.
|
|
88
|
+
If None, the first cluster in the file will be used.
|
|
89
|
+
kubeconfig_path: Path to save the Kubernetes configuration file.
|
|
90
|
+
If None, the default ~/.kube/config will be used.
|
|
91
|
+
"""
|
|
92
|
+
check_ssh_cluster_dependencies()
|
|
93
|
+
|
|
94
|
+
# Prepare command to call deploy_remote_cluster.py script
|
|
95
|
+
# TODO(romilb): We should move this to a native python method/class call
|
|
96
|
+
# instead of invoking a script with subprocess.
|
|
97
|
+
path_to_package = os.path.dirname(__file__)
|
|
98
|
+
up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.py')
|
|
99
|
+
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
|
100
|
+
|
|
101
|
+
deploy_command = [sys.executable, up_script_path]
|
|
102
|
+
|
|
103
|
+
if cleanup:
|
|
104
|
+
deploy_command.append('--cleanup')
|
|
105
|
+
|
|
106
|
+
if infra:
|
|
107
|
+
deploy_command.extend(['--infra', infra])
|
|
108
|
+
|
|
109
|
+
# Use the default kubeconfig path if none is provided
|
|
110
|
+
kubeconfig_path = kubeconfig_path or DEFAULT_KUBECONFIG_PATH
|
|
111
|
+
deploy_command.extend(['--kubeconfig-path', kubeconfig_path])
|
|
112
|
+
|
|
113
|
+
# Setup logging paths
|
|
114
|
+
run_timestamp = sky_logging.get_run_timestamp()
|
|
115
|
+
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
116
|
+
'ssh_up.log')
|
|
117
|
+
|
|
118
|
+
if cleanup:
|
|
119
|
+
msg_str = 'Cleaning up SSH Node Pools...'
|
|
120
|
+
else:
|
|
121
|
+
msg_str = 'Initializing deployment to SSH Node Pools...'
|
|
122
|
+
|
|
123
|
+
# Create environment with PYTHONUNBUFFERED=1 to ensure unbuffered output
|
|
124
|
+
env = os.environ.copy()
|
|
125
|
+
env['PYTHONUNBUFFERED'] = '1'
|
|
126
|
+
|
|
127
|
+
with rich_utils.safe_status(
|
|
128
|
+
ux_utils.spinner_message(msg_str, log_path=log_path,
|
|
129
|
+
is_local=True)):
|
|
130
|
+
returncode, _, stderr = log_lib.run_with_log(
|
|
131
|
+
cmd=deploy_command,
|
|
132
|
+
log_path=log_path,
|
|
133
|
+
require_outputs=True,
|
|
134
|
+
stream_logs=False,
|
|
135
|
+
line_processor=log_utils.SkySSHUpLineProcessor(log_path=log_path,
|
|
136
|
+
is_local=False),
|
|
137
|
+
cwd=cwd,
|
|
138
|
+
env=env)
|
|
139
|
+
|
|
140
|
+
if returncode == 0:
|
|
141
|
+
success = True
|
|
142
|
+
else:
|
|
143
|
+
with ux_utils.print_exception_no_traceback():
|
|
144
|
+
log_hint = ux_utils.log_path_hint(log_path, is_local=False)
|
|
145
|
+
raise RuntimeError('Failed to deploy SkyPilot on some Node Pools. '
|
|
146
|
+
f'{log_hint}'
|
|
147
|
+
f'\nError: {stderr}')
|
|
148
|
+
|
|
149
|
+
if success:
|
|
150
|
+
# Add an empty line to separate the deployment logs from the final
|
|
151
|
+
# message
|
|
152
|
+
logger.info('')
|
|
153
|
+
if cleanup:
|
|
154
|
+
logger.info(
|
|
155
|
+
ux_utils.finishing_message(
|
|
156
|
+
'🎉 SSH Node Pools cleaned up successfully.',
|
|
157
|
+
log_path=log_path,
|
|
158
|
+
is_local=True))
|
|
159
|
+
else:
|
|
160
|
+
logger.info(
|
|
161
|
+
ux_utils.finishing_message(
|
|
162
|
+
'🎉 SSH Node Pools set up successfully. ',
|
|
163
|
+
follow_up_message=(
|
|
164
|
+
f'Run `{colorama.Style.BRIGHT}'
|
|
165
|
+
f'sky check ssh'
|
|
166
|
+
f'{colorama.Style.RESET_ALL}` to verify access, '
|
|
167
|
+
f'`{colorama.Style.BRIGHT}sky launch --infra ssh'
|
|
168
|
+
f'{colorama.Style.RESET_ALL}` to launch a cluster. '),
|
|
169
|
+
log_path=log_path,
|
|
170
|
+
is_local=True))
|
|
171
|
+
|
|
22
172
|
|
|
23
173
|
def deploy_remote_cluster(ip_list: List[str],
|
|
24
174
|
ssh_user: str,
|
|
@@ -28,7 +178,7 @@ def deploy_remote_cluster(ip_list: List[str],
|
|
|
28
178
|
password: Optional[str] = None):
|
|
29
179
|
success = False
|
|
30
180
|
path_to_package = os.path.dirname(__file__)
|
|
31
|
-
up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.
|
|
181
|
+
up_script_path = os.path.join(path_to_package, 'deploy_remote_cluster.py')
|
|
32
182
|
# Get directory of script and run it from there
|
|
33
183
|
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
|
34
184
|
|
|
@@ -44,17 +194,18 @@ def deploy_remote_cluster(ip_list: List[str],
|
|
|
44
194
|
key_file.flush()
|
|
45
195
|
os.chmod(key_file.name, 0o600)
|
|
46
196
|
|
|
47
|
-
|
|
48
|
-
|
|
197
|
+
# Use the legacy mode command line arguments for backward compatibility
|
|
198
|
+
deploy_command = [
|
|
199
|
+
sys.executable, up_script_path, '--ips-file', ip_file.name,
|
|
200
|
+
'--user', ssh_user, '--ssh-key', key_file.name
|
|
201
|
+
]
|
|
202
|
+
|
|
49
203
|
if context_name is not None:
|
|
50
|
-
deploy_command
|
|
204
|
+
deploy_command.extend(['--context-name', context_name])
|
|
51
205
|
if password is not None:
|
|
52
|
-
deploy_command
|
|
206
|
+
deploy_command.extend(['--password', password])
|
|
53
207
|
if cleanup:
|
|
54
|
-
deploy_command
|
|
55
|
-
|
|
56
|
-
# Convert the command to a format suitable for subprocess
|
|
57
|
-
deploy_command = shlex.split(deploy_command)
|
|
208
|
+
deploy_command.append('--cleanup')
|
|
58
209
|
|
|
59
210
|
# Setup logging paths
|
|
60
211
|
run_timestamp = sky_logging.get_run_timestamp()
|
|
@@ -65,6 +216,11 @@ def deploy_remote_cluster(ip_list: List[str],
|
|
|
65
216
|
msg_str = 'Cleaning up remote cluster...'
|
|
66
217
|
else:
|
|
67
218
|
msg_str = 'Deploying remote cluster...'
|
|
219
|
+
|
|
220
|
+
# Create environment with PYTHONUNBUFFERED=1 to ensure unbuffered output
|
|
221
|
+
env = os.environ.copy()
|
|
222
|
+
env['PYTHONUNBUFFERED'] = '1'
|
|
223
|
+
|
|
68
224
|
with rich_utils.safe_status(
|
|
69
225
|
ux_utils.spinner_message(msg_str,
|
|
70
226
|
log_path=log_path,
|
|
@@ -76,7 +232,8 @@ def deploy_remote_cluster(ip_list: List[str],
|
|
|
76
232
|
stream_logs=False,
|
|
77
233
|
line_processor=log_utils.SkyRemoteUpLineProcessor(
|
|
78
234
|
log_path=log_path, is_local=True),
|
|
79
|
-
cwd=cwd
|
|
235
|
+
cwd=cwd,
|
|
236
|
+
env=env)
|
|
80
237
|
if returncode == 0:
|
|
81
238
|
success = True
|
|
82
239
|
else:
|
|
@@ -101,7 +258,93 @@ def deploy_remote_cluster(ip_list: List[str],
|
|
|
101
258
|
is_local=True))
|
|
102
259
|
|
|
103
260
|
|
|
104
|
-
def
|
|
261
|
+
def generate_kind_config(port_start: int,
|
|
262
|
+
num_nodes: int = 1,
|
|
263
|
+
gpus: bool = False) -> str:
|
|
264
|
+
"""Generate a kind cluster config with ports mapped from host to container
|
|
265
|
+
|
|
266
|
+
Port range will be [port_start, port_start + LOCAL_CLUSTER_PORT_RANGE)
|
|
267
|
+
Internally, this will map to ports 30000 - 30099
|
|
268
|
+
|
|
269
|
+
Args:
|
|
270
|
+
path: Path to generate the config file at
|
|
271
|
+
port_start: Port range start for mappings
|
|
272
|
+
num_nodes: Number of nodes in the cluster
|
|
273
|
+
gpus: If true, initialize kind cluster with GPU support
|
|
274
|
+
|
|
275
|
+
Returns:
|
|
276
|
+
The kind cluster config
|
|
277
|
+
"""
|
|
278
|
+
internal_start = LOCAL_CLUSTER_INTERNAL_PORT_START
|
|
279
|
+
internal_end = LOCAL_CLUSTER_INTERNAL_PORT_END
|
|
280
|
+
|
|
281
|
+
config = textwrap.dedent(f"""
|
|
282
|
+
apiVersion: kind.x-k8s.io/v1alpha4
|
|
283
|
+
kind: Cluster
|
|
284
|
+
kubeadmConfigPatches:
|
|
285
|
+
- |
|
|
286
|
+
kind: ClusterConfiguration
|
|
287
|
+
apiServer:
|
|
288
|
+
extraArgs:
|
|
289
|
+
"service-node-port-range": {internal_start}-{internal_end}
|
|
290
|
+
nodes:
|
|
291
|
+
- role: control-plane
|
|
292
|
+
kubeadmConfigPatches:
|
|
293
|
+
- |
|
|
294
|
+
kind: InitConfiguration
|
|
295
|
+
nodeRegistration:
|
|
296
|
+
kubeletExtraArgs:
|
|
297
|
+
node-labels: "ingress-ready=true"
|
|
298
|
+
""")
|
|
299
|
+
if gpus:
|
|
300
|
+
config += textwrap.indent(
|
|
301
|
+
textwrap.dedent("""
|
|
302
|
+
extraMounts:
|
|
303
|
+
- hostPath: /dev/null
|
|
304
|
+
containerPath: /var/run/nvidia-container-devices/all"""), ' ' * 2)
|
|
305
|
+
config += textwrap.indent(textwrap.dedent("""
|
|
306
|
+
extraPortMappings:"""), ' ' * 2)
|
|
307
|
+
for offset in range(LOCAL_CLUSTER_PORT_RANGE):
|
|
308
|
+
config += textwrap.indent(
|
|
309
|
+
textwrap.dedent(f"""
|
|
310
|
+
- containerPort: {internal_start + offset}
|
|
311
|
+
hostPort: {port_start + offset}
|
|
312
|
+
listenAddress: "0.0.0.0"
|
|
313
|
+
protocol: tcp
|
|
314
|
+
"""), ' ' * 2)
|
|
315
|
+
if num_nodes > 1:
|
|
316
|
+
config += '- role: worker\n' * (num_nodes - 1)
|
|
317
|
+
return config
|
|
318
|
+
|
|
319
|
+
|
|
320
|
+
def _get_port_range(name: str, port_start: Optional[int]) -> Tuple[int, int]:
|
|
321
|
+
is_default = name == DEFAULT_LOCAL_CLUSTER_NAME
|
|
322
|
+
if port_start is None:
|
|
323
|
+
if is_default:
|
|
324
|
+
port_start = LOCAL_CLUSTER_INTERNAL_PORT_START
|
|
325
|
+
else:
|
|
326
|
+
port_start = random.randint(301, 399) * 100
|
|
327
|
+
port_end = port_start + LOCAL_CLUSTER_PORT_RANGE - 1
|
|
328
|
+
|
|
329
|
+
port_range = f'Current port range: {port_start}-{port_end}'
|
|
330
|
+
if is_default and port_start != LOCAL_CLUSTER_INTERNAL_PORT_START:
|
|
331
|
+
raise ValueError('Default local cluster `skypilot` should have '
|
|
332
|
+
f'port range from 30000 to 30099. {port_range}.')
|
|
333
|
+
if not is_default and port_start == LOCAL_CLUSTER_INTERNAL_PORT_START:
|
|
334
|
+
raise ValueError('Port range 30000 to 30099 is reserved for '
|
|
335
|
+
f'default local cluster `skypilot`. {port_range}.')
|
|
336
|
+
if port_start % 100 != 0:
|
|
337
|
+
raise ValueError('Local cluster port start must be a multiple of 100. '
|
|
338
|
+
f'{port_range}.')
|
|
339
|
+
|
|
340
|
+
return port_start, port_end
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
def deploy_local_cluster(name: Optional[str], port_start: Optional[int],
|
|
344
|
+
gpus: bool):
|
|
345
|
+
name = name or DEFAULT_LOCAL_CLUSTER_NAME
|
|
346
|
+
port_start, port_end = _get_port_range(name, port_start)
|
|
347
|
+
context_name = f'kind-{name}'
|
|
105
348
|
cluster_created = False
|
|
106
349
|
|
|
107
350
|
# Check if GPUs are available on the host
|
|
@@ -111,41 +354,52 @@ def deploy_local_cluster(gpus: bool):
|
|
|
111
354
|
# Check if ~/.kube/config exists:
|
|
112
355
|
if os.path.exists(os.path.expanduser('~/.kube/config')):
|
|
113
356
|
curr_context = kubernetes_utils.get_current_kube_config_context_name()
|
|
114
|
-
|
|
115
|
-
if curr_context is not None and curr_context != skypilot_context:
|
|
357
|
+
if curr_context is not None and curr_context != context_name:
|
|
116
358
|
logger.info(
|
|
117
359
|
f'Current context in kube config: {curr_context}'
|
|
118
|
-
'\nWill automatically switch to
|
|
119
|
-
'cluster is created.')
|
|
120
|
-
message_str = 'Creating local cluster{}...'
|
|
121
|
-
message_str = message_str.format(
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
|
|
360
|
+
f'\nWill automatically switch to {context_name} after the '
|
|
361
|
+
'local cluster is created.')
|
|
362
|
+
message_str = 'Creating local cluster {}{}...'
|
|
363
|
+
message_str = message_str.format(
|
|
364
|
+
name,
|
|
365
|
+
' with GPU support (this may take up to 15 minutes)' if gpus else '')
|
|
125
366
|
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
367
|
+
with tempfile.NamedTemporaryFile(mode='w+', suffix='.yaml',
|
|
368
|
+
delete=True) as f:
|
|
369
|
+
# Choose random port range to use on the host machine.
|
|
370
|
+
# Port range is port_start - port_start + 99 (exactly 100 ports).
|
|
371
|
+
logger.debug(f'Using host port range {port_start}-{port_end}')
|
|
372
|
+
f.write(generate_kind_config(port_start, gpus=gpus))
|
|
373
|
+
f.flush()
|
|
130
374
|
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
134
|
-
'local_up.log')
|
|
135
|
-
logger.info(message_str)
|
|
375
|
+
path_to_package = os.path.dirname(__file__)
|
|
376
|
+
up_script_path = os.path.join(path_to_package, 'create_cluster.sh')
|
|
136
377
|
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
378
|
+
# Get directory of script and run it from there
|
|
379
|
+
cwd = os.path.dirname(os.path.abspath(up_script_path))
|
|
380
|
+
run_command = f'{up_script_path} {name} {f.name}'
|
|
381
|
+
if gpus:
|
|
382
|
+
run_command += ' --gpus'
|
|
383
|
+
run_command = shlex.split(run_command)
|
|
384
|
+
|
|
385
|
+
# Setup logging paths
|
|
386
|
+
run_timestamp = sky_logging.get_run_timestamp()
|
|
387
|
+
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
388
|
+
'local_up.log')
|
|
389
|
+
logger.info(message_str)
|
|
390
|
+
|
|
391
|
+
with rich_utils.safe_status(
|
|
392
|
+
ux_utils.spinner_message(message_str,
|
|
393
|
+
log_path=log_path,
|
|
394
|
+
is_local=True)):
|
|
395
|
+
returncode, _, stderr = log_lib.run_with_log(
|
|
396
|
+
cmd=run_command,
|
|
397
|
+
log_path=log_path,
|
|
398
|
+
require_outputs=True,
|
|
399
|
+
stream_logs=False,
|
|
400
|
+
line_processor=log_utils.SkyLocalUpLineProcessor(
|
|
401
|
+
log_path=log_path, is_local=True),
|
|
402
|
+
cwd=cwd)
|
|
149
403
|
|
|
150
404
|
# Kind always writes to stderr even if it succeeds.
|
|
151
405
|
# If the failure happens after the cluster is created, we need
|
|
@@ -158,11 +412,11 @@ def deploy_local_cluster(gpus: bool):
|
|
|
158
412
|
elif returncode == 100:
|
|
159
413
|
logger.info(
|
|
160
414
|
ux_utils.finishing_message(
|
|
161
|
-
'Local cluster already exists.\n',
|
|
415
|
+
f'Local cluster {name} already exists.\n',
|
|
162
416
|
log_path=log_path,
|
|
163
417
|
is_local=True,
|
|
164
418
|
follow_up_message=
|
|
165
|
-
'If you want to delete it instead, run: sky local down'))
|
|
419
|
+
'If you want to delete it instead, run: `sky local down --name {name}`')) # pylint: disable=line-too-long
|
|
166
420
|
else:
|
|
167
421
|
with ux_utils.print_exception_no_traceback():
|
|
168
422
|
log_hint = ux_utils.log_path_hint(log_path, is_local=True)
|
|
@@ -188,7 +442,7 @@ def deploy_local_cluster(gpus: bool):
|
|
|
188
442
|
if gpus:
|
|
189
443
|
# Get GPU model by querying the node labels
|
|
190
444
|
label_name_escaped = 'skypilot.co/accelerator'.replace('.', '\\.')
|
|
191
|
-
gpu_type_cmd = f'kubectl get node
|
|
445
|
+
gpu_type_cmd = f'kubectl get node {name}-control-plane -o jsonpath=\"{{.metadata.labels[\'{label_name_escaped}\']}}\"' # pylint: disable=line-too-long
|
|
192
446
|
try:
|
|
193
447
|
# Run the command and capture the output
|
|
194
448
|
gpu_count_output = subprocess.check_output(gpu_type_cmd,
|
|
@@ -224,8 +478,10 @@ def deploy_local_cluster(gpus: bool):
|
|
|
224
478
|
'This may cause issues with running tasks.')
|
|
225
479
|
logger.info(
|
|
226
480
|
ux_utils.finishing_message(
|
|
227
|
-
message=(
|
|
228
|
-
|
|
481
|
+
message=(
|
|
482
|
+
f'Local Kubernetes cluster {name} created successfully '
|
|
483
|
+
f'with {num_cpus} CPUs{gpu_message} on host port range '
|
|
484
|
+
f'{port_start}-{port_end}.'),
|
|
229
485
|
log_path=log_path,
|
|
230
486
|
is_local=True,
|
|
231
487
|
follow_up_message=(
|
|
@@ -233,3 +489,54 @@ def deploy_local_cluster(gpus: bool):
|
|
|
233
489
|
'Hint: To change the number of CPUs, change your docker '
|
|
234
490
|
'runtime settings. See https://kind.sigs.k8s.io/docs/user/quick-start/#settings-for-docker-desktop for more info.' # pylint: disable=line-too-long
|
|
235
491
|
f'{gpu_hint}')))
|
|
492
|
+
|
|
493
|
+
|
|
494
|
+
def teardown_local_cluster(name: Optional[str] = None):
|
|
495
|
+
name = name or DEFAULT_LOCAL_CLUSTER_NAME
|
|
496
|
+
cluster_removed = False
|
|
497
|
+
|
|
498
|
+
path_to_package = os.path.dirname(__file__)
|
|
499
|
+
down_script_path = os.path.join(path_to_package, 'delete_cluster.sh')
|
|
500
|
+
|
|
501
|
+
cwd = os.path.dirname(os.path.abspath(down_script_path))
|
|
502
|
+
run_command = f'{down_script_path} {name}'
|
|
503
|
+
run_command = shlex.split(run_command)
|
|
504
|
+
|
|
505
|
+
# Setup logging paths
|
|
506
|
+
run_timestamp = sky_logging.get_run_timestamp()
|
|
507
|
+
log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
|
|
508
|
+
'local_down.log')
|
|
509
|
+
|
|
510
|
+
with rich_utils.safe_status(
|
|
511
|
+
ux_utils.spinner_message(f'Removing local cluster {name}',
|
|
512
|
+
log_path=log_path,
|
|
513
|
+
is_local=True)):
|
|
514
|
+
|
|
515
|
+
returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
|
|
516
|
+
log_path=log_path,
|
|
517
|
+
require_outputs=True,
|
|
518
|
+
stream_logs=False,
|
|
519
|
+
cwd=cwd)
|
|
520
|
+
stderr = stderr.replace('No kind clusters found.\n', '')
|
|
521
|
+
|
|
522
|
+
if returncode == 0:
|
|
523
|
+
cluster_removed = True
|
|
524
|
+
elif returncode == 100:
|
|
525
|
+
logger.info(
|
|
526
|
+
ux_utils.error_message(f'Local cluster {name} does not exist.'))
|
|
527
|
+
else:
|
|
528
|
+
with ux_utils.print_exception_no_traceback():
|
|
529
|
+
raise RuntimeError(f'Failed to down local cluster {name}. '
|
|
530
|
+
f'Stdout: {stdout}'
|
|
531
|
+
f'\nError: {stderr}')
|
|
532
|
+
if cluster_removed:
|
|
533
|
+
# Run sky check
|
|
534
|
+
with rich_utils.safe_status(
|
|
535
|
+
ux_utils.spinner_message('Running sky check...')):
|
|
536
|
+
sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
|
|
537
|
+
clouds=['kubernetes'],
|
|
538
|
+
quiet=True)
|
|
539
|
+
logger.info(
|
|
540
|
+
ux_utils.finishing_message(f'Local cluster {name} removed.',
|
|
541
|
+
log_path=log_path,
|
|
542
|
+
is_local=True))
|