skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/utils/db_utils.py
DELETED
|
@@ -1,100 +0,0 @@
|
|
|
1
|
-
"""Utils for sky databases."""
|
|
2
|
-
import contextlib
|
|
3
|
-
import sqlite3
|
|
4
|
-
import threading
|
|
5
|
-
from typing import Any, Callable, Optional
|
|
6
|
-
|
|
7
|
-
# This parameter (passed to sqlite3.connect) controls how long we will wait to
|
|
8
|
-
# obtains a database lock (not necessarily during connection, but whenever it is
|
|
9
|
-
# needed). It is not a connection timeout.
|
|
10
|
-
# Even in WAL mode, only a single writer is allowed at a time. Other writers
|
|
11
|
-
# will block until the write lock can be obtained. This behavior is described in
|
|
12
|
-
# the SQLite documentation for WAL: https://www.sqlite.org/wal.html
|
|
13
|
-
# Python's default timeout is 5s. In normal usage, lock contention is very low,
|
|
14
|
-
# and this is more than sufficient. However, in some highly concurrent cases,
|
|
15
|
-
# such as a jobs controller suddenly recovering thousands of jobs at once, we
|
|
16
|
-
# can see a small number of processes that take much longer to obtain the lock.
|
|
17
|
-
# In contrived highly contentious cases, around 0.1% of transactions will take
|
|
18
|
-
# >30s to take the lock. We have not seen cases that take >60s. For cases up to
|
|
19
|
-
# 1000x parallelism, this is thus thought to be a conservative setting.
|
|
20
|
-
# For more info, see the PR description for #4552.
|
|
21
|
-
_DB_TIMEOUT_S = 60
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
@contextlib.contextmanager
|
|
25
|
-
def safe_cursor(db_path: str):
|
|
26
|
-
"""A newly created, auto-committing, auto-closing cursor."""
|
|
27
|
-
conn = sqlite3.connect(db_path, timeout=_DB_TIMEOUT_S)
|
|
28
|
-
cursor = conn.cursor()
|
|
29
|
-
try:
|
|
30
|
-
yield cursor
|
|
31
|
-
finally:
|
|
32
|
-
cursor.close()
|
|
33
|
-
conn.commit()
|
|
34
|
-
conn.close()
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
def add_column_to_table(
|
|
38
|
-
cursor: 'sqlite3.Cursor',
|
|
39
|
-
conn: 'sqlite3.Connection',
|
|
40
|
-
table_name: str,
|
|
41
|
-
column_name: str,
|
|
42
|
-
column_type: str,
|
|
43
|
-
copy_from: Optional[str] = None,
|
|
44
|
-
value_to_replace_existing_entries: Optional[Any] = None,
|
|
45
|
-
):
|
|
46
|
-
"""Add a column to a table."""
|
|
47
|
-
for row in cursor.execute(f'PRAGMA table_info({table_name})'):
|
|
48
|
-
if row[1] == column_name:
|
|
49
|
-
break
|
|
50
|
-
else:
|
|
51
|
-
try:
|
|
52
|
-
add_column_cmd = (f'ALTER TABLE {table_name} '
|
|
53
|
-
f'ADD COLUMN {column_name} {column_type}')
|
|
54
|
-
cursor.execute(add_column_cmd)
|
|
55
|
-
if copy_from is not None:
|
|
56
|
-
cursor.execute(f'UPDATE {table_name} '
|
|
57
|
-
f'SET {column_name} = {copy_from}')
|
|
58
|
-
if value_to_replace_existing_entries is not None:
|
|
59
|
-
cursor.execute(
|
|
60
|
-
f'UPDATE {table_name} '
|
|
61
|
-
f'SET {column_name} = (?) '
|
|
62
|
-
f'WHERE {column_name} IS NULL',
|
|
63
|
-
(value_to_replace_existing_entries,))
|
|
64
|
-
except sqlite3.OperationalError as e:
|
|
65
|
-
if 'duplicate column name' in str(e):
|
|
66
|
-
# We may be trying to add the same column twice, when
|
|
67
|
-
# running multiple threads. This is fine.
|
|
68
|
-
pass
|
|
69
|
-
else:
|
|
70
|
-
raise
|
|
71
|
-
conn.commit()
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
def rename_column(
|
|
75
|
-
cursor: 'sqlite3.Cursor',
|
|
76
|
-
conn: 'sqlite3.Connection',
|
|
77
|
-
table_name: str,
|
|
78
|
-
old_name: str,
|
|
79
|
-
new_name: str,
|
|
80
|
-
):
|
|
81
|
-
"""Rename a column in a table."""
|
|
82
|
-
# NOTE: This only works for sqlite3 >= 3.25.0. Be careful to use this.
|
|
83
|
-
|
|
84
|
-
for row in cursor.execute(f'PRAGMA table_info({table_name})'):
|
|
85
|
-
if row[1] == old_name:
|
|
86
|
-
cursor.execute(f'ALTER TABLE {table_name} '
|
|
87
|
-
f'RENAME COLUMN {old_name} to {new_name}')
|
|
88
|
-
break
|
|
89
|
-
conn.commit()
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
class SQLiteConn(threading.local):
|
|
93
|
-
"""Thread-local connection to the sqlite3 database."""
|
|
94
|
-
|
|
95
|
-
def __init__(self, db_path: str, create_table: Callable):
|
|
96
|
-
super().__init__()
|
|
97
|
-
self.db_path = db_path
|
|
98
|
-
self.conn = sqlite3.connect(db_path, timeout=_DB_TIMEOUT_S)
|
|
99
|
-
self.cursor = self.conn.cursor()
|
|
100
|
-
create_table(self.cursor, self.conn)
|
|
@@ -1,308 +0,0 @@
|
|
|
1
|
-
#!/bin/bash
|
|
2
|
-
# Refer to https://docs.skypilot.co/en/latest/reservations/existing-machines.html for details on how to use this script.
|
|
3
|
-
set -e
|
|
4
|
-
|
|
5
|
-
# Colors for nicer UX
|
|
6
|
-
RED='\033[0;31m'
|
|
7
|
-
GREEN='\033[0;32m'
|
|
8
|
-
YELLOW='\033[1;33m'
|
|
9
|
-
NC='\033[0m' # No color
|
|
10
|
-
|
|
11
|
-
# Variables
|
|
12
|
-
CLEANUP=false
|
|
13
|
-
INSTALL_GPU=false
|
|
14
|
-
POSITIONAL_ARGS=()
|
|
15
|
-
PASSWORD=""
|
|
16
|
-
|
|
17
|
-
# Process all arguments
|
|
18
|
-
while [[ $# -gt 0 ]]; do
|
|
19
|
-
case $1 in
|
|
20
|
-
--cleanup)
|
|
21
|
-
CLEANUP=true
|
|
22
|
-
shift
|
|
23
|
-
;;
|
|
24
|
-
--password)
|
|
25
|
-
PASSWORD=$2
|
|
26
|
-
shift
|
|
27
|
-
shift
|
|
28
|
-
;;
|
|
29
|
-
*)
|
|
30
|
-
POSITIONAL_ARGS+=("$1")
|
|
31
|
-
shift
|
|
32
|
-
;;
|
|
33
|
-
esac
|
|
34
|
-
done
|
|
35
|
-
|
|
36
|
-
# Restore positional arguments in correct order
|
|
37
|
-
set -- "${POSITIONAL_ARGS[@]}"
|
|
38
|
-
|
|
39
|
-
# Assign positional arguments to variables
|
|
40
|
-
IPS_FILE=$1
|
|
41
|
-
USER=$2
|
|
42
|
-
SSH_KEY=$3
|
|
43
|
-
CONTEXT_NAME=${4:-default}
|
|
44
|
-
K3S_TOKEN=mytoken # Any string can be used as the token
|
|
45
|
-
# Create temporary askpass script for sudo
|
|
46
|
-
ASKPASS_BLOCK="# Create temporary askpass script
|
|
47
|
-
ASKPASS_SCRIPT=\$(mktemp)
|
|
48
|
-
trap 'rm -f \$ASKPASS_SCRIPT' EXIT INT TERM ERR QUIT
|
|
49
|
-
cat > \$ASKPASS_SCRIPT << EOF
|
|
50
|
-
#!/bin/bash
|
|
51
|
-
echo $PASSWORD
|
|
52
|
-
EOF
|
|
53
|
-
chmod 700 \$ASKPASS_SCRIPT
|
|
54
|
-
# Use askpass
|
|
55
|
-
export SUDO_ASKPASS=\$ASKPASS_SCRIPT
|
|
56
|
-
"
|
|
57
|
-
|
|
58
|
-
# Basic argument checks
|
|
59
|
-
if [ -z "$IPS_FILE" ] || [ -z "$USER" ] || [ -z "$SSH_KEY" ]; then
|
|
60
|
-
>&2 echo -e "${RED}Error: Missing required arguments.${NC}"
|
|
61
|
-
>&2 echo "Usage: ./deploy_remote_cluster.sh ips.txt username path/to/ssh/key [context-name] [--cleanup] [--password password]"
|
|
62
|
-
exit 1
|
|
63
|
-
fi
|
|
64
|
-
|
|
65
|
-
# Check if SSH key exists
|
|
66
|
-
if [ ! -f "$SSH_KEY" ]; then
|
|
67
|
-
>&2 echo -e "${RED}Error: SSH key not found: $SSH_KEY${NC}"
|
|
68
|
-
exit 1
|
|
69
|
-
fi
|
|
70
|
-
|
|
71
|
-
# Check if IPs file exists
|
|
72
|
-
if [ ! -f "$IPS_FILE" ]; then
|
|
73
|
-
>&2 echo -e "${RED}Error: IPs file not found: $IPS_FILE${NC}"
|
|
74
|
-
exit 1
|
|
75
|
-
fi
|
|
76
|
-
|
|
77
|
-
# Get head node and worker nodes from the IPs file
|
|
78
|
-
HEAD_NODE=$(head -n 1 "$IPS_FILE")
|
|
79
|
-
WORKER_NODES=$(tail -n +2 "$IPS_FILE")
|
|
80
|
-
|
|
81
|
-
# Check if the IPs file is empty or not formatted correctly
|
|
82
|
-
if [ -z "$HEAD_NODE" ]; then
|
|
83
|
-
>&2 echo -e "${RED}Error: IPs file is empty or not formatted correctly.${NC}"
|
|
84
|
-
exit 1
|
|
85
|
-
fi
|
|
86
|
-
|
|
87
|
-
# Function to show a progress message
|
|
88
|
-
progress_message() {
|
|
89
|
-
echo -e "${YELLOW}➜ $1${NC}"
|
|
90
|
-
}
|
|
91
|
-
|
|
92
|
-
# Step to display success
|
|
93
|
-
success_message() {
|
|
94
|
-
echo -e "${GREEN}✔ $1${NC}"
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
# Function to run a command on a remote machine via SSH
|
|
98
|
-
run_remote() {
|
|
99
|
-
local NODE_IP=$1
|
|
100
|
-
local CMD=$2
|
|
101
|
-
# echo -e "${YELLOW}Running command on $NODE_IP...${NC}"
|
|
102
|
-
ssh -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$NODE_IP" "$CMD"
|
|
103
|
-
}
|
|
104
|
-
|
|
105
|
-
# Function to uninstall k3s and clean up the state on a remote machine
|
|
106
|
-
cleanup_server_node() {
|
|
107
|
-
local NODE_IP=$1
|
|
108
|
-
echo -e "${YELLOW}Cleaning up head node $NODE_IP...${NC}"
|
|
109
|
-
run_remote "$NODE_IP" "
|
|
110
|
-
$ASKPASS_BLOCK
|
|
111
|
-
echo 'Uninstalling k3s...' &&
|
|
112
|
-
sudo -A /usr/local/bin/k3s-uninstall.sh || true &&
|
|
113
|
-
sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
|
|
114
|
-
"
|
|
115
|
-
echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
# Function to uninstall k3s and clean up the state on a remote machine
|
|
119
|
-
cleanup_agent_node() {
|
|
120
|
-
local NODE_IP=$1
|
|
121
|
-
echo -e "${YELLOW}Cleaning up node $NODE_IP...${NC}"
|
|
122
|
-
run_remote "$NODE_IP" "
|
|
123
|
-
$ASKPASS_BLOCK
|
|
124
|
-
echo 'Uninstalling k3s...' &&
|
|
125
|
-
sudo -A /usr/local/bin/k3s-agent-uninstall.sh || true &&
|
|
126
|
-
sudo -A rm -rf /etc/rancher /var/lib/rancher /var/lib/kubelet /etc/kubernetes ~/.kube
|
|
127
|
-
"
|
|
128
|
-
echo -e "${GREEN}Node $NODE_IP cleaned up successfully.${NC}"
|
|
129
|
-
}
|
|
130
|
-
|
|
131
|
-
check_gpu() {
|
|
132
|
-
local NODE_IP=$1
|
|
133
|
-
if run_remote "$NODE_IP" "command -v nvidia-smi &> /dev/null && nvidia-smi --query-gpu=gpu_name --format=csv,noheader &> /dev/null"; then
|
|
134
|
-
return 0 # GPU detected
|
|
135
|
-
else
|
|
136
|
-
return 1 # No GPU detected
|
|
137
|
-
fi
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
# Pre-flight checks
|
|
141
|
-
run_remote "$HEAD_NODE" "echo 'SSH connection successful'"
|
|
142
|
-
# TODO: Add more pre-flight checks here, including checking if port 6443 is accessible
|
|
143
|
-
|
|
144
|
-
# If --cleanup flag is set, uninstall k3s and exit
|
|
145
|
-
if [ "$CLEANUP" == "true" ]; then
|
|
146
|
-
echo -e "${YELLOW}Starting cleanup...${NC}"
|
|
147
|
-
|
|
148
|
-
# Clean up head node
|
|
149
|
-
cleanup_server_node "$HEAD_NODE"
|
|
150
|
-
|
|
151
|
-
# Clean up worker nodes
|
|
152
|
-
for NODE in $WORKER_NODES; do
|
|
153
|
-
cleanup_agent_node "$NODE"
|
|
154
|
-
done
|
|
155
|
-
|
|
156
|
-
# Remove the context from local kubeconfig if it exists
|
|
157
|
-
if [ -f "$HOME/.kube/config" ]; then
|
|
158
|
-
progress_message "Removing context '$CONTEXT_NAME' from local kubeconfig..."
|
|
159
|
-
kubectl config delete-context "$CONTEXT_NAME" 2>/dev/null || true
|
|
160
|
-
kubectl config delete-cluster "$CONTEXT_NAME" 2>/dev/null || true
|
|
161
|
-
kubectl config delete-user "$CONTEXT_NAME" 2>/dev/null || true
|
|
162
|
-
# Update the current context to the first available context
|
|
163
|
-
kubectl config use-context $(kubectl config view -o jsonpath='{.contexts[0].name}') 2>/dev/null || true
|
|
164
|
-
success_message "Context '$CONTEXT_NAME' removed from local kubeconfig."
|
|
165
|
-
fi
|
|
166
|
-
|
|
167
|
-
echo -e "${GREEN}Cleanup completed successfully.${NC}"
|
|
168
|
-
exit 0
|
|
169
|
-
fi
|
|
170
|
-
|
|
171
|
-
# Step 1: Install k3s on the head node
|
|
172
|
-
progress_message "Deploying Kubernetes on head node ($HEAD_NODE)..."
|
|
173
|
-
run_remote "$HEAD_NODE" "
|
|
174
|
-
$ASKPASS_BLOCK
|
|
175
|
-
curl -sfL https://get.k3s.io | K3S_TOKEN=$K3S_TOKEN sudo -E -A sh - &&
|
|
176
|
-
mkdir -p ~/.kube &&
|
|
177
|
-
sudo -A cp /etc/rancher/k3s/k3s.yaml ~/.kube/config &&
|
|
178
|
-
sudo -A chown \$(id -u):\$(id -g) ~/.kube/config &&
|
|
179
|
-
for i in {1..3}; do
|
|
180
|
-
if kubectl wait --for=condition=ready node --all --timeout=2m --kubeconfig ~/.kube/config; then
|
|
181
|
-
break
|
|
182
|
-
else
|
|
183
|
-
echo 'Waiting for nodes to be ready...'
|
|
184
|
-
sleep 5
|
|
185
|
-
fi
|
|
186
|
-
done
|
|
187
|
-
if [ \$i -eq 3 ]; then
|
|
188
|
-
echo 'Failed to wait for nodes to be ready after 3 attempts'
|
|
189
|
-
exit 1
|
|
190
|
-
fi"
|
|
191
|
-
success_message "K3s deployed on head node."
|
|
192
|
-
|
|
193
|
-
# Check if head node has a GPU
|
|
194
|
-
if check_gpu "$HEAD_NODE"; then
|
|
195
|
-
echo -e "${YELLOW}GPU detected on head node ($HEAD_NODE).${NC}"
|
|
196
|
-
INSTALL_GPU=true
|
|
197
|
-
fi
|
|
198
|
-
|
|
199
|
-
# Fetch the head node's internal IP (this will be passed to worker nodes)
|
|
200
|
-
MASTER_ADDR=$(run_remote "$HEAD_NODE" "hostname -I | awk '{print \$1}'")
|
|
201
|
-
|
|
202
|
-
echo -e "${GREEN}Master node internal IP: $MASTER_ADDR${NC}"
|
|
203
|
-
|
|
204
|
-
# Step 2: Install k3s on worker nodes and join them to the master node
|
|
205
|
-
for NODE in $WORKER_NODES; do
|
|
206
|
-
progress_message "Deploying Kubernetes on worker node ($NODE)..."
|
|
207
|
-
run_remote "$NODE" "
|
|
208
|
-
$ASKPASS_BLOCK
|
|
209
|
-
curl -sfL https://get.k3s.io | K3S_URL=https://$MASTER_ADDR:6443 K3S_TOKEN=$K3S_TOKEN sudo -E -A sh -"
|
|
210
|
-
success_message "Kubernetes deployed on worker node ($NODE)."
|
|
211
|
-
|
|
212
|
-
# Check if worker node has a GPU
|
|
213
|
-
if check_gpu "$NODE"; then
|
|
214
|
-
echo -e "${YELLOW}GPU detected on worker node ($NODE).${NC}"
|
|
215
|
-
INSTALL_GPU=true
|
|
216
|
-
fi
|
|
217
|
-
done
|
|
218
|
-
# Step 3: Configure local kubectl to connect to the cluster
|
|
219
|
-
progress_message "Configuring local kubectl to connect to the cluster..."
|
|
220
|
-
|
|
221
|
-
# Create temporary directory for kubeconfig operations
|
|
222
|
-
TEMP_DIR=$(mktemp -d)
|
|
223
|
-
TEMP_KUBECONFIG="$TEMP_DIR/kubeconfig"
|
|
224
|
-
|
|
225
|
-
# Get the kubeconfig from remote server
|
|
226
|
-
scp -o StrictHostKeyChecking=no -o IdentitiesOnly=yes -i "$SSH_KEY" "$USER@$HEAD_NODE":~/.kube/config "$TEMP_KUBECONFIG"
|
|
227
|
-
|
|
228
|
-
# Create .kube directory if it doesn't exist
|
|
229
|
-
mkdir -p "$HOME/.kube"
|
|
230
|
-
|
|
231
|
-
# Create empty kubeconfig if it doesn't exist
|
|
232
|
-
KUBECONFIG_FILE="$HOME/.kube/config"
|
|
233
|
-
if [[ ! -f "$KUBECONFIG_FILE" ]]; then
|
|
234
|
-
touch "$KUBECONFIG_FILE"
|
|
235
|
-
fi
|
|
236
|
-
|
|
237
|
-
# Modify the temporary kubeconfig to update server address and context name
|
|
238
|
-
awk -v context="$CONTEXT_NAME" '
|
|
239
|
-
/^clusters:/ { in_cluster = 1 }
|
|
240
|
-
/^users:/ { in_cluster = 0 }
|
|
241
|
-
in_cluster && /^ *certificate-authority-data:/ { next }
|
|
242
|
-
in_cluster && /^ *server:/ {
|
|
243
|
-
print " server: https://'${HEAD_NODE}:6443'"
|
|
244
|
-
print " insecure-skip-tls-verify: true"
|
|
245
|
-
next
|
|
246
|
-
}
|
|
247
|
-
/name: default/ { sub("name: default", "name: " context) }
|
|
248
|
-
/cluster: default/ { sub("cluster: default", "cluster: " context) }
|
|
249
|
-
/user: default/ { sub("user: default", "user: " context) }
|
|
250
|
-
/current-context: default/ { sub("current-context: default", "current-context: " context) }
|
|
251
|
-
{ print }
|
|
252
|
-
' "$TEMP_KUBECONFIG" > "$TEMP_DIR/modified_config"
|
|
253
|
-
|
|
254
|
-
# Merge the configurations using kubectl
|
|
255
|
-
KUBECONFIG="$KUBECONFIG_FILE:$TEMP_DIR/modified_config" kubectl config view --flatten > "$TEMP_DIR/merged_config"
|
|
256
|
-
mv "$TEMP_DIR/merged_config" "$KUBECONFIG_FILE"
|
|
257
|
-
|
|
258
|
-
# Set the new context as the current context
|
|
259
|
-
kubectl config use-context "$CONTEXT_NAME"
|
|
260
|
-
|
|
261
|
-
# Clean up temporary files
|
|
262
|
-
rm -rf "$TEMP_DIR"
|
|
263
|
-
|
|
264
|
-
success_message "kubectl configured with new context '$CONTEXT_NAME'."
|
|
265
|
-
|
|
266
|
-
echo "Cluster deployment completed. You can now run 'kubectl get nodes' to verify the setup."
|
|
267
|
-
|
|
268
|
-
# Install GPU operator if a GPU was detected on any node
|
|
269
|
-
if [ "$INSTALL_GPU" == "true" ]; then
|
|
270
|
-
echo -e "${YELLOW}GPU detected in the cluster. Installing Nvidia GPU Operator...${NC}"
|
|
271
|
-
run_remote "$HEAD_NODE" "
|
|
272
|
-
$ASKPASS_BLOCK
|
|
273
|
-
curl -fsSL -o get_helm.sh https://raw.githubusercontent.com/helm/helm/master/scripts/get-helm-3 &&
|
|
274
|
-
chmod 700 get_helm.sh &&
|
|
275
|
-
./get_helm.sh &&
|
|
276
|
-
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia && helm repo update &&
|
|
277
|
-
kubectl create namespace gpu-operator --kubeconfig ~/.kube/config || true &&
|
|
278
|
-
sudo -A ln -s /sbin/ldconfig /sbin/ldconfig.real || true &&
|
|
279
|
-
helm install gpu-operator -n gpu-operator --create-namespace nvidia/gpu-operator \
|
|
280
|
-
--set 'toolkit.env[0].name=CONTAINERD_CONFIG' \
|
|
281
|
-
--set 'toolkit.env[0].value=/var/lib/rancher/k3s/agent/etc/containerd/config.toml' \
|
|
282
|
-
--set 'toolkit.env[1].name=CONTAINERD_SOCKET' \
|
|
283
|
-
--set 'toolkit.env[1].value=/run/k3s/containerd/containerd.sock' \
|
|
284
|
-
--set 'toolkit.env[2].name=CONTAINERD_RUNTIME_CLASS' \
|
|
285
|
-
--set 'toolkit.env[2].value=nvidia' &&
|
|
286
|
-
echo 'Waiting for GPU operator installation...' &&
|
|
287
|
-
while ! kubectl describe nodes --kubeconfig ~/.kube/config | grep -q 'nvidia.com/gpu:'; do
|
|
288
|
-
echo 'Waiting for GPU operator...'
|
|
289
|
-
sleep 5
|
|
290
|
-
done
|
|
291
|
-
echo 'GPU operator installed successfully.'"
|
|
292
|
-
success_message "GPU Operator installed."
|
|
293
|
-
else
|
|
294
|
-
echo -e "${YELLOW}No GPUs detected. Skipping GPU Operator installation.${NC}"
|
|
295
|
-
fi
|
|
296
|
-
|
|
297
|
-
# Configure SkyPilot
|
|
298
|
-
progress_message "Configuring SkyPilot..."
|
|
299
|
-
sky check kubernetes
|
|
300
|
-
success_message "SkyPilot configured successfully."
|
|
301
|
-
|
|
302
|
-
# Display final success message
|
|
303
|
-
echo -e "${GREEN}==== 🎉 Kubernetes cluster deployment completed successfully 🎉 ====${NC}"
|
|
304
|
-
echo "You can now interact with your Kubernetes cluster through SkyPilot: "
|
|
305
|
-
echo " • List available GPUs: sky show-gpus --cloud kubernetes"
|
|
306
|
-
echo " • Launch a GPU development pod: sky launch -c devbox --cloud kubernetes --gpus A100:1"
|
|
307
|
-
echo " • Connect to pod with SSH: ssh devbox"
|
|
308
|
-
echo " • Connect to pod with VSCode: code --remote ssh-remote+devbox '/'"
|
|
@@ -1,191 +0,0 @@
|
|
|
1
|
-
"""Manages lifecycle of ssh jump pod.
|
|
2
|
-
|
|
3
|
-
This script runs inside ssh jump pod as the main process (PID 1).
|
|
4
|
-
|
|
5
|
-
It terminates itself (by removing ssh jump service and pod via a call to
|
|
6
|
-
kubeapi) if it does not see ray pods in the duration of 10 minutes. If the
|
|
7
|
-
user re-launches a task before the duration is over, then ssh jump pod is being
|
|
8
|
-
reused and will terminate itself when it sees that no ray clusters exist in
|
|
9
|
-
that duration.
|
|
10
|
-
|
|
11
|
-
To allow multiple users to the share the same SSH jump pod,
|
|
12
|
-
this script also reloads SSH keys from the mounted secret volume on an
|
|
13
|
-
interval and updates `~/.ssh/authorized_keys`.
|
|
14
|
-
"""
|
|
15
|
-
import datetime
|
|
16
|
-
import os
|
|
17
|
-
import subprocess
|
|
18
|
-
import sys
|
|
19
|
-
import threading
|
|
20
|
-
import time
|
|
21
|
-
|
|
22
|
-
from kubernetes import client
|
|
23
|
-
from kubernetes import config
|
|
24
|
-
|
|
25
|
-
# Load kube config
|
|
26
|
-
config.load_incluster_config()
|
|
27
|
-
|
|
28
|
-
v1 = client.CoreV1Api()
|
|
29
|
-
|
|
30
|
-
current_name = os.getenv('MY_POD_NAME')
|
|
31
|
-
current_namespace = os.getenv('MY_POD_NAMESPACE')
|
|
32
|
-
|
|
33
|
-
# The amount of time in seconds where no Ray pods exist in which after that time
|
|
34
|
-
# ssh jump pod terminates itself
|
|
35
|
-
alert_threshold = int(os.getenv('ALERT_THRESHOLD', '600'))
|
|
36
|
-
# The amount of time in seconds to wait between Ray pods existence checks
|
|
37
|
-
retry_interval = int(os.getenv('RETRY_INTERVAL', '60'))
|
|
38
|
-
# The amount of time in seconds to wait between SSH key reloads
|
|
39
|
-
reload_interval = int(os.getenv('RELOAD_INTERVAL', '5'))
|
|
40
|
-
|
|
41
|
-
# Ray pods are labeled with this value i.e., ssh jump name which is unique per
|
|
42
|
-
# user (based on user hash)
|
|
43
|
-
label_selector = f'skypilot-ssh-jump={current_name}'
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def poll(interval, leading=True):
|
|
47
|
-
"""Decorator factory for polling function. To stop polling, return True.
|
|
48
|
-
|
|
49
|
-
Args:
|
|
50
|
-
interval (int): The amount of time to wait between function calls.
|
|
51
|
-
leading (bool): Whether to wait before (rather than after) calls.
|
|
52
|
-
"""
|
|
53
|
-
|
|
54
|
-
def decorator(func):
|
|
55
|
-
|
|
56
|
-
def wrapper(*args, **kwargs):
|
|
57
|
-
while True:
|
|
58
|
-
if leading:
|
|
59
|
-
time.sleep(interval)
|
|
60
|
-
done = func(*args, **kwargs)
|
|
61
|
-
if done:
|
|
62
|
-
return
|
|
63
|
-
if not leading:
|
|
64
|
-
time.sleep(interval)
|
|
65
|
-
|
|
66
|
-
return wrapper
|
|
67
|
-
|
|
68
|
-
return decorator
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
# Flag to terminate the reload keys thread when the lifecycle thread
|
|
72
|
-
# terminates.
|
|
73
|
-
terminated = False
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
@poll(interval=reload_interval, leading=False)
|
|
77
|
-
def reload_keys():
|
|
78
|
-
"""Reloads SSH keys from mounted secret volume."""
|
|
79
|
-
|
|
80
|
-
if terminated:
|
|
81
|
-
sys.stdout.write('[SSH Key Reloader] Terminated.\n')
|
|
82
|
-
return True
|
|
83
|
-
|
|
84
|
-
# Reload SSH keys from mounted secret volume if changed.
|
|
85
|
-
tmpfile = '/tmp/sky-ssh-keys'
|
|
86
|
-
try:
|
|
87
|
-
subprocess.check_output(
|
|
88
|
-
f'cat /etc/secret-volume/ssh-publickey* > {tmpfile}', shell=True)
|
|
89
|
-
try:
|
|
90
|
-
subprocess.check_output(f'diff {tmpfile} ~/.ssh/authorized_keys',
|
|
91
|
-
shell=True)
|
|
92
|
-
sys.stdout.write(
|
|
93
|
-
'[SSH Key Reloader] No keys changed, continuing.\n')
|
|
94
|
-
except subprocess.CalledProcessError as e:
|
|
95
|
-
if e.returncode == 1:
|
|
96
|
-
sys.stdout.write(
|
|
97
|
-
'[SSH Key Reloader] Changes detected, reloading.\n')
|
|
98
|
-
subprocess.check_output(f'mv {tmpfile} ~/.ssh/authorized_keys',
|
|
99
|
-
shell=True)
|
|
100
|
-
else:
|
|
101
|
-
raise
|
|
102
|
-
except Exception as e:
|
|
103
|
-
sys.stdout.write(
|
|
104
|
-
f'[SSH Key Reloader][ERROR] Failed to reload SSH keys: {e}\n')
|
|
105
|
-
raise
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
alert_delta = datetime.timedelta(seconds=alert_threshold)
|
|
109
|
-
retry_interval_delta = datetime.timedelta(seconds=retry_interval)
|
|
110
|
-
# Accumulated time of where no SkyPilot cluster exists. Compared
|
|
111
|
-
# against alert_threshold.
|
|
112
|
-
nocluster_delta = datetime.timedelta()
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
@poll(interval=retry_interval)
|
|
116
|
-
def manage_lifecycle():
|
|
117
|
-
"""Manages lifecycle of ssh jump pod."""
|
|
118
|
-
|
|
119
|
-
global terminated, nocluster_delta
|
|
120
|
-
|
|
121
|
-
try:
|
|
122
|
-
ret = v1.list_namespaced_pod(current_namespace,
|
|
123
|
-
label_selector=label_selector)
|
|
124
|
-
except Exception as e:
|
|
125
|
-
sys.stdout.write('[Lifecycle] [ERROR] listing pods failed with '
|
|
126
|
-
f'error: {e}\n')
|
|
127
|
-
raise
|
|
128
|
-
|
|
129
|
-
if not ret.items:
|
|
130
|
-
sys.stdout.write(
|
|
131
|
-
f'[Lifecycle] Did not find pods with label '
|
|
132
|
-
f'"{label_selector}" in namespace {current_namespace}\n')
|
|
133
|
-
nocluster_delta = nocluster_delta + retry_interval_delta
|
|
134
|
-
sys.stdout.write(
|
|
135
|
-
f'[Lifecycle] Time since no pods found: {nocluster_delta}, alert '
|
|
136
|
-
f'threshold: {alert_delta}\n')
|
|
137
|
-
else:
|
|
138
|
-
sys.stdout.write(
|
|
139
|
-
f'[Lifecycle] Found pods with label "{label_selector}" in '
|
|
140
|
-
f'namespace {current_namespace}\n')
|
|
141
|
-
# reset ..
|
|
142
|
-
nocluster_delta = datetime.timedelta()
|
|
143
|
-
sys.stdout.write(
|
|
144
|
-
f'[Lifecycle] nocluster_delta is reset: {nocluster_delta}\n')
|
|
145
|
-
|
|
146
|
-
if nocluster_delta >= alert_delta:
|
|
147
|
-
sys.stdout.write(
|
|
148
|
-
f'[Lifecycle] nocluster_delta: {nocluster_delta} crossed alert '
|
|
149
|
-
f'threshold: {alert_delta}. Time to terminate myself and my '
|
|
150
|
-
'service.\n')
|
|
151
|
-
try:
|
|
152
|
-
# ssh jump resources created under same name
|
|
153
|
-
v1.delete_namespaced_service(current_name, current_namespace)
|
|
154
|
-
v1.delete_namespaced_pod(current_name, current_namespace)
|
|
155
|
-
except Exception as e:
|
|
156
|
-
sys.stdout.write('[Lifecycle][ERROR] Deletion failed. Exiting '
|
|
157
|
-
f'poll() with error: {e}\n')
|
|
158
|
-
raise
|
|
159
|
-
|
|
160
|
-
terminated = True
|
|
161
|
-
return True
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
def main():
|
|
165
|
-
sys.stdout.write('SkyPilot SSH Jump Pod Lifecycle Manager\n')
|
|
166
|
-
sys.stdout.write(f'current_name: {current_name}\n')
|
|
167
|
-
sys.stdout.write(f'current_namespace: {current_namespace}\n')
|
|
168
|
-
sys.stdout.write(f'alert_threshold time: {alert_threshold}\n')
|
|
169
|
-
sys.stdout.write(f'retry_interval time: {retry_interval}\n')
|
|
170
|
-
sys.stdout.write(f'reload_interval time: {reload_interval}\n')
|
|
171
|
-
sys.stdout.write(f'label_selector: {label_selector}\n')
|
|
172
|
-
|
|
173
|
-
if not current_name or not current_namespace:
|
|
174
|
-
# Raise Exception with message to terminate pod
|
|
175
|
-
raise Exception('Missing environment variables MY_POD_NAME or '
|
|
176
|
-
'MY_POD_NAMESPACE')
|
|
177
|
-
|
|
178
|
-
threads = [
|
|
179
|
-
threading.Thread(target=manage_lifecycle),
|
|
180
|
-
threading.Thread(target=reload_keys)
|
|
181
|
-
]
|
|
182
|
-
sys.stdout.write(f'Polling with {len(threads)} threads.\n')
|
|
183
|
-
for t in threads:
|
|
184
|
-
t.start()
|
|
185
|
-
for t in threads:
|
|
186
|
-
t.join()
|
|
187
|
-
sys.stdout.write('Done.\n')
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
if __name__ == '__main__':
|
|
191
|
-
main()
|