skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/workspaces/server.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
1
|
+
"""REST API for workspace management."""
|
|
2
|
+
|
|
3
|
+
import fastapi
|
|
4
|
+
|
|
5
|
+
from sky.server.requests import executor
|
|
6
|
+
from sky.server.requests import payloads
|
|
7
|
+
from sky.server.requests import request_names
|
|
8
|
+
from sky.server.requests import requests as api_requests
|
|
9
|
+
from sky.workspaces import core
|
|
10
|
+
|
|
11
|
+
router = fastapi.APIRouter()
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@router.get('')
|
|
15
|
+
# pylint: disable=redefined-builtin
|
|
16
|
+
async def get(request: fastapi.Request) -> None:
|
|
17
|
+
"""Gets workspace config on the server."""
|
|
18
|
+
# Have to manually inject user info into the request body because the
|
|
19
|
+
# request body is not available in the GET endpoint.
|
|
20
|
+
auth_user = request.state.auth_user
|
|
21
|
+
auth_user_env_vars_kwargs = {
|
|
22
|
+
'env_vars': auth_user.to_env_vars()
|
|
23
|
+
} if auth_user else {}
|
|
24
|
+
request_body = payloads.RequestBody(**auth_user_env_vars_kwargs)
|
|
25
|
+
|
|
26
|
+
await executor.schedule_request_async(
|
|
27
|
+
request_id=request.state.request_id,
|
|
28
|
+
request_name=request_names.RequestName.WORKSPACES_GET,
|
|
29
|
+
request_body=request_body,
|
|
30
|
+
func=core.get_workspaces,
|
|
31
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@router.post('/update')
|
|
36
|
+
async def update(request: fastapi.Request,
|
|
37
|
+
update_workspace_body: payloads.UpdateWorkspaceBody) -> None:
|
|
38
|
+
"""Updates a specific workspace configuration."""
|
|
39
|
+
await executor.schedule_request_async(
|
|
40
|
+
request_id=request.state.request_id,
|
|
41
|
+
request_name=request_names.RequestName.WORKSPACES_UPDATE,
|
|
42
|
+
request_body=update_workspace_body,
|
|
43
|
+
func=core.update_workspace,
|
|
44
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
|
45
|
+
)
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@router.post('/create')
|
|
49
|
+
async def create(request: fastapi.Request,
|
|
50
|
+
create_workspace_body: payloads.CreateWorkspaceBody) -> None:
|
|
51
|
+
"""Creates a new workspace configuration."""
|
|
52
|
+
await executor.schedule_request_async(
|
|
53
|
+
request_id=request.state.request_id,
|
|
54
|
+
request_name=request_names.RequestName.WORKSPACES_CREATE,
|
|
55
|
+
request_body=create_workspace_body,
|
|
56
|
+
func=core.create_workspace,
|
|
57
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@router.post('/delete')
|
|
62
|
+
async def delete(request: fastapi.Request,
|
|
63
|
+
delete_workspace_body: payloads.DeleteWorkspaceBody) -> None:
|
|
64
|
+
"""Deletes a workspace configuration."""
|
|
65
|
+
await executor.schedule_request_async(
|
|
66
|
+
request_id=request.state.request_id,
|
|
67
|
+
request_name=request_names.RequestName.WORKSPACES_DELETE,
|
|
68
|
+
request_body=delete_workspace_body,
|
|
69
|
+
func=core.delete_workspace,
|
|
70
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
@router.get('/config')
|
|
75
|
+
async def get_config(request: fastapi.Request) -> None:
|
|
76
|
+
"""Gets the entire SkyPilot configuration."""
|
|
77
|
+
auth_user = request.state.auth_user
|
|
78
|
+
auth_user_env_vars_kwargs = {
|
|
79
|
+
'env_vars': auth_user.to_env_vars()
|
|
80
|
+
} if auth_user else {}
|
|
81
|
+
get_config_body = payloads.GetConfigBody(**auth_user_env_vars_kwargs)
|
|
82
|
+
await executor.schedule_request_async(
|
|
83
|
+
request_id=request.state.request_id,
|
|
84
|
+
request_name=request_names.RequestName.WORKSPACES_GET_CONFIG,
|
|
85
|
+
request_body=get_config_body,
|
|
86
|
+
func=core.get_config,
|
|
87
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
|
88
|
+
)
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
@router.post('/config')
|
|
92
|
+
async def update_config(request: fastapi.Request,
|
|
93
|
+
update_config_body: payloads.UpdateConfigBody) -> None:
|
|
94
|
+
"""Updates the entire SkyPilot configuration."""
|
|
95
|
+
await executor.schedule_request_async(
|
|
96
|
+
request_id=request.state.request_id,
|
|
97
|
+
request_name=request_names.RequestName.WORKSPACES_UPDATE_CONFIG,
|
|
98
|
+
request_body=update_config_body,
|
|
99
|
+
func=core.update_config,
|
|
100
|
+
schedule_type=api_requests.ScheduleType.SHORT,
|
|
101
|
+
)
|
sky/workspaces/utils.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""Utils for workspaces."""
|
|
2
|
+
import collections
|
|
3
|
+
from typing import Any, Dict, List
|
|
4
|
+
|
|
5
|
+
from sky import global_user_state
|
|
6
|
+
from sky import sky_logging
|
|
7
|
+
|
|
8
|
+
logger = sky_logging.init_logger(__name__)
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def get_workspace_users(workspace_config: Dict[str, Any]) -> List[str]:
|
|
12
|
+
"""Get the users that should have access to a workspace.
|
|
13
|
+
|
|
14
|
+
workspace_config is a dict with the following keys:
|
|
15
|
+
- private: bool
|
|
16
|
+
- allowed_users: list of user names or IDs
|
|
17
|
+
|
|
18
|
+
This function will automatically resolve the user names to IDs.
|
|
19
|
+
|
|
20
|
+
Args:
|
|
21
|
+
workspace_config: The configuration of the workspace.
|
|
22
|
+
|
|
23
|
+
Returns:
|
|
24
|
+
List of user IDs that should have access to the workspace.
|
|
25
|
+
For private workspaces, returns specific user IDs.
|
|
26
|
+
For public workspaces, returns ['*'] to indicate all users.
|
|
27
|
+
"""
|
|
28
|
+
if workspace_config.get('private', False):
|
|
29
|
+
user_ids = []
|
|
30
|
+
workspace_user_name_or_ids = workspace_config.get('allowed_users', [])
|
|
31
|
+
all_users = global_user_state.get_all_users()
|
|
32
|
+
all_user_ids = {user.id for user in all_users}
|
|
33
|
+
all_user_map = collections.defaultdict(list)
|
|
34
|
+
for user in all_users:
|
|
35
|
+
all_user_map[user.name].append(user.id)
|
|
36
|
+
|
|
37
|
+
# Resolve user names to IDs
|
|
38
|
+
for user_name_or_id in workspace_user_name_or_ids:
|
|
39
|
+
if user_name_or_id in all_user_ids:
|
|
40
|
+
user_ids.append(user_name_or_id)
|
|
41
|
+
elif user_name_or_id in all_user_map:
|
|
42
|
+
if len(all_user_map[user_name_or_id]) > 1:
|
|
43
|
+
user_ids_str = ', '.join(all_user_map[user_name_or_id])
|
|
44
|
+
raise ValueError(
|
|
45
|
+
f'User {user_name_or_id!r} has multiple IDs: '
|
|
46
|
+
f'{user_ids_str}. Please specify the user '
|
|
47
|
+
f'ID instead.')
|
|
48
|
+
user_ids.append(all_user_map[user_name_or_id][0])
|
|
49
|
+
else:
|
|
50
|
+
logger.warning(
|
|
51
|
+
f'User {user_name_or_id!r} not found in all users')
|
|
52
|
+
continue
|
|
53
|
+
return user_ids
|
|
54
|
+
else:
|
|
55
|
+
# Public workspace - return '*' to indicate all users should have access
|
|
56
|
+
return ['*']
|
sky_templates/README.md
ADDED
|
File without changes
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Starts a Ray cluster on a SkyPilot cluster.
|
|
3
|
+
#
|
|
4
|
+
# This script starts a Ray cluster using default Ray ports (6379, 8265),
|
|
5
|
+
# which are different from SkyPilot's system Ray ports (6380, 8266).
|
|
6
|
+
# This allows users to run their own Ray applications independently of
|
|
7
|
+
# SkyPilot's internal Ray cluster.
|
|
8
|
+
#
|
|
9
|
+
# Environment Variables:
|
|
10
|
+
# RAY_HEAD_PORT=6379 - Ray head node port
|
|
11
|
+
# RAY_DASHBOARD_PORT=8265 - Ray dashboard port
|
|
12
|
+
# RAY_DASHBOARD_HOST=127.0.0.1 - Dashboard host (set to 0.0.0.0 to expose externally)
|
|
13
|
+
# RAY_DASHBOARD_AGENT_LISTEN_PORT= - (Optional) Dashboard agent listen port
|
|
14
|
+
# RAY_HEAD_IP_ADDRESS= - (Optional) Node IP address
|
|
15
|
+
# RAY_CMD=ray - (Optional) Command to invoke Ray (e.g., "uv run ray")
|
|
16
|
+
#
|
|
17
|
+
# Usage:
|
|
18
|
+
# ~/sky_templates/ray/start_cluster
|
|
19
|
+
#
|
|
20
|
+
# # With custom configurations
|
|
21
|
+
# export RAY_DASHBOARD_HOST=0.0.0.0
|
|
22
|
+
# export RAY_DASHBOARD_PORT=8280
|
|
23
|
+
# ~/sky_templates/ray/start_cluster
|
|
24
|
+
#
|
|
25
|
+
# # With uv
|
|
26
|
+
# export RAY_CMD="uv run ray"
|
|
27
|
+
# ~/sky_templates/ray/start_cluster
|
|
28
|
+
|
|
29
|
+
set -e
|
|
30
|
+
|
|
31
|
+
# Color codes for output
|
|
32
|
+
RED='\033[0;31m'
|
|
33
|
+
GREEN='\033[0;32m'
|
|
34
|
+
YELLOW='\033[1;33m'
|
|
35
|
+
NC='\033[0m' # No Color
|
|
36
|
+
|
|
37
|
+
RAY_HEAD_PORT=${RAY_HEAD_PORT:-6379}
|
|
38
|
+
RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265}
|
|
39
|
+
RAY_DASHBOARD_HOST=${RAY_DASHBOARD_HOST:-127.0.0.1}
|
|
40
|
+
RAY_DASHBOARD_AGENT_LISTEN_PORT=${RAY_DASHBOARD_AGENT_LISTEN_PORT:-}
|
|
41
|
+
RAY_HEAD_IP_ADDRESS=${RAY_HEAD_IP_ADDRESS:-}
|
|
42
|
+
|
|
43
|
+
RAY_CMD=${RAY_CMD:-ray}
|
|
44
|
+
# Tokenize the command string into an array so multi-word commands
|
|
45
|
+
# (e.g., "uv run ray") are handled safely when expanded later.
|
|
46
|
+
eval "RAY_CMD_ARR=( ${RAY_CMD} )"
|
|
47
|
+
|
|
48
|
+
# Convenience wrapper to invoke the configured Ray command with arbitrary args.
|
|
49
|
+
run_ray() {
|
|
50
|
+
"${RAY_CMD_ARR[@]}" "$@"
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
echo -e "${GREEN}Starting Ray cluster...${NC}"
|
|
54
|
+
|
|
55
|
+
# Ensure ray[default] is installed (we need [default] to do `ray list nodes`)
|
|
56
|
+
# Pin to existing version if Ray is already installed to avoid upgrading existing version.
|
|
57
|
+
RAY_VERSION=$(run_ray --version 2>/dev/null | cut -d' ' -f3 || echo "")
|
|
58
|
+
if [ -n "${RAY_VERSION}" ]; then
|
|
59
|
+
# Pin to existing version.
|
|
60
|
+
VERSION_SPEC="==${RAY_VERSION}"
|
|
61
|
+
else
|
|
62
|
+
echo -e "${YELLOW}Installing ray[default]...${NC}"
|
|
63
|
+
VERSION_SPEC=""
|
|
64
|
+
fi
|
|
65
|
+
|
|
66
|
+
# Pin click<8.3.0 to avoid incompatibility with Ray on Python 3.10
|
|
67
|
+
# click 8.3.0 and 8.3.1 breaks Ray CLI due to deepcopy issues with sentinel values
|
|
68
|
+
# See: https://github.com/ray-project/ray/issues/56747
|
|
69
|
+
# TODO(kevin): Remove this once the issue is fixed in a future click release
|
|
70
|
+
RAY_INSTALL_SPEC="ray[default]${VERSION_SPEC} click<8.3.0"
|
|
71
|
+
uv pip install ${RAY_INSTALL_SPEC} || uv pip install --system ${RAY_INSTALL_SPEC}
|
|
72
|
+
|
|
73
|
+
# Verify Ray is working
|
|
74
|
+
if ! run_ray --version > /dev/null; then
|
|
75
|
+
echo -e "${RED}Error: Failed to install Ray.${NC}"
|
|
76
|
+
exit 1
|
|
77
|
+
fi
|
|
78
|
+
echo -e "${GREEN}Ray $(run_ray --version | cut -d' ' -f3) is installed.${NC}"
|
|
79
|
+
|
|
80
|
+
RAY_ADDRESS="127.0.0.1:${RAY_HEAD_PORT}"
|
|
81
|
+
if [ "${SKYPILOT_NODE_RANK}" -ne 0 ]; then
|
|
82
|
+
HEAD_IP=$(echo "${SKYPILOT_NODE_IPS}" | head -n1)
|
|
83
|
+
RAY_ADDRESS="${HEAD_IP}:${RAY_HEAD_PORT}"
|
|
84
|
+
fi
|
|
85
|
+
|
|
86
|
+
# Check if user-space Ray is already running
|
|
87
|
+
if run_ray status --address="${RAY_ADDRESS}" &> /dev/null; then
|
|
88
|
+
echo -e "${YELLOW}Ray cluster is already running.${NC}"
|
|
89
|
+
run_ray status --address="${RAY_ADDRESS}"
|
|
90
|
+
exit 0
|
|
91
|
+
fi
|
|
92
|
+
|
|
93
|
+
TIMEOUT=300
|
|
94
|
+
|
|
95
|
+
if [ "${SKYPILOT_NODE_RANK}" -eq 0 ]; then
|
|
96
|
+
echo -e "${GREEN}Starting Ray head node...${NC}"
|
|
97
|
+
|
|
98
|
+
RAY_START_CMD="start --head \
|
|
99
|
+
--port=${RAY_HEAD_PORT} \
|
|
100
|
+
--dashboard-port=${RAY_DASHBOARD_PORT} \
|
|
101
|
+
--dashboard-host=${RAY_DASHBOARD_HOST} \
|
|
102
|
+
--disable-usage-stats \
|
|
103
|
+
--include-dashboard=True"
|
|
104
|
+
|
|
105
|
+
# Add --num-gpus only if > 0
|
|
106
|
+
if [ "${SKYPILOT_NUM_GPUS_PER_NODE}" -gt 0 ]; then
|
|
107
|
+
RAY_START_CMD="${RAY_START_CMD} --num-gpus=${SKYPILOT_NUM_GPUS_PER_NODE}"
|
|
108
|
+
fi
|
|
109
|
+
|
|
110
|
+
# Add optional dashboard agent listen port if specified
|
|
111
|
+
if [ -n "${RAY_DASHBOARD_AGENT_LISTEN_PORT}" ]; then
|
|
112
|
+
RAY_START_CMD="${RAY_START_CMD} --dashboard-agent-listen-port=${RAY_DASHBOARD_AGENT_LISTEN_PORT}"
|
|
113
|
+
fi
|
|
114
|
+
|
|
115
|
+
# Add optional node IP address if specified
|
|
116
|
+
if [ -n "${RAY_HEAD_IP_ADDRESS}" ]; then
|
|
117
|
+
RAY_START_CMD="${RAY_START_CMD} --node-ip-address=${RAY_HEAD_IP_ADDRESS}"
|
|
118
|
+
fi
|
|
119
|
+
|
|
120
|
+
run_ray ${RAY_START_CMD}
|
|
121
|
+
|
|
122
|
+
start_time=$(date +%s)
|
|
123
|
+
while ! run_ray health-check --address="${RAY_ADDRESS}" &>/dev/null; do
|
|
124
|
+
if [ "$(( $(date +%s) - start_time ))" -ge "$TIMEOUT" ]; then
|
|
125
|
+
echo -e "${RED}Timed out waiting for head node. Exiting.${NC}" >&2
|
|
126
|
+
exit 1
|
|
127
|
+
fi
|
|
128
|
+
echo "Head node not healthy yet. Retrying in 1s..."
|
|
129
|
+
sleep 1
|
|
130
|
+
done
|
|
131
|
+
|
|
132
|
+
echo -e "${GREEN}Head node started successfully.${NC}"
|
|
133
|
+
|
|
134
|
+
# Wait for all worker nodes to join
|
|
135
|
+
if [ "${SKYPILOT_NUM_NODES}" -gt 1 ]; then
|
|
136
|
+
echo "Waiting for all ${SKYPILOT_NUM_NODES} nodes to join..."
|
|
137
|
+
start_time=$(date +%s)
|
|
138
|
+
while true; do
|
|
139
|
+
if [ "$(( $(date +%s) - start_time ))" -ge "${TIMEOUT}" ]; then
|
|
140
|
+
echo -e "${RED}Error: Timeout waiting for nodes.${NC}" >&2
|
|
141
|
+
exit 1
|
|
142
|
+
fi
|
|
143
|
+
ready_nodes=$(run_ray list nodes --format=json | python3 -c "import sys, json; print(len(json.load(sys.stdin)))")
|
|
144
|
+
if [ "${ready_nodes}" -ge "${SKYPILOT_NUM_NODES}" ]; then
|
|
145
|
+
break
|
|
146
|
+
fi
|
|
147
|
+
echo "Waiting... (${ready_nodes} / ${SKYPILOT_NUM_NODES} nodes ready)"
|
|
148
|
+
sleep 5
|
|
149
|
+
done
|
|
150
|
+
echo -e "${GREEN}All ${SKYPILOT_NUM_NODES} nodes have joined.${NC}"
|
|
151
|
+
fi
|
|
152
|
+
|
|
153
|
+
# Add sleep to after `ray start` to give ray enough time to daemonize
|
|
154
|
+
sleep 5
|
|
155
|
+
else
|
|
156
|
+
echo -e "${GREEN}Starting Ray worker node...${NC}"
|
|
157
|
+
|
|
158
|
+
echo "Waiting for head node at ${RAY_ADDRESS}..."
|
|
159
|
+
start_time=$(date +%s)
|
|
160
|
+
while ! run_ray health-check --address="${RAY_ADDRESS}" &>/dev/null; do
|
|
161
|
+
if [ "$(( $(date +%s) - start_time ))" -ge "$TIMEOUT" ]; then
|
|
162
|
+
echo -e "${RED}Timed out waiting for head node. Exiting.${NC}" >&2
|
|
163
|
+
exit 1
|
|
164
|
+
fi
|
|
165
|
+
echo "Head node not healthy yet. Retrying in 1s..."
|
|
166
|
+
sleep 1
|
|
167
|
+
done
|
|
168
|
+
|
|
169
|
+
echo -e "${GREEN}Head node is healthy. Starting worker node...${NC}"
|
|
170
|
+
WORKER_CMD="start --address=${RAY_ADDRESS} --disable-usage-stats"
|
|
171
|
+
|
|
172
|
+
# Add --num-gpus only if > 0
|
|
173
|
+
if [ "${SKYPILOT_NUM_GPUS_PER_NODE}" -gt 0 ]; then
|
|
174
|
+
WORKER_CMD="${WORKER_CMD} --num-gpus=${SKYPILOT_NUM_GPUS_PER_NODE}"
|
|
175
|
+
fi
|
|
176
|
+
|
|
177
|
+
run_ray ${WORKER_CMD}
|
|
178
|
+
|
|
179
|
+
echo -e "${GREEN}Worker node started successfully.${NC}"
|
|
180
|
+
|
|
181
|
+
# Add sleep to after `ray start` to give ray enough time to daemonize
|
|
182
|
+
sleep 5
|
|
183
|
+
fi
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Stops a user Ray cluster on a SkyPilot cluster.
|
|
3
|
+
#
|
|
4
|
+
# This script stops a Ray cluster running on custom ports (default 6379),
|
|
5
|
+
# which is separate from SkyPilot's internal Ray cluster (port 6380).
|
|
6
|
+
#
|
|
7
|
+
# IMPORTANT: This script uses pkill to stop Ray processes, NOT 'ray stop',
|
|
8
|
+
# as 'ray stop' can interfere with SkyPilot's internal operations.
|
|
9
|
+
#
|
|
10
|
+
# Environment Variables:
|
|
11
|
+
# RAY_HEAD_PORT=6379 - Ray head node port to stop
|
|
12
|
+
# RAY_CMD=ray - (Optional) Command to invoke Ray (e.g., "uv run ray")
|
|
13
|
+
#
|
|
14
|
+
# Usage:
|
|
15
|
+
# # Stop default Ray cluster (port 6379)
|
|
16
|
+
# ~/sky_templates/ray/stop_ray_cluster.sh
|
|
17
|
+
#
|
|
18
|
+
# # Stop Ray cluster on custom port
|
|
19
|
+
# export RAY_HEAD_PORT=6385
|
|
20
|
+
# ~/sky_templates/ray/stop_ray_cluster.sh
|
|
21
|
+
#
|
|
22
|
+
# # With uv
|
|
23
|
+
# export RAY_CMD="uv run ray"
|
|
24
|
+
# ~/sky_templates/ray/stop_ray_cluster.sh
|
|
25
|
+
|
|
26
|
+
set -e
|
|
27
|
+
|
|
28
|
+
# Color codes for output
|
|
29
|
+
RED='\033[0;31m'
|
|
30
|
+
GREEN='\033[0;32m'
|
|
31
|
+
YELLOW='\033[1;33m'
|
|
32
|
+
NC='\033[0m' # No Color
|
|
33
|
+
|
|
34
|
+
RAY_HEAD_PORT=${RAY_HEAD_PORT:-6379}
|
|
35
|
+
RAY_CMD=${RAY_CMD:-ray}
|
|
36
|
+
# Tokenize the command string into an array so multi-word commands (e.g., "uv run ray")
|
|
37
|
+
# are handled safely when expanded later.
|
|
38
|
+
eval "RAY_CMD_ARR=( ${RAY_CMD} )"
|
|
39
|
+
|
|
40
|
+
run_ray() {
|
|
41
|
+
"${RAY_CMD_ARR[@]}" "$@"
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
echo -e "${GREEN}Stopping Ray cluster on port ${RAY_HEAD_PORT}...${NC}"
|
|
45
|
+
|
|
46
|
+
RAY_ADDRESS="127.0.0.1:${RAY_HEAD_PORT}"
|
|
47
|
+
if [ "$SKYPILOT_NODE_RANK" -ne 0 ]; then
|
|
48
|
+
HEAD_IP=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
|
|
49
|
+
RAY_ADDRESS="${HEAD_IP}:${RAY_HEAD_PORT}"
|
|
50
|
+
fi
|
|
51
|
+
|
|
52
|
+
# Check if Ray is running
|
|
53
|
+
if ! run_ray status --address="${RAY_ADDRESS}" &> /dev/null; then
|
|
54
|
+
echo -e "${YELLOW}No Ray cluster found running on port ${RAY_HEAD_PORT}.${NC}"
|
|
55
|
+
exit 0
|
|
56
|
+
fi
|
|
57
|
+
|
|
58
|
+
# Use pkill to stop Ray processes instead of 'ray stop'
|
|
59
|
+
# This prevents interfering with SkyPilot's internal Ray cluster (port 6380)
|
|
60
|
+
echo -e "${YELLOW}Killing Ray processes on port ${RAY_HEAD_PORT}...${NC}"
|
|
61
|
+
|
|
62
|
+
pkill -f "ray.*[=:]${RAY_HEAD_PORT}" || true
|
|
63
|
+
|
|
64
|
+
echo -e "${GREEN}Ray processes killed.${NC}"
|
|
65
|
+
# Wait a moment for processes to terminate
|
|
66
|
+
sleep 5
|
|
67
|
+
|
|
68
|
+
# Verify Ray is stopped
|
|
69
|
+
if run_ray status --address="${RAY_ADDRESS}" &> /dev/null; then
|
|
70
|
+
echo -e "${RED}Warning: Ray cluster may still be running. Try manually:${NC}"
|
|
71
|
+
echo -e "${RED} pkill -9 -f 'ray.*[=:]${RAY_HEAD_PORT}'${NC}"
|
|
72
|
+
exit 1
|
|
73
|
+
else
|
|
74
|
+
echo -e "${GREEN}Ray cluster successfully stopped.${NC}"
|
|
75
|
+
fi
|