skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
sky/volumes/volume.py
CHANGED
|
@@ -13,6 +13,10 @@ VOLUME_TYPE_TO_CLOUD = {
|
|
|
13
13
|
volume_lib.VolumeType.PVC: clouds.Kubernetes(),
|
|
14
14
|
volume_lib.VolumeType.RUNPOD_NETWORK_VOLUME: clouds.RunPod(),
|
|
15
15
|
}
|
|
16
|
+
CLOUD_TO_VOLUME_TYPE = {
|
|
17
|
+
clouds.Kubernetes(): [volume_lib.VolumeType.PVC],
|
|
18
|
+
clouds.RunPod(): [volume_lib.VolumeType.RUNPOD_NETWORK_VOLUME],
|
|
19
|
+
}
|
|
16
20
|
|
|
17
21
|
|
|
18
22
|
class Volume:
|
|
@@ -25,7 +29,7 @@ class Volume:
|
|
|
25
29
|
infra: Optional[str] = None,
|
|
26
30
|
size: Optional[str] = None,
|
|
27
31
|
labels: Optional[Dict[str, str]] = None,
|
|
28
|
-
|
|
32
|
+
use_existing: Optional[bool] = None,
|
|
29
33
|
config: Optional[Dict[str, Any]] = None):
|
|
30
34
|
"""Initialize a Volume instance.
|
|
31
35
|
|
|
@@ -35,6 +39,7 @@ class Volume:
|
|
|
35
39
|
infra: Infrastructure specification
|
|
36
40
|
size: Volume size
|
|
37
41
|
labels: Volume labels
|
|
42
|
+
use_existing: Whether to use an existing volume
|
|
38
43
|
config: Additional configuration
|
|
39
44
|
"""
|
|
40
45
|
self.name = name
|
|
@@ -42,7 +47,7 @@ class Volume:
|
|
|
42
47
|
self.infra = infra
|
|
43
48
|
self.size = size
|
|
44
49
|
self.labels = labels or {}
|
|
45
|
-
self.
|
|
50
|
+
self.use_existing = use_existing
|
|
46
51
|
self.config = config or {}
|
|
47
52
|
|
|
48
53
|
self.cloud: Optional[str] = None
|
|
@@ -70,17 +75,16 @@ class Volume:
|
|
|
70
75
|
infra=config.get('infra'),
|
|
71
76
|
size=config.get('size'),
|
|
72
77
|
labels=config.get('labels'),
|
|
73
|
-
|
|
78
|
+
use_existing=config.get('use_existing'),
|
|
74
79
|
config=config.get('config', {}))
|
|
75
80
|
if vt == volume_lib.VolumeType.RUNPOD_NETWORK_VOLUME:
|
|
76
|
-
return RunpodNetworkVolume(
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
config=config.get('config', {}))
|
|
81
|
+
return RunpodNetworkVolume(name=config.get('name'),
|
|
82
|
+
type=vol_type_val,
|
|
83
|
+
infra=config.get('infra'),
|
|
84
|
+
size=config.get('size'),
|
|
85
|
+
labels=config.get('labels'),
|
|
86
|
+
use_existing=config.get('use_existing'),
|
|
87
|
+
config=config.get('config', {}))
|
|
84
88
|
|
|
85
89
|
raise ValueError(f'Invalid volume type: {vol_type_val}')
|
|
86
90
|
|
|
@@ -92,7 +96,7 @@ class Volume:
|
|
|
92
96
|
'infra': self.infra,
|
|
93
97
|
'size': self.size,
|
|
94
98
|
'labels': self.labels,
|
|
95
|
-
'
|
|
99
|
+
'use_existing': self.use_existing,
|
|
96
100
|
'config': self.config,
|
|
97
101
|
'cloud': self.cloud,
|
|
98
102
|
'region': self.region,
|
|
@@ -100,7 +104,7 @@ class Volume:
|
|
|
100
104
|
}
|
|
101
105
|
|
|
102
106
|
def _normalize_config(self) -> None:
|
|
103
|
-
"""
|
|
107
|
+
"""Normalize and validate the config."""
|
|
104
108
|
# Validate schema
|
|
105
109
|
common_utils.validate_schema(self.to_yaml_config(),
|
|
106
110
|
schemas.get_volume_schema(),
|
|
@@ -115,8 +119,17 @@ class Volume:
|
|
|
115
119
|
self.region = infra_info.region
|
|
116
120
|
self.zone = infra_info.zone
|
|
117
121
|
|
|
118
|
-
#
|
|
119
|
-
|
|
122
|
+
# Set cloud from volume type if not specified
|
|
123
|
+
cloud_obj_from_type = VOLUME_TYPE_TO_CLOUD.get(
|
|
124
|
+
volume_lib.VolumeType(self.type))
|
|
125
|
+
if self.cloud:
|
|
126
|
+
cloud_obj = registry.CLOUD_REGISTRY.from_str(self.cloud)
|
|
127
|
+
assert cloud_obj is not None
|
|
128
|
+
if not cloud_obj.is_same_cloud(cloud_obj_from_type):
|
|
129
|
+
raise ValueError(
|
|
130
|
+
f'Invalid cloud {self.cloud} for volume type {self.type}')
|
|
131
|
+
else:
|
|
132
|
+
self.cloud = str(cloud_obj_from_type)
|
|
120
133
|
|
|
121
134
|
def _adjust_config(self) -> None:
|
|
122
135
|
"""Adjust the volume config (e.g., parse size)."""
|
|
@@ -132,41 +145,41 @@ class Volume:
|
|
|
132
145
|
except ValueError as e:
|
|
133
146
|
raise ValueError(f'Invalid size {self.size}: {e}') from e
|
|
134
147
|
|
|
135
|
-
def
|
|
136
|
-
"""
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
if
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
raise ValueError(
|
|
144
|
-
f'Invalid cloud {self.cloud} for volume type {self.type}')
|
|
145
|
-
else:
|
|
146
|
-
self.cloud = str(cloud_obj_from_type)
|
|
147
|
-
cloud_obj = cloud_obj_from_type
|
|
148
|
-
assert cloud_obj is not None
|
|
148
|
+
def validate(self, skip_cloud_compatibility: bool = False) -> None:
|
|
149
|
+
"""Validates the volume."""
|
|
150
|
+
self.validate_name()
|
|
151
|
+
self.validate_size()
|
|
152
|
+
if not skip_cloud_compatibility:
|
|
153
|
+
self.validate_cloud_compatibility()
|
|
154
|
+
# Extra, type-specific validations
|
|
155
|
+
self._validate_config_extra()
|
|
149
156
|
|
|
150
|
-
|
|
151
|
-
|
|
157
|
+
def validate_name(self) -> None:
|
|
158
|
+
"""Validates if the volume name is set."""
|
|
159
|
+
assert self.name is not None, 'Volume name must be set'
|
|
160
|
+
|
|
161
|
+
def validate_size(self) -> None:
|
|
162
|
+
"""Validates that size is specified for new volumes."""
|
|
163
|
+
if not self.use_existing and not self.size:
|
|
164
|
+
raise ValueError('Size is required for new volumes. '
|
|
165
|
+
'Please specify the size in the YAML file or '
|
|
166
|
+
'use the --size flag.')
|
|
167
|
+
|
|
168
|
+
def validate_cloud_compatibility(self) -> None:
|
|
169
|
+
"""Validates region, zone, name, labels with the cloud."""
|
|
170
|
+
cloud_obj = registry.CLOUD_REGISTRY.from_str(self.cloud)
|
|
171
|
+
assert cloud_obj is not None
|
|
152
172
|
|
|
153
173
|
valid, err_msg = cloud_obj.is_volume_name_valid(self.name)
|
|
154
174
|
if not valid:
|
|
155
175
|
raise ValueError(f'Invalid volume name: {err_msg}')
|
|
156
176
|
|
|
157
|
-
if not self.resource_name and not self.size:
|
|
158
|
-
raise ValueError('Size is required for new volumes. '
|
|
159
|
-
'Please specify the size in the YAML file or '
|
|
160
|
-
'use the --size flag.')
|
|
161
177
|
if self.labels:
|
|
162
178
|
for key, value in self.labels.items():
|
|
163
179
|
valid, err_msg = cloud_obj.is_label_valid(key, value)
|
|
164
180
|
if not valid:
|
|
165
181
|
raise ValueError(f'{err_msg}')
|
|
166
182
|
|
|
167
|
-
# Extra, type-specific validations
|
|
168
|
-
self._validate_config_extra()
|
|
169
|
-
|
|
170
183
|
# Hook methods for subclasses
|
|
171
184
|
def _validate_config_extra(self) -> None:
|
|
172
185
|
"""Additional type-specific validation.
|
|
@@ -185,7 +198,7 @@ class RunpodNetworkVolume(Volume):
|
|
|
185
198
|
"""RunPod Network Volume."""
|
|
186
199
|
|
|
187
200
|
def _validate_config_extra(self) -> None:
|
|
188
|
-
if self.size is not None:
|
|
201
|
+
if not self.use_existing and self.size is not None:
|
|
189
202
|
try:
|
|
190
203
|
size_int = int(self.size)
|
|
191
204
|
if size_int < volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB:
|
|
@@ -196,8 +209,7 @@ class RunpodNetworkVolume(Volume):
|
|
|
196
209
|
raise ValueError(f'Invalid volume size {self.size!r}: '
|
|
197
210
|
f'{e}') from e
|
|
198
211
|
if not self.zone:
|
|
199
|
-
raise ValueError(
|
|
200
|
-
|
|
201
|
-
'volume. Set the zone in the infra field.')
|
|
212
|
+
raise ValueError('RunPod DataCenterId is required for network '
|
|
213
|
+
'volumes. Set the zone in the infra field.')
|
|
202
214
|
|
|
203
215
|
return
|
sky/workspaces/core.py
CHANGED
|
@@ -14,6 +14,7 @@ from sky.backends import backend_utils
|
|
|
14
14
|
from sky.skylet import constants
|
|
15
15
|
from sky.usage import usage_lib
|
|
16
16
|
from sky.users import permission
|
|
17
|
+
from sky.users import rbac
|
|
17
18
|
from sky.utils import annotations
|
|
18
19
|
from sky.utils import common_utils
|
|
19
20
|
from sky.utils import config_utils
|
|
@@ -147,11 +148,15 @@ def _compare_workspace_configs(
|
|
|
147
148
|
private_new = new_config.get('private', False)
|
|
148
149
|
private_changed = private_old != private_new
|
|
149
150
|
|
|
151
|
+
admin_user_ids = permission.permission_service.get_users_for_role(
|
|
152
|
+
rbac.RoleName.ADMIN.value)
|
|
150
153
|
# Get allowed users (resolve to user IDs for comparison)
|
|
151
154
|
allowed_users_old = workspaces_utils.get_workspace_users(
|
|
152
155
|
current_config) if private_old else []
|
|
156
|
+
allowed_users_old += admin_user_ids
|
|
153
157
|
allowed_users_new = workspaces_utils.get_workspace_users(
|
|
154
158
|
new_config) if private_new else []
|
|
159
|
+
allowed_users_new += admin_user_ids
|
|
155
160
|
|
|
156
161
|
# Convert to sets for easier comparison
|
|
157
162
|
old_users_set = set(allowed_users_old)
|
|
@@ -188,6 +193,24 @@ def _compare_workspace_configs(
|
|
|
188
193
|
added_users=added_users)
|
|
189
194
|
|
|
190
195
|
|
|
196
|
+
def _validate_workspace_config_changes_with_lock(
|
|
197
|
+
workspace_name: str, current_config: Dict[str, Any],
|
|
198
|
+
new_config: Dict[str, Any]) -> None:
|
|
199
|
+
lock_id = backend_utils.workspace_lock_id(workspace_name)
|
|
200
|
+
lock_timeout = backend_utils.WORKSPACE_LOCK_TIMEOUT_SECONDS
|
|
201
|
+
try:
|
|
202
|
+
with locks.get_lock(lock_id, lock_timeout):
|
|
203
|
+
# Validate the configuration changes based on active resources
|
|
204
|
+
_validate_workspace_config_changes(workspace_name, current_config,
|
|
205
|
+
new_config)
|
|
206
|
+
except locks.LockTimeout as e:
|
|
207
|
+
raise RuntimeError(
|
|
208
|
+
f'Failed to validate workspace {workspace_name!r} due to '
|
|
209
|
+
'a timeout when trying to access database. Please '
|
|
210
|
+
f'try again or manually remove the lock at {lock_id}. '
|
|
211
|
+
f'{common_utils.format_exception(e)}') from None
|
|
212
|
+
|
|
213
|
+
|
|
191
214
|
def _validate_workspace_config_changes(workspace_name: str,
|
|
192
215
|
current_config: Dict[str, Any],
|
|
193
216
|
new_config: Dict[str, Any]) -> None:
|
|
@@ -232,7 +255,7 @@ def _validate_workspace_config_changes(workspace_name: str,
|
|
|
232
255
|
f' private. Checking that all active resources belong'
|
|
233
256
|
f' to allowed users.')
|
|
234
257
|
|
|
235
|
-
error_summary, missed_users_names = (
|
|
258
|
+
error_summary, missed_users_names, _ = (
|
|
236
259
|
resource_checker.check_users_workspaces_active_resources(
|
|
237
260
|
config_comparison.allowed_users_new, [workspace_name]))
|
|
238
261
|
if error_summary:
|
|
@@ -259,11 +282,35 @@ def _validate_workspace_config_changes(workspace_name: str,
|
|
|
259
282
|
f'Checking that removed users'
|
|
260
283
|
f' {config_comparison.removed_users} do not have'
|
|
261
284
|
f' active resources in workspace {workspace_name!r}.')
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
285
|
+
error_summary, missed_users_names, missed_user_dict = (
|
|
286
|
+
resource_checker.check_users_workspaces_active_resources(
|
|
287
|
+
config_comparison.allowed_users_new, [workspace_name]))
|
|
288
|
+
if error_summary:
|
|
289
|
+
error_user_ids = []
|
|
290
|
+
for user_id in config_comparison.removed_users:
|
|
291
|
+
if user_id in missed_user_dict:
|
|
292
|
+
error_user_ids.append(user_id)
|
|
293
|
+
error_user_names = []
|
|
294
|
+
if error_user_ids:
|
|
295
|
+
error_user_names = [
|
|
296
|
+
missed_user_dict[user_id]
|
|
297
|
+
for user_id in error_user_ids
|
|
298
|
+
]
|
|
299
|
+
|
|
300
|
+
error_msg = 'Cannot '
|
|
301
|
+
error_users_list = ', '.join(error_user_names)
|
|
302
|
+
if len(error_user_names) == 1:
|
|
303
|
+
error_msg += f'remove user {error_users_list!r} ' \
|
|
304
|
+
f'from workspace {workspace_name!r} because the ' \
|
|
305
|
+
f'user has {error_summary}'
|
|
306
|
+
else:
|
|
307
|
+
error_msg += f'remove users {error_users_list!r}' \
|
|
308
|
+
f' from workspace {workspace_name!r} because the' \
|
|
309
|
+
f' users have {error_summary}'
|
|
310
|
+
error_msg += ', but not in the allowed_users list.' \
|
|
311
|
+
' Please either add the users to allowed_users or' \
|
|
312
|
+
' ask them to terminate their resources.'
|
|
313
|
+
raise ValueError(error_msg)
|
|
267
314
|
else:
|
|
268
315
|
# Other configuration changes - check that workspace has no active
|
|
269
316
|
# resources
|
|
@@ -310,20 +357,8 @@ def update_workspace(workspace_name: str, config: Dict[str,
|
|
|
310
357
|
default_value={})
|
|
311
358
|
current_config = current_workspaces.get(workspace_name, {})
|
|
312
359
|
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
lock_timeout = backend_utils.WORKSPACE_LOCK_TIMEOUT_SECONDS
|
|
316
|
-
try:
|
|
317
|
-
with locks.get_lock(lock_id, lock_timeout):
|
|
318
|
-
# Validate the configuration changes based on active resources
|
|
319
|
-
_validate_workspace_config_changes(workspace_name,
|
|
320
|
-
current_config, config)
|
|
321
|
-
except locks.LockTimeout as e:
|
|
322
|
-
raise RuntimeError(
|
|
323
|
-
f'Failed to validate workspace {workspace_name!r} due to '
|
|
324
|
-
'a timeout when trying to access database. Please '
|
|
325
|
-
f'try again or manually remove the lock at {lock_id}. '
|
|
326
|
-
f'{common_utils.format_exception(e)}') from None
|
|
360
|
+
_validate_workspace_config_changes_with_lock(workspace_name, current_config,
|
|
361
|
+
config)
|
|
327
362
|
|
|
328
363
|
def update_workspace_fn(workspaces: Dict[str, Any]) -> None:
|
|
329
364
|
"""Function to update workspace inside the lock."""
|
|
@@ -510,7 +545,8 @@ def update_config(config: Dict[str, Any]) -> Dict[str, Any]:
|
|
|
510
545
|
# If workspace configuration is changing, validate and mark for checking
|
|
511
546
|
if current_workspace_config != new_workspace_config:
|
|
512
547
|
_validate_workspace_config(workspace_name, new_workspace_config)
|
|
513
|
-
|
|
548
|
+
_validate_workspace_config_changes_with_lock(
|
|
549
|
+
workspace_name, current_workspace_config, new_workspace_config)
|
|
514
550
|
users = workspaces_utils.get_workspace_users(new_workspace_config)
|
|
515
551
|
workspaces_to_check_policy['update'][workspace_name] = users
|
|
516
552
|
|
sky/workspaces/server.py
CHANGED
|
@@ -4,6 +4,7 @@ import fastapi
|
|
|
4
4
|
|
|
5
5
|
from sky.server.requests import executor
|
|
6
6
|
from sky.server.requests import payloads
|
|
7
|
+
from sky.server.requests import request_names
|
|
7
8
|
from sky.server.requests import requests as api_requests
|
|
8
9
|
from sky.workspaces import core
|
|
9
10
|
|
|
@@ -22,9 +23,9 @@ async def get(request: fastapi.Request) -> None:
|
|
|
22
23
|
} if auth_user else {}
|
|
23
24
|
request_body = payloads.RequestBody(**auth_user_env_vars_kwargs)
|
|
24
25
|
|
|
25
|
-
executor.
|
|
26
|
+
await executor.schedule_request_async(
|
|
26
27
|
request_id=request.state.request_id,
|
|
27
|
-
request_name=
|
|
28
|
+
request_name=request_names.RequestName.WORKSPACES_GET,
|
|
28
29
|
request_body=request_body,
|
|
29
30
|
func=core.get_workspaces,
|
|
30
31
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -35,9 +36,9 @@ async def get(request: fastapi.Request) -> None:
|
|
|
35
36
|
async def update(request: fastapi.Request,
|
|
36
37
|
update_workspace_body: payloads.UpdateWorkspaceBody) -> None:
|
|
37
38
|
"""Updates a specific workspace configuration."""
|
|
38
|
-
executor.
|
|
39
|
+
await executor.schedule_request_async(
|
|
39
40
|
request_id=request.state.request_id,
|
|
40
|
-
request_name=
|
|
41
|
+
request_name=request_names.RequestName.WORKSPACES_UPDATE,
|
|
41
42
|
request_body=update_workspace_body,
|
|
42
43
|
func=core.update_workspace,
|
|
43
44
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -48,9 +49,9 @@ async def update(request: fastapi.Request,
|
|
|
48
49
|
async def create(request: fastapi.Request,
|
|
49
50
|
create_workspace_body: payloads.CreateWorkspaceBody) -> None:
|
|
50
51
|
"""Creates a new workspace configuration."""
|
|
51
|
-
executor.
|
|
52
|
+
await executor.schedule_request_async(
|
|
52
53
|
request_id=request.state.request_id,
|
|
53
|
-
request_name=
|
|
54
|
+
request_name=request_names.RequestName.WORKSPACES_CREATE,
|
|
54
55
|
request_body=create_workspace_body,
|
|
55
56
|
func=core.create_workspace,
|
|
56
57
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -61,9 +62,9 @@ async def create(request: fastapi.Request,
|
|
|
61
62
|
async def delete(request: fastapi.Request,
|
|
62
63
|
delete_workspace_body: payloads.DeleteWorkspaceBody) -> None:
|
|
63
64
|
"""Deletes a workspace configuration."""
|
|
64
|
-
executor.
|
|
65
|
+
await executor.schedule_request_async(
|
|
65
66
|
request_id=request.state.request_id,
|
|
66
|
-
request_name=
|
|
67
|
+
request_name=request_names.RequestName.WORKSPACES_DELETE,
|
|
67
68
|
request_body=delete_workspace_body,
|
|
68
69
|
func=core.delete_workspace,
|
|
69
70
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -78,9 +79,9 @@ async def get_config(request: fastapi.Request) -> None:
|
|
|
78
79
|
'env_vars': auth_user.to_env_vars()
|
|
79
80
|
} if auth_user else {}
|
|
80
81
|
get_config_body = payloads.GetConfigBody(**auth_user_env_vars_kwargs)
|
|
81
|
-
executor.
|
|
82
|
+
await executor.schedule_request_async(
|
|
82
83
|
request_id=request.state.request_id,
|
|
83
|
-
request_name=
|
|
84
|
+
request_name=request_names.RequestName.WORKSPACES_GET_CONFIG,
|
|
84
85
|
request_body=get_config_body,
|
|
85
86
|
func=core.get_config,
|
|
86
87
|
schedule_type=api_requests.ScheduleType.SHORT,
|
|
@@ -91,9 +92,9 @@ async def get_config(request: fastapi.Request) -> None:
|
|
|
91
92
|
async def update_config(request: fastapi.Request,
|
|
92
93
|
update_config_body: payloads.UpdateConfigBody) -> None:
|
|
93
94
|
"""Updates the entire SkyPilot configuration."""
|
|
94
|
-
executor.
|
|
95
|
+
await executor.schedule_request_async(
|
|
95
96
|
request_id=request.state.request_id,
|
|
96
|
-
request_name=
|
|
97
|
+
request_name=request_names.RequestName.WORKSPACES_UPDATE_CONFIG,
|
|
97
98
|
request_body=update_config_body,
|
|
98
99
|
func=core.update_config,
|
|
99
100
|
schedule_type=api_requests.ScheduleType.SHORT,
|
sky_templates/README.md
ADDED
|
File without changes
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Starts a Ray cluster on a SkyPilot cluster.
|
|
3
|
+
#
|
|
4
|
+
# This script starts a Ray cluster using default Ray ports (6379, 8265),
|
|
5
|
+
# which are different from SkyPilot's system Ray ports (6380, 8266).
|
|
6
|
+
# This allows users to run their own Ray applications independently of
|
|
7
|
+
# SkyPilot's internal Ray cluster.
|
|
8
|
+
#
|
|
9
|
+
# Environment Variables:
|
|
10
|
+
# RAY_HEAD_PORT=6379 - Ray head node port
|
|
11
|
+
# RAY_DASHBOARD_PORT=8265 - Ray dashboard port
|
|
12
|
+
# RAY_DASHBOARD_HOST=127.0.0.1 - Dashboard host (set to 0.0.0.0 to expose externally)
|
|
13
|
+
# RAY_DASHBOARD_AGENT_LISTEN_PORT= - (Optional) Dashboard agent listen port
|
|
14
|
+
# RAY_HEAD_IP_ADDRESS= - (Optional) Node IP address
|
|
15
|
+
# RAY_CMD=ray - (Optional) Command to invoke Ray (e.g., "uv run ray")
|
|
16
|
+
#
|
|
17
|
+
# Usage:
|
|
18
|
+
# ~/sky_templates/ray/start_cluster
|
|
19
|
+
#
|
|
20
|
+
# # With custom configurations
|
|
21
|
+
# export RAY_DASHBOARD_HOST=0.0.0.0
|
|
22
|
+
# export RAY_DASHBOARD_PORT=8280
|
|
23
|
+
# ~/sky_templates/ray/start_cluster
|
|
24
|
+
#
|
|
25
|
+
# # With uv
|
|
26
|
+
# export RAY_CMD="uv run ray"
|
|
27
|
+
# ~/sky_templates/ray/start_cluster
|
|
28
|
+
|
|
29
|
+
set -e
|
|
30
|
+
|
|
31
|
+
# Color codes for output
|
|
32
|
+
RED='\033[0;31m'
|
|
33
|
+
GREEN='\033[0;32m'
|
|
34
|
+
YELLOW='\033[1;33m'
|
|
35
|
+
NC='\033[0m' # No Color
|
|
36
|
+
|
|
37
|
+
RAY_HEAD_PORT=${RAY_HEAD_PORT:-6379}
|
|
38
|
+
RAY_DASHBOARD_PORT=${RAY_DASHBOARD_PORT:-8265}
|
|
39
|
+
RAY_DASHBOARD_HOST=${RAY_DASHBOARD_HOST:-127.0.0.1}
|
|
40
|
+
RAY_DASHBOARD_AGENT_LISTEN_PORT=${RAY_DASHBOARD_AGENT_LISTEN_PORT:-}
|
|
41
|
+
RAY_HEAD_IP_ADDRESS=${RAY_HEAD_IP_ADDRESS:-}
|
|
42
|
+
|
|
43
|
+
RAY_CMD=${RAY_CMD:-ray}
|
|
44
|
+
# Tokenize the command string into an array so multi-word commands
|
|
45
|
+
# (e.g., "uv run ray") are handled safely when expanded later.
|
|
46
|
+
eval "RAY_CMD_ARR=( ${RAY_CMD} )"
|
|
47
|
+
|
|
48
|
+
# Convenience wrapper to invoke the configured Ray command with arbitrary args.
|
|
49
|
+
run_ray() {
|
|
50
|
+
"${RAY_CMD_ARR[@]}" "$@"
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
echo -e "${GREEN}Starting Ray cluster...${NC}"
|
|
54
|
+
|
|
55
|
+
# Ensure ray[default] is installed (we need [default] to do `ray list nodes`)
|
|
56
|
+
# Pin to existing version if Ray is already installed to avoid upgrading existing version.
|
|
57
|
+
RAY_VERSION=$(run_ray --version 2>/dev/null | cut -d' ' -f3 || echo "")
|
|
58
|
+
if [ -n "${RAY_VERSION}" ]; then
|
|
59
|
+
# Pin to existing version.
|
|
60
|
+
VERSION_SPEC="==${RAY_VERSION}"
|
|
61
|
+
else
|
|
62
|
+
echo -e "${YELLOW}Installing ray[default]...${NC}"
|
|
63
|
+
VERSION_SPEC=""
|
|
64
|
+
fi
|
|
65
|
+
|
|
66
|
+
# Pin click<8.3.0 to avoid incompatibility with Ray on Python 3.10
|
|
67
|
+
# click 8.3.0 and 8.3.1 breaks Ray CLI due to deepcopy issues with sentinel values
|
|
68
|
+
# See: https://github.com/ray-project/ray/issues/56747
|
|
69
|
+
# TODO(kevin): Remove this once the issue is fixed in a future click release
|
|
70
|
+
RAY_INSTALL_SPEC="ray[default]${VERSION_SPEC} click<8.3.0"
|
|
71
|
+
uv pip install ${RAY_INSTALL_SPEC} || uv pip install --system ${RAY_INSTALL_SPEC}
|
|
72
|
+
|
|
73
|
+
# Verify Ray is working
|
|
74
|
+
if ! run_ray --version > /dev/null; then
|
|
75
|
+
echo -e "${RED}Error: Failed to install Ray.${NC}"
|
|
76
|
+
exit 1
|
|
77
|
+
fi
|
|
78
|
+
echo -e "${GREEN}Ray $(run_ray --version | cut -d' ' -f3) is installed.${NC}"
|
|
79
|
+
|
|
80
|
+
RAY_ADDRESS="127.0.0.1:${RAY_HEAD_PORT}"
|
|
81
|
+
if [ "${SKYPILOT_NODE_RANK}" -ne 0 ]; then
|
|
82
|
+
HEAD_IP=$(echo "${SKYPILOT_NODE_IPS}" | head -n1)
|
|
83
|
+
RAY_ADDRESS="${HEAD_IP}:${RAY_HEAD_PORT}"
|
|
84
|
+
fi
|
|
85
|
+
|
|
86
|
+
# Check if user-space Ray is already running
|
|
87
|
+
if run_ray status --address="${RAY_ADDRESS}" &> /dev/null; then
|
|
88
|
+
echo -e "${YELLOW}Ray cluster is already running.${NC}"
|
|
89
|
+
run_ray status --address="${RAY_ADDRESS}"
|
|
90
|
+
exit 0
|
|
91
|
+
fi
|
|
92
|
+
|
|
93
|
+
TIMEOUT=300
|
|
94
|
+
|
|
95
|
+
if [ "${SKYPILOT_NODE_RANK}" -eq 0 ]; then
|
|
96
|
+
echo -e "${GREEN}Starting Ray head node...${NC}"
|
|
97
|
+
|
|
98
|
+
RAY_START_CMD="start --head \
|
|
99
|
+
--port=${RAY_HEAD_PORT} \
|
|
100
|
+
--dashboard-port=${RAY_DASHBOARD_PORT} \
|
|
101
|
+
--dashboard-host=${RAY_DASHBOARD_HOST} \
|
|
102
|
+
--disable-usage-stats \
|
|
103
|
+
--include-dashboard=True"
|
|
104
|
+
|
|
105
|
+
# Add --num-gpus only if > 0
|
|
106
|
+
if [ "${SKYPILOT_NUM_GPUS_PER_NODE}" -gt 0 ]; then
|
|
107
|
+
RAY_START_CMD="${RAY_START_CMD} --num-gpus=${SKYPILOT_NUM_GPUS_PER_NODE}"
|
|
108
|
+
fi
|
|
109
|
+
|
|
110
|
+
# Add optional dashboard agent listen port if specified
|
|
111
|
+
if [ -n "${RAY_DASHBOARD_AGENT_LISTEN_PORT}" ]; then
|
|
112
|
+
RAY_START_CMD="${RAY_START_CMD} --dashboard-agent-listen-port=${RAY_DASHBOARD_AGENT_LISTEN_PORT}"
|
|
113
|
+
fi
|
|
114
|
+
|
|
115
|
+
# Add optional node IP address if specified
|
|
116
|
+
if [ -n "${RAY_HEAD_IP_ADDRESS}" ]; then
|
|
117
|
+
RAY_START_CMD="${RAY_START_CMD} --node-ip-address=${RAY_HEAD_IP_ADDRESS}"
|
|
118
|
+
fi
|
|
119
|
+
|
|
120
|
+
run_ray ${RAY_START_CMD}
|
|
121
|
+
|
|
122
|
+
start_time=$(date +%s)
|
|
123
|
+
while ! run_ray health-check --address="${RAY_ADDRESS}" &>/dev/null; do
|
|
124
|
+
if [ "$(( $(date +%s) - start_time ))" -ge "$TIMEOUT" ]; then
|
|
125
|
+
echo -e "${RED}Timed out waiting for head node. Exiting.${NC}" >&2
|
|
126
|
+
exit 1
|
|
127
|
+
fi
|
|
128
|
+
echo "Head node not healthy yet. Retrying in 1s..."
|
|
129
|
+
sleep 1
|
|
130
|
+
done
|
|
131
|
+
|
|
132
|
+
echo -e "${GREEN}Head node started successfully.${NC}"
|
|
133
|
+
|
|
134
|
+
# Wait for all worker nodes to join
|
|
135
|
+
if [ "${SKYPILOT_NUM_NODES}" -gt 1 ]; then
|
|
136
|
+
echo "Waiting for all ${SKYPILOT_NUM_NODES} nodes to join..."
|
|
137
|
+
start_time=$(date +%s)
|
|
138
|
+
while true; do
|
|
139
|
+
if [ "$(( $(date +%s) - start_time ))" -ge "${TIMEOUT}" ]; then
|
|
140
|
+
echo -e "${RED}Error: Timeout waiting for nodes.${NC}" >&2
|
|
141
|
+
exit 1
|
|
142
|
+
fi
|
|
143
|
+
ready_nodes=$(run_ray list nodes --format=json | python3 -c "import sys, json; print(len(json.load(sys.stdin)))")
|
|
144
|
+
if [ "${ready_nodes}" -ge "${SKYPILOT_NUM_NODES}" ]; then
|
|
145
|
+
break
|
|
146
|
+
fi
|
|
147
|
+
echo "Waiting... (${ready_nodes} / ${SKYPILOT_NUM_NODES} nodes ready)"
|
|
148
|
+
sleep 5
|
|
149
|
+
done
|
|
150
|
+
echo -e "${GREEN}All ${SKYPILOT_NUM_NODES} nodes have joined.${NC}"
|
|
151
|
+
fi
|
|
152
|
+
|
|
153
|
+
# Add sleep to after `ray start` to give ray enough time to daemonize
|
|
154
|
+
sleep 5
|
|
155
|
+
else
|
|
156
|
+
echo -e "${GREEN}Starting Ray worker node...${NC}"
|
|
157
|
+
|
|
158
|
+
echo "Waiting for head node at ${RAY_ADDRESS}..."
|
|
159
|
+
start_time=$(date +%s)
|
|
160
|
+
while ! run_ray health-check --address="${RAY_ADDRESS}" &>/dev/null; do
|
|
161
|
+
if [ "$(( $(date +%s) - start_time ))" -ge "$TIMEOUT" ]; then
|
|
162
|
+
echo -e "${RED}Timed out waiting for head node. Exiting.${NC}" >&2
|
|
163
|
+
exit 1
|
|
164
|
+
fi
|
|
165
|
+
echo "Head node not healthy yet. Retrying in 1s..."
|
|
166
|
+
sleep 1
|
|
167
|
+
done
|
|
168
|
+
|
|
169
|
+
echo -e "${GREEN}Head node is healthy. Starting worker node...${NC}"
|
|
170
|
+
WORKER_CMD="start --address=${RAY_ADDRESS} --disable-usage-stats"
|
|
171
|
+
|
|
172
|
+
# Add --num-gpus only if > 0
|
|
173
|
+
if [ "${SKYPILOT_NUM_GPUS_PER_NODE}" -gt 0 ]; then
|
|
174
|
+
WORKER_CMD="${WORKER_CMD} --num-gpus=${SKYPILOT_NUM_GPUS_PER_NODE}"
|
|
175
|
+
fi
|
|
176
|
+
|
|
177
|
+
run_ray ${WORKER_CMD}
|
|
178
|
+
|
|
179
|
+
echo -e "${GREEN}Worker node started successfully.${NC}"
|
|
180
|
+
|
|
181
|
+
# Add sleep to after `ray start` to give ray enough time to daemonize
|
|
182
|
+
sleep 5
|
|
183
|
+
fi
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Stops a user Ray cluster on a SkyPilot cluster.
|
|
3
|
+
#
|
|
4
|
+
# This script stops a Ray cluster running on custom ports (default 6379),
|
|
5
|
+
# which is separate from SkyPilot's internal Ray cluster (port 6380).
|
|
6
|
+
#
|
|
7
|
+
# IMPORTANT: This script uses pkill to stop Ray processes, NOT 'ray stop',
|
|
8
|
+
# as 'ray stop' can interfere with SkyPilot's internal operations.
|
|
9
|
+
#
|
|
10
|
+
# Environment Variables:
|
|
11
|
+
# RAY_HEAD_PORT=6379 - Ray head node port to stop
|
|
12
|
+
# RAY_CMD=ray - (Optional) Command to invoke Ray (e.g., "uv run ray")
|
|
13
|
+
#
|
|
14
|
+
# Usage:
|
|
15
|
+
# # Stop default Ray cluster (port 6379)
|
|
16
|
+
# ~/sky_templates/ray/stop_ray_cluster.sh
|
|
17
|
+
#
|
|
18
|
+
# # Stop Ray cluster on custom port
|
|
19
|
+
# export RAY_HEAD_PORT=6385
|
|
20
|
+
# ~/sky_templates/ray/stop_ray_cluster.sh
|
|
21
|
+
#
|
|
22
|
+
# # With uv
|
|
23
|
+
# export RAY_CMD="uv run ray"
|
|
24
|
+
# ~/sky_templates/ray/stop_ray_cluster.sh
|
|
25
|
+
|
|
26
|
+
set -e
|
|
27
|
+
|
|
28
|
+
# Color codes for output
|
|
29
|
+
RED='\033[0;31m'
|
|
30
|
+
GREEN='\033[0;32m'
|
|
31
|
+
YELLOW='\033[1;33m'
|
|
32
|
+
NC='\033[0m' # No Color
|
|
33
|
+
|
|
34
|
+
RAY_HEAD_PORT=${RAY_HEAD_PORT:-6379}
|
|
35
|
+
RAY_CMD=${RAY_CMD:-ray}
|
|
36
|
+
# Tokenize the command string into an array so multi-word commands (e.g., "uv run ray")
|
|
37
|
+
# are handled safely when expanded later.
|
|
38
|
+
eval "RAY_CMD_ARR=( ${RAY_CMD} )"
|
|
39
|
+
|
|
40
|
+
run_ray() {
|
|
41
|
+
"${RAY_CMD_ARR[@]}" "$@"
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
echo -e "${GREEN}Stopping Ray cluster on port ${RAY_HEAD_PORT}...${NC}"
|
|
45
|
+
|
|
46
|
+
RAY_ADDRESS="127.0.0.1:${RAY_HEAD_PORT}"
|
|
47
|
+
if [ "$SKYPILOT_NODE_RANK" -ne 0 ]; then
|
|
48
|
+
HEAD_IP=$(echo "$SKYPILOT_NODE_IPS" | head -n1)
|
|
49
|
+
RAY_ADDRESS="${HEAD_IP}:${RAY_HEAD_PORT}"
|
|
50
|
+
fi
|
|
51
|
+
|
|
52
|
+
# Check if Ray is running
|
|
53
|
+
if ! run_ray status --address="${RAY_ADDRESS}" &> /dev/null; then
|
|
54
|
+
echo -e "${YELLOW}No Ray cluster found running on port ${RAY_HEAD_PORT}.${NC}"
|
|
55
|
+
exit 0
|
|
56
|
+
fi
|
|
57
|
+
|
|
58
|
+
# Use pkill to stop Ray processes instead of 'ray stop'
|
|
59
|
+
# This prevents interfering with SkyPilot's internal Ray cluster (port 6380)
|
|
60
|
+
echo -e "${YELLOW}Killing Ray processes on port ${RAY_HEAD_PORT}...${NC}"
|
|
61
|
+
|
|
62
|
+
pkill -f "ray.*[=:]${RAY_HEAD_PORT}" || true
|
|
63
|
+
|
|
64
|
+
echo -e "${GREEN}Ray processes killed.${NC}"
|
|
65
|
+
# Wait a moment for processes to terminate
|
|
66
|
+
sleep 5
|
|
67
|
+
|
|
68
|
+
# Verify Ray is stopped
|
|
69
|
+
if run_ray status --address="${RAY_ADDRESS}" &> /dev/null; then
|
|
70
|
+
echo -e "${RED}Warning: Ray cluster may still be running. Try manually:${NC}"
|
|
71
|
+
echo -e "${RED} pkill -9 -f 'ray.*[=:]${RAY_HEAD_PORT}'${NC}"
|
|
72
|
+
exit 1
|
|
73
|
+
else
|
|
74
|
+
echo -e "${GREEN}Ray cluster successfully stopped.${NC}"
|
|
75
|
+
fi
|