skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
"""Volume management core."""
|
|
2
|
+
|
|
3
|
+
import contextlib
|
|
4
|
+
import os
|
|
5
|
+
from typing import Any, Dict, Generator, List, Optional
|
|
6
|
+
import uuid
|
|
7
|
+
|
|
8
|
+
import filelock
|
|
9
|
+
|
|
10
|
+
from sky import global_user_state
|
|
11
|
+
from sky import models
|
|
12
|
+
from sky import provision
|
|
13
|
+
from sky import sky_logging
|
|
14
|
+
from sky.schemas.api import responses
|
|
15
|
+
from sky.utils import common_utils
|
|
16
|
+
from sky.utils import registry
|
|
17
|
+
from sky.utils import rich_utils
|
|
18
|
+
from sky.utils import status_lib
|
|
19
|
+
from sky.utils import ux_utils
|
|
20
|
+
|
|
21
|
+
logger = sky_logging.init_logger(__name__)
|
|
22
|
+
|
|
23
|
+
# Filelocks for the storage management.
|
|
24
|
+
VOLUME_LOCK_PATH = os.path.expanduser('~/.sky/.{volume_name}.lock')
|
|
25
|
+
VOLUME_LOCK_TIMEOUT_SECONDS = 20
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def volume_refresh():
|
|
29
|
+
"""Refreshes the volume status."""
|
|
30
|
+
volumes = volume_list(is_ephemeral=False)
|
|
31
|
+
for volume in volumes:
|
|
32
|
+
volume_name = volume.name
|
|
33
|
+
usedby_pods = volume.usedby_pods
|
|
34
|
+
with _volume_lock(volume_name):
|
|
35
|
+
latest_volume = global_user_state.get_volume_by_name(volume_name)
|
|
36
|
+
if latest_volume is None:
|
|
37
|
+
logger.warning(f'Volume {volume_name} not found.')
|
|
38
|
+
continue
|
|
39
|
+
status = latest_volume.get('status')
|
|
40
|
+
if not usedby_pods:
|
|
41
|
+
if status != status_lib.VolumeStatus.READY:
|
|
42
|
+
logger.info(f'Update volume {volume_name} '
|
|
43
|
+
f'status to READY')
|
|
44
|
+
global_user_state.update_volume_status(
|
|
45
|
+
volume_name, status=status_lib.VolumeStatus.READY)
|
|
46
|
+
else:
|
|
47
|
+
if status != status_lib.VolumeStatus.IN_USE:
|
|
48
|
+
logger.info(f'Update volume {volume_name} '
|
|
49
|
+
f'status to IN_USE, usedby: {usedby_pods}')
|
|
50
|
+
global_user_state.update_volume_status(
|
|
51
|
+
volume_name, status=status_lib.VolumeStatus.IN_USE)
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def volume_list(
|
|
55
|
+
is_ephemeral: Optional[bool] = None) -> List[responses.VolumeRecord]:
|
|
56
|
+
"""Gets the volumes.
|
|
57
|
+
|
|
58
|
+
Returns:
|
|
59
|
+
[
|
|
60
|
+
{
|
|
61
|
+
'name': str,
|
|
62
|
+
'type': str,
|
|
63
|
+
'launched_at': int timestamp of creation,
|
|
64
|
+
'cloud': str,
|
|
65
|
+
'region': str,
|
|
66
|
+
'zone': str,
|
|
67
|
+
'size': str,
|
|
68
|
+
'config': Dict[str, Any],
|
|
69
|
+
'name_on_cloud': str,
|
|
70
|
+
'user_hash': str,
|
|
71
|
+
'workspace': str,
|
|
72
|
+
'last_attached_at': int timestamp of last attachment,
|
|
73
|
+
'last_use': last command,
|
|
74
|
+
'status': sky.VolumeStatus,
|
|
75
|
+
'usedby_pods': List[str],
|
|
76
|
+
'usedby_clusters': List[str],
|
|
77
|
+
'is_ephemeral': bool,
|
|
78
|
+
}
|
|
79
|
+
]
|
|
80
|
+
"""
|
|
81
|
+
with rich_utils.safe_status(ux_utils.spinner_message('Listing volumes')):
|
|
82
|
+
volumes = global_user_state.get_volumes(is_ephemeral=is_ephemeral)
|
|
83
|
+
cloud_to_configs: Dict[str, List[models.VolumeConfig]] = {}
|
|
84
|
+
for volume in volumes:
|
|
85
|
+
config = volume.get('handle')
|
|
86
|
+
if config is None:
|
|
87
|
+
volume_name = volume.get('name')
|
|
88
|
+
logger.warning(f'Volume {volume_name} has no handle.')
|
|
89
|
+
continue
|
|
90
|
+
cloud = config.cloud
|
|
91
|
+
if cloud not in cloud_to_configs:
|
|
92
|
+
cloud_to_configs[cloud] = []
|
|
93
|
+
cloud_to_configs[cloud].append(config)
|
|
94
|
+
|
|
95
|
+
cloud_to_used_by_pods, cloud_to_used_by_clusters = {}, {}
|
|
96
|
+
for cloud, configs in cloud_to_configs.items():
|
|
97
|
+
used_by_pods, used_by_clusters = provision.get_all_volumes_usedby(
|
|
98
|
+
cloud, configs)
|
|
99
|
+
cloud_to_used_by_pods[cloud] = used_by_pods
|
|
100
|
+
cloud_to_used_by_clusters[cloud] = used_by_clusters
|
|
101
|
+
|
|
102
|
+
all_users = global_user_state.get_all_users()
|
|
103
|
+
user_map = {user.id: user.name for user in all_users}
|
|
104
|
+
records = []
|
|
105
|
+
for volume in volumes:
|
|
106
|
+
volume_name = volume.get('name')
|
|
107
|
+
record = {
|
|
108
|
+
'name': volume_name,
|
|
109
|
+
'launched_at': volume.get('launched_at'),
|
|
110
|
+
'user_hash': volume.get('user_hash'),
|
|
111
|
+
'user_name': user_map.get(volume.get('user_hash'), ''),
|
|
112
|
+
'workspace': volume.get('workspace'),
|
|
113
|
+
'last_attached_at': volume.get('last_attached_at'),
|
|
114
|
+
'last_use': volume.get('last_use'),
|
|
115
|
+
'usedby_pods': [],
|
|
116
|
+
'usedby_clusters': [],
|
|
117
|
+
'is_ephemeral': volume.get('is_ephemeral', False),
|
|
118
|
+
}
|
|
119
|
+
status = volume.get('status')
|
|
120
|
+
if status is not None:
|
|
121
|
+
record['status'] = status.value
|
|
122
|
+
else:
|
|
123
|
+
record['status'] = ''
|
|
124
|
+
config = volume.get('handle')
|
|
125
|
+
if config is None:
|
|
126
|
+
logger.warning(f'Volume {volume_name} has no handle.')
|
|
127
|
+
continue
|
|
128
|
+
cloud = config.cloud
|
|
129
|
+
usedby_pods, usedby_clusters = provision.map_all_volumes_usedby(
|
|
130
|
+
cloud,
|
|
131
|
+
cloud_to_used_by_pods[cloud],
|
|
132
|
+
cloud_to_used_by_clusters[cloud],
|
|
133
|
+
config,
|
|
134
|
+
)
|
|
135
|
+
record['type'] = config.type
|
|
136
|
+
record['cloud'] = config.cloud
|
|
137
|
+
record['region'] = config.region
|
|
138
|
+
record['zone'] = config.zone
|
|
139
|
+
record['size'] = config.size
|
|
140
|
+
record['config'] = config.config
|
|
141
|
+
record['name_on_cloud'] = config.name_on_cloud
|
|
142
|
+
record['usedby_pods'] = usedby_pods
|
|
143
|
+
record['usedby_clusters'] = usedby_clusters
|
|
144
|
+
records.append(responses.VolumeRecord(**record))
|
|
145
|
+
return records
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def volume_delete(names: List[str], ignore_not_found: bool = False) -> None:
|
|
149
|
+
"""Deletes volumes.
|
|
150
|
+
|
|
151
|
+
Args:
|
|
152
|
+
names: List of volume names to delete.
|
|
153
|
+
ignore_not_found: If True, ignore volumes that are not found.
|
|
154
|
+
|
|
155
|
+
Raises:
|
|
156
|
+
ValueError: If the volume does not exist
|
|
157
|
+
or is in use or has no handle.
|
|
158
|
+
"""
|
|
159
|
+
with rich_utils.safe_status(ux_utils.spinner_message('Deleting volumes')):
|
|
160
|
+
for name in names:
|
|
161
|
+
volume = global_user_state.get_volume_by_name(name)
|
|
162
|
+
if volume is None:
|
|
163
|
+
if ignore_not_found:
|
|
164
|
+
continue
|
|
165
|
+
raise ValueError(f'Volume {name} not found.')
|
|
166
|
+
config = volume.get('handle')
|
|
167
|
+
if config is None:
|
|
168
|
+
raise ValueError(f'Volume {name} has no handle.')
|
|
169
|
+
cloud = config.cloud
|
|
170
|
+
usedby_pods, usedby_clusters = provision.get_volume_usedby(
|
|
171
|
+
cloud, config)
|
|
172
|
+
if usedby_clusters:
|
|
173
|
+
usedby_clusters_str = ', '.join(usedby_clusters)
|
|
174
|
+
cluster_str = 'clusters' if len(
|
|
175
|
+
usedby_clusters) > 1 else 'cluster'
|
|
176
|
+
raise ValueError(f'Volume {name} is used by {cluster_str}'
|
|
177
|
+
f' {usedby_clusters_str}.')
|
|
178
|
+
if usedby_pods:
|
|
179
|
+
usedby_pods_str = ', '.join(usedby_pods)
|
|
180
|
+
pod_str = 'pods' if len(usedby_pods) > 1 else 'pod'
|
|
181
|
+
raise ValueError(
|
|
182
|
+
f'Volume {name} is used by {pod_str} {usedby_pods_str}.')
|
|
183
|
+
logger.debug(f'Deleting volume {name} with config {config}')
|
|
184
|
+
with _volume_lock(name):
|
|
185
|
+
provision.delete_volume(cloud, config)
|
|
186
|
+
global_user_state.delete_volume(name)
|
|
187
|
+
logger.info(f'Deleted volumes: {names}')
|
|
188
|
+
|
|
189
|
+
|
|
190
|
+
def volume_apply(
|
|
191
|
+
name: str,
|
|
192
|
+
volume_type: str,
|
|
193
|
+
cloud: str,
|
|
194
|
+
region: Optional[str],
|
|
195
|
+
zone: Optional[str],
|
|
196
|
+
size: Optional[str],
|
|
197
|
+
config: Dict[str, Any],
|
|
198
|
+
labels: Optional[Dict[str, str]] = None,
|
|
199
|
+
use_existing: Optional[bool] = None,
|
|
200
|
+
is_ephemeral: bool = False,
|
|
201
|
+
) -> None:
|
|
202
|
+
"""Creates or registers a volume.
|
|
203
|
+
|
|
204
|
+
Args:
|
|
205
|
+
name: The name of the volume.
|
|
206
|
+
volume_type: The type of the volume.
|
|
207
|
+
cloud: The cloud of the volume.
|
|
208
|
+
region: The region of the volume.
|
|
209
|
+
zone: The zone of the volume.
|
|
210
|
+
size: The size of the volume.
|
|
211
|
+
config: The configuration of the volume.
|
|
212
|
+
labels: The labels of the volume.
|
|
213
|
+
use_existing: Whether to use an existing volume.
|
|
214
|
+
is_ephemeral: Whether the volume is ephemeral.
|
|
215
|
+
"""
|
|
216
|
+
with rich_utils.safe_status(ux_utils.spinner_message('Creating volume')):
|
|
217
|
+
# Reuse the method for cluster name on cloud to
|
|
218
|
+
# generate the storage name on cloud.
|
|
219
|
+
cloud_obj = registry.CLOUD_REGISTRY.from_str(cloud)
|
|
220
|
+
assert cloud_obj is not None
|
|
221
|
+
region, zone = cloud_obj.validate_region_zone(region, zone)
|
|
222
|
+
if use_existing:
|
|
223
|
+
name_on_cloud = name
|
|
224
|
+
else:
|
|
225
|
+
name_uuid = str(uuid.uuid4())[:6]
|
|
226
|
+
name_on_cloud = common_utils.make_cluster_name_on_cloud(
|
|
227
|
+
name, max_length=cloud_obj.max_cluster_name_length())
|
|
228
|
+
name_on_cloud += '-' + name_uuid
|
|
229
|
+
config = models.VolumeConfig(
|
|
230
|
+
name=name,
|
|
231
|
+
type=volume_type,
|
|
232
|
+
cloud=str(cloud_obj),
|
|
233
|
+
region=region,
|
|
234
|
+
zone=zone,
|
|
235
|
+
size=size,
|
|
236
|
+
config=config,
|
|
237
|
+
name_on_cloud=name_on_cloud,
|
|
238
|
+
labels=labels,
|
|
239
|
+
)
|
|
240
|
+
logger.debug(
|
|
241
|
+
f'Creating volume {name} on cloud {cloud} with config {config}')
|
|
242
|
+
with _volume_lock(name):
|
|
243
|
+
current_volume = global_user_state.get_volume_by_name(name)
|
|
244
|
+
if current_volume is not None:
|
|
245
|
+
logger.info(f'Volume {name} already exists.')
|
|
246
|
+
return
|
|
247
|
+
config = provision.apply_volume(cloud, config)
|
|
248
|
+
global_user_state.add_volume(
|
|
249
|
+
name,
|
|
250
|
+
config,
|
|
251
|
+
status_lib.VolumeStatus.READY,
|
|
252
|
+
is_ephemeral,
|
|
253
|
+
)
|
|
254
|
+
logger.info(f'Created volume {name} on cloud {cloud}')
|
|
255
|
+
|
|
256
|
+
|
|
257
|
+
@contextlib.contextmanager
|
|
258
|
+
def _volume_lock(volume_name: str) -> Generator[None, None, None]:
|
|
259
|
+
"""Context manager for volume lock."""
|
|
260
|
+
try:
|
|
261
|
+
with filelock.FileLock(VOLUME_LOCK_PATH.format(volume_name=volume_name),
|
|
262
|
+
VOLUME_LOCK_TIMEOUT_SECONDS):
|
|
263
|
+
yield
|
|
264
|
+
except filelock.Timeout as e:
|
|
265
|
+
raise RuntimeError(
|
|
266
|
+
f'Failed to update user due to a timeout '
|
|
267
|
+
f'when trying to acquire the lock at '
|
|
268
|
+
f'{VOLUME_LOCK_PATH.format(volume_name=volume_name)}. '
|
|
269
|
+
'Please try again or manually remove the lock '
|
|
270
|
+
f'file if you believe it is stale.') from e
|
|
@@ -0,0 +1,124 @@
|
|
|
1
|
+
"""REST API for storage management."""
|
|
2
|
+
|
|
3
|
+
import fastapi
|
|
4
|
+
|
|
5
|
+
from sky import clouds
|
|
6
|
+
from sky import exceptions
|
|
7
|
+
from sky import sky_logging
|
|
8
|
+
from sky.server.requests import executor
|
|
9
|
+
from sky.server.requests import payloads
|
|
10
|
+
from sky.server.requests import request_names
|
|
11
|
+
from sky.server.requests import requests as requests_lib
|
|
12
|
+
from sky.utils import registry
|
|
13
|
+
from sky.utils import volume as volume_utils
|
|
14
|
+
from sky.volumes.server import core
|
|
15
|
+
|
|
16
|
+
logger = sky_logging.init_logger(__name__)
|
|
17
|
+
|
|
18
|
+
router = fastapi.APIRouter()
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@router.get('')
|
|
22
|
+
async def volume_list(request: fastapi.Request) -> None:
|
|
23
|
+
"""Gets the volumes."""
|
|
24
|
+
auth_user = request.state.auth_user
|
|
25
|
+
auth_user_env_vars_kwargs = {
|
|
26
|
+
'env_vars': auth_user.to_env_vars()
|
|
27
|
+
} if auth_user else {}
|
|
28
|
+
request_body = payloads.RequestBody(**auth_user_env_vars_kwargs)
|
|
29
|
+
await executor.schedule_request_async(
|
|
30
|
+
request_id=request.state.request_id,
|
|
31
|
+
request_name=request_names.RequestName.VOLUME_LIST,
|
|
32
|
+
request_body=request_body,
|
|
33
|
+
func=core.volume_list,
|
|
34
|
+
schedule_type=requests_lib.ScheduleType.SHORT,
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@router.post('/delete')
|
|
39
|
+
async def volume_delete(request: fastapi.Request,
|
|
40
|
+
volume_delete_body: payloads.VolumeDeleteBody) -> None:
|
|
41
|
+
"""Deletes a volume."""
|
|
42
|
+
await executor.schedule_request_async(
|
|
43
|
+
request_id=request.state.request_id,
|
|
44
|
+
request_name=request_names.RequestName.VOLUME_DELETE,
|
|
45
|
+
request_body=volume_delete_body,
|
|
46
|
+
func=core.volume_delete,
|
|
47
|
+
schedule_type=requests_lib.ScheduleType.LONG,
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
@router.post('/validate')
|
|
52
|
+
async def volume_validate(
|
|
53
|
+
_: fastapi.Request,
|
|
54
|
+
volume_validate_body: payloads.VolumeValidateBody) -> None:
|
|
55
|
+
"""Validates a volume."""
|
|
56
|
+
# pylint: disable=import-outside-toplevel
|
|
57
|
+
from sky.volumes import volume as volume_lib
|
|
58
|
+
|
|
59
|
+
try:
|
|
60
|
+
volume_config = {
|
|
61
|
+
'name': volume_validate_body.name,
|
|
62
|
+
'type': volume_validate_body.volume_type,
|
|
63
|
+
'infra': volume_validate_body.infra,
|
|
64
|
+
'size': volume_validate_body.size,
|
|
65
|
+
'labels': volume_validate_body.labels,
|
|
66
|
+
'config': volume_validate_body.config,
|
|
67
|
+
'use_existing': volume_validate_body.use_existing,
|
|
68
|
+
}
|
|
69
|
+
volume = volume_lib.Volume.from_yaml_config(volume_config)
|
|
70
|
+
volume.validate()
|
|
71
|
+
except Exception as e:
|
|
72
|
+
requests_lib.set_exception_stacktrace(e)
|
|
73
|
+
raise fastapi.HTTPException(status_code=400,
|
|
74
|
+
detail=exceptions.serialize_exception(e))
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
@router.post('/apply')
|
|
78
|
+
async def volume_apply(request: fastapi.Request,
|
|
79
|
+
volume_apply_body: payloads.VolumeApplyBody) -> None:
|
|
80
|
+
"""Creates or registers a volume."""
|
|
81
|
+
volume_cloud = volume_apply_body.cloud
|
|
82
|
+
volume_type = volume_apply_body.volume_type
|
|
83
|
+
volume_config = volume_apply_body.config
|
|
84
|
+
if volume_config is None:
|
|
85
|
+
volume_config = {}
|
|
86
|
+
volume_config['use_existing'] = volume_apply_body.use_existing
|
|
87
|
+
|
|
88
|
+
supported_volume_types = [
|
|
89
|
+
volume_type.value for volume_type in volume_utils.VolumeType
|
|
90
|
+
]
|
|
91
|
+
if volume_type not in supported_volume_types:
|
|
92
|
+
raise fastapi.HTTPException(
|
|
93
|
+
status_code=400, detail=f'Invalid volume type: {volume_type}')
|
|
94
|
+
cloud = registry.CLOUD_REGISTRY.from_str(volume_cloud)
|
|
95
|
+
if cloud is None:
|
|
96
|
+
raise fastapi.HTTPException(status_code=400,
|
|
97
|
+
detail=f'Invalid cloud: {volume_cloud}')
|
|
98
|
+
if volume_type == volume_utils.VolumeType.PVC.value:
|
|
99
|
+
if not cloud.is_same_cloud(clouds.Kubernetes()):
|
|
100
|
+
raise fastapi.HTTPException(
|
|
101
|
+
status_code=400,
|
|
102
|
+
detail='PVC storage is only supported on Kubernetes')
|
|
103
|
+
supported_access_modes = [
|
|
104
|
+
access_mode.value for access_mode in volume_utils.VolumeAccessMode
|
|
105
|
+
]
|
|
106
|
+
access_mode = volume_config.get('access_mode')
|
|
107
|
+
if access_mode is None:
|
|
108
|
+
volume_config['access_mode'] = (
|
|
109
|
+
volume_utils.VolumeAccessMode.READ_WRITE_ONCE.value)
|
|
110
|
+
elif access_mode not in supported_access_modes:
|
|
111
|
+
raise fastapi.HTTPException(
|
|
112
|
+
status_code=400, detail=f'Invalid access mode: {access_mode}')
|
|
113
|
+
elif volume_type == volume_utils.VolumeType.RUNPOD_NETWORK_VOLUME.value:
|
|
114
|
+
if not cloud.is_same_cloud(clouds.RunPod()):
|
|
115
|
+
raise fastapi.HTTPException(
|
|
116
|
+
status_code=400,
|
|
117
|
+
detail='Runpod network volume is only supported on Runpod')
|
|
118
|
+
await executor.schedule_request_async(
|
|
119
|
+
request_id=request.state.request_id,
|
|
120
|
+
request_name=request_names.RequestName.VOLUME_APPLY,
|
|
121
|
+
request_body=volume_apply_body,
|
|
122
|
+
func=core.volume_apply,
|
|
123
|
+
schedule_type=requests_lib.ScheduleType.LONG,
|
|
124
|
+
)
|
sky/volumes/volume.py
ADDED
|
@@ -0,0 +1,215 @@
|
|
|
1
|
+
"""Volume types and access modes."""
|
|
2
|
+
from typing import Any, Dict, Optional
|
|
3
|
+
|
|
4
|
+
from sky import clouds
|
|
5
|
+
from sky.utils import common_utils
|
|
6
|
+
from sky.utils import infra_utils
|
|
7
|
+
from sky.utils import registry
|
|
8
|
+
from sky.utils import resources_utils
|
|
9
|
+
from sky.utils import schemas
|
|
10
|
+
from sky.utils import volume as volume_lib
|
|
11
|
+
|
|
12
|
+
VOLUME_TYPE_TO_CLOUD = {
|
|
13
|
+
volume_lib.VolumeType.PVC: clouds.Kubernetes(),
|
|
14
|
+
volume_lib.VolumeType.RUNPOD_NETWORK_VOLUME: clouds.RunPod(),
|
|
15
|
+
}
|
|
16
|
+
CLOUD_TO_VOLUME_TYPE = {
|
|
17
|
+
clouds.Kubernetes(): [volume_lib.VolumeType.PVC],
|
|
18
|
+
clouds.RunPod(): [volume_lib.VolumeType.RUNPOD_NETWORK_VOLUME],
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Volume:
|
|
23
|
+
"""Volume specification."""
|
|
24
|
+
|
|
25
|
+
def __init__(
|
|
26
|
+
self,
|
|
27
|
+
name: Optional[str] = None,
|
|
28
|
+
type: Optional[str] = None, # pylint: disable=redefined-builtin
|
|
29
|
+
infra: Optional[str] = None,
|
|
30
|
+
size: Optional[str] = None,
|
|
31
|
+
labels: Optional[Dict[str, str]] = None,
|
|
32
|
+
use_existing: Optional[bool] = None,
|
|
33
|
+
config: Optional[Dict[str, Any]] = None):
|
|
34
|
+
"""Initialize a Volume instance.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
name: Volume name
|
|
38
|
+
type: Volume type (e.g., 'k8s-pvc')
|
|
39
|
+
infra: Infrastructure specification
|
|
40
|
+
size: Volume size
|
|
41
|
+
labels: Volume labels
|
|
42
|
+
use_existing: Whether to use an existing volume
|
|
43
|
+
config: Additional configuration
|
|
44
|
+
"""
|
|
45
|
+
self.name = name
|
|
46
|
+
self.type = type
|
|
47
|
+
self.infra = infra
|
|
48
|
+
self.size = size
|
|
49
|
+
self.labels = labels or {}
|
|
50
|
+
self.use_existing = use_existing
|
|
51
|
+
self.config = config or {}
|
|
52
|
+
|
|
53
|
+
self.cloud: Optional[str] = None
|
|
54
|
+
self.region: Optional[str] = None
|
|
55
|
+
self.zone: Optional[str] = None
|
|
56
|
+
|
|
57
|
+
self._normalize_config()
|
|
58
|
+
|
|
59
|
+
@classmethod
|
|
60
|
+
def from_yaml_config(cls, config: Dict[str, Any]) -> 'Volume':
|
|
61
|
+
"""Create a Volume subclass instance from a dictionary via factory."""
|
|
62
|
+
vol_type_val = config.get('type')
|
|
63
|
+
try:
|
|
64
|
+
vt = (volume_lib.VolumeType(vol_type_val)
|
|
65
|
+
if vol_type_val is not None else None)
|
|
66
|
+
except Exception: # pylint: disable=broad-except
|
|
67
|
+
vt = None
|
|
68
|
+
|
|
69
|
+
if vt is None:
|
|
70
|
+
raise ValueError(f'Invalid volume type: {vol_type_val}')
|
|
71
|
+
|
|
72
|
+
if vt == volume_lib.VolumeType.PVC:
|
|
73
|
+
return PVCVolume(name=config.get('name'),
|
|
74
|
+
type=vol_type_val,
|
|
75
|
+
infra=config.get('infra'),
|
|
76
|
+
size=config.get('size'),
|
|
77
|
+
labels=config.get('labels'),
|
|
78
|
+
use_existing=config.get('use_existing'),
|
|
79
|
+
config=config.get('config', {}))
|
|
80
|
+
if vt == volume_lib.VolumeType.RUNPOD_NETWORK_VOLUME:
|
|
81
|
+
return RunpodNetworkVolume(name=config.get('name'),
|
|
82
|
+
type=vol_type_val,
|
|
83
|
+
infra=config.get('infra'),
|
|
84
|
+
size=config.get('size'),
|
|
85
|
+
labels=config.get('labels'),
|
|
86
|
+
use_existing=config.get('use_existing'),
|
|
87
|
+
config=config.get('config', {}))
|
|
88
|
+
|
|
89
|
+
raise ValueError(f'Invalid volume type: {vol_type_val}')
|
|
90
|
+
|
|
91
|
+
def to_yaml_config(self) -> Dict[str, Any]:
|
|
92
|
+
"""Convert the Volume to a dictionary."""
|
|
93
|
+
return {
|
|
94
|
+
'name': self.name,
|
|
95
|
+
'type': self.type,
|
|
96
|
+
'infra': self.infra,
|
|
97
|
+
'size': self.size,
|
|
98
|
+
'labels': self.labels,
|
|
99
|
+
'use_existing': self.use_existing,
|
|
100
|
+
'config': self.config,
|
|
101
|
+
'cloud': self.cloud,
|
|
102
|
+
'region': self.region,
|
|
103
|
+
'zone': self.zone,
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
def _normalize_config(self) -> None:
|
|
107
|
+
"""Normalize and validate the config."""
|
|
108
|
+
# Validate schema
|
|
109
|
+
common_utils.validate_schema(self.to_yaml_config(),
|
|
110
|
+
schemas.get_volume_schema(),
|
|
111
|
+
'Invalid volumes config: ')
|
|
112
|
+
|
|
113
|
+
# Adjust the volume config (e.g., parse size)
|
|
114
|
+
self._adjust_config()
|
|
115
|
+
|
|
116
|
+
# Resolve the infrastructure options to cloud, region, zone
|
|
117
|
+
infra_info = infra_utils.InfraInfo.from_str(self.infra)
|
|
118
|
+
self.cloud = infra_info.cloud
|
|
119
|
+
self.region = infra_info.region
|
|
120
|
+
self.zone = infra_info.zone
|
|
121
|
+
|
|
122
|
+
# Set cloud from volume type if not specified
|
|
123
|
+
cloud_obj_from_type = VOLUME_TYPE_TO_CLOUD.get(
|
|
124
|
+
volume_lib.VolumeType(self.type))
|
|
125
|
+
if self.cloud:
|
|
126
|
+
cloud_obj = registry.CLOUD_REGISTRY.from_str(self.cloud)
|
|
127
|
+
assert cloud_obj is not None
|
|
128
|
+
if not cloud_obj.is_same_cloud(cloud_obj_from_type):
|
|
129
|
+
raise ValueError(
|
|
130
|
+
f'Invalid cloud {self.cloud} for volume type {self.type}')
|
|
131
|
+
else:
|
|
132
|
+
self.cloud = str(cloud_obj_from_type)
|
|
133
|
+
|
|
134
|
+
def _adjust_config(self) -> None:
|
|
135
|
+
"""Adjust the volume config (e.g., parse size)."""
|
|
136
|
+
if self.size is None:
|
|
137
|
+
return
|
|
138
|
+
try:
|
|
139
|
+
size = resources_utils.parse_memory_resource(self.size,
|
|
140
|
+
'size',
|
|
141
|
+
allow_rounding=True)
|
|
142
|
+
if size == '0':
|
|
143
|
+
raise ValueError('Size must be no less than 1Gi')
|
|
144
|
+
self.size = size
|
|
145
|
+
except ValueError as e:
|
|
146
|
+
raise ValueError(f'Invalid size {self.size}: {e}') from e
|
|
147
|
+
|
|
148
|
+
def validate(self, skip_cloud_compatibility: bool = False) -> None:
|
|
149
|
+
"""Validates the volume."""
|
|
150
|
+
self.validate_name()
|
|
151
|
+
self.validate_size()
|
|
152
|
+
if not skip_cloud_compatibility:
|
|
153
|
+
self.validate_cloud_compatibility()
|
|
154
|
+
# Extra, type-specific validations
|
|
155
|
+
self._validate_config_extra()
|
|
156
|
+
|
|
157
|
+
def validate_name(self) -> None:
|
|
158
|
+
"""Validates if the volume name is set."""
|
|
159
|
+
assert self.name is not None, 'Volume name must be set'
|
|
160
|
+
|
|
161
|
+
def validate_size(self) -> None:
|
|
162
|
+
"""Validates that size is specified for new volumes."""
|
|
163
|
+
if not self.use_existing and not self.size:
|
|
164
|
+
raise ValueError('Size is required for new volumes. '
|
|
165
|
+
'Please specify the size in the YAML file or '
|
|
166
|
+
'use the --size flag.')
|
|
167
|
+
|
|
168
|
+
def validate_cloud_compatibility(self) -> None:
|
|
169
|
+
"""Validates region, zone, name, labels with the cloud."""
|
|
170
|
+
cloud_obj = registry.CLOUD_REGISTRY.from_str(self.cloud)
|
|
171
|
+
assert cloud_obj is not None
|
|
172
|
+
|
|
173
|
+
valid, err_msg = cloud_obj.is_volume_name_valid(self.name)
|
|
174
|
+
if not valid:
|
|
175
|
+
raise ValueError(f'Invalid volume name: {err_msg}')
|
|
176
|
+
|
|
177
|
+
if self.labels:
|
|
178
|
+
for key, value in self.labels.items():
|
|
179
|
+
valid, err_msg = cloud_obj.is_label_valid(key, value)
|
|
180
|
+
if not valid:
|
|
181
|
+
raise ValueError(f'{err_msg}')
|
|
182
|
+
|
|
183
|
+
# Hook methods for subclasses
|
|
184
|
+
def _validate_config_extra(self) -> None:
|
|
185
|
+
"""Additional type-specific validation.
|
|
186
|
+
|
|
187
|
+
Subclasses can override to enforce stricter rules.
|
|
188
|
+
"""
|
|
189
|
+
return
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class PVCVolume(Volume):
|
|
193
|
+
"""Kubernetes PVC-backed volume."""
|
|
194
|
+
pass
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class RunpodNetworkVolume(Volume):
|
|
198
|
+
"""RunPod Network Volume."""
|
|
199
|
+
|
|
200
|
+
def _validate_config_extra(self) -> None:
|
|
201
|
+
if not self.use_existing and self.size is not None:
|
|
202
|
+
try:
|
|
203
|
+
size_int = int(self.size)
|
|
204
|
+
if size_int < volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB:
|
|
205
|
+
raise ValueError(
|
|
206
|
+
f'RunPod network volume size must be at least '
|
|
207
|
+
f'{volume_lib.MIN_RUNPOD_NETWORK_VOLUME_SIZE_GB}GB.')
|
|
208
|
+
except Exception as e: # pylint: disable=broad-except
|
|
209
|
+
raise ValueError(f'Invalid volume size {self.size!r}: '
|
|
210
|
+
f'{e}') from e
|
|
211
|
+
if not self.zone:
|
|
212
|
+
raise ValueError('RunPod DataCenterId is required for network '
|
|
213
|
+
'volumes. Set the zone in the infra field.')
|
|
214
|
+
|
|
215
|
+
return
|
|
File without changes
|