skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,282 @@
|
|
|
1
|
+
"""Kubernetes pvc provisioning."""
|
|
2
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
3
|
+
|
|
4
|
+
from sky import global_user_state
|
|
5
|
+
from sky import models
|
|
6
|
+
from sky import sky_logging
|
|
7
|
+
from sky.adaptors import kubernetes
|
|
8
|
+
from sky.provision import constants
|
|
9
|
+
from sky.provision.kubernetes import config as config_lib
|
|
10
|
+
from sky.provision.kubernetes import constants as k8s_constants
|
|
11
|
+
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
12
|
+
from sky.utils import volume as volume_lib
|
|
13
|
+
|
|
14
|
+
logger = sky_logging.init_logger(__name__)
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def _get_context_namespace(config: models.VolumeConfig) -> Tuple[str, str]:
|
|
18
|
+
"""Gets the context and namespace of a volume."""
|
|
19
|
+
if config.region is None:
|
|
20
|
+
context = kubernetes_utils.get_current_kube_config_context_name()
|
|
21
|
+
config.region = context
|
|
22
|
+
else:
|
|
23
|
+
context = config.region
|
|
24
|
+
namespace = config.config.get('namespace')
|
|
25
|
+
if namespace is None:
|
|
26
|
+
namespace = kubernetes_utils.get_kube_config_context_namespace(context)
|
|
27
|
+
config.config['namespace'] = namespace
|
|
28
|
+
return context, namespace
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
|
|
32
|
+
pod_spec: Dict[str, Any]) -> None:
|
|
33
|
+
"""Checks if the PVC is used by any pod in the namespace."""
|
|
34
|
+
volumes = pod_spec.get('spec', {}).get('volumes', [])
|
|
35
|
+
if not volumes:
|
|
36
|
+
return
|
|
37
|
+
once_modes = [
|
|
38
|
+
volume_lib.VolumeAccessMode.READ_WRITE_ONCE.value,
|
|
39
|
+
volume_lib.VolumeAccessMode.READ_WRITE_ONCE_POD.value
|
|
40
|
+
]
|
|
41
|
+
for volume in volumes:
|
|
42
|
+
pvc_name = volume.get('persistentVolumeClaim', {}).get('claimName')
|
|
43
|
+
if not pvc_name:
|
|
44
|
+
continue
|
|
45
|
+
pvc = kubernetes.core_api(
|
|
46
|
+
context).read_namespaced_persistent_volume_claim(
|
|
47
|
+
name=pvc_name, namespace=namespace)
|
|
48
|
+
access_mode = pvc.spec.access_modes[0]
|
|
49
|
+
if access_mode not in once_modes:
|
|
50
|
+
continue
|
|
51
|
+
usedby_pods, _ = _get_volume_usedby(context, namespace, pvc_name)
|
|
52
|
+
if usedby_pods:
|
|
53
|
+
raise config_lib.KubernetesError(f'Volume {pvc_name} with access '
|
|
54
|
+
f'mode {access_mode} is already '
|
|
55
|
+
f'in use by Pods {usedby_pods}.')
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
59
|
+
"""Creates or registers a volume."""
|
|
60
|
+
context, namespace = _get_context_namespace(config)
|
|
61
|
+
pvc_spec = _get_pvc_spec(namespace, config)
|
|
62
|
+
# Check if the storage class exists
|
|
63
|
+
storage_class_name = pvc_spec['spec'].get('storageClassName')
|
|
64
|
+
if storage_class_name is not None:
|
|
65
|
+
try:
|
|
66
|
+
kubernetes.storage_api(context).read_storage_class(
|
|
67
|
+
name=storage_class_name)
|
|
68
|
+
except kubernetes.api_exception() as e:
|
|
69
|
+
raise config_lib.KubernetesError(
|
|
70
|
+
f'Check storage class {storage_class_name} error: {e}')
|
|
71
|
+
create_persistent_volume_claim(namespace, context, pvc_spec)
|
|
72
|
+
return config
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
76
|
+
"""Deletes a volume."""
|
|
77
|
+
context, namespace = _get_context_namespace(config)
|
|
78
|
+
pvc_name = config.name_on_cloud
|
|
79
|
+
kubernetes_utils.delete_k8s_resource_with_retry(
|
|
80
|
+
delete_func=lambda pvc_name=pvc_name: kubernetes.core_api(
|
|
81
|
+
context).delete_namespaced_persistent_volume_claim(
|
|
82
|
+
name=pvc_name,
|
|
83
|
+
namespace=namespace,
|
|
84
|
+
_request_timeout=config_lib.DELETION_TIMEOUT),
|
|
85
|
+
resource_type='pvc',
|
|
86
|
+
resource_name=pvc_name)
|
|
87
|
+
logger.info(f'Deleted PVC {pvc_name} in namespace {namespace}')
|
|
88
|
+
return config
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _get_volume_usedby(
|
|
92
|
+
context: Optional[str],
|
|
93
|
+
namespace: str,
|
|
94
|
+
pvc_name: str,
|
|
95
|
+
) -> Tuple[List[str], List[str]]:
|
|
96
|
+
"""Gets the usedby resources of a volume.
|
|
97
|
+
|
|
98
|
+
This function returns the pods and clusters that are using the volume.
|
|
99
|
+
The usedby_pods is accurate, which also includes the Pods that are not
|
|
100
|
+
managed by SkyPilot.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
context: Kubernetes context
|
|
104
|
+
namespace: Kubernetes namespace
|
|
105
|
+
pvc_name: PVC name
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
usedby_pods: List of pods using the volume. These may include pods
|
|
109
|
+
not created by SkyPilot.
|
|
110
|
+
usedby_clusters: List of clusters using the volume.
|
|
111
|
+
"""
|
|
112
|
+
usedby_pods = []
|
|
113
|
+
usedby_clusters = []
|
|
114
|
+
field_selector = ','.join([
|
|
115
|
+
f'status.phase!={phase}'
|
|
116
|
+
for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
|
|
117
|
+
])
|
|
118
|
+
cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
|
|
119
|
+
# Get all pods in the namespace
|
|
120
|
+
pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
121
|
+
namespace=namespace, field_selector=field_selector)
|
|
122
|
+
for pod in pods.items:
|
|
123
|
+
if pod.spec.volumes is None:
|
|
124
|
+
continue
|
|
125
|
+
for volume in pod.spec.volumes:
|
|
126
|
+
if volume.persistent_volume_claim is None:
|
|
127
|
+
continue
|
|
128
|
+
if volume.persistent_volume_claim.claim_name == pvc_name:
|
|
129
|
+
usedby_pods.append(pod.metadata.name)
|
|
130
|
+
# Get the real cluster name
|
|
131
|
+
cluster_name_on_cloud = pod.metadata.labels.get(
|
|
132
|
+
constants.TAG_SKYPILOT_CLUSTER_NAME)
|
|
133
|
+
if cluster_name_on_cloud is None:
|
|
134
|
+
continue
|
|
135
|
+
cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
|
|
136
|
+
if cluster_name is not None:
|
|
137
|
+
usedby_clusters.append(cluster_name)
|
|
138
|
+
if usedby_pods:
|
|
139
|
+
logger.debug(f'Volume {pvc_name} is used by Pods {usedby_pods}'
|
|
140
|
+
f' and clusters {usedby_clusters}')
|
|
141
|
+
return usedby_pods, usedby_clusters
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def _get_cluster_name_on_cloud_to_cluster_name_map() -> Dict[str, str]:
|
|
145
|
+
"""Gets the map from cluster name on cloud to cluster name."""
|
|
146
|
+
clusters = global_user_state.get_clusters()
|
|
147
|
+
cloud_to_name_map = {}
|
|
148
|
+
for cluster in clusters:
|
|
149
|
+
handle = cluster['handle']
|
|
150
|
+
if handle is None:
|
|
151
|
+
continue
|
|
152
|
+
cloud_to_name_map[handle.cluster_name_on_cloud] = cluster['name']
|
|
153
|
+
return cloud_to_name_map
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
def get_volume_usedby(
|
|
157
|
+
config: models.VolumeConfig,) -> Tuple[List[str], List[str]]:
|
|
158
|
+
"""Gets the usedby resources of a volume."""
|
|
159
|
+
context, namespace = _get_context_namespace(config)
|
|
160
|
+
pvc_name = config.name_on_cloud
|
|
161
|
+
return _get_volume_usedby(context, namespace, pvc_name)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def get_all_volumes_usedby(
|
|
165
|
+
configs: List[models.VolumeConfig],
|
|
166
|
+
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
167
|
+
"""Gets the usedby resources of all volumes."""
|
|
168
|
+
field_selector = ','.join([
|
|
169
|
+
f'status.phase!={phase}'
|
|
170
|
+
for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
|
|
171
|
+
])
|
|
172
|
+
label_selector = 'parent=skypilot'
|
|
173
|
+
context_to_namespaces: Dict[str, Set[str]] = {}
|
|
174
|
+
pvc_names = set()
|
|
175
|
+
for config in configs:
|
|
176
|
+
context, namespace = _get_context_namespace(config)
|
|
177
|
+
if context not in context_to_namespaces:
|
|
178
|
+
context_to_namespaces[context] = set()
|
|
179
|
+
context_to_namespaces[context].add(namespace)
|
|
180
|
+
pvc_names.add(config.name_on_cloud)
|
|
181
|
+
cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
|
|
182
|
+
# Get all pods in the namespace
|
|
183
|
+
used_by_pods: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
|
|
184
|
+
used_by_clusters: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
|
|
185
|
+
for context, namespaces in context_to_namespaces.items():
|
|
186
|
+
used_by_pods[context] = {}
|
|
187
|
+
used_by_clusters[context] = {}
|
|
188
|
+
for namespace in namespaces:
|
|
189
|
+
used_by_pods[context][namespace] = {}
|
|
190
|
+
used_by_clusters[context][namespace] = {}
|
|
191
|
+
pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
192
|
+
namespace=namespace,
|
|
193
|
+
field_selector=field_selector,
|
|
194
|
+
label_selector=label_selector)
|
|
195
|
+
for pod in pods.items:
|
|
196
|
+
if pod.spec.volumes is None:
|
|
197
|
+
continue
|
|
198
|
+
for volume in pod.spec.volumes:
|
|
199
|
+
if volume.persistent_volume_claim is None:
|
|
200
|
+
continue
|
|
201
|
+
volume_name = volume.persistent_volume_claim.claim_name
|
|
202
|
+
if volume_name not in pvc_names:
|
|
203
|
+
continue
|
|
204
|
+
if volume_name not in used_by_pods[context][namespace]:
|
|
205
|
+
used_by_pods[context][namespace][volume_name] = []
|
|
206
|
+
used_by_pods[context][namespace][volume_name].append(
|
|
207
|
+
pod.metadata.name)
|
|
208
|
+
cluster_name_on_cloud = pod.metadata.labels.get(
|
|
209
|
+
constants.TAG_SKYPILOT_CLUSTER_NAME)
|
|
210
|
+
if cluster_name_on_cloud is None:
|
|
211
|
+
continue
|
|
212
|
+
cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
|
|
213
|
+
if cluster_name is None:
|
|
214
|
+
continue
|
|
215
|
+
if cluster_name not in used_by_clusters[context][namespace]:
|
|
216
|
+
used_by_clusters[context][namespace][cluster_name] = []
|
|
217
|
+
used_by_clusters[context][namespace][cluster_name].append(
|
|
218
|
+
cluster_name)
|
|
219
|
+
return used_by_pods, used_by_clusters
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
def map_all_volumes_usedby(
|
|
223
|
+
used_by_pods: Dict[str, Any], used_by_clusters: Dict[str, Any],
|
|
224
|
+
config: models.VolumeConfig) -> Tuple[List[str], List[str]]:
|
|
225
|
+
"""Maps the usedby resources of a volume."""
|
|
226
|
+
context, namespace = _get_context_namespace(config)
|
|
227
|
+
pvc_name = config.name_on_cloud
|
|
228
|
+
|
|
229
|
+
return (used_by_pods.get(context, {}).get(namespace, {}).get(pvc_name, []),
|
|
230
|
+
used_by_clusters.get(context, {}).get(namespace,
|
|
231
|
+
{}).get(pvc_name, []))
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def create_persistent_volume_claim(namespace: str, context: Optional[str],
|
|
235
|
+
pvc_spec: Dict[str, Any]) -> None:
|
|
236
|
+
"""Creates a persistent volume claim for SkyServe controller."""
|
|
237
|
+
pvc_name = pvc_spec['metadata']['name']
|
|
238
|
+
try:
|
|
239
|
+
kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
|
|
240
|
+
name=pvc_name, namespace=namespace)
|
|
241
|
+
logger.debug(f'PVC {pvc_name} already exists')
|
|
242
|
+
return
|
|
243
|
+
except kubernetes.api_exception() as e:
|
|
244
|
+
if e.status != 404: # Not found
|
|
245
|
+
raise
|
|
246
|
+
kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
|
|
247
|
+
namespace=namespace, body=pvc_spec)
|
|
248
|
+
logger.info(f'Created PVC {pvc_name} in namespace {namespace}')
|
|
249
|
+
|
|
250
|
+
|
|
251
|
+
def _get_pvc_spec(namespace: str,
|
|
252
|
+
config: models.VolumeConfig) -> Dict[str, Any]:
|
|
253
|
+
"""Gets the PVC spec for the given storage config."""
|
|
254
|
+
access_mode = config.config.get('access_mode')
|
|
255
|
+
size = config.size
|
|
256
|
+
# The previous code assumes that the access_mode and size are always set.
|
|
257
|
+
assert access_mode is not None
|
|
258
|
+
assert size is not None
|
|
259
|
+
pvc_spec: Dict[str, Any] = {
|
|
260
|
+
'metadata': {
|
|
261
|
+
'name': config.name_on_cloud,
|
|
262
|
+
'namespace': namespace,
|
|
263
|
+
'labels': {
|
|
264
|
+
'parent': 'skypilot',
|
|
265
|
+
'skypilot-name': config.name,
|
|
266
|
+
}
|
|
267
|
+
},
|
|
268
|
+
'spec': {
|
|
269
|
+
'accessModes': [access_mode],
|
|
270
|
+
'resources': {
|
|
271
|
+
'requests': {
|
|
272
|
+
'storage': f'{size}Gi'
|
|
273
|
+
}
|
|
274
|
+
},
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
if config.labels:
|
|
278
|
+
pvc_spec['metadata']['labels'].update(config.labels)
|
|
279
|
+
storage_class = config.config.get('storage_class_name')
|
|
280
|
+
if storage_class is not None:
|
|
281
|
+
pvc_spec['spec']['storageClassName'] = storage_class
|
|
282
|
+
return pvc_spec
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Lambda Cloud instance provisioning."""
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.provision import common
|
|
@@ -68,9 +68,10 @@ def _get_private_ip(instance_info: Dict[str, Any], single_node: bool) -> str:
|
|
|
68
68
|
return private_ip
|
|
69
69
|
|
|
70
70
|
|
|
71
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
71
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
72
72
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
73
73
|
"""Runs instances for the given cluster"""
|
|
74
|
+
del cluster_name # unused
|
|
74
75
|
lambda_client = _get_lambda_client()
|
|
75
76
|
pending_status = ['booting']
|
|
76
77
|
while True:
|
|
@@ -106,34 +107,35 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
106
107
|
created_instance_ids = []
|
|
107
108
|
remote_ssh_key_name = config.authentication_config['remote_key_name']
|
|
108
109
|
|
|
109
|
-
def
|
|
110
|
+
def launch_node(node_type: str) -> str:
|
|
110
111
|
try:
|
|
111
112
|
instance_ids = lambda_client.create_instances(
|
|
112
113
|
instance_type=config.node_config['InstanceType'],
|
|
113
114
|
region=region,
|
|
114
115
|
name=f'{cluster_name_on_cloud}-{node_type}',
|
|
115
|
-
|
|
116
|
+
# Quantity cannot actually be greater than 1; see:
|
|
117
|
+
# https://github.com/skypilot-org/skypilot/issues/7084
|
|
118
|
+
quantity=1,
|
|
116
119
|
ssh_key_name=remote_ssh_key_name,
|
|
117
120
|
)
|
|
118
|
-
logger.info(f'Launched {
|
|
119
|
-
f'
|
|
120
|
-
return instance_ids
|
|
121
|
+
logger.info(f'Launched {node_type} node, '
|
|
122
|
+
f'instance_id: {instance_ids[0]}')
|
|
123
|
+
return instance_ids[0]
|
|
121
124
|
except Exception as e:
|
|
122
125
|
logger.warning(f'run_instances error: {e}')
|
|
123
126
|
raise
|
|
124
127
|
|
|
125
128
|
if head_instance_id is None:
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
created_instance_ids.append(instance_ids[0])
|
|
129
|
-
head_instance_id = instance_ids[0]
|
|
129
|
+
head_instance_id = launch_node('head')
|
|
130
|
+
created_instance_ids.append(head_instance_id)
|
|
130
131
|
|
|
131
132
|
assert head_instance_id is not None, 'head_instance_id should not be None'
|
|
132
133
|
|
|
133
134
|
worker_node_count = to_start_count - 1
|
|
134
135
|
if worker_node_count > 0:
|
|
135
|
-
|
|
136
|
-
|
|
136
|
+
for _ in range(worker_node_count):
|
|
137
|
+
worker_instance_id = launch_node('worker')
|
|
138
|
+
created_instance_ids.append(worker_instance_id)
|
|
137
139
|
|
|
138
140
|
while True:
|
|
139
141
|
instances = _filter_instances(cluster_name_on_cloud, ['active'])
|
|
@@ -226,11 +228,14 @@ def get_cluster_info(
|
|
|
226
228
|
|
|
227
229
|
|
|
228
230
|
def query_instances(
|
|
231
|
+
cluster_name: str,
|
|
229
232
|
cluster_name_on_cloud: str,
|
|
230
233
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
231
234
|
non_terminated_only: bool = True,
|
|
232
|
-
|
|
235
|
+
retry_if_missing: bool = False,
|
|
236
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
233
237
|
"""See sky/provision/__init__.py"""
|
|
238
|
+
del cluster_name, retry_if_missing # unused
|
|
234
239
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
235
240
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
236
241
|
|
|
@@ -240,12 +245,13 @@ def query_instances(
|
|
|
240
245
|
'unhealthy': status_lib.ClusterStatus.INIT,
|
|
241
246
|
'terminating': None,
|
|
242
247
|
}
|
|
243
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
248
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
249
|
+
Optional[str]]] = {}
|
|
244
250
|
for instance_id, instance in instances.items():
|
|
245
251
|
status = status_map.get(instance['status'])
|
|
246
252
|
if non_terminated_only and status is None:
|
|
247
253
|
continue
|
|
248
|
-
statuses[instance_id] = status
|
|
254
|
+
statuses[instance_id] = (status, None)
|
|
249
255
|
return statuses
|
|
250
256
|
|
|
251
257
|
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
"""Constants used by the Nebius provisioner."""
|
|
2
|
+
|
|
3
|
+
VERSION = 'v1'
|
|
4
|
+
|
|
5
|
+
# InfiniBand-capable instance platforms
|
|
6
|
+
INFINIBAND_INSTANCE_PLATFORMS = [
|
|
7
|
+
'gpu-h100-sxm',
|
|
8
|
+
'gpu-h200-sxm',
|
|
9
|
+
]
|
|
10
|
+
|
|
11
|
+
# InfiniBand environment variables for NCCL and UCX
|
|
12
|
+
INFINIBAND_ENV_VARS = {
|
|
13
|
+
'NCCL_IB_HCA': 'mlx5',
|
|
14
|
+
'UCX_NET_DEVICES': ('mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,'
|
|
15
|
+
'mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1')
|
|
16
|
+
}
|
|
17
|
+
|
|
18
|
+
# pylint: disable=line-too-long
|
|
19
|
+
INFINIBAND_IMAGE_ID = 'docker:cr.eu-north1.nebius.cloud/nebius-benchmarks/nccl-tests:2.23.4-ubu22.04-cu12.4'
|
|
20
|
+
|
|
21
|
+
# Docker run options for InfiniBand support
|
|
22
|
+
INFINIBAND_DOCKER_OPTIONS = ['--device=/dev/infiniband', '--cap-add=IPC_LOCK']
|
|
23
|
+
|
|
24
|
+
# InfiniBand fabric mapping by platform and region
|
|
25
|
+
# Based on Nebius documentation
|
|
26
|
+
INFINIBAND_FABRIC_MAPPING = {
|
|
27
|
+
# H100 platforms
|
|
28
|
+
('gpu-h100-sxm', 'eu-north1'): [
|
|
29
|
+
'fabric-2', 'fabric-3', 'fabric-4', 'fabric-6'
|
|
30
|
+
],
|
|
31
|
+
|
|
32
|
+
# H200 platforms
|
|
33
|
+
('gpu-h200-sxm', 'eu-north1'): ['fabric-7'],
|
|
34
|
+
('gpu-h200-sxm', 'eu-west1'): ['fabric-5'],
|
|
35
|
+
('gpu-h200-sxm', 'us-central1'): ['us-central1-a'],
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def get_default_fabric(platform: str, region: str) -> str:
|
|
40
|
+
"""Get the default (first) fabric for a given platform and region."""
|
|
41
|
+
fabrics = INFINIBAND_FABRIC_MAPPING.get((platform, region), [])
|
|
42
|
+
if not fabrics:
|
|
43
|
+
# Select north europe region as default
|
|
44
|
+
fabrics = INFINIBAND_FABRIC_MAPPING.get(('gpu-h100-sxm', 'eu-north1'),
|
|
45
|
+
[])
|
|
46
|
+
if not fabrics:
|
|
47
|
+
raise ValueError(
|
|
48
|
+
f'No InfiniBand fabric available for platform {platform} '
|
|
49
|
+
f'in region {region}')
|
|
50
|
+
return fabrics[0]
|
sky/provision/nebius/instance.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
"""Nebius instance provisioning."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict, List, Optional
|
|
3
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
4
4
|
|
|
5
5
|
from sky import sky_logging
|
|
6
6
|
from sky.provision import common
|
|
@@ -65,9 +65,10 @@ def _wait_until_no_pending(region: str, cluster_name_on_cloud: str) -> None:
|
|
|
65
65
|
f' to be ready.')
|
|
66
66
|
|
|
67
67
|
|
|
68
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
68
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
69
69
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
70
70
|
"""Runs instances for the given cluster."""
|
|
71
|
+
del cluster_name # unused
|
|
71
72
|
_wait_until_no_pending(region, cluster_name_on_cloud)
|
|
72
73
|
running_instances = _filter_instances(region, cluster_name_on_cloud,
|
|
73
74
|
['RUNNING'])
|
|
@@ -124,6 +125,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
124
125
|
node_type = 'head' if head_instance_id is None else 'worker'
|
|
125
126
|
try:
|
|
126
127
|
platform, preset = config.node_config['InstanceType'].split('_')
|
|
128
|
+
|
|
127
129
|
instance_id = utils.launch(
|
|
128
130
|
cluster_name_on_cloud=cluster_name_on_cloud,
|
|
129
131
|
node_type=node_type,
|
|
@@ -132,7 +134,14 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
132
134
|
region=region,
|
|
133
135
|
image_family=config.node_config['ImageId'],
|
|
134
136
|
disk_size=config.node_config['DiskSize'],
|
|
135
|
-
user_data=config.node_config['UserData']
|
|
137
|
+
user_data=config.node_config['UserData'],
|
|
138
|
+
use_spot=config.node_config['use_spot'],
|
|
139
|
+
associate_public_ip_address=(
|
|
140
|
+
not config.provider_config['use_internal_ips']),
|
|
141
|
+
use_static_ip_address=config.provider_config.get(
|
|
142
|
+
'use_static_ip_address', False),
|
|
143
|
+
filesystems=config.node_config.get('filesystems', []),
|
|
144
|
+
network_tier=config.node_config.get('network_tier'))
|
|
136
145
|
except Exception as e: # pylint: disable=broad-except
|
|
137
146
|
logger.warning(f'run_instances error: {e}')
|
|
138
147
|
raise
|
|
@@ -241,11 +250,14 @@ def get_cluster_info(
|
|
|
241
250
|
|
|
242
251
|
|
|
243
252
|
def query_instances(
|
|
253
|
+
cluster_name: str,
|
|
244
254
|
cluster_name_on_cloud: str,
|
|
245
255
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
246
256
|
non_terminated_only: bool = True,
|
|
247
|
-
|
|
257
|
+
retry_if_missing: bool = False,
|
|
258
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
248
259
|
"""See sky/provision/__init__.py"""
|
|
260
|
+
del cluster_name, retry_if_missing # unused
|
|
249
261
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
250
262
|
instances = _filter_instances(provider_config['region'],
|
|
251
263
|
cluster_name_on_cloud, None)
|
|
@@ -257,12 +269,13 @@ def query_instances(
|
|
|
257
269
|
'STOPPING': status_lib.ClusterStatus.STOPPED,
|
|
258
270
|
'DELETING': status_lib.ClusterStatus.STOPPED,
|
|
259
271
|
}
|
|
260
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
272
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
273
|
+
Optional[str]]] = {}
|
|
261
274
|
for inst_id, inst in instances.items():
|
|
262
275
|
status = status_map[inst['status']]
|
|
263
276
|
if non_terminated_only and status is None:
|
|
264
277
|
continue
|
|
265
|
-
statuses[inst_id] = status
|
|
278
|
+
statuses[inst_id] = (status, None)
|
|
266
279
|
return statuses
|
|
267
280
|
|
|
268
281
|
|