skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/provision/nebius/utils.py
CHANGED
|
@@ -1,16 +1,21 @@
|
|
|
1
1
|
"""Nebius library wrapper for SkyPilot."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
4
|
import uuid
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
|
+
from sky import skypilot_config
|
|
7
8
|
from sky.adaptors import nebius
|
|
9
|
+
from sky.provision.nebius import constants as nebius_constants
|
|
8
10
|
from sky.utils import common_utils
|
|
11
|
+
from sky.utils import resources_utils
|
|
9
12
|
|
|
10
13
|
logger = sky_logging.init_logger(__name__)
|
|
11
14
|
|
|
12
15
|
POLL_INTERVAL = 5
|
|
13
16
|
|
|
17
|
+
_MAX_OPERATIONS_TO_FETCH = 1000
|
|
18
|
+
|
|
14
19
|
|
|
15
20
|
def retry(func):
|
|
16
21
|
"""Decorator to retry a function."""
|
|
@@ -33,68 +38,43 @@ def retry(func):
|
|
|
33
38
|
|
|
34
39
|
def get_project_by_region(region: str) -> str:
|
|
35
40
|
service = nebius.iam().ProjectServiceClient(nebius.sdk())
|
|
36
|
-
projects =
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
# information directly from the project. Additionally, there is only one
|
|
41
|
-
# project per region, and projects cannot be created at this time.
|
|
42
|
-
# The region is determined from the project ID using a region-specific
|
|
43
|
-
# identifier embedded in it.
|
|
44
|
-
# Project id looks like project-e00xxxxxxxxxxxxxx where
|
|
45
|
-
# e00 - id of region 'eu-north1'
|
|
46
|
-
# e01 - id of region 'eu-west1'
|
|
47
|
-
region_ids = {'eu-north1': 'e00', 'eu-west1': 'e01'}
|
|
48
|
-
# TODO(SalikovAlex): fix when info about region will be in projects list
|
|
49
|
-
# Currently, Nebius cloud supports 2 regions. We manually enumerate
|
|
50
|
-
# them here. Reference: https://docs.nebius.com/overview/regions
|
|
41
|
+
projects = nebius.sync_call(
|
|
42
|
+
service.list(
|
|
43
|
+
nebius.iam().ListProjectsRequest(parent_id=nebius.get_tenant_id()),
|
|
44
|
+
timeout=nebius.READ_TIMEOUT))
|
|
51
45
|
|
|
52
46
|
# Check is there project if in config
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
logger.warning(
|
|
58
|
-
f'Can\'t use customized NEBIUS_PROJECT_ID ({preferable_project_id})'
|
|
59
|
-
f' for region {region}. Please check if the project ID is correct.')
|
|
47
|
+
project_id = skypilot_config.get_effective_region_config(
|
|
48
|
+
cloud='nebius', region=region, keys=('project_id',), default_value=None)
|
|
49
|
+
if project_id is not None:
|
|
50
|
+
return project_id
|
|
60
51
|
for project in projects.items:
|
|
61
|
-
if project.
|
|
52
|
+
if project.status.region == region:
|
|
62
53
|
return project.metadata.id
|
|
63
54
|
raise Exception(f'No project found for region "{region}".')
|
|
64
55
|
|
|
65
56
|
|
|
66
|
-
def get_or_create_gpu_cluster(name: str,
|
|
57
|
+
def get_or_create_gpu_cluster(name: str, project_id: str, fabric: str) -> str:
|
|
67
58
|
"""Creates a GPU cluster.
|
|
68
|
-
When creating a GPU cluster, select an InfiniBand fabric for it:
|
|
69
|
-
|
|
70
|
-
fabric-2, fabric-3 or fabric-4 for projects in the eu-north1 region.
|
|
71
|
-
fabric-5 for projects in the eu-west1 region.
|
|
72
|
-
|
|
73
59
|
https://docs.nebius.com/compute/clusters/gpu
|
|
74
60
|
"""
|
|
75
|
-
project_id = get_project_by_region(region)
|
|
76
61
|
service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
|
|
77
62
|
try:
|
|
78
|
-
cluster =
|
|
79
|
-
|
|
80
|
-
name=name,
|
|
81
|
-
)).wait()
|
|
82
|
-
cluster_id = cluster.metadata.id
|
|
83
|
-
except nebius.request_error() as no_cluster_found_error:
|
|
84
|
-
if region == 'eu-north1':
|
|
85
|
-
fabric = 'fabric-4'
|
|
86
|
-
elif region == 'eu-west1':
|
|
87
|
-
fabric = 'fabric-5'
|
|
88
|
-
else:
|
|
89
|
-
raise RuntimeError(
|
|
90
|
-
f'Unsupported region {region}.') from no_cluster_found_error
|
|
91
|
-
cluster = service.create(nebius.compute().CreateGpuClusterRequest(
|
|
92
|
-
metadata=nebius.nebius_common().ResourceMetadata(
|
|
63
|
+
cluster = nebius.sync_call(
|
|
64
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
93
65
|
parent_id=project_id,
|
|
94
66
|
name=name,
|
|
95
|
-
)
|
|
96
|
-
|
|
97
|
-
|
|
67
|
+
)))
|
|
68
|
+
cluster_id = cluster.metadata.id
|
|
69
|
+
except nebius.request_error():
|
|
70
|
+
cluster = nebius.sync_call(
|
|
71
|
+
service.create(nebius.compute().CreateGpuClusterRequest(
|
|
72
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
|
73
|
+
parent_id=project_id,
|
|
74
|
+
name=name,
|
|
75
|
+
),
|
|
76
|
+
spec=nebius.compute().GpuClusterSpec(
|
|
77
|
+
infiniband_fabric=fabric))))
|
|
98
78
|
cluster_id = cluster.resource_id
|
|
99
79
|
return cluster_id
|
|
100
80
|
|
|
@@ -104,14 +84,16 @@ def delete_cluster(name: str, region: str) -> None:
|
|
|
104
84
|
project_id = get_project_by_region(region)
|
|
105
85
|
service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
|
|
106
86
|
try:
|
|
107
|
-
cluster =
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
87
|
+
cluster = nebius.sync_call(
|
|
88
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
89
|
+
parent_id=project_id,
|
|
90
|
+
name=name,
|
|
91
|
+
)))
|
|
111
92
|
cluster_id = cluster.metadata.id
|
|
112
93
|
logger.debug(f'Found GPU Cluster : {cluster_id}.')
|
|
113
|
-
|
|
114
|
-
|
|
94
|
+
nebius.sync_call(
|
|
95
|
+
service.delete(
|
|
96
|
+
nebius.compute().DeleteGpuClusterRequest(id=cluster_id)))
|
|
115
97
|
logger.debug(f'Deleted GPU Cluster : {cluster_id}.')
|
|
116
98
|
except nebius.request_error():
|
|
117
99
|
logger.debug('GPU Cluster does not exist.')
|
|
@@ -120,13 +102,23 @@ def delete_cluster(name: str, region: str) -> None:
|
|
|
120
102
|
def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
|
|
121
103
|
"""Lists instances associated with API key."""
|
|
122
104
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
105
|
+
page_token = ''
|
|
106
|
+
instances = []
|
|
107
|
+
while True:
|
|
108
|
+
result = nebius.sync_call(
|
|
109
|
+
service.list(nebius.compute().ListInstancesRequest(
|
|
110
|
+
parent_id=project_id,
|
|
111
|
+
page_size=100,
|
|
112
|
+
page_token=page_token,
|
|
113
|
+
),
|
|
114
|
+
timeout=nebius.READ_TIMEOUT))
|
|
115
|
+
instances.extend(result.items)
|
|
116
|
+
if not result.next_page_token: # "" means no more pages
|
|
117
|
+
break
|
|
118
|
+
page_token = result.next_page_token
|
|
127
119
|
|
|
128
120
|
instance_dict: Dict[str, Dict[str, Any]] = {}
|
|
129
|
-
for instance in instances
|
|
121
|
+
for instance in instances:
|
|
130
122
|
info = {}
|
|
131
123
|
info['status'] = instance.status.state.name
|
|
132
124
|
info['name'] = instance.metadata.name
|
|
@@ -142,12 +134,13 @@ def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
|
|
|
142
134
|
|
|
143
135
|
def stop(instance_id: str) -> None:
|
|
144
136
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
145
|
-
|
|
137
|
+
nebius.sync_call(
|
|
138
|
+
service.stop(nebius.compute().StopInstanceRequest(id=instance_id)))
|
|
146
139
|
retry_count = 0
|
|
147
140
|
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_STOP:
|
|
148
141
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
149
|
-
instance =
|
|
150
|
-
id=instance_id,))
|
|
142
|
+
instance = nebius.sync_call(
|
|
143
|
+
service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
|
|
151
144
|
if instance.status.state.name == 'STOPPED':
|
|
152
145
|
break
|
|
153
146
|
time.sleep(POLL_INTERVAL)
|
|
@@ -164,12 +157,13 @@ def stop(instance_id: str) -> None:
|
|
|
164
157
|
|
|
165
158
|
def start(instance_id: str) -> None:
|
|
166
159
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
167
|
-
|
|
160
|
+
nebius.sync_call(
|
|
161
|
+
service.start(nebius.compute().StartInstanceRequest(id=instance_id)))
|
|
168
162
|
retry_count = 0
|
|
169
163
|
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_START:
|
|
170
164
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
171
|
-
instance =
|
|
172
|
-
id=instance_id,))
|
|
165
|
+
instance = nebius.sync_call(
|
|
166
|
+
service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
|
|
173
167
|
if instance.status.state.name == 'RUNNING':
|
|
174
168
|
break
|
|
175
169
|
time.sleep(POLL_INTERVAL)
|
|
@@ -184,9 +178,19 @@ def start(instance_id: str) -> None:
|
|
|
184
178
|
f' to be ready.')
|
|
185
179
|
|
|
186
180
|
|
|
187
|
-
def launch(cluster_name_on_cloud: str,
|
|
188
|
-
|
|
189
|
-
|
|
181
|
+
def launch(cluster_name_on_cloud: str,
|
|
182
|
+
node_type: str,
|
|
183
|
+
platform: str,
|
|
184
|
+
preset: str,
|
|
185
|
+
region: str,
|
|
186
|
+
image_family: str,
|
|
187
|
+
disk_size: int,
|
|
188
|
+
user_data: str,
|
|
189
|
+
associate_public_ip_address: bool,
|
|
190
|
+
filesystems: List[Dict[str, Any]],
|
|
191
|
+
use_static_ip_address: bool = False,
|
|
192
|
+
use_spot: bool = False,
|
|
193
|
+
network_tier: Optional[resources_utils.NetworkTier] = None) -> str:
|
|
190
194
|
# Each node must have a unique name to avoid conflicts between
|
|
191
195
|
# multiple worker VMs. To ensure uniqueness,a UUID is appended
|
|
192
196
|
# to the node name.
|
|
@@ -196,34 +200,59 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
|
|
196
200
|
|
|
197
201
|
disk_name = 'disk-' + instance_name
|
|
198
202
|
cluster_id = None
|
|
203
|
+
project_id = get_project_by_region(region)
|
|
199
204
|
# 8 GPU virtual machines can be grouped into a GPU cluster.
|
|
200
205
|
# The GPU clusters are built with InfiniBand secure high-speed networking.
|
|
201
206
|
# https://docs.nebius.com/compute/clusters/gpu
|
|
202
|
-
if platform in
|
|
207
|
+
if platform in nebius_constants.INFINIBAND_INSTANCE_PLATFORMS:
|
|
203
208
|
if preset == '8gpu-128vcpu-1600gb':
|
|
204
|
-
|
|
205
|
-
|
|
209
|
+
fabric = skypilot_config.get_effective_region_config(
|
|
210
|
+
cloud='nebius',
|
|
211
|
+
region=region,
|
|
212
|
+
keys=('fabric',),
|
|
213
|
+
default_value=None)
|
|
214
|
+
|
|
215
|
+
# Auto-select fabric if network_tier=best and no fabric configured
|
|
216
|
+
if (fabric is None and
|
|
217
|
+
str(network_tier) == str(resources_utils.NetworkTier.BEST)):
|
|
218
|
+
try:
|
|
219
|
+
fabric = nebius_constants.get_default_fabric(
|
|
220
|
+
platform, region)
|
|
221
|
+
logger.info(f'Auto-selected InfiniBand fabric {fabric} '
|
|
222
|
+
f'for {platform} in {region}')
|
|
223
|
+
except ValueError as e:
|
|
224
|
+
logger.warning(
|
|
225
|
+
f'InfiniBand fabric auto-selection failed: {e}')
|
|
226
|
+
|
|
227
|
+
if fabric is None:
|
|
228
|
+
logger.warning(
|
|
229
|
+
f'Set up fabric for region {region} in ~/.sky/config.yaml '
|
|
230
|
+
'to use GPU clusters.')
|
|
231
|
+
else:
|
|
232
|
+
cluster_id = get_or_create_gpu_cluster(cluster_name_on_cloud,
|
|
233
|
+
project_id, fabric)
|
|
206
234
|
|
|
207
|
-
project_id = get_project_by_region(region)
|
|
208
235
|
service = nebius.compute().DiskServiceClient(nebius.sdk())
|
|
209
|
-
disk =
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
236
|
+
disk = nebius.sync_call(
|
|
237
|
+
service.create(nebius.compute().CreateDiskRequest(
|
|
238
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
|
239
|
+
parent_id=project_id,
|
|
240
|
+
name=disk_name,
|
|
241
|
+
),
|
|
242
|
+
spec=nebius.compute().DiskSpec(
|
|
243
|
+
source_image_family=nebius.compute().SourceImageFamily(
|
|
244
|
+
image_family=image_family),
|
|
245
|
+
size_gibibytes=disk_size,
|
|
246
|
+
type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
|
|
247
|
+
))))
|
|
220
248
|
disk_id = disk.resource_id
|
|
221
249
|
retry_count = 0
|
|
222
250
|
while retry_count < nebius.MAX_RETRIES_TO_DISK_CREATE:
|
|
223
|
-
disk =
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
251
|
+
disk = nebius.sync_call(
|
|
252
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
253
|
+
parent_id=project_id,
|
|
254
|
+
name=disk_name,
|
|
255
|
+
)))
|
|
227
256
|
if disk.status.state.name == 'READY':
|
|
228
257
|
break
|
|
229
258
|
logger.debug(f'Waiting for disk {disk_name} to be ready.')
|
|
@@ -237,73 +266,144 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
|
|
237
266
|
f' seconds) while waiting for disk {disk_name}'
|
|
238
267
|
f' to be ready.')
|
|
239
268
|
|
|
269
|
+
filesystems_spec = []
|
|
270
|
+
if filesystems:
|
|
271
|
+
for fs in filesystems:
|
|
272
|
+
filesystems_spec.append(nebius.compute().AttachedFilesystemSpec(
|
|
273
|
+
mount_tag=fs['filesystem_mount_tag'],
|
|
274
|
+
attach_mode=nebius.compute().AttachedFilesystemSpec.AttachMode[
|
|
275
|
+
fs['filesystem_attach_mode']],
|
|
276
|
+
existing_filesystem=nebius.compute().ExistingFilesystem(
|
|
277
|
+
id=fs['filesystem_id'])))
|
|
278
|
+
|
|
240
279
|
service = nebius.vpc().SubnetServiceClient(nebius.sdk())
|
|
241
|
-
sub_net =
|
|
242
|
-
parent_id=project_id,))
|
|
280
|
+
sub_net = nebius.sync_call(
|
|
281
|
+
service.list(nebius.vpc().ListSubnetsRequest(parent_id=project_id,)))
|
|
243
282
|
|
|
244
283
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
284
|
+
logger.debug(f'Creating instance {instance_name} in project {project_id}.')
|
|
285
|
+
try:
|
|
286
|
+
nebius.sync_call(
|
|
287
|
+
service.create(nebius.compute().CreateInstanceRequest(
|
|
288
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
|
289
|
+
parent_id=project_id,
|
|
290
|
+
name=instance_name,
|
|
291
|
+
),
|
|
292
|
+
spec=nebius.compute().InstanceSpec(
|
|
293
|
+
gpu_cluster=nebius.compute().InstanceGpuClusterSpec(
|
|
294
|
+
id=cluster_id,) if cluster_id is not None else None,
|
|
295
|
+
boot_disk=nebius.compute().AttachedDiskSpec(
|
|
296
|
+
attach_mode=nebius.compute(
|
|
297
|
+
).AttachedDiskSpec.AttachMode.READ_WRITE,
|
|
298
|
+
existing_disk=nebius.compute().ExistingDisk(
|
|
299
|
+
id=disk_id)),
|
|
300
|
+
cloud_init_user_data=user_data,
|
|
301
|
+
resources=nebius.compute().ResourcesSpec(platform=platform,
|
|
302
|
+
preset=preset),
|
|
303
|
+
filesystems=filesystems_spec if filesystems_spec else None,
|
|
304
|
+
network_interfaces=[
|
|
305
|
+
nebius.compute().NetworkInterfaceSpec(
|
|
306
|
+
subnet_id=sub_net.items[0].metadata.id,
|
|
307
|
+
ip_address=nebius.compute().IPAddress(),
|
|
308
|
+
name='network-interface-0',
|
|
309
|
+
public_ip_address=nebius.compute().PublicIPAddress(
|
|
310
|
+
static=use_static_ip_address)
|
|
311
|
+
if associate_public_ip_address else None,
|
|
312
|
+
)
|
|
313
|
+
],
|
|
314
|
+
recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
|
|
315
|
+
if use_spot else None,
|
|
316
|
+
preemptible=nebius.compute().PreemptibleSpec(
|
|
317
|
+
priority=1,
|
|
318
|
+
on_preemption=nebius.compute().PreemptibleSpec.
|
|
319
|
+
PreemptionPolicy.STOP) if use_spot else None,
|
|
320
|
+
))))
|
|
321
|
+
instance_id = ''
|
|
322
|
+
retry_count = 0
|
|
323
|
+
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
|
|
324
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
325
|
+
instance = nebius.sync_call(
|
|
326
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
327
|
+
parent_id=project_id,
|
|
328
|
+
name=instance_name,
|
|
329
|
+
)))
|
|
276
330
|
instance_id = instance.metadata.id
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
331
|
+
if instance.status.state.name == 'STARTING':
|
|
332
|
+
break
|
|
333
|
+
|
|
334
|
+
# All Instances initially have state=STOPPED and reconciling=True,
|
|
335
|
+
# so we need to wait until reconciling is False.
|
|
336
|
+
if instance.status.state.name == 'STOPPED' and \
|
|
337
|
+
not instance.status.reconciling:
|
|
338
|
+
next_token = ''
|
|
339
|
+
total_operations = 0
|
|
340
|
+
while True:
|
|
341
|
+
operations_response = nebius.sync_call(
|
|
342
|
+
service.list_operations_by_parent(
|
|
343
|
+
nebius.compute().ListOperationsByParentRequest(
|
|
344
|
+
parent_id=project_id,
|
|
345
|
+
page_size=100,
|
|
346
|
+
page_token=next_token,
|
|
347
|
+
)))
|
|
348
|
+
total_operations += len(operations_response.operations)
|
|
349
|
+
for operation in operations_response.operations:
|
|
350
|
+
# Find the most recent operation for the instance.
|
|
351
|
+
if operation.resource_id == instance_id:
|
|
352
|
+
error_msg = operation.description
|
|
353
|
+
if operation.status:
|
|
354
|
+
error_msg += f' {operation.status.message}'
|
|
355
|
+
raise RuntimeError(error_msg)
|
|
356
|
+
# If we've fetched too many operations, or there are no more
|
|
357
|
+
# operations to fetch, just raise a generic error.
|
|
358
|
+
if total_operations > _MAX_OPERATIONS_TO_FETCH or \
|
|
359
|
+
not operations_response.next_page_token:
|
|
360
|
+
raise RuntimeError(
|
|
361
|
+
f'Instance {instance_name} failed to start.')
|
|
362
|
+
next_token = operations_response.next_page_token
|
|
363
|
+
time.sleep(POLL_INTERVAL)
|
|
364
|
+
logger.debug(
|
|
365
|
+
f'Waiting for instance {instance_name} to start running. '
|
|
366
|
+
f'State: {instance.status.state.name}, '
|
|
367
|
+
f'Reconciling: {instance.status.reconciling}')
|
|
368
|
+
retry_count += 1
|
|
281
369
|
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
370
|
+
if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
|
|
371
|
+
raise TimeoutError(
|
|
372
|
+
f'Exceeded maximum retries '
|
|
373
|
+
f'({nebius.MAX_RETRIES_TO_INSTANCE_READY * POLL_INTERVAL}'
|
|
374
|
+
f' seconds) while waiting for instance {instance_name}'
|
|
375
|
+
f' to be ready.')
|
|
376
|
+
except nebius.request_error() as e:
|
|
377
|
+
# Handle ResourceExhausted quota limit error. In this case, we need to
|
|
378
|
+
# clean up the disk as VM creation failed and we can't proceed.
|
|
379
|
+
# It cannot be handled by the caller (provisioner)'s teardown logic,
|
|
380
|
+
# as we cannot retrieve the disk id, after the instance creation
|
|
381
|
+
# fails
|
|
382
|
+
logger.warning(f'Failed to launch instance {instance_name}: {e}')
|
|
383
|
+
service = nebius.compute().DiskServiceClient(nebius.sdk())
|
|
384
|
+
nebius.sync_call(
|
|
385
|
+
service.delete(nebius.compute().DeleteDiskRequest(id=disk_id)))
|
|
386
|
+
logger.debug(f'Disk {disk_id} deleted.')
|
|
387
|
+
raise e
|
|
288
388
|
return instance_id
|
|
289
389
|
|
|
290
390
|
|
|
291
391
|
def remove(instance_id: str) -> None:
|
|
292
392
|
"""Terminates the given instance."""
|
|
293
393
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
294
|
-
result =
|
|
295
|
-
nebius.compute().GetInstanceRequest(id=instance_id))
|
|
394
|
+
result = nebius.sync_call(
|
|
395
|
+
service.get(nebius.compute().GetInstanceRequest(id=instance_id)))
|
|
296
396
|
disk_id = result.spec.boot_disk.existing_disk.id
|
|
297
|
-
|
|
298
|
-
nebius.compute().DeleteInstanceRequest(id=instance_id))
|
|
397
|
+
nebius.sync_call(
|
|
398
|
+
service.delete(nebius.compute().DeleteInstanceRequest(id=instance_id)))
|
|
299
399
|
retry_count = 0
|
|
300
400
|
# The instance begins deleting and attempts to delete the disk.
|
|
301
401
|
# Must wait until the disk is unlocked and becomes deletable.
|
|
302
402
|
while retry_count < nebius.MAX_RETRIES_TO_DISK_DELETE:
|
|
303
403
|
try:
|
|
304
404
|
service = nebius.compute().DiskServiceClient(nebius.sdk())
|
|
305
|
-
|
|
306
|
-
nebius.compute().DeleteDiskRequest(id=disk_id))
|
|
405
|
+
nebius.sync_call(
|
|
406
|
+
service.delete(nebius.compute().DeleteDiskRequest(id=disk_id)))
|
|
307
407
|
break
|
|
308
408
|
except nebius.request_error():
|
|
309
409
|
logger.debug('Waiting for disk deletion.')
|
sky/provision/oci/instance.py
CHANGED
|
@@ -10,7 +10,7 @@ import copy
|
|
|
10
10
|
from datetime import datetime
|
|
11
11
|
import time
|
|
12
12
|
import typing
|
|
13
|
-
from typing import Any, Dict, List, Optional
|
|
13
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
14
14
|
|
|
15
15
|
from sky import exceptions
|
|
16
16
|
from sky import sky_logging
|
|
@@ -32,10 +32,12 @@ logger = sky_logging.init_logger(__name__)
|
|
|
32
32
|
@query_utils.debug_enabled(logger)
|
|
33
33
|
@common_utils.retry
|
|
34
34
|
def query_instances(
|
|
35
|
+
cluster_name: str,
|
|
35
36
|
cluster_name_on_cloud: str,
|
|
36
37
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
37
38
|
non_terminated_only: bool = True,
|
|
38
|
-
|
|
39
|
+
retry_if_missing: bool = False,
|
|
40
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
39
41
|
"""Query instances.
|
|
40
42
|
|
|
41
43
|
Returns a dictionary of instance IDs and status.
|
|
@@ -43,11 +45,13 @@ def query_instances(
|
|
|
43
45
|
A None status means the instance is marked as "terminated"
|
|
44
46
|
or "terminating".
|
|
45
47
|
"""
|
|
48
|
+
del cluster_name, retry_if_missing # unused
|
|
46
49
|
assert provider_config is not None, cluster_name_on_cloud
|
|
47
50
|
region = provider_config['region']
|
|
48
51
|
|
|
49
52
|
status_map = oci_utils.oci_config.STATE_MAPPING_OCI_TO_SKY
|
|
50
|
-
statuses: Dict[str, Optional['status_lib.ClusterStatus']
|
|
53
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
54
|
+
Optional[str]]] = {}
|
|
51
55
|
filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
|
|
52
56
|
|
|
53
57
|
instances = _get_filtered_nodes(region, filters)
|
|
@@ -56,15 +60,16 @@ def query_instances(
|
|
|
56
60
|
sky_status = status_map[vm_status]
|
|
57
61
|
if non_terminated_only and sky_status is None:
|
|
58
62
|
continue
|
|
59
|
-
statuses[node['inst_id']] = sky_status
|
|
63
|
+
statuses[node['inst_id']] = (sky_status, None)
|
|
60
64
|
|
|
61
65
|
return statuses
|
|
62
66
|
|
|
63
67
|
|
|
64
68
|
@query_utils.debug_enabled(logger)
|
|
65
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
69
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
66
70
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
67
71
|
"""Start instances with bootstrapped configuration."""
|
|
72
|
+
del cluster_name # unused
|
|
68
73
|
tags = dict(sorted(copy.deepcopy(config.tags).items()))
|
|
69
74
|
|
|
70
75
|
start_time = round(time.time() * 1000)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Paperspace instance provisioning."""
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.provision import common
|
|
@@ -48,10 +48,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
48
48
|
return head_instance_id
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
51
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
52
52
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
53
53
|
"""Runs instances for the given cluster."""
|
|
54
|
-
|
|
54
|
+
del cluster_name # unused
|
|
55
55
|
pending_status = [
|
|
56
56
|
'starting', 'restarting', 'upgrading', 'provisioning', 'stopping'
|
|
57
57
|
]
|
|
@@ -277,12 +277,14 @@ def get_cluster_info(
|
|
|
277
277
|
|
|
278
278
|
|
|
279
279
|
def query_instances(
|
|
280
|
+
cluster_name: str,
|
|
280
281
|
cluster_name_on_cloud: str,
|
|
281
282
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
282
283
|
non_terminated_only: bool = True,
|
|
283
|
-
|
|
284
|
+
retry_if_missing: bool = False,
|
|
285
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
284
286
|
"""See sky/provision/__init__.py"""
|
|
285
|
-
del non_terminated_only
|
|
287
|
+
del cluster_name, non_terminated_only, retry_if_missing #unused
|
|
286
288
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
287
289
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
288
290
|
|
|
@@ -297,10 +299,11 @@ def query_instances(
|
|
|
297
299
|
'ready': status_lib.ClusterStatus.UP,
|
|
298
300
|
'off': status_lib.ClusterStatus.STOPPED,
|
|
299
301
|
}
|
|
300
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
302
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
303
|
+
Optional[str]]] = {}
|
|
301
304
|
for inst_id, inst in instances.items():
|
|
302
305
|
status = status_map[inst['state']]
|
|
303
|
-
statuses[inst_id] = status
|
|
306
|
+
statuses[inst_id] = (status, None)
|
|
304
307
|
return statuses
|
|
305
308
|
|
|
306
309
|
|
|
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
8
8
|
|
|
9
9
|
from sky import sky_logging
|
|
10
10
|
from sky.adaptors import common as adaptors_common
|
|
11
|
-
|
|
11
|
+
from sky.provision.paperspace import constants
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
|
|
14
14
|
if typing.TYPE_CHECKING:
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Prime Intellect provisioner for SkyPilot."""
|
|
2
|
+
|
|
3
|
+
from sky.provision.primeintellect.config import bootstrap_instances
|
|
4
|
+
from sky.provision.primeintellect.instance import cleanup_ports
|
|
5
|
+
from sky.provision.primeintellect.instance import get_cluster_info
|
|
6
|
+
from sky.provision.primeintellect.instance import query_instances
|
|
7
|
+
from sky.provision.primeintellect.instance import run_instances
|
|
8
|
+
from sky.provision.primeintellect.instance import stop_instances
|
|
9
|
+
from sky.provision.primeintellect.instance import terminate_instances
|
|
10
|
+
from sky.provision.primeintellect.instance import wait_instances
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Prime Intellect configuration bootstrapping."""
|
|
2
|
+
|
|
3
|
+
from sky.provision import common
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def bootstrap_instances(
|
|
7
|
+
region: str, cluster_name: str,
|
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
|
9
|
+
"""Bootstraps instances for the given cluster."""
|
|
10
|
+
del region, cluster_name # unused
|
|
11
|
+
return config
|