skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +173 -30
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1323 -397
- sky/backends/cloud_vm_ray_backend.py +1749 -1029
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +55 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +90 -49
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +338 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +533 -185
- sky/cli.py +5 -5975
- sky/client/{cli.py → cli/command.py} +2591 -1956
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +322 -0
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +78 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1219 -319
- sky/client/sdk_async.py +827 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +564 -164
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +140 -40
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +570 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +210 -81
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -86
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +231 -167
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +375 -173
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1460 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +126 -2
- sky/execution.py +216 -63
- sky/global_user_state.py +2390 -586
- sky/jobs/__init__.py +7 -0
- sky/jobs/client/sdk.py +300 -58
- sky/jobs/client/sdk_async.py +161 -0
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +848 -275
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +402 -152
- sky/jobs/scheduler.py +314 -189
- sky/jobs/server/core.py +836 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2109 -706
- sky/jobs/utils.py +1306 -215
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +453 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +136 -50
- sky/provision/azure/instance.py +11 -6
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +140 -33
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +101 -20
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +919 -280
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +44 -43
- sky/provision/kubernetes/utils.py +1221 -534
- sky/provision/kubernetes/volume.py +343 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +237 -137
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +117 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +214 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +707 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +4 -4
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +17 -20
- sky/py.typed +0 -0
- sky/resources.py +845 -119
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +227 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +12 -9
- sky/serve/controller.py +68 -17
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +116 -2
- sky/serve/replica_managers.py +434 -249
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +569 -257
- sky/serve/serve_utils.py +775 -265
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +192 -89
- sky/serve/service_spec.py +144 -20
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +202 -0
- sky/server/common.py +478 -182
- sky/server/config.py +85 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +295 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +558 -138
- sky/server/requests/payloads.py +364 -24
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +822 -226
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +455 -0
- sky/server/server.py +1309 -285
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +327 -61
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +11 -1
- sky/setup_files/alembic.ini +160 -0
- sky/setup_files/dependencies.py +139 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +114 -7
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +29 -20
- sky/skylet/constants.py +216 -25
- sky/skylet/events.py +101 -21
- sky/skylet/job_lib.py +345 -164
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +568 -0
- sky/skylet/skylet.py +72 -4
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +685 -163
- sky/templates/aws-ray.yml.j2 +11 -3
- sky/templates/azure-ray.yml.j2 +2 -1
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +2 -1
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +62 -1
- sky/templates/hyperbolic-ray.yml.j2 +68 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +611 -50
- sky/templates/lambda-ray.yml.j2 +2 -1
- sky/templates/nebius-ray.yml.j2 +34 -12
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +2 -1
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +10 -1
- sky/templates/scp-ray.yml.j2 +4 -50
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +212 -37
- sky/usage/usage_lib.py +31 -15
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +397 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +35 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +38 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +314 -91
- sky/utils/config_utils.py +74 -5
- sky/utils/context.py +403 -0
- sky/utils/context_utils.py +242 -0
- sky/utils/controller_utils.py +383 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +485 -0
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +137 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +15 -29
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +18 -8
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +416 -0
- sky/utils/log_utils.py +82 -107
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +217 -39
- sky/utils/schemas.py +955 -160
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +29 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +26 -53
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +165 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +150 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +270 -0
- sky/volumes/server/server.py +124 -0
- sky/volumes/volume.py +215 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
- skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/clouds/nebius.py
CHANGED
|
@@ -1,25 +1,22 @@
|
|
|
1
1
|
""" Nebius Cloud. """
|
|
2
|
+
import json
|
|
2
3
|
import os
|
|
3
4
|
import typing
|
|
4
|
-
from typing import Dict, Iterator, List, Optional, Tuple, Union
|
|
5
|
+
from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
|
|
5
6
|
|
|
7
|
+
from sky import catalog
|
|
6
8
|
from sky import clouds
|
|
9
|
+
from sky import exceptions
|
|
10
|
+
from sky import skypilot_config
|
|
7
11
|
from sky.adaptors import nebius
|
|
8
|
-
from sky.
|
|
12
|
+
from sky.provision.nebius import constants as nebius_constants
|
|
9
13
|
from sky.utils import annotations
|
|
10
14
|
from sky.utils import registry
|
|
11
15
|
from sky.utils import resources_utils
|
|
12
16
|
|
|
13
17
|
if typing.TYPE_CHECKING:
|
|
14
18
|
from sky import resources as resources_lib
|
|
15
|
-
|
|
16
|
-
_CREDENTIAL_FILES = [
|
|
17
|
-
# credential files for Nebius
|
|
18
|
-
nebius.NEBIUS_TENANT_ID_FILENAME,
|
|
19
|
-
nebius.NEBIUS_IAM_TOKEN_FILENAME,
|
|
20
|
-
nebius.NEBIUS_PROJECT_ID_FILENAME,
|
|
21
|
-
nebius.NEBIUS_CREDENTIALS_FILENAME
|
|
22
|
-
]
|
|
19
|
+
from sky.utils import volume as volume_lib
|
|
23
20
|
|
|
24
21
|
_INDENT_PREFIX = ' '
|
|
25
22
|
|
|
@@ -55,14 +52,18 @@ class Nebius(clouds.Cloud):
|
|
|
55
52
|
_CLOUD_UNSUPPORTED_FEATURES = {
|
|
56
53
|
clouds.CloudImplementationFeatures.AUTODOWN:
|
|
57
54
|
('Autodown not supported. Can\'t delete OS disk.'),
|
|
58
|
-
clouds.CloudImplementationFeatures.SPOT_INSTANCE:
|
|
59
|
-
('Spot is not supported, as Nebius API does not implement spot.'),
|
|
60
55
|
clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
|
|
61
56
|
(f'Migrating disk is currently not supported on {_REPR}.'),
|
|
62
57
|
clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
|
|
63
58
|
(f'Custom disk tier is currently not supported on {_REPR}.'),
|
|
59
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
|
|
60
|
+
('Custom network tier is currently only supported for '
|
|
61
|
+
'H100:8 and H200:8 on Nebius.'),
|
|
64
62
|
clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
|
|
65
63
|
('High availability controllers are not supported on Nebius.'),
|
|
64
|
+
clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK:
|
|
65
|
+
('Customized multiple network interfaces are not supported on '
|
|
66
|
+
f'{_REPR}.'),
|
|
66
67
|
}
|
|
67
68
|
# Nebius maximum instance name length defined as <= 63 as a hostname length
|
|
68
69
|
# 63 - 8 - 5 = 50 characters since
|
|
@@ -77,25 +78,43 @@ class Nebius(clouds.Cloud):
|
|
|
77
78
|
|
|
78
79
|
@classmethod
|
|
79
80
|
def _unsupported_features_for_resources(
|
|
80
|
-
cls,
|
|
81
|
+
cls,
|
|
82
|
+
resources: 'resources_lib.Resources',
|
|
83
|
+
region: Optional[str] = None,
|
|
81
84
|
) -> Dict[clouds.CloudImplementationFeatures, str]:
|
|
82
|
-
|
|
83
|
-
|
|
85
|
+
unsupported = cls._CLOUD_UNSUPPORTED_FEATURES.copy()
|
|
86
|
+
|
|
87
|
+
# Check if the accelerators support InfiniBand (H100 or H200) and 8 GPUs
|
|
88
|
+
if resources.accelerators is not None:
|
|
89
|
+
for acc_name, acc_count in resources.accelerators.items():
|
|
90
|
+
if acc_name.lower() in ('h100', 'h200') and acc_count == 8:
|
|
91
|
+
# Remove CUSTOM_NETWORK_TIER from unsupported features for
|
|
92
|
+
# InfiniBand-capable accelerators. Refer to:
|
|
93
|
+
# https://docs.nebius.com/compute/clusters/gpu#fabrics
|
|
94
|
+
unsupported.pop(
|
|
95
|
+
clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER,
|
|
96
|
+
None)
|
|
97
|
+
break
|
|
98
|
+
|
|
99
|
+
return unsupported
|
|
84
100
|
|
|
85
101
|
@classmethod
|
|
86
102
|
def _max_cluster_name_length(cls) -> Optional[int]:
|
|
87
103
|
return cls._MAX_CLUSTER_NAME_LEN_LIMIT
|
|
88
104
|
|
|
89
105
|
@classmethod
|
|
90
|
-
def regions_with_offering(
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
106
|
+
def regions_with_offering(
|
|
107
|
+
cls,
|
|
108
|
+
instance_type: str,
|
|
109
|
+
accelerators: Optional[Dict[str, int]],
|
|
110
|
+
use_spot: bool,
|
|
111
|
+
region: Optional[str],
|
|
112
|
+
zone: Optional[str],
|
|
113
|
+
resources: Optional['resources_lib.Resources'] = None,
|
|
114
|
+
) -> List[clouds.Region]:
|
|
94
115
|
assert zone is None, 'Nebius does not support zones.'
|
|
95
116
|
del accelerators, zone # unused
|
|
96
|
-
|
|
97
|
-
return []
|
|
98
|
-
regions = service_catalog.get_region_zones_for_instance_type(
|
|
117
|
+
regions = catalog.get_region_zones_for_instance_type(
|
|
99
118
|
instance_type, use_spot, 'nebius')
|
|
100
119
|
|
|
101
120
|
if region is not None:
|
|
@@ -107,8 +126,8 @@ class Nebius(clouds.Cloud):
|
|
|
107
126
|
cls,
|
|
108
127
|
instance_type: str,
|
|
109
128
|
) -> Tuple[Optional[float], Optional[float]]:
|
|
110
|
-
return
|
|
111
|
-
|
|
129
|
+
return catalog.get_vcpus_mem_from_instance_type(instance_type,
|
|
130
|
+
clouds='nebius')
|
|
112
131
|
|
|
113
132
|
@classmethod
|
|
114
133
|
def zones_provision_loop(
|
|
@@ -135,11 +154,11 @@ class Nebius(clouds.Cloud):
|
|
|
135
154
|
use_spot: bool,
|
|
136
155
|
region: Optional[str] = None,
|
|
137
156
|
zone: Optional[str] = None) -> float:
|
|
138
|
-
return
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
157
|
+
return catalog.get_hourly_cost(instance_type,
|
|
158
|
+
use_spot=use_spot,
|
|
159
|
+
region=region,
|
|
160
|
+
zone=zone,
|
|
161
|
+
clouds='nebius')
|
|
143
162
|
|
|
144
163
|
def accelerators_to_hourly_cost(self,
|
|
145
164
|
accelerators: Dict[str, int],
|
|
@@ -161,69 +180,124 @@ class Nebius(clouds.Cloud):
|
|
|
161
180
|
return isinstance(other, Nebius)
|
|
162
181
|
|
|
163
182
|
@classmethod
|
|
164
|
-
def get_default_instance_type(
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
183
|
+
def get_default_instance_type(cls,
|
|
184
|
+
cpus: Optional[str] = None,
|
|
185
|
+
memory: Optional[str] = None,
|
|
186
|
+
disk_tier: Optional[
|
|
187
|
+
resources_utils.DiskTier] = None,
|
|
188
|
+
region: Optional[str] = None,
|
|
189
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
170
190
|
"""Returns the default instance type for Nebius."""
|
|
171
|
-
return
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
191
|
+
return catalog.get_default_instance_type(cpus=cpus,
|
|
192
|
+
memory=memory,
|
|
193
|
+
disk_tier=disk_tier,
|
|
194
|
+
region=region,
|
|
195
|
+
zone=zone,
|
|
196
|
+
clouds='nebius')
|
|
175
197
|
|
|
176
198
|
@classmethod
|
|
177
199
|
def get_accelerators_from_instance_type(
|
|
178
200
|
cls,
|
|
179
201
|
instance_type: str,
|
|
180
202
|
) -> Optional[Dict[str, Union[int, float]]]:
|
|
181
|
-
return
|
|
182
|
-
|
|
203
|
+
return catalog.get_accelerators_from_instance_type(instance_type,
|
|
204
|
+
clouds='nebius')
|
|
183
205
|
|
|
184
206
|
@classmethod
|
|
185
207
|
def get_zone_shell_cmd(cls) -> Optional[str]:
|
|
186
208
|
return None
|
|
187
209
|
|
|
188
210
|
def make_deploy_resources_variables(
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
211
|
+
self,
|
|
212
|
+
resources: 'resources_lib.Resources',
|
|
213
|
+
cluster_name: resources_utils.ClusterName,
|
|
214
|
+
region: 'clouds.Region',
|
|
215
|
+
zones: Optional[List['clouds.Zone']],
|
|
216
|
+
num_nodes: int,
|
|
217
|
+
dryrun: bool = False,
|
|
218
|
+
volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
|
|
219
|
+
) -> Dict[str, Any]:
|
|
196
220
|
del dryrun, cluster_name
|
|
197
221
|
assert zones is None, ('Nebius does not support zones', zones)
|
|
198
222
|
|
|
199
|
-
|
|
200
|
-
acc_dict = self.get_accelerators_from_instance_type(
|
|
223
|
+
resources = resources.assert_launchable()
|
|
224
|
+
acc_dict = self.get_accelerators_from_instance_type(
|
|
225
|
+
resources.instance_type)
|
|
201
226
|
custom_resources = resources_utils.make_ray_custom_resources_str(
|
|
202
227
|
acc_dict)
|
|
203
228
|
platform, _ = resources.instance_type.split('_')
|
|
204
229
|
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
image_family = '
|
|
230
|
+
# Selecting image_family by platform
|
|
231
|
+
# https://docs.nebius.com/compute/storage/boot-disk-images
|
|
232
|
+
if platform.startswith('cpu'):
|
|
233
|
+
image_family = 'ubuntu24.04-driverless'
|
|
234
|
+
elif platform.startswith('gpu'):
|
|
235
|
+
image_family = 'ubuntu24.04-cuda12'
|
|
209
236
|
else:
|
|
210
237
|
raise RuntimeError('Unsupported instance type for Nebius cloud:'
|
|
211
238
|
f' {resources.instance_type}')
|
|
212
239
|
|
|
213
|
-
|
|
240
|
+
config_fs = skypilot_config.get_effective_region_config(
|
|
241
|
+
cloud='nebius',
|
|
242
|
+
region=region.name,
|
|
243
|
+
keys=('filesystems',),
|
|
244
|
+
default_value=[])
|
|
245
|
+
resources_vars_fs = []
|
|
246
|
+
for i, fs in enumerate(config_fs):
|
|
247
|
+
resources_vars_fs.append({
|
|
248
|
+
'filesystem_id': fs['filesystem_id'],
|
|
249
|
+
'filesystem_attach_mode': fs.get('attach_mode', 'READ_WRITE'),
|
|
250
|
+
'filesystem_mount_path': fs.get(
|
|
251
|
+
'mount_path', f'/mnt/filesystem-skypilot-{i+1}'),
|
|
252
|
+
'filesystem_mount_tag': f'filesystem-skypilot-{i+1}'
|
|
253
|
+
})
|
|
254
|
+
|
|
255
|
+
use_static_ip_address = skypilot_config.get_nested(
|
|
256
|
+
('nebius', 'use_static_ip_address'), default_value=False)
|
|
257
|
+
resources_vars: Dict[str, Any] = {
|
|
214
258
|
'instance_type': resources.instance_type,
|
|
215
259
|
'custom_resources': custom_resources,
|
|
260
|
+
'use_static_ip_address': use_static_ip_address,
|
|
216
261
|
'region': region.name,
|
|
217
262
|
'image_id': image_family,
|
|
218
263
|
# Nebius does not support specific zones.
|
|
219
264
|
'zones': None,
|
|
265
|
+
'use_spot': resources.use_spot,
|
|
266
|
+
'filesystems': resources_vars_fs,
|
|
267
|
+
'network_tier': resources.network_tier
|
|
220
268
|
}
|
|
221
269
|
|
|
270
|
+
docker_run_options = []
|
|
271
|
+
|
|
222
272
|
if acc_dict is not None:
|
|
223
273
|
# Nebius cloud's docker runtime information does not contain
|
|
224
274
|
# 'nvidia-container-runtime', causing no GPU option to be added to
|
|
225
275
|
# the docker run command. We patch this by adding it here.
|
|
226
|
-
|
|
276
|
+
docker_run_options.append('--gpus all')
|
|
277
|
+
|
|
278
|
+
# Check for InfiniBand support with network_tier: best
|
|
279
|
+
is_infiniband_capable = (
|
|
280
|
+
platform in nebius_constants.INFINIBAND_INSTANCE_PLATFORMS)
|
|
281
|
+
if (is_infiniband_capable and
|
|
282
|
+
resources.network_tier == resources_utils.NetworkTier.BEST):
|
|
283
|
+
# For Docker containers, add InfiniBand device access and
|
|
284
|
+
# IPC_LOCK capability
|
|
285
|
+
if resources.extract_docker_image() is not None:
|
|
286
|
+
docker_run_options.extend(
|
|
287
|
+
nebius_constants.INFINIBAND_DOCKER_OPTIONS)
|
|
288
|
+
|
|
289
|
+
# Add InfiniBand environment variables to docker run options
|
|
290
|
+
for env_var, env_value in (
|
|
291
|
+
nebius_constants.INFINIBAND_ENV_VARS.items()):
|
|
292
|
+
docker_run_options.extend(
|
|
293
|
+
['-e', f'{env_var}={env_value}'])
|
|
294
|
+
|
|
295
|
+
# For all InfiniBand-capable instances, add env variables
|
|
296
|
+
resources_vars[
|
|
297
|
+
'env_vars'] = nebius_constants.INFINIBAND_ENV_VARS
|
|
298
|
+
|
|
299
|
+
if docker_run_options:
|
|
300
|
+
resources_vars['docker_run_options'] = docker_run_options
|
|
227
301
|
|
|
228
302
|
return resources_vars
|
|
229
303
|
|
|
@@ -255,7 +329,9 @@ class Nebius(clouds.Cloud):
|
|
|
255
329
|
default_instance_type = Nebius.get_default_instance_type(
|
|
256
330
|
cpus=resources.cpus,
|
|
257
331
|
memory=resources.memory,
|
|
258
|
-
disk_tier=resources.disk_tier
|
|
332
|
+
disk_tier=resources.disk_tier,
|
|
333
|
+
region=resources.region,
|
|
334
|
+
zone=resources.zone)
|
|
259
335
|
if default_instance_type is None:
|
|
260
336
|
# TODO: Add hints to all return values in this method to help
|
|
261
337
|
# users understand why the resources are not launchable.
|
|
@@ -266,15 +342,16 @@ class Nebius(clouds.Cloud):
|
|
|
266
342
|
|
|
267
343
|
assert len(accelerators) == 1, resources
|
|
268
344
|
acc, acc_count = list(accelerators.items())[0]
|
|
269
|
-
(instance_list,
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
345
|
+
(instance_list,
|
|
346
|
+
fuzzy_candidate_list) = catalog.get_instance_type_for_accelerator(
|
|
347
|
+
acc,
|
|
348
|
+
acc_count,
|
|
349
|
+
use_spot=resources.use_spot,
|
|
350
|
+
cpus=resources.cpus,
|
|
351
|
+
memory=resources.memory,
|
|
352
|
+
region=resources.region,
|
|
353
|
+
zone=resources.zone,
|
|
354
|
+
clouds='nebius')
|
|
278
355
|
if instance_list is None:
|
|
279
356
|
return resources_utils.FeasibleResources([], fuzzy_candidate_list,
|
|
280
357
|
None)
|
|
@@ -282,25 +359,25 @@ class Nebius(clouds.Cloud):
|
|
|
282
359
|
fuzzy_candidate_list, None)
|
|
283
360
|
|
|
284
361
|
@classmethod
|
|
285
|
-
|
|
286
|
-
|
|
362
|
+
def _check_compute_credentials(
|
|
363
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
287
364
|
"""Checks if the user has access credentials to
|
|
288
365
|
Nebius's compute service."""
|
|
289
366
|
token_cred_msg = (
|
|
290
367
|
f'{_INDENT_PREFIX}Credentials can be set up by running: \n'
|
|
291
|
-
f'{_INDENT_PREFIX} $ nebius iam get-access-token > {nebius.
|
|
292
|
-
f'{_INDENT_PREFIX} or generate
|
|
368
|
+
f'{_INDENT_PREFIX} $ nebius iam get-access-token > {nebius.iam_token_path()} \n' # pylint: disable=line-too-long
|
|
369
|
+
f'{_INDENT_PREFIX} or generate {nebius.credentials_path()} \n')
|
|
293
370
|
|
|
294
|
-
tenant_msg = (f'{_INDENT_PREFIX} Copy your
|
|
295
|
-
f'{_INDENT_PREFIX} $ echo $NEBIUS_TENANT_ID_PATH > {nebius.
|
|
371
|
+
tenant_msg = (f'{_INDENT_PREFIX} Copy your tenant ID from the web console and save it to file \n' # pylint: disable=line-too-long
|
|
372
|
+
f'{_INDENT_PREFIX} $ echo $NEBIUS_TENANT_ID_PATH > {nebius.tenant_id_path()} \n' # pylint: disable=line-too-long
|
|
296
373
|
f'{_INDENT_PREFIX} Or if you have 1 tenant you can run:\n' # pylint: disable=line-too-long
|
|
297
|
-
f'{_INDENT_PREFIX} $ nebius --format json iam whoami|jq -r \'.user_profile.tenants[0].tenant_id\' > {nebius.
|
|
374
|
+
f'{_INDENT_PREFIX} $ nebius --format json iam whoami|jq -r \'.user_profile.tenants[0].tenant_id\' > {nebius.tenant_id_path()} \n') # pylint: disable=line-too-long
|
|
298
375
|
if not nebius.is_token_or_cred_file_exist():
|
|
299
376
|
return False, f'{token_cred_msg}'
|
|
300
|
-
sdk = nebius.sdk()
|
|
301
377
|
tenant_id = nebius.get_tenant_id()
|
|
302
378
|
if tenant_id is None:
|
|
303
379
|
return False, f'{tenant_msg}'
|
|
380
|
+
sdk = nebius.sdk()
|
|
304
381
|
try:
|
|
305
382
|
service = nebius.iam().ProjectServiceClient(sdk)
|
|
306
383
|
service.list(
|
|
@@ -314,7 +391,8 @@ class Nebius(clouds.Cloud):
|
|
|
314
391
|
|
|
315
392
|
@classmethod
|
|
316
393
|
@annotations.lru_cache(scope='request')
|
|
317
|
-
def _check_storage_credentials(
|
|
394
|
+
def _check_storage_credentials(
|
|
395
|
+
cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
|
|
318
396
|
"""Checks if the user has access credentials to Nebius Object Storage.
|
|
319
397
|
|
|
320
398
|
Returns:
|
|
@@ -341,8 +419,8 @@ class Nebius(clouds.Cloud):
|
|
|
341
419
|
|
|
342
420
|
def get_credential_file_mounts(self) -> Dict[str, str]:
|
|
343
421
|
credential_file_mounts = {
|
|
344
|
-
|
|
345
|
-
for
|
|
422
|
+
filepath: filepath
|
|
423
|
+
for filepath in nebius.get_credential_file_paths()
|
|
346
424
|
}
|
|
347
425
|
if nebius_profile_in_aws_cred_and_config():
|
|
348
426
|
credential_file_mounts['~/.aws/credentials'] = '~/.aws/credentials'
|
|
@@ -356,9 +434,60 @@ class Nebius(clouds.Cloud):
|
|
|
356
434
|
return None
|
|
357
435
|
|
|
358
436
|
def instance_type_exists(self, instance_type: str) -> bool:
|
|
359
|
-
return
|
|
437
|
+
return catalog.instance_type_exists(instance_type, 'nebius')
|
|
360
438
|
|
|
361
439
|
def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
|
|
362
|
-
return
|
|
363
|
-
|
|
364
|
-
|
|
440
|
+
return catalog.validate_region_zone(region, zone, clouds='nebius')
|
|
441
|
+
|
|
442
|
+
@classmethod
|
|
443
|
+
def get_user_identities(cls) -> Optional[List[List[str]]]:
|
|
444
|
+
"""Returns the email address + project id of the active user."""
|
|
445
|
+
nebius_workspace_config = json.dumps(
|
|
446
|
+
skypilot_config.get_workspace_cloud('nebius'), sort_keys=True)
|
|
447
|
+
return cls._get_user_identities(nebius_workspace_config)
|
|
448
|
+
|
|
449
|
+
@classmethod
|
|
450
|
+
@annotations.lru_cache(scope='request', maxsize=5)
|
|
451
|
+
def _get_user_identities(
|
|
452
|
+
cls, workspace_config: Optional[str]) -> Optional[List[List[str]]]:
|
|
453
|
+
# We add workspace_config in args to avoid caching the identity for when
|
|
454
|
+
# different workspace configs are used.
|
|
455
|
+
del workspace_config # Unused
|
|
456
|
+
sdk = nebius.sdk()
|
|
457
|
+
profile_client = nebius.iam().ProfileServiceClient(sdk)
|
|
458
|
+
try:
|
|
459
|
+
profile = nebius.sync_call(
|
|
460
|
+
profile_client.get(nebius.iam().GetProfileRequest(),
|
|
461
|
+
timeout=nebius.READ_TIMEOUT))
|
|
462
|
+
except Exception as e:
|
|
463
|
+
raise exceptions.CloudUserIdentityError(
|
|
464
|
+
f'Error getting Nebius profile: {e}')
|
|
465
|
+
if profile.user_profile is not None:
|
|
466
|
+
if profile.user_profile.attributes is None:
|
|
467
|
+
raise exceptions.CloudUserIdentityError(
|
|
468
|
+
'Nebius profile is a UserProfile, but has no attributes: '
|
|
469
|
+
f'{profile.user_profile}')
|
|
470
|
+
if profile.user_profile.attributes.email is None:
|
|
471
|
+
raise exceptions.CloudUserIdentityError(
|
|
472
|
+
'Nebius profile is a UserProfile, but has no email: '
|
|
473
|
+
f'{profile.user_profile}')
|
|
474
|
+
return [[profile.user_profile.attributes.email]]
|
|
475
|
+
if profile.service_account_profile is not None:
|
|
476
|
+
if profile.service_account_profile.info is None:
|
|
477
|
+
raise exceptions.CloudUserIdentityError(
|
|
478
|
+
'Nebius profile is a ServiceAccountProfile, but has no '
|
|
479
|
+
f'info: {profile.service_account_profile}')
|
|
480
|
+
if profile.service_account_profile.info.metadata is None:
|
|
481
|
+
raise exceptions.CloudUserIdentityError(
|
|
482
|
+
'Nebius profile is a ServiceAccountProfile, but has no '
|
|
483
|
+
f'metadata: {profile.service_account_profile}')
|
|
484
|
+
if profile.service_account_profile.info.metadata.name is None:
|
|
485
|
+
raise exceptions.CloudUserIdentityError(
|
|
486
|
+
'Nebius profile is a ServiceAccountProfile, but has no '
|
|
487
|
+
f'name: {profile.service_account_profile}')
|
|
488
|
+
return [[profile.service_account_profile.info.metadata.name]]
|
|
489
|
+
if profile.anonymous_profile is not None:
|
|
490
|
+
return None
|
|
491
|
+
unknown_profile_type = profile.which_field_in_oneof('profile')
|
|
492
|
+
raise exceptions.CloudUserIdentityError(
|
|
493
|
+
f'Nebius profile is of an unknown type - {unknown_profile_type}')
|