skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +22 -6
- sky/adaptors/aws.py +25 -7
- sky/adaptors/common.py +24 -1
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/hyperbolic.py +8 -0
- sky/adaptors/kubernetes.py +149 -18
- sky/adaptors/nebius.py +170 -17
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/runpod.py +68 -0
- sky/adaptors/seeweb.py +167 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +187 -4
- sky/authentication.py +179 -225
- sky/backends/__init__.py +4 -2
- sky/backends/backend.py +22 -9
- sky/backends/backend_utils.py +1299 -380
- sky/backends/cloud_vm_ray_backend.py +1715 -518
- sky/backends/docker_utils.py +1 -1
- sky/backends/local_docker_backend.py +11 -6
- sky/backends/wheel_utils.py +37 -9
- sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
- sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
- sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
- sky/{clouds/service_catalog → catalog}/common.py +89 -48
- sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
- sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
- sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
- sky/catalog/data_fetchers/fetch_nebius.py +335 -0
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
- sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
- sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
- sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
- sky/catalog/hyperbolic_catalog.py +136 -0
- sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
- sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
- sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
- sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
- sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/ssh_catalog.py +167 -0
- sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
- sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
- sky/check.py +491 -203
- sky/cli.py +5 -6005
- sky/client/{cli.py → cli/command.py} +2477 -1885
- sky/client/cli/deprecation_utils.py +99 -0
- sky/client/cli/flags.py +359 -0
- sky/client/cli/table_utils.py +320 -0
- sky/client/common.py +70 -32
- sky/client/oauth.py +82 -0
- sky/client/sdk.py +1203 -297
- sky/client/sdk_async.py +833 -0
- sky/client/service_account_auth.py +47 -0
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +13 -0
- sky/clouds/aws.py +358 -93
- sky/clouds/azure.py +105 -83
- sky/clouds/cloud.py +127 -36
- sky/clouds/cudo.py +68 -50
- sky/clouds/do.py +66 -48
- sky/clouds/fluidstack.py +63 -44
- sky/clouds/gcp.py +339 -110
- sky/clouds/hyperbolic.py +293 -0
- sky/clouds/ibm.py +70 -49
- sky/clouds/kubernetes.py +563 -162
- sky/clouds/lambda_cloud.py +74 -54
- sky/clouds/nebius.py +206 -80
- sky/clouds/oci.py +88 -66
- sky/clouds/paperspace.py +61 -44
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +164 -74
- sky/clouds/scp.py +89 -83
- sky/clouds/seeweb.py +466 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +263 -0
- sky/clouds/utils/aws_utils.py +10 -4
- sky/clouds/utils/gcp_utils.py +87 -11
- sky/clouds/utils/oci_utils.py +38 -14
- sky/clouds/utils/scp_utils.py +177 -124
- sky/clouds/vast.py +99 -77
- sky/clouds/vsphere.py +51 -40
- sky/core.py +349 -139
- sky/dag.py +15 -0
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
- sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
- sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
- sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
- sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
- sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
- sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
- sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -0
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -0
- sky/dashboard/out/infra.html +1 -0
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -0
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -0
- sky/dashboard/out/volumes.html +1 -0
- sky/dashboard/out/workspace/new.html +1 -0
- sky/dashboard/out/workspaces/[name].html +1 -0
- sky/dashboard/out/workspaces.html +1 -0
- sky/data/data_utils.py +137 -1
- sky/data/mounting_utils.py +269 -84
- sky/data/storage.py +1451 -1807
- sky/data/storage_utils.py +43 -57
- sky/exceptions.py +132 -2
- sky/execution.py +206 -63
- sky/global_user_state.py +2374 -586
- sky/jobs/__init__.py +5 -0
- sky/jobs/client/sdk.py +242 -65
- sky/jobs/client/sdk_async.py +143 -0
- sky/jobs/constants.py +9 -8
- sky/jobs/controller.py +839 -277
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +398 -152
- sky/jobs/scheduler.py +315 -189
- sky/jobs/server/core.py +829 -255
- sky/jobs/server/server.py +156 -115
- sky/jobs/server/utils.py +136 -0
- sky/jobs/state.py +2092 -701
- sky/jobs/utils.py +1242 -160
- sky/logs/__init__.py +21 -0
- sky/logs/agent.py +108 -0
- sky/logs/aws.py +243 -0
- sky/logs/gcp.py +91 -0
- sky/metrics/__init__.py +0 -0
- sky/metrics/utils.py +443 -0
- sky/models.py +78 -1
- sky/optimizer.py +164 -70
- sky/provision/__init__.py +90 -4
- sky/provision/aws/config.py +147 -26
- sky/provision/aws/instance.py +135 -50
- sky/provision/azure/instance.py +10 -5
- sky/provision/common.py +13 -1
- sky/provision/cudo/cudo_machine_type.py +1 -1
- sky/provision/cudo/cudo_utils.py +14 -8
- sky/provision/cudo/cudo_wrapper.py +72 -71
- sky/provision/cudo/instance.py +10 -6
- sky/provision/do/instance.py +10 -6
- sky/provision/do/utils.py +4 -3
- sky/provision/docker_utils.py +114 -23
- sky/provision/fluidstack/instance.py +13 -8
- sky/provision/gcp/__init__.py +1 -0
- sky/provision/gcp/config.py +301 -19
- sky/provision/gcp/constants.py +218 -0
- sky/provision/gcp/instance.py +36 -8
- sky/provision/gcp/instance_utils.py +18 -4
- sky/provision/gcp/volume_utils.py +247 -0
- sky/provision/hyperbolic/__init__.py +12 -0
- sky/provision/hyperbolic/config.py +10 -0
- sky/provision/hyperbolic/instance.py +437 -0
- sky/provision/hyperbolic/utils.py +373 -0
- sky/provision/instance_setup.py +93 -14
- sky/provision/kubernetes/__init__.py +5 -0
- sky/provision/kubernetes/config.py +9 -52
- sky/provision/kubernetes/constants.py +17 -0
- sky/provision/kubernetes/instance.py +789 -247
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +27 -17
- sky/provision/kubernetes/network_utils.py +40 -43
- sky/provision/kubernetes/utils.py +1192 -531
- sky/provision/kubernetes/volume.py +282 -0
- sky/provision/lambda_cloud/instance.py +22 -16
- sky/provision/nebius/constants.py +50 -0
- sky/provision/nebius/instance.py +19 -6
- sky/provision/nebius/utils.py +196 -91
- sky/provision/oci/instance.py +10 -5
- sky/provision/paperspace/instance.py +10 -7
- sky/provision/paperspace/utils.py +1 -1
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +110 -36
- sky/provision/runpod/__init__.py +5 -0
- sky/provision/runpod/instance.py +27 -6
- sky/provision/runpod/utils.py +51 -18
- sky/provision/runpod/volume.py +180 -0
- sky/provision/scp/__init__.py +15 -0
- sky/provision/scp/config.py +93 -0
- sky/provision/scp/instance.py +531 -0
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +807 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/ssh/__init__.py +18 -0
- sky/provision/vast/instance.py +13 -8
- sky/provision/vast/utils.py +10 -7
- sky/provision/vsphere/common/vim_utils.py +1 -2
- sky/provision/vsphere/instance.py +15 -10
- sky/provision/vsphere/vsphere_utils.py +9 -19
- sky/py.typed +0 -0
- sky/resources.py +844 -118
- sky/schemas/__init__.py +0 -0
- sky/schemas/api/__init__.py +0 -0
- sky/schemas/api/responses.py +225 -0
- sky/schemas/db/README +4 -0
- sky/schemas/db/env.py +90 -0
- sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
- sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
- sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
- sky/schemas/db/global_user_state/004_is_managed.py +34 -0
- sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
- sky/schemas/db/global_user_state/006_provision_log.py +41 -0
- sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/script.py.mako +28 -0
- sky/schemas/db/serve_state/001_initial_schema.py +67 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
- sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/__init__.py +0 -0
- sky/schemas/generated/autostopv1_pb2.py +36 -0
- sky/schemas/generated/autostopv1_pb2.pyi +43 -0
- sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +357 -5
- sky/serve/client/impl.py +310 -0
- sky/serve/client/sdk.py +47 -139
- sky/serve/client/sdk_async.py +130 -0
- sky/serve/constants.py +10 -8
- sky/serve/controller.py +64 -19
- sky/serve/load_balancer.py +106 -60
- sky/serve/load_balancing_policies.py +115 -1
- sky/serve/replica_managers.py +273 -162
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +554 -251
- sky/serve/serve_utils.py +733 -220
- sky/serve/server/core.py +66 -711
- sky/serve/server/impl.py +1093 -0
- sky/serve/server/server.py +21 -18
- sky/serve/service.py +133 -48
- sky/serve/service_spec.py +135 -16
- sky/serve/spot_placer.py +3 -0
- sky/server/auth/__init__.py +0 -0
- sky/server/auth/authn.py +50 -0
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +200 -0
- sky/server/common.py +475 -181
- sky/server/config.py +81 -23
- sky/server/constants.py +44 -6
- sky/server/daemons.py +229 -0
- sky/server/html/token_page.html +185 -0
- sky/server/metrics.py +160 -0
- sky/server/requests/executor.py +528 -138
- sky/server/requests/payloads.py +351 -17
- sky/server/requests/preconditions.py +21 -17
- sky/server/requests/process.py +112 -29
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +817 -224
- sky/server/requests/serializers/decoders.py +82 -31
- sky/server/requests/serializers/encoders.py +140 -22
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +417 -0
- sky/server/server.py +1290 -284
- sky/server/state.py +20 -0
- sky/server/stream_utils.py +345 -57
- sky/server/uvicorn.py +217 -3
- sky/server/versions.py +270 -0
- sky/setup_files/MANIFEST.in +5 -0
- sky/setup_files/alembic.ini +156 -0
- sky/setup_files/dependencies.py +136 -31
- sky/setup_files/setup.py +44 -42
- sky/sky_logging.py +102 -5
- sky/skylet/attempt_skylet.py +1 -0
- sky/skylet/autostop_lib.py +129 -8
- sky/skylet/configs.py +27 -20
- sky/skylet/constants.py +171 -19
- sky/skylet/events.py +105 -21
- sky/skylet/job_lib.py +335 -104
- sky/skylet/log_lib.py +297 -18
- sky/skylet/log_lib.pyi +44 -1
- sky/skylet/ray_patches/__init__.py +17 -3
- sky/skylet/ray_patches/autoscaler.py.diff +18 -0
- sky/skylet/ray_patches/cli.py.diff +19 -0
- sky/skylet/ray_patches/command_runner.py.diff +17 -0
- sky/skylet/ray_patches/log_monitor.py.diff +20 -0
- sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
- sky/skylet/ray_patches/updater.py.diff +18 -0
- sky/skylet/ray_patches/worker.py.diff +41 -0
- sky/skylet/services.py +564 -0
- sky/skylet/skylet.py +63 -4
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +506 -99
- sky/ssh_node_pools/__init__.py +1 -0
- sky/ssh_node_pools/core.py +135 -0
- sky/ssh_node_pools/server.py +233 -0
- sky/task.py +621 -137
- sky/templates/aws-ray.yml.j2 +10 -3
- sky/templates/azure-ray.yml.j2 +1 -1
- sky/templates/do-ray.yml.j2 +1 -1
- sky/templates/gcp-ray.yml.j2 +57 -0
- sky/templates/hyperbolic-ray.yml.j2 +67 -0
- sky/templates/jobs-controller.yaml.j2 +27 -24
- sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
- sky/templates/kubernetes-ray.yml.j2 +607 -51
- sky/templates/lambda-ray.yml.j2 +1 -1
- sky/templates/nebius-ray.yml.j2 +33 -12
- sky/templates/paperspace-ray.yml.j2 +1 -1
- sky/templates/primeintellect-ray.yml.j2 +71 -0
- sky/templates/runpod-ray.yml.j2 +9 -1
- sky/templates/scp-ray.yml.j2 +3 -50
- sky/templates/seeweb-ray.yml.j2 +108 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/sky-serve-controller.yaml.j2 +22 -2
- sky/templates/websocket_proxy.py +178 -18
- sky/usage/usage_lib.py +18 -11
- sky/users/__init__.py +0 -0
- sky/users/model.conf +15 -0
- sky/users/permission.py +387 -0
- sky/users/rbac.py +121 -0
- sky/users/server.py +720 -0
- sky/users/token_service.py +218 -0
- sky/utils/accelerator_registry.py +34 -5
- sky/utils/admin_policy_utils.py +84 -38
- sky/utils/annotations.py +16 -5
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/benchmark_utils.py +60 -0
- sky/utils/cli_utils/status_utils.py +159 -86
- sky/utils/cluster_utils.py +31 -9
- sky/utils/command_runner.py +354 -68
- sky/utils/command_runner.pyi +93 -3
- sky/utils/common.py +35 -8
- sky/utils/common_utils.py +310 -87
- sky/utils/config_utils.py +87 -5
- sky/utils/context.py +402 -0
- sky/utils/context_utils.py +222 -0
- sky/utils/controller_utils.py +264 -89
- sky/utils/dag_utils.py +31 -12
- sky/utils/db/__init__.py +0 -0
- sky/utils/db/db_utils.py +470 -0
- sky/utils/db/migration_utils.py +133 -0
- sky/utils/directory_utils.py +12 -0
- sky/utils/env_options.py +13 -0
- sky/utils/git.py +567 -0
- sky/utils/git_clone.sh +460 -0
- sky/utils/infra_utils.py +195 -0
- sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
- sky/utils/kubernetes/config_map_utils.py +133 -0
- sky/utils/kubernetes/create_cluster.sh +13 -27
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
- sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
- sky/utils/kubernetes/gpu_labeler.py +5 -5
- sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
- sky/utils/kubernetes/ssh-tunnel.sh +379 -0
- sky/utils/kubernetes/ssh_utils.py +221 -0
- sky/utils/kubernetes_enums.py +8 -15
- sky/utils/lock_events.py +94 -0
- sky/utils/locks.py +368 -0
- sky/utils/log_utils.py +300 -6
- sky/utils/perf_utils.py +22 -0
- sky/utils/resource_checker.py +298 -0
- sky/utils/resources_utils.py +249 -32
- sky/utils/rich_utils.py +213 -37
- sky/utils/schemas.py +905 -147
- sky/utils/serialize_utils.py +16 -0
- sky/utils/status_lib.py +10 -0
- sky/utils/subprocess_utils.py +38 -15
- sky/utils/tempstore.py +70 -0
- sky/utils/timeline.py +24 -52
- sky/utils/ux_utils.py +84 -15
- sky/utils/validator.py +11 -1
- sky/utils/volume.py +86 -0
- sky/utils/yaml_utils.py +111 -0
- sky/volumes/__init__.py +13 -0
- sky/volumes/client/__init__.py +0 -0
- sky/volumes/client/sdk.py +149 -0
- sky/volumes/server/__init__.py +0 -0
- sky/volumes/server/core.py +258 -0
- sky/volumes/server/server.py +122 -0
- sky/volumes/volume.py +212 -0
- sky/workspaces/__init__.py +0 -0
- sky/workspaces/core.py +655 -0
- sky/workspaces/server.py +101 -0
- sky/workspaces/utils.py +56 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
- skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
- sky/benchmark/benchmark_state.py +0 -256
- sky/benchmark/benchmark_utils.py +0 -641
- sky/clouds/service_catalog/constants.py +0 -7
- sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
- sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
- sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
- sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
- sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
- sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
- sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
- sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
- sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
- sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
- sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
- sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
- sky/jobs/dashboard/dashboard.py +0 -223
- sky/jobs/dashboard/static/favicon.ico +0 -0
- sky/jobs/dashboard/templates/index.html +0 -831
- sky/jobs/server/dashboard_utils.py +0 -69
- sky/skylet/providers/scp/__init__.py +0 -2
- sky/skylet/providers/scp/config.py +0 -149
- sky/skylet/providers/scp/node_provider.py +0 -578
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/db_utils.py +0 -100
- sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
- skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
- /sky/{clouds/service_catalog → catalog}/config.py +0 -0
- /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
- /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
- /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
- /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/provision/nebius/utils.py
CHANGED
|
@@ -1,17 +1,21 @@
|
|
|
1
1
|
"""Nebius library wrapper for SkyPilot."""
|
|
2
2
|
import time
|
|
3
|
-
from typing import Any, Dict
|
|
3
|
+
from typing import Any, Dict, List, Optional
|
|
4
4
|
import uuid
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky import skypilot_config
|
|
8
8
|
from sky.adaptors import nebius
|
|
9
|
+
from sky.provision.nebius import constants as nebius_constants
|
|
9
10
|
from sky.utils import common_utils
|
|
11
|
+
from sky.utils import resources_utils
|
|
10
12
|
|
|
11
13
|
logger = sky_logging.init_logger(__name__)
|
|
12
14
|
|
|
13
15
|
POLL_INTERVAL = 5
|
|
14
16
|
|
|
17
|
+
_MAX_OPERATIONS_TO_FETCH = 1000
|
|
18
|
+
|
|
15
19
|
|
|
16
20
|
def retry(func):
|
|
17
21
|
"""Decorator to retry a function."""
|
|
@@ -34,12 +38,14 @@ def retry(func):
|
|
|
34
38
|
|
|
35
39
|
def get_project_by_region(region: str) -> str:
|
|
36
40
|
service = nebius.iam().ProjectServiceClient(nebius.sdk())
|
|
37
|
-
projects =
|
|
38
|
-
|
|
41
|
+
projects = nebius.sync_call(
|
|
42
|
+
service.list(
|
|
43
|
+
nebius.iam().ListProjectsRequest(parent_id=nebius.get_tenant_id()),
|
|
44
|
+
timeout=nebius.READ_TIMEOUT))
|
|
39
45
|
|
|
40
46
|
# Check is there project if in config
|
|
41
|
-
project_id = skypilot_config.
|
|
42
|
-
|
|
47
|
+
project_id = skypilot_config.get_effective_region_config(
|
|
48
|
+
cloud='nebius', region=region, keys=('project_id',), default_value=None)
|
|
43
49
|
if project_id is not None:
|
|
44
50
|
return project_id
|
|
45
51
|
for project in projects.items:
|
|
@@ -54,19 +60,21 @@ def get_or_create_gpu_cluster(name: str, project_id: str, fabric: str) -> str:
|
|
|
54
60
|
"""
|
|
55
61
|
service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
|
|
56
62
|
try:
|
|
57
|
-
cluster =
|
|
58
|
-
|
|
59
|
-
name=name,
|
|
60
|
-
)).wait()
|
|
61
|
-
cluster_id = cluster.metadata.id
|
|
62
|
-
except nebius.request_error():
|
|
63
|
-
cluster = service.create(nebius.compute().CreateGpuClusterRequest(
|
|
64
|
-
metadata=nebius.nebius_common().ResourceMetadata(
|
|
63
|
+
cluster = nebius.sync_call(
|
|
64
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
65
65
|
parent_id=project_id,
|
|
66
66
|
name=name,
|
|
67
|
-
)
|
|
68
|
-
|
|
69
|
-
|
|
67
|
+
)))
|
|
68
|
+
cluster_id = cluster.metadata.id
|
|
69
|
+
except nebius.request_error():
|
|
70
|
+
cluster = nebius.sync_call(
|
|
71
|
+
service.create(nebius.compute().CreateGpuClusterRequest(
|
|
72
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
|
73
|
+
parent_id=project_id,
|
|
74
|
+
name=name,
|
|
75
|
+
),
|
|
76
|
+
spec=nebius.compute().GpuClusterSpec(
|
|
77
|
+
infiniband_fabric=fabric))))
|
|
70
78
|
cluster_id = cluster.resource_id
|
|
71
79
|
return cluster_id
|
|
72
80
|
|
|
@@ -76,14 +84,16 @@ def delete_cluster(name: str, region: str) -> None:
|
|
|
76
84
|
project_id = get_project_by_region(region)
|
|
77
85
|
service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
|
|
78
86
|
try:
|
|
79
|
-
cluster =
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
87
|
+
cluster = nebius.sync_call(
|
|
88
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
89
|
+
parent_id=project_id,
|
|
90
|
+
name=name,
|
|
91
|
+
)))
|
|
83
92
|
cluster_id = cluster.metadata.id
|
|
84
93
|
logger.debug(f'Found GPU Cluster : {cluster_id}.')
|
|
85
|
-
|
|
86
|
-
|
|
94
|
+
nebius.sync_call(
|
|
95
|
+
service.delete(
|
|
96
|
+
nebius.compute().DeleteGpuClusterRequest(id=cluster_id)))
|
|
87
97
|
logger.debug(f'Deleted GPU Cluster : {cluster_id}.')
|
|
88
98
|
except nebius.request_error():
|
|
89
99
|
logger.debug('GPU Cluster does not exist.')
|
|
@@ -92,13 +102,23 @@ def delete_cluster(name: str, region: str) -> None:
|
|
|
92
102
|
def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
|
|
93
103
|
"""Lists instances associated with API key."""
|
|
94
104
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
105
|
+
page_token = ''
|
|
106
|
+
instances = []
|
|
107
|
+
while True:
|
|
108
|
+
result = nebius.sync_call(
|
|
109
|
+
service.list(nebius.compute().ListInstancesRequest(
|
|
110
|
+
parent_id=project_id,
|
|
111
|
+
page_size=100,
|
|
112
|
+
page_token=page_token,
|
|
113
|
+
),
|
|
114
|
+
timeout=nebius.READ_TIMEOUT))
|
|
115
|
+
instances.extend(result.items)
|
|
116
|
+
if not result.next_page_token: # "" means no more pages
|
|
117
|
+
break
|
|
118
|
+
page_token = result.next_page_token
|
|
99
119
|
|
|
100
120
|
instance_dict: Dict[str, Dict[str, Any]] = {}
|
|
101
|
-
for instance in instances
|
|
121
|
+
for instance in instances:
|
|
102
122
|
info = {}
|
|
103
123
|
info['status'] = instance.status.state.name
|
|
104
124
|
info['name'] = instance.metadata.name
|
|
@@ -114,12 +134,13 @@ def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
|
|
|
114
134
|
|
|
115
135
|
def stop(instance_id: str) -> None:
|
|
116
136
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
117
|
-
|
|
137
|
+
nebius.sync_call(
|
|
138
|
+
service.stop(nebius.compute().StopInstanceRequest(id=instance_id)))
|
|
118
139
|
retry_count = 0
|
|
119
140
|
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_STOP:
|
|
120
141
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
121
|
-
instance =
|
|
122
|
-
id=instance_id,))
|
|
142
|
+
instance = nebius.sync_call(
|
|
143
|
+
service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
|
|
123
144
|
if instance.status.state.name == 'STOPPED':
|
|
124
145
|
break
|
|
125
146
|
time.sleep(POLL_INTERVAL)
|
|
@@ -136,12 +157,13 @@ def stop(instance_id: str) -> None:
|
|
|
136
157
|
|
|
137
158
|
def start(instance_id: str) -> None:
|
|
138
159
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
139
|
-
|
|
160
|
+
nebius.sync_call(
|
|
161
|
+
service.start(nebius.compute().StartInstanceRequest(id=instance_id)))
|
|
140
162
|
retry_count = 0
|
|
141
163
|
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_START:
|
|
142
164
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
143
|
-
instance =
|
|
144
|
-
id=instance_id,))
|
|
165
|
+
instance = nebius.sync_call(
|
|
166
|
+
service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
|
|
145
167
|
if instance.status.state.name == 'RUNNING':
|
|
146
168
|
break
|
|
147
169
|
time.sleep(POLL_INTERVAL)
|
|
@@ -156,9 +178,19 @@ def start(instance_id: str) -> None:
|
|
|
156
178
|
f' to be ready.')
|
|
157
179
|
|
|
158
180
|
|
|
159
|
-
def launch(cluster_name_on_cloud: str,
|
|
160
|
-
|
|
161
|
-
|
|
181
|
+
def launch(cluster_name_on_cloud: str,
|
|
182
|
+
node_type: str,
|
|
183
|
+
platform: str,
|
|
184
|
+
preset: str,
|
|
185
|
+
region: str,
|
|
186
|
+
image_family: str,
|
|
187
|
+
disk_size: int,
|
|
188
|
+
user_data: str,
|
|
189
|
+
associate_public_ip_address: bool,
|
|
190
|
+
filesystems: List[Dict[str, Any]],
|
|
191
|
+
use_static_ip_address: bool = False,
|
|
192
|
+
use_spot: bool = False,
|
|
193
|
+
network_tier: Optional[resources_utils.NetworkTier] = None) -> str:
|
|
162
194
|
# Each node must have a unique name to avoid conflicts between
|
|
163
195
|
# multiple worker VMs. To ensure uniqueness,a UUID is appended
|
|
164
196
|
# to the node name.
|
|
@@ -172,11 +204,26 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
|
|
172
204
|
# 8 GPU virtual machines can be grouped into a GPU cluster.
|
|
173
205
|
# The GPU clusters are built with InfiniBand secure high-speed networking.
|
|
174
206
|
# https://docs.nebius.com/compute/clusters/gpu
|
|
175
|
-
if platform in
|
|
207
|
+
if platform in nebius_constants.INFINIBAND_INSTANCE_PLATFORMS:
|
|
176
208
|
if preset == '8gpu-128vcpu-1600gb':
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
209
|
+
fabric = skypilot_config.get_effective_region_config(
|
|
210
|
+
cloud='nebius',
|
|
211
|
+
region=region,
|
|
212
|
+
keys=('fabric',),
|
|
213
|
+
default_value=None)
|
|
214
|
+
|
|
215
|
+
# Auto-select fabric if network_tier=best and no fabric configured
|
|
216
|
+
if (fabric is None and
|
|
217
|
+
str(network_tier) == str(resources_utils.NetworkTier.BEST)):
|
|
218
|
+
try:
|
|
219
|
+
fabric = nebius_constants.get_default_fabric(
|
|
220
|
+
platform, region)
|
|
221
|
+
logger.info(f'Auto-selected InfiniBand fabric {fabric} '
|
|
222
|
+
f'for {platform} in {region}')
|
|
223
|
+
except ValueError as e:
|
|
224
|
+
logger.warning(
|
|
225
|
+
f'InfiniBand fabric auto-selection failed: {e}')
|
|
226
|
+
|
|
180
227
|
if fabric is None:
|
|
181
228
|
logger.warning(
|
|
182
229
|
f'Set up fabric for region {region} in ~/.sky/config.yaml '
|
|
@@ -186,24 +233,26 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
|
|
186
233
|
project_id, fabric)
|
|
187
234
|
|
|
188
235
|
service = nebius.compute().DiskServiceClient(nebius.sdk())
|
|
189
|
-
disk =
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
236
|
+
disk = nebius.sync_call(
|
|
237
|
+
service.create(nebius.compute().CreateDiskRequest(
|
|
238
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
|
239
|
+
parent_id=project_id,
|
|
240
|
+
name=disk_name,
|
|
241
|
+
),
|
|
242
|
+
spec=nebius.compute().DiskSpec(
|
|
243
|
+
source_image_family=nebius.compute().SourceImageFamily(
|
|
244
|
+
image_family=image_family),
|
|
245
|
+
size_gibibytes=disk_size,
|
|
246
|
+
type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
|
|
247
|
+
))))
|
|
200
248
|
disk_id = disk.resource_id
|
|
201
249
|
retry_count = 0
|
|
202
250
|
while retry_count < nebius.MAX_RETRIES_TO_DISK_CREATE:
|
|
203
|
-
disk =
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
251
|
+
disk = nebius.sync_call(
|
|
252
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
253
|
+
parent_id=project_id,
|
|
254
|
+
name=disk_name,
|
|
255
|
+
)))
|
|
207
256
|
if disk.status.state.name == 'READY':
|
|
208
257
|
break
|
|
209
258
|
logger.debug(f'Waiting for disk {disk_name} to be ready.')
|
|
@@ -217,46 +266,102 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
|
|
217
266
|
f' seconds) while waiting for disk {disk_name}'
|
|
218
267
|
f' to be ready.')
|
|
219
268
|
|
|
269
|
+
filesystems_spec = []
|
|
270
|
+
if filesystems:
|
|
271
|
+
for fs in filesystems:
|
|
272
|
+
filesystems_spec.append(nebius.compute().AttachedFilesystemSpec(
|
|
273
|
+
mount_tag=fs['filesystem_mount_tag'],
|
|
274
|
+
attach_mode=nebius.compute().AttachedFilesystemSpec.AttachMode[
|
|
275
|
+
fs['filesystem_attach_mode']],
|
|
276
|
+
existing_filesystem=nebius.compute().ExistingFilesystem(
|
|
277
|
+
id=fs['filesystem_id'])))
|
|
278
|
+
|
|
220
279
|
service = nebius.vpc().SubnetServiceClient(nebius.sdk())
|
|
221
|
-
sub_net =
|
|
222
|
-
parent_id=project_id,))
|
|
280
|
+
sub_net = nebius.sync_call(
|
|
281
|
+
service.list(nebius.vpc().ListSubnetsRequest(parent_id=project_id,)))
|
|
223
282
|
|
|
224
283
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
).AttachedDiskSpec
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
284
|
+
logger.debug(f'Creating instance {instance_name} in project {project_id}.')
|
|
285
|
+
nebius.sync_call(
|
|
286
|
+
service.create(nebius.compute().CreateInstanceRequest(
|
|
287
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
|
288
|
+
parent_id=project_id,
|
|
289
|
+
name=instance_name,
|
|
290
|
+
),
|
|
291
|
+
spec=nebius.compute().InstanceSpec(
|
|
292
|
+
gpu_cluster=nebius.compute().InstanceGpuClusterSpec(
|
|
293
|
+
id=cluster_id,) if cluster_id is not None else None,
|
|
294
|
+
boot_disk=nebius.compute().AttachedDiskSpec(
|
|
295
|
+
attach_mode=nebius.compute(
|
|
296
|
+
).AttachedDiskSpec.AttachMode.READ_WRITE,
|
|
297
|
+
existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
|
|
298
|
+
cloud_init_user_data=user_data,
|
|
299
|
+
resources=nebius.compute().ResourcesSpec(platform=platform,
|
|
300
|
+
preset=preset),
|
|
301
|
+
filesystems=filesystems_spec if filesystems_spec else None,
|
|
302
|
+
network_interfaces=[
|
|
303
|
+
nebius.compute().NetworkInterfaceSpec(
|
|
304
|
+
subnet_id=sub_net.items[0].metadata.id,
|
|
305
|
+
ip_address=nebius.compute().IPAddress(),
|
|
306
|
+
name='network-interface-0',
|
|
307
|
+
public_ip_address=nebius.compute().PublicIPAddress(
|
|
308
|
+
static=use_static_ip_address)
|
|
309
|
+
if associate_public_ip_address else None,
|
|
310
|
+
)
|
|
311
|
+
],
|
|
312
|
+
recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
|
|
313
|
+
if use_spot else None,
|
|
314
|
+
preemptible=nebius.compute().PreemptibleSpec(
|
|
315
|
+
priority=1,
|
|
316
|
+
on_preemption=nebius.compute().PreemptibleSpec.
|
|
317
|
+
PreemptionPolicy.STOP) if use_spot else None,
|
|
318
|
+
))))
|
|
247
319
|
instance_id = ''
|
|
248
320
|
retry_count = 0
|
|
249
321
|
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
|
|
250
322
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
251
|
-
instance =
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
323
|
+
instance = nebius.sync_call(
|
|
324
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
325
|
+
parent_id=project_id,
|
|
326
|
+
name=instance_name,
|
|
327
|
+
)))
|
|
328
|
+
instance_id = instance.metadata.id
|
|
255
329
|
if instance.status.state.name == 'STARTING':
|
|
256
|
-
instance_id = instance.metadata.id
|
|
257
330
|
break
|
|
331
|
+
|
|
332
|
+
# All Instances initially have state=STOPPED and reconciling=True,
|
|
333
|
+
# so we need to wait until reconciling is False.
|
|
334
|
+
if instance.status.state.name == 'STOPPED' and \
|
|
335
|
+
not instance.status.reconciling:
|
|
336
|
+
next_token = ''
|
|
337
|
+
total_operations = 0
|
|
338
|
+
while True:
|
|
339
|
+
operations_response = nebius.sync_call(
|
|
340
|
+
service.list_operations_by_parent(
|
|
341
|
+
nebius.compute().ListOperationsByParentRequest(
|
|
342
|
+
parent_id=project_id,
|
|
343
|
+
page_size=100,
|
|
344
|
+
page_token=next_token,
|
|
345
|
+
)))
|
|
346
|
+
total_operations += len(operations_response.operations)
|
|
347
|
+
for operation in operations_response.operations:
|
|
348
|
+
# Find the most recent operation for the instance.
|
|
349
|
+
if operation.resource_id == instance_id:
|
|
350
|
+
error_msg = operation.description
|
|
351
|
+
if operation.status:
|
|
352
|
+
error_msg += f' {operation.status.message}'
|
|
353
|
+
raise RuntimeError(error_msg)
|
|
354
|
+
# If we've fetched too many operations, or there are no more
|
|
355
|
+
# operations to fetch, just raise a generic error.
|
|
356
|
+
if total_operations > _MAX_OPERATIONS_TO_FETCH or \
|
|
357
|
+
not operations_response.next_page_token:
|
|
358
|
+
raise RuntimeError(
|
|
359
|
+
f'Instance {instance_name} failed to start.')
|
|
360
|
+
next_token = operations_response.next_page_token
|
|
258
361
|
time.sleep(POLL_INTERVAL)
|
|
259
|
-
logger.debug(f'Waiting for instance {instance_name} start running.'
|
|
362
|
+
logger.debug(f'Waiting for instance {instance_name} to start running. '
|
|
363
|
+
f'State: {instance.status.state.name}, '
|
|
364
|
+
f'Reconciling: {instance.status.reconciling}')
|
|
260
365
|
retry_count += 1
|
|
261
366
|
|
|
262
367
|
if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
|
|
@@ -271,19 +376,19 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
|
|
|
271
376
|
def remove(instance_id: str) -> None:
|
|
272
377
|
"""Terminates the given instance."""
|
|
273
378
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
274
|
-
result =
|
|
275
|
-
nebius.compute().GetInstanceRequest(id=instance_id))
|
|
379
|
+
result = nebius.sync_call(
|
|
380
|
+
service.get(nebius.compute().GetInstanceRequest(id=instance_id)))
|
|
276
381
|
disk_id = result.spec.boot_disk.existing_disk.id
|
|
277
|
-
|
|
278
|
-
nebius.compute().DeleteInstanceRequest(id=instance_id))
|
|
382
|
+
nebius.sync_call(
|
|
383
|
+
service.delete(nebius.compute().DeleteInstanceRequest(id=instance_id)))
|
|
279
384
|
retry_count = 0
|
|
280
385
|
# The instance begins deleting and attempts to delete the disk.
|
|
281
386
|
# Must wait until the disk is unlocked and becomes deletable.
|
|
282
387
|
while retry_count < nebius.MAX_RETRIES_TO_DISK_DELETE:
|
|
283
388
|
try:
|
|
284
389
|
service = nebius.compute().DiskServiceClient(nebius.sdk())
|
|
285
|
-
|
|
286
|
-
nebius.compute().DeleteDiskRequest(id=disk_id))
|
|
390
|
+
nebius.sync_call(
|
|
391
|
+
service.delete(nebius.compute().DeleteDiskRequest(id=disk_id)))
|
|
287
392
|
break
|
|
288
393
|
except nebius.request_error():
|
|
289
394
|
logger.debug('Waiting for disk deletion.')
|
sky/provision/oci/instance.py
CHANGED
|
@@ -10,7 +10,7 @@ import copy
|
|
|
10
10
|
from datetime import datetime
|
|
11
11
|
import time
|
|
12
12
|
import typing
|
|
13
|
-
from typing import Any, Dict, List, Optional
|
|
13
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
14
14
|
|
|
15
15
|
from sky import exceptions
|
|
16
16
|
from sky import sky_logging
|
|
@@ -32,10 +32,12 @@ logger = sky_logging.init_logger(__name__)
|
|
|
32
32
|
@query_utils.debug_enabled(logger)
|
|
33
33
|
@common_utils.retry
|
|
34
34
|
def query_instances(
|
|
35
|
+
cluster_name: str,
|
|
35
36
|
cluster_name_on_cloud: str,
|
|
36
37
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
37
38
|
non_terminated_only: bool = True,
|
|
38
|
-
|
|
39
|
+
retry_if_missing: bool = False,
|
|
40
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
39
41
|
"""Query instances.
|
|
40
42
|
|
|
41
43
|
Returns a dictionary of instance IDs and status.
|
|
@@ -43,11 +45,13 @@ def query_instances(
|
|
|
43
45
|
A None status means the instance is marked as "terminated"
|
|
44
46
|
or "terminating".
|
|
45
47
|
"""
|
|
48
|
+
del cluster_name, retry_if_missing # unused
|
|
46
49
|
assert provider_config is not None, cluster_name_on_cloud
|
|
47
50
|
region = provider_config['region']
|
|
48
51
|
|
|
49
52
|
status_map = oci_utils.oci_config.STATE_MAPPING_OCI_TO_SKY
|
|
50
|
-
statuses: Dict[str, Optional['status_lib.ClusterStatus']
|
|
53
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
54
|
+
Optional[str]]] = {}
|
|
51
55
|
filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
|
|
52
56
|
|
|
53
57
|
instances = _get_filtered_nodes(region, filters)
|
|
@@ -56,15 +60,16 @@ def query_instances(
|
|
|
56
60
|
sky_status = status_map[vm_status]
|
|
57
61
|
if non_terminated_only and sky_status is None:
|
|
58
62
|
continue
|
|
59
|
-
statuses[node['inst_id']] = sky_status
|
|
63
|
+
statuses[node['inst_id']] = (sky_status, None)
|
|
60
64
|
|
|
61
65
|
return statuses
|
|
62
66
|
|
|
63
67
|
|
|
64
68
|
@query_utils.debug_enabled(logger)
|
|
65
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
69
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
66
70
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
67
71
|
"""Start instances with bootstrapped configuration."""
|
|
72
|
+
del cluster_name # unused
|
|
68
73
|
tags = dict(sorted(copy.deepcopy(config.tags).items()))
|
|
69
74
|
|
|
70
75
|
start_time = round(time.time() * 1000)
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
"""Paperspace instance provisioning."""
|
|
2
2
|
|
|
3
3
|
import time
|
|
4
|
-
from typing import Any, Dict, List, Optional
|
|
4
|
+
from typing import Any, Dict, List, Optional, Tuple
|
|
5
5
|
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.provision import common
|
|
@@ -48,10 +48,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
48
48
|
return head_instance_id
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
51
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
52
52
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
53
53
|
"""Runs instances for the given cluster."""
|
|
54
|
-
|
|
54
|
+
del cluster_name # unused
|
|
55
55
|
pending_status = [
|
|
56
56
|
'starting', 'restarting', 'upgrading', 'provisioning', 'stopping'
|
|
57
57
|
]
|
|
@@ -277,12 +277,14 @@ def get_cluster_info(
|
|
|
277
277
|
|
|
278
278
|
|
|
279
279
|
def query_instances(
|
|
280
|
+
cluster_name: str,
|
|
280
281
|
cluster_name_on_cloud: str,
|
|
281
282
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
282
283
|
non_terminated_only: bool = True,
|
|
283
|
-
|
|
284
|
+
retry_if_missing: bool = False,
|
|
285
|
+
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
284
286
|
"""See sky/provision/__init__.py"""
|
|
285
|
-
del non_terminated_only
|
|
287
|
+
del cluster_name, non_terminated_only, retry_if_missing #unused
|
|
286
288
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
287
289
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
288
290
|
|
|
@@ -297,10 +299,11 @@ def query_instances(
|
|
|
297
299
|
'ready': status_lib.ClusterStatus.UP,
|
|
298
300
|
'off': status_lib.ClusterStatus.STOPPED,
|
|
299
301
|
}
|
|
300
|
-
statuses: Dict[str, Optional[status_lib.ClusterStatus]
|
|
302
|
+
statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
|
|
303
|
+
Optional[str]]] = {}
|
|
301
304
|
for inst_id, inst in instances.items():
|
|
302
305
|
status = status_map[inst['state']]
|
|
303
|
-
statuses[inst_id] = status
|
|
306
|
+
statuses[inst_id] = (status, None)
|
|
304
307
|
return statuses
|
|
305
308
|
|
|
306
309
|
|
|
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Union
|
|
|
8
8
|
|
|
9
9
|
from sky import sky_logging
|
|
10
10
|
from sky.adaptors import common as adaptors_common
|
|
11
|
-
|
|
11
|
+
from sky.provision.paperspace import constants
|
|
12
12
|
from sky.utils import common_utils
|
|
13
13
|
|
|
14
14
|
if typing.TYPE_CHECKING:
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Prime Intellect provisioner for SkyPilot."""
|
|
2
|
+
|
|
3
|
+
from sky.provision.primeintellect.config import bootstrap_instances
|
|
4
|
+
from sky.provision.primeintellect.instance import cleanup_ports
|
|
5
|
+
from sky.provision.primeintellect.instance import get_cluster_info
|
|
6
|
+
from sky.provision.primeintellect.instance import query_instances
|
|
7
|
+
from sky.provision.primeintellect.instance import run_instances
|
|
8
|
+
from sky.provision.primeintellect.instance import stop_instances
|
|
9
|
+
from sky.provision.primeintellect.instance import terminate_instances
|
|
10
|
+
from sky.provision.primeintellect.instance import wait_instances
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Prime Intellect configuration bootstrapping."""
|
|
2
|
+
|
|
3
|
+
from sky.provision import common
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def bootstrap_instances(
|
|
7
|
+
region: str, cluster_name: str,
|
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
|
9
|
+
"""Bootstraps instances for the given cluster."""
|
|
10
|
+
del region, cluster_name # unused
|
|
11
|
+
return config
|