skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +10 -2
- sky/adaptors/aws.py +81 -16
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +588 -184
- sky/backends/cloud_vm_ray_backend.py +1088 -904
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +633 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -0
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +73 -43
- sky/client/cli/command.py +675 -412
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +132 -63
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +6 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +40 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +33 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +174 -165
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +162 -29
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +37 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +97 -26
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +382 -418
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +30 -9
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +5 -3
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/requests/executor.py +381 -145
- sky/server/requests/payloads.py +71 -18
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +121 -0
- sky/server/requests/requests.py +507 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +85 -20
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +420 -172
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +62 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +106 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +111 -26
- sky/skylet/events.py +64 -10
- sky/skylet/job_lib.py +141 -104
- sky/skylet/log_lib.py +233 -5
- sky/skylet/log_lib.pyi +40 -2
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +22 -1
- sky/skylet/subprocess_daemon.py +104 -29
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +196 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/vast-ray.yml.j2 +1 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/permission.py +60 -43
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +22 -0
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +88 -27
- sky/utils/command_runner.pyi +36 -3
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +37 -4
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +107 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
- skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
- skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,13 +1,15 @@
|
|
|
1
1
|
"""Kubernetes pvc provisioning."""
|
|
2
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
2
|
+
from typing import Any, Dict, List, Optional, Set, Tuple
|
|
3
3
|
|
|
4
4
|
from sky import global_user_state
|
|
5
5
|
from sky import models
|
|
6
6
|
from sky import sky_logging
|
|
7
7
|
from sky.adaptors import kubernetes
|
|
8
|
+
from sky.provision import constants
|
|
8
9
|
from sky.provision.kubernetes import config as config_lib
|
|
9
10
|
from sky.provision.kubernetes import constants as k8s_constants
|
|
10
11
|
from sky.provision.kubernetes import utils as kubernetes_utils
|
|
12
|
+
from sky.utils import resources_utils
|
|
11
13
|
from sky.utils import volume as volume_lib
|
|
12
14
|
|
|
13
15
|
logger = sky_logging.init_logger(__name__)
|
|
@@ -67,7 +69,7 @@ def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
|
67
69
|
except kubernetes.api_exception() as e:
|
|
68
70
|
raise config_lib.KubernetesError(
|
|
69
71
|
f'Check storage class {storage_class_name} error: {e}')
|
|
70
|
-
create_persistent_volume_claim(namespace, context, pvc_spec)
|
|
72
|
+
create_persistent_volume_claim(namespace, context, pvc_spec, config)
|
|
71
73
|
return config
|
|
72
74
|
|
|
73
75
|
|
|
@@ -75,7 +77,6 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
|
75
77
|
"""Deletes a volume."""
|
|
76
78
|
context, namespace = _get_context_namespace(config)
|
|
77
79
|
pvc_name = config.name_on_cloud
|
|
78
|
-
logger.info(f'Deleting PVC {pvc_name}')
|
|
79
80
|
kubernetes_utils.delete_k8s_resource_with_retry(
|
|
80
81
|
delete_func=lambda pvc_name=pvc_name: kubernetes.core_api(
|
|
81
82
|
context).delete_namespaced_persistent_volume_claim(
|
|
@@ -84,6 +85,7 @@ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
|
|
|
84
85
|
_request_timeout=config_lib.DELETION_TIMEOUT),
|
|
85
86
|
resource_type='pvc',
|
|
86
87
|
resource_name=pvc_name)
|
|
88
|
+
logger.info(f'Deleted PVC {pvc_name} in namespace {namespace}')
|
|
87
89
|
return config
|
|
88
90
|
|
|
89
91
|
|
|
@@ -128,7 +130,7 @@ def _get_volume_usedby(
|
|
|
128
130
|
usedby_pods.append(pod.metadata.name)
|
|
129
131
|
# Get the real cluster name
|
|
130
132
|
cluster_name_on_cloud = pod.metadata.labels.get(
|
|
131
|
-
|
|
133
|
+
constants.TAG_SKYPILOT_CLUSTER_NAME)
|
|
132
134
|
if cluster_name_on_cloud is None:
|
|
133
135
|
continue
|
|
134
136
|
cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
|
|
@@ -160,21 +162,154 @@ def get_volume_usedby(
|
|
|
160
162
|
return _get_volume_usedby(context, namespace, pvc_name)
|
|
161
163
|
|
|
162
164
|
|
|
163
|
-
def
|
|
164
|
-
|
|
165
|
+
def get_all_volumes_usedby(
|
|
166
|
+
configs: List[models.VolumeConfig],
|
|
167
|
+
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
|
168
|
+
"""Gets the usedby resources of all volumes."""
|
|
169
|
+
field_selector = ','.join([
|
|
170
|
+
f'status.phase!={phase}'
|
|
171
|
+
for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
|
|
172
|
+
])
|
|
173
|
+
label_selector = 'parent=skypilot'
|
|
174
|
+
context_to_namespaces: Dict[str, Set[str]] = {}
|
|
175
|
+
pvc_names = set()
|
|
176
|
+
for config in configs:
|
|
177
|
+
context, namespace = _get_context_namespace(config)
|
|
178
|
+
if context not in context_to_namespaces:
|
|
179
|
+
context_to_namespaces[context] = set()
|
|
180
|
+
context_to_namespaces[context].add(namespace)
|
|
181
|
+
pvc_names.add(config.name_on_cloud)
|
|
182
|
+
cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
|
|
183
|
+
# Get all pods in the namespace
|
|
184
|
+
used_by_pods: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
|
|
185
|
+
used_by_clusters: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
|
|
186
|
+
for context, namespaces in context_to_namespaces.items():
|
|
187
|
+
used_by_pods[context] = {}
|
|
188
|
+
used_by_clusters[context] = {}
|
|
189
|
+
for namespace in namespaces:
|
|
190
|
+
used_by_pods[context][namespace] = {}
|
|
191
|
+
used_by_clusters[context][namespace] = {}
|
|
192
|
+
pods = kubernetes.core_api(context).list_namespaced_pod(
|
|
193
|
+
namespace=namespace,
|
|
194
|
+
field_selector=field_selector,
|
|
195
|
+
label_selector=label_selector)
|
|
196
|
+
for pod in pods.items:
|
|
197
|
+
if pod.spec.volumes is None:
|
|
198
|
+
continue
|
|
199
|
+
for volume in pod.spec.volumes:
|
|
200
|
+
if volume.persistent_volume_claim is None:
|
|
201
|
+
continue
|
|
202
|
+
volume_name = volume.persistent_volume_claim.claim_name
|
|
203
|
+
if volume_name not in pvc_names:
|
|
204
|
+
continue
|
|
205
|
+
if volume_name not in used_by_pods[context][namespace]:
|
|
206
|
+
used_by_pods[context][namespace][volume_name] = []
|
|
207
|
+
used_by_pods[context][namespace][volume_name].append(
|
|
208
|
+
pod.metadata.name)
|
|
209
|
+
cluster_name_on_cloud = pod.metadata.labels.get(
|
|
210
|
+
constants.TAG_SKYPILOT_CLUSTER_NAME)
|
|
211
|
+
if cluster_name_on_cloud is None:
|
|
212
|
+
continue
|
|
213
|
+
cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
|
|
214
|
+
if cluster_name is None:
|
|
215
|
+
continue
|
|
216
|
+
if cluster_name not in used_by_clusters[context][namespace]:
|
|
217
|
+
used_by_clusters[context][namespace][cluster_name] = []
|
|
218
|
+
used_by_clusters[context][namespace][cluster_name].append(
|
|
219
|
+
cluster_name)
|
|
220
|
+
return used_by_pods, used_by_clusters
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def map_all_volumes_usedby(
|
|
224
|
+
used_by_pods: Dict[str, Any], used_by_clusters: Dict[str, Any],
|
|
225
|
+
config: models.VolumeConfig) -> Tuple[List[str], List[str]]:
|
|
226
|
+
"""Maps the usedby resources of a volume."""
|
|
227
|
+
context, namespace = _get_context_namespace(config)
|
|
228
|
+
pvc_name = config.name_on_cloud
|
|
229
|
+
|
|
230
|
+
return (used_by_pods.get(context, {}).get(namespace, {}).get(pvc_name, []),
|
|
231
|
+
used_by_clusters.get(context, {}).get(namespace,
|
|
232
|
+
{}).get(pvc_name, []))
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def _populate_config_from_pvc(config: models.VolumeConfig,
|
|
236
|
+
pvc_obj: Any) -> None:
|
|
237
|
+
"""Populate missing fields in config from a PVC object.
|
|
238
|
+
|
|
239
|
+
Args:
|
|
240
|
+
config: VolumeConfig to populate
|
|
241
|
+
pvc_obj: V1PersistentVolumeClaim object from kubernetes client
|
|
242
|
+
"""
|
|
243
|
+
if pvc_obj is None:
|
|
244
|
+
return
|
|
245
|
+
pvc_name = pvc_obj.metadata.name
|
|
246
|
+
|
|
247
|
+
# Populate storageClassName if not set
|
|
248
|
+
if config.config.get('storage_class_name') is None:
|
|
249
|
+
pvc_storage_class = getattr(pvc_obj.spec, 'storage_class_name', None)
|
|
250
|
+
if pvc_storage_class:
|
|
251
|
+
config.config['storage_class_name'] = pvc_storage_class
|
|
252
|
+
|
|
253
|
+
# Populate size if not set (prefer bound capacity, fallback to requested)
|
|
254
|
+
pvc_size = None
|
|
255
|
+
size_quantity = None
|
|
256
|
+
# Try status.capacity (dict) - actual bound size
|
|
257
|
+
capacity = getattr(getattr(pvc_obj, 'status', None), 'capacity', None)
|
|
258
|
+
if isinstance(capacity, dict) and 'storage' in capacity:
|
|
259
|
+
size_quantity = capacity['storage']
|
|
260
|
+
# Fallback to spec.resources.requests (dict) - requested size
|
|
261
|
+
if size_quantity is None:
|
|
262
|
+
requests = getattr(getattr(pvc_obj.spec, 'resources', None), 'requests',
|
|
263
|
+
None)
|
|
264
|
+
if isinstance(requests, dict):
|
|
265
|
+
size_quantity = requests.get('storage')
|
|
266
|
+
# Parse and normalize the size if found
|
|
267
|
+
if size_quantity:
|
|
268
|
+
try:
|
|
269
|
+
# Normalize to GB string (e.g., '20')
|
|
270
|
+
pvc_size = resources_utils.parse_memory_resource(
|
|
271
|
+
size_quantity, 'size', allow_rounding=True)
|
|
272
|
+
except ValueError as e:
|
|
273
|
+
# Just log the error since it is not critical.
|
|
274
|
+
logger.warning(f'Failed to parse PVC size {size_quantity!r} '
|
|
275
|
+
f'for PVC {pvc_name}: {e}')
|
|
276
|
+
if pvc_size is not None:
|
|
277
|
+
if config.size is not None and config.size != pvc_size:
|
|
278
|
+
logger.warning(f'PVC {pvc_name} has size {pvc_size} but config '
|
|
279
|
+
f'size is {config.size}, overriding the config size'
|
|
280
|
+
f' with the PVC size.')
|
|
281
|
+
config.size = pvc_size
|
|
282
|
+
|
|
283
|
+
|
|
284
|
+
def create_persistent_volume_claim(
|
|
285
|
+
namespace: str,
|
|
286
|
+
context: Optional[str],
|
|
287
|
+
pvc_spec: Dict[str, Any],
|
|
288
|
+
config: Optional[models.VolumeConfig] = None,
|
|
289
|
+
) -> None:
|
|
165
290
|
"""Creates a persistent volume claim for SkyServe controller."""
|
|
166
291
|
pvc_name = pvc_spec['metadata']['name']
|
|
167
292
|
try:
|
|
168
|
-
kubernetes.core_api(
|
|
169
|
-
|
|
293
|
+
pvc = kubernetes.core_api(
|
|
294
|
+
context).read_namespaced_persistent_volume_claim(
|
|
295
|
+
name=pvc_name, namespace=namespace)
|
|
296
|
+
if config is not None:
|
|
297
|
+
_populate_config_from_pvc(config, pvc)
|
|
170
298
|
logger.debug(f'PVC {pvc_name} already exists')
|
|
171
299
|
return
|
|
172
300
|
except kubernetes.api_exception() as e:
|
|
173
301
|
if e.status != 404: # Not found
|
|
174
302
|
raise
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
303
|
+
use_existing = config is not None and config.config.get('use_existing')
|
|
304
|
+
if use_existing:
|
|
305
|
+
raise ValueError(
|
|
306
|
+
f'PVC {pvc_name} does not exist while use_existing is True.')
|
|
307
|
+
pvc = kubernetes.core_api(
|
|
308
|
+
context).create_namespaced_persistent_volume_claim(namespace=namespace,
|
|
309
|
+
body=pvc_spec)
|
|
310
|
+
logger.info(f'Created PVC {pvc_name} in namespace {namespace}')
|
|
311
|
+
if config is not None:
|
|
312
|
+
_populate_config_from_pvc(config, pvc)
|
|
178
313
|
|
|
179
314
|
|
|
180
315
|
def _get_pvc_spec(namespace: str,
|
|
@@ -183,8 +318,8 @@ def _get_pvc_spec(namespace: str,
|
|
|
183
318
|
access_mode = config.config.get('access_mode')
|
|
184
319
|
size = config.size
|
|
185
320
|
# The previous code assumes that the access_mode and size are always set.
|
|
186
|
-
assert access_mode is not None
|
|
187
|
-
|
|
321
|
+
assert access_mode is not None, f'access_mode is None for volume ' \
|
|
322
|
+
f'{config.name_on_cloud}'
|
|
188
323
|
pvc_spec: Dict[str, Any] = {
|
|
189
324
|
'metadata': {
|
|
190
325
|
'name': config.name_on_cloud,
|
|
@@ -196,13 +331,10 @@ def _get_pvc_spec(namespace: str,
|
|
|
196
331
|
},
|
|
197
332
|
'spec': {
|
|
198
333
|
'accessModes': [access_mode],
|
|
199
|
-
'resources': {
|
|
200
|
-
'requests': {
|
|
201
|
-
'storage': f'{size}Gi'
|
|
202
|
-
}
|
|
203
|
-
},
|
|
204
334
|
}
|
|
205
335
|
}
|
|
336
|
+
if size is not None:
|
|
337
|
+
pvc_spec['spec']['resources'] = {'requests': {'storage': f'{size}Gi'}}
|
|
206
338
|
if config.labels:
|
|
207
339
|
pvc_spec['metadata']['labels'].update(config.labels)
|
|
208
340
|
storage_class = config.config.get('storage_class_name')
|
|
@@ -68,9 +68,10 @@ def _get_private_ip(instance_info: Dict[str, Any], single_node: bool) -> str:
|
|
|
68
68
|
return private_ip
|
|
69
69
|
|
|
70
70
|
|
|
71
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
71
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
72
72
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
73
73
|
"""Runs instances for the given cluster"""
|
|
74
|
+
del cluster_name # unused
|
|
74
75
|
lambda_client = _get_lambda_client()
|
|
75
76
|
pending_status = ['booting']
|
|
76
77
|
while True:
|
|
@@ -106,34 +107,35 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
106
107
|
created_instance_ids = []
|
|
107
108
|
remote_ssh_key_name = config.authentication_config['remote_key_name']
|
|
108
109
|
|
|
109
|
-
def
|
|
110
|
+
def launch_node(node_type: str) -> str:
|
|
110
111
|
try:
|
|
111
112
|
instance_ids = lambda_client.create_instances(
|
|
112
113
|
instance_type=config.node_config['InstanceType'],
|
|
113
114
|
region=region,
|
|
114
115
|
name=f'{cluster_name_on_cloud}-{node_type}',
|
|
115
|
-
|
|
116
|
+
# Quantity cannot actually be greater than 1; see:
|
|
117
|
+
# https://github.com/skypilot-org/skypilot/issues/7084
|
|
118
|
+
quantity=1,
|
|
116
119
|
ssh_key_name=remote_ssh_key_name,
|
|
117
120
|
)
|
|
118
|
-
logger.info(f'Launched {
|
|
119
|
-
f'
|
|
120
|
-
return instance_ids
|
|
121
|
+
logger.info(f'Launched {node_type} node, '
|
|
122
|
+
f'instance_id: {instance_ids[0]}')
|
|
123
|
+
return instance_ids[0]
|
|
121
124
|
except Exception as e:
|
|
122
125
|
logger.warning(f'run_instances error: {e}')
|
|
123
126
|
raise
|
|
124
127
|
|
|
125
128
|
if head_instance_id is None:
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
created_instance_ids.append(instance_ids[0])
|
|
129
|
-
head_instance_id = instance_ids[0]
|
|
129
|
+
head_instance_id = launch_node('head')
|
|
130
|
+
created_instance_ids.append(head_instance_id)
|
|
130
131
|
|
|
131
132
|
assert head_instance_id is not None, 'head_instance_id should not be None'
|
|
132
133
|
|
|
133
134
|
worker_node_count = to_start_count - 1
|
|
134
135
|
if worker_node_count > 0:
|
|
135
|
-
|
|
136
|
-
|
|
136
|
+
for _ in range(worker_node_count):
|
|
137
|
+
worker_instance_id = launch_node('worker')
|
|
138
|
+
created_instance_ids.append(worker_instance_id)
|
|
137
139
|
|
|
138
140
|
while True:
|
|
139
141
|
instances = _filter_instances(cluster_name_on_cloud, ['active'])
|
|
@@ -230,9 +232,10 @@ def query_instances(
|
|
|
230
232
|
cluster_name_on_cloud: str,
|
|
231
233
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
232
234
|
non_terminated_only: bool = True,
|
|
235
|
+
retry_if_missing: bool = False,
|
|
233
236
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
234
237
|
"""See sky/provision/__init__.py"""
|
|
235
|
-
del cluster_name # unused
|
|
238
|
+
del cluster_name, retry_if_missing # unused
|
|
236
239
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
237
240
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
238
241
|
|
sky/provision/nebius/instance.py
CHANGED
|
@@ -65,9 +65,10 @@ def _wait_until_no_pending(region: str, cluster_name_on_cloud: str) -> None:
|
|
|
65
65
|
f' to be ready.')
|
|
66
66
|
|
|
67
67
|
|
|
68
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
68
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
69
69
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
70
70
|
"""Runs instances for the given cluster."""
|
|
71
|
+
del cluster_name # unused
|
|
71
72
|
_wait_until_no_pending(region, cluster_name_on_cloud)
|
|
72
73
|
running_instances = _filter_instances(region, cluster_name_on_cloud,
|
|
73
74
|
['RUNNING'])
|
|
@@ -137,6 +138,8 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
137
138
|
use_spot=config.node_config['use_spot'],
|
|
138
139
|
associate_public_ip_address=(
|
|
139
140
|
not config.provider_config['use_internal_ips']),
|
|
141
|
+
use_static_ip_address=config.provider_config.get(
|
|
142
|
+
'use_static_ip_address', False),
|
|
140
143
|
filesystems=config.node_config.get('filesystems', []),
|
|
141
144
|
network_tier=config.node_config.get('network_tier'))
|
|
142
145
|
except Exception as e: # pylint: disable=broad-except
|
|
@@ -251,9 +254,10 @@ def query_instances(
|
|
|
251
254
|
cluster_name_on_cloud: str,
|
|
252
255
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
253
256
|
non_terminated_only: bool = True,
|
|
257
|
+
retry_if_missing: bool = False,
|
|
254
258
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
255
259
|
"""See sky/provision/__init__.py"""
|
|
256
|
-
del cluster_name # unused
|
|
260
|
+
del cluster_name, retry_if_missing # unused
|
|
257
261
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
258
262
|
instances = _filter_instances(provider_config['region'],
|
|
259
263
|
cluster_name_on_cloud, None)
|
sky/provision/nebius/utils.py
CHANGED
|
@@ -188,6 +188,7 @@ def launch(cluster_name_on_cloud: str,
|
|
|
188
188
|
user_data: str,
|
|
189
189
|
associate_public_ip_address: bool,
|
|
190
190
|
filesystems: List[Dict[str, Any]],
|
|
191
|
+
use_static_ip_address: bool = False,
|
|
191
192
|
use_spot: bool = False,
|
|
192
193
|
network_tier: Optional[resources_utils.NetworkTier] = None) -> str:
|
|
193
194
|
# Each node must have a unique name to avoid conflicts between
|
|
@@ -281,93 +282,109 @@ def launch(cluster_name_on_cloud: str,
|
|
|
281
282
|
|
|
282
283
|
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
283
284
|
logger.debug(f'Creating instance {instance_name} in project {project_id}.')
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
raise
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
285
|
+
try:
|
|
286
|
+
nebius.sync_call(
|
|
287
|
+
service.create(nebius.compute().CreateInstanceRequest(
|
|
288
|
+
metadata=nebius.nebius_common().ResourceMetadata(
|
|
289
|
+
parent_id=project_id,
|
|
290
|
+
name=instance_name,
|
|
291
|
+
),
|
|
292
|
+
spec=nebius.compute().InstanceSpec(
|
|
293
|
+
gpu_cluster=nebius.compute().InstanceGpuClusterSpec(
|
|
294
|
+
id=cluster_id,) if cluster_id is not None else None,
|
|
295
|
+
boot_disk=nebius.compute().AttachedDiskSpec(
|
|
296
|
+
attach_mode=nebius.compute(
|
|
297
|
+
).AttachedDiskSpec.AttachMode.READ_WRITE,
|
|
298
|
+
existing_disk=nebius.compute().ExistingDisk(
|
|
299
|
+
id=disk_id)),
|
|
300
|
+
cloud_init_user_data=user_data,
|
|
301
|
+
resources=nebius.compute().ResourcesSpec(platform=platform,
|
|
302
|
+
preset=preset),
|
|
303
|
+
filesystems=filesystems_spec if filesystems_spec else None,
|
|
304
|
+
network_interfaces=[
|
|
305
|
+
nebius.compute().NetworkInterfaceSpec(
|
|
306
|
+
subnet_id=sub_net.items[0].metadata.id,
|
|
307
|
+
ip_address=nebius.compute().IPAddress(),
|
|
308
|
+
name='network-interface-0',
|
|
309
|
+
public_ip_address=nebius.compute().PublicIPAddress(
|
|
310
|
+
static=use_static_ip_address)
|
|
311
|
+
if associate_public_ip_address else None,
|
|
312
|
+
)
|
|
313
|
+
],
|
|
314
|
+
recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
|
|
315
|
+
if use_spot else None,
|
|
316
|
+
preemptible=nebius.compute().PreemptibleSpec(
|
|
317
|
+
priority=1,
|
|
318
|
+
on_preemption=nebius.compute().PreemptibleSpec.
|
|
319
|
+
PreemptionPolicy.STOP) if use_spot else None,
|
|
320
|
+
))))
|
|
321
|
+
instance_id = ''
|
|
322
|
+
retry_count = 0
|
|
323
|
+
while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
|
|
324
|
+
service = nebius.compute().InstanceServiceClient(nebius.sdk())
|
|
325
|
+
instance = nebius.sync_call(
|
|
326
|
+
service.get_by_name(nebius.nebius_common().GetByNameRequest(
|
|
327
|
+
parent_id=project_id,
|
|
328
|
+
name=instance_name,
|
|
329
|
+
)))
|
|
330
|
+
instance_id = instance.metadata.id
|
|
331
|
+
if instance.status.state.name == 'STARTING':
|
|
332
|
+
break
|
|
333
|
+
|
|
334
|
+
# All Instances initially have state=STOPPED and reconciling=True,
|
|
335
|
+
# so we need to wait until reconciling is False.
|
|
336
|
+
if instance.status.state.name == 'STOPPED' and \
|
|
337
|
+
not instance.status.reconciling:
|
|
338
|
+
next_token = ''
|
|
339
|
+
total_operations = 0
|
|
340
|
+
while True:
|
|
341
|
+
operations_response = nebius.sync_call(
|
|
342
|
+
service.list_operations_by_parent(
|
|
343
|
+
nebius.compute().ListOperationsByParentRequest(
|
|
344
|
+
parent_id=project_id,
|
|
345
|
+
page_size=100,
|
|
346
|
+
page_token=next_token,
|
|
347
|
+
)))
|
|
348
|
+
total_operations += len(operations_response.operations)
|
|
349
|
+
for operation in operations_response.operations:
|
|
350
|
+
# Find the most recent operation for the instance.
|
|
351
|
+
if operation.resource_id == instance_id:
|
|
352
|
+
error_msg = operation.description
|
|
353
|
+
if operation.status:
|
|
354
|
+
error_msg += f' {operation.status.message}'
|
|
355
|
+
raise RuntimeError(error_msg)
|
|
356
|
+
# If we've fetched too many operations, or there are no more
|
|
357
|
+
# operations to fetch, just raise a generic error.
|
|
358
|
+
if total_operations > _MAX_OPERATIONS_TO_FETCH or \
|
|
359
|
+
not operations_response.next_page_token:
|
|
360
|
+
raise RuntimeError(
|
|
361
|
+
f'Instance {instance_name} failed to start.')
|
|
362
|
+
next_token = operations_response.next_page_token
|
|
363
|
+
time.sleep(POLL_INTERVAL)
|
|
364
|
+
logger.debug(
|
|
365
|
+
f'Waiting for instance {instance_name} to start running. '
|
|
366
|
+
f'State: {instance.status.state.name}, '
|
|
367
|
+
f'Reconciling: {instance.status.reconciling}')
|
|
368
|
+
retry_count += 1
|
|
364
369
|
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
370
|
+
if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
|
|
371
|
+
raise TimeoutError(
|
|
372
|
+
f'Exceeded maximum retries '
|
|
373
|
+
f'({nebius.MAX_RETRIES_TO_INSTANCE_READY * POLL_INTERVAL}'
|
|
374
|
+
f' seconds) while waiting for instance {instance_name}'
|
|
375
|
+
f' to be ready.')
|
|
376
|
+
except nebius.request_error() as e:
|
|
377
|
+
# Handle ResourceExhausted quota limit error. In this case, we need to
|
|
378
|
+
# clean up the disk as VM creation failed and we can't proceed.
|
|
379
|
+
# It cannot be handled by the caller (provisioner)'s teardown logic,
|
|
380
|
+
# as we cannot retrieve the disk id, after the instance creation
|
|
381
|
+
# fails
|
|
382
|
+
logger.warning(f'Failed to launch instance {instance_name}: {e}')
|
|
383
|
+
service = nebius.compute().DiskServiceClient(nebius.sdk())
|
|
384
|
+
nebius.sync_call(
|
|
385
|
+
service.delete(nebius.compute().DeleteDiskRequest(id=disk_id)))
|
|
386
|
+
logger.debug(f'Disk {disk_id} deleted.')
|
|
387
|
+
raise e
|
|
371
388
|
return instance_id
|
|
372
389
|
|
|
373
390
|
|
sky/provision/oci/instance.py
CHANGED
|
@@ -36,6 +36,7 @@ def query_instances(
|
|
|
36
36
|
cluster_name_on_cloud: str,
|
|
37
37
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
38
38
|
non_terminated_only: bool = True,
|
|
39
|
+
retry_if_missing: bool = False,
|
|
39
40
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
40
41
|
"""Query instances.
|
|
41
42
|
|
|
@@ -44,7 +45,7 @@ def query_instances(
|
|
|
44
45
|
A None status means the instance is marked as "terminated"
|
|
45
46
|
or "terminating".
|
|
46
47
|
"""
|
|
47
|
-
del cluster_name #
|
|
48
|
+
del cluster_name, retry_if_missing # unused
|
|
48
49
|
assert provider_config is not None, cluster_name_on_cloud
|
|
49
50
|
region = provider_config['region']
|
|
50
51
|
|
|
@@ -65,9 +66,10 @@ def query_instances(
|
|
|
65
66
|
|
|
66
67
|
|
|
67
68
|
@query_utils.debug_enabled(logger)
|
|
68
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
69
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
69
70
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
70
71
|
"""Start instances with bootstrapped configuration."""
|
|
72
|
+
del cluster_name # unused
|
|
71
73
|
tags = dict(sorted(copy.deepcopy(config.tags).items()))
|
|
72
74
|
|
|
73
75
|
start_time = round(time.time() * 1000)
|
|
@@ -48,10 +48,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
|
48
48
|
return head_instance_id
|
|
49
49
|
|
|
50
50
|
|
|
51
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
51
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
52
52
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
53
53
|
"""Runs instances for the given cluster."""
|
|
54
|
-
|
|
54
|
+
del cluster_name # unused
|
|
55
55
|
pending_status = [
|
|
56
56
|
'starting', 'restarting', 'upgrading', 'provisioning', 'stopping'
|
|
57
57
|
]
|
|
@@ -281,9 +281,10 @@ def query_instances(
|
|
|
281
281
|
cluster_name_on_cloud: str,
|
|
282
282
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
283
283
|
non_terminated_only: bool = True,
|
|
284
|
+
retry_if_missing: bool = False,
|
|
284
285
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
285
286
|
"""See sky/provision/__init__.py"""
|
|
286
|
-
del cluster_name, non_terminated_only #unused
|
|
287
|
+
del cluster_name, non_terminated_only, retry_if_missing #unused
|
|
287
288
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
288
289
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
289
290
|
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
"""Prime Intellect provisioner for SkyPilot."""
|
|
2
|
+
|
|
3
|
+
from sky.provision.primeintellect.config import bootstrap_instances
|
|
4
|
+
from sky.provision.primeintellect.instance import cleanup_ports
|
|
5
|
+
from sky.provision.primeintellect.instance import get_cluster_info
|
|
6
|
+
from sky.provision.primeintellect.instance import query_instances
|
|
7
|
+
from sky.provision.primeintellect.instance import run_instances
|
|
8
|
+
from sky.provision.primeintellect.instance import stop_instances
|
|
9
|
+
from sky.provision.primeintellect.instance import terminate_instances
|
|
10
|
+
from sky.provision.primeintellect.instance import wait_instances
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""Prime Intellect configuration bootstrapping."""
|
|
2
|
+
|
|
3
|
+
from sky.provision import common
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
def bootstrap_instances(
|
|
7
|
+
region: str, cluster_name: str,
|
|
8
|
+
config: common.ProvisionConfig) -> common.ProvisionConfig:
|
|
9
|
+
"""Bootstraps instances for the given cluster."""
|
|
10
|
+
del region, cluster_name # unused
|
|
11
|
+
return config
|