skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,583 @@
|
|
|
1
|
+
"""Slurm utilities for SkyPilot."""
|
|
2
|
+
import math
|
|
3
|
+
import os
|
|
4
|
+
import re
|
|
5
|
+
from typing import Any, Dict, List, Optional, Tuple, Union
|
|
6
|
+
|
|
7
|
+
from paramiko.config import SSHConfig
|
|
8
|
+
|
|
9
|
+
from sky import exceptions
|
|
10
|
+
from sky import sky_logging
|
|
11
|
+
from sky.adaptors import slurm
|
|
12
|
+
from sky.utils import annotations
|
|
13
|
+
from sky.utils import common_utils
|
|
14
|
+
|
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
|
16
|
+
|
|
17
|
+
# TODO(jwj): Choose commonly used default values.
|
|
18
|
+
DEFAULT_SLURM_PATH = '~/.slurm/config'
|
|
19
|
+
DEFAULT_CLUSTER_NAME = 'localcluster'
|
|
20
|
+
DEFAULT_PARTITION = 'dev'
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def get_slurm_ssh_config() -> SSHConfig:
|
|
24
|
+
"""Get the Slurm SSH config."""
|
|
25
|
+
slurm_config_path = os.path.expanduser(DEFAULT_SLURM_PATH)
|
|
26
|
+
slurm_config = SSHConfig.from_path(slurm_config_path)
|
|
27
|
+
return slurm_config
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class SlurmInstanceType:
|
|
31
|
+
"""Class to represent the "Instance Type" in a Slurm cluster.
|
|
32
|
+
|
|
33
|
+
Since Slurm does not have a notion of instances, we generate
|
|
34
|
+
virtual instance types that represent the resources requested by a
|
|
35
|
+
Slurm worker node.
|
|
36
|
+
|
|
37
|
+
This name captures the following resource requests:
|
|
38
|
+
- CPU
|
|
39
|
+
- Memory
|
|
40
|
+
- Accelerators
|
|
41
|
+
|
|
42
|
+
The name format is "{n}CPU--{k}GB" where n is the number of vCPUs and
|
|
43
|
+
k is the amount of memory in GB. Accelerators can be specified by
|
|
44
|
+
appending "--{type}:{a}" where type is the accelerator type and a
|
|
45
|
+
is the number of accelerators.
|
|
46
|
+
CPU and memory can be specified as floats. Accelerator count must be int.
|
|
47
|
+
|
|
48
|
+
Examples:
|
|
49
|
+
- 4CPU--16GB
|
|
50
|
+
- 0.5CPU--1.5GB
|
|
51
|
+
- 4CPU--16GB--V100:1
|
|
52
|
+
"""
|
|
53
|
+
|
|
54
|
+
def __init__(self,
|
|
55
|
+
cpus: float,
|
|
56
|
+
memory: float,
|
|
57
|
+
accelerator_count: Optional[int] = None,
|
|
58
|
+
accelerator_type: Optional[str] = None):
|
|
59
|
+
self.cpus = cpus
|
|
60
|
+
self.memory = memory
|
|
61
|
+
self.accelerator_count = accelerator_count
|
|
62
|
+
self.accelerator_type = accelerator_type
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def name(self) -> str:
|
|
66
|
+
"""Returns the name of the instance."""
|
|
67
|
+
assert self.cpus is not None
|
|
68
|
+
assert self.memory is not None
|
|
69
|
+
name = (f'{common_utils.format_float(self.cpus)}CPU--'
|
|
70
|
+
f'{common_utils.format_float(self.memory)}GB')
|
|
71
|
+
if self.accelerator_count is not None:
|
|
72
|
+
# Replace spaces with underscores in accelerator type to make it a
|
|
73
|
+
# valid logical instance type name.
|
|
74
|
+
assert self.accelerator_type is not None, self.accelerator_count
|
|
75
|
+
acc_name = self.accelerator_type.replace(' ', '_')
|
|
76
|
+
name += f'--{acc_name}:{self.accelerator_count}'
|
|
77
|
+
return name
|
|
78
|
+
|
|
79
|
+
@staticmethod
|
|
80
|
+
def is_valid_instance_type(name: str) -> bool:
|
|
81
|
+
"""Returns whether the given name is a valid instance type."""
|
|
82
|
+
pattern = re.compile(
|
|
83
|
+
r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--[\w\d-]+:\d+)?$')
|
|
84
|
+
return bool(pattern.match(name))
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def _parse_instance_type(
|
|
88
|
+
cls,
|
|
89
|
+
name: str) -> Tuple[float, float, Optional[int], Optional[str]]:
|
|
90
|
+
"""Parses and returns resources from the given InstanceType name.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
cpus | float: Number of CPUs
|
|
94
|
+
memory | float: Amount of memory in GB
|
|
95
|
+
accelerator_count | float: Number of accelerators
|
|
96
|
+
accelerator_type | str: Type of accelerator
|
|
97
|
+
"""
|
|
98
|
+
pattern = re.compile(
|
|
99
|
+
r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_type>[\w\d-]+):(?P<accelerator_count>\d+))?$' # pylint: disable=line-too-long
|
|
100
|
+
)
|
|
101
|
+
match = pattern.match(name)
|
|
102
|
+
if match is not None:
|
|
103
|
+
cpus = float(match.group('cpus'))
|
|
104
|
+
memory = float(match.group('memory'))
|
|
105
|
+
accelerator_count = match.group('accelerator_count')
|
|
106
|
+
accelerator_type = match.group('accelerator_type')
|
|
107
|
+
if accelerator_count is not None:
|
|
108
|
+
accelerator_count = int(accelerator_count)
|
|
109
|
+
# This is to revert the accelerator types with spaces back to
|
|
110
|
+
# the original format.
|
|
111
|
+
accelerator_type = str(accelerator_type).replace(' ', '_')
|
|
112
|
+
else:
|
|
113
|
+
accelerator_count = None
|
|
114
|
+
accelerator_type = None
|
|
115
|
+
return cpus, memory, accelerator_count, accelerator_type
|
|
116
|
+
else:
|
|
117
|
+
raise ValueError(f'Invalid instance name: {name}')
|
|
118
|
+
|
|
119
|
+
@classmethod
|
|
120
|
+
def from_instance_type(cls, name: str) -> 'SlurmInstanceType':
|
|
121
|
+
"""Returns an instance name object from the given name."""
|
|
122
|
+
if not cls.is_valid_instance_type(name):
|
|
123
|
+
raise ValueError(f'Invalid instance name: {name}')
|
|
124
|
+
cpus, memory, accelerator_count, accelerator_type = \
|
|
125
|
+
cls._parse_instance_type(name)
|
|
126
|
+
return cls(cpus=cpus,
|
|
127
|
+
memory=memory,
|
|
128
|
+
accelerator_count=accelerator_count,
|
|
129
|
+
accelerator_type=accelerator_type)
|
|
130
|
+
|
|
131
|
+
@classmethod
|
|
132
|
+
def from_resources(cls,
|
|
133
|
+
cpus: float,
|
|
134
|
+
memory: float,
|
|
135
|
+
accelerator_count: Union[float, int] = 0,
|
|
136
|
+
accelerator_type: str = '') -> 'SlurmInstanceType':
|
|
137
|
+
"""Returns an instance name object from the given resources.
|
|
138
|
+
|
|
139
|
+
If accelerator_count is not an int, it will be rounded up since GPU
|
|
140
|
+
requests in Slurm must be int.
|
|
141
|
+
|
|
142
|
+
NOTE: Should we take MIG management into account? See
|
|
143
|
+
https://slurm.schedmd.com/gres.html#MIG_Management.
|
|
144
|
+
"""
|
|
145
|
+
name = f'{cpus}CPU--{memory}GB'
|
|
146
|
+
# Round up accelerator_count if it is not an int.
|
|
147
|
+
accelerator_count = math.ceil(accelerator_count)
|
|
148
|
+
if accelerator_count > 0:
|
|
149
|
+
name += f'--{accelerator_type}:{accelerator_count}'
|
|
150
|
+
return cls(cpus=cpus,
|
|
151
|
+
memory=memory,
|
|
152
|
+
accelerator_count=accelerator_count,
|
|
153
|
+
accelerator_type=accelerator_type)
|
|
154
|
+
|
|
155
|
+
def __str__(self):
|
|
156
|
+
return self.name
|
|
157
|
+
|
|
158
|
+
def __repr__(self):
|
|
159
|
+
return (f'SlurmInstanceType(cpus={self.cpus!r}, '
|
|
160
|
+
f'memory={self.memory!r}, '
|
|
161
|
+
f'accelerator_count={self.accelerator_count!r}, '
|
|
162
|
+
f'accelerator_type={self.accelerator_type!r})')
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def instance_id(job_id: str, node: str) -> str:
|
|
166
|
+
"""Generates the SkyPilot-defined instance ID for Slurm.
|
|
167
|
+
|
|
168
|
+
A (job id, node) pair is unique within a Slurm cluster.
|
|
169
|
+
"""
|
|
170
|
+
return f'job{job_id}-{node}'
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def get_cluster_name_from_config(provider_config: Dict[str, Any]) -> str:
|
|
174
|
+
"""Return the cluster name from the provider config.
|
|
175
|
+
|
|
176
|
+
The concept of cluster can be mapped to a cloud region.
|
|
177
|
+
"""
|
|
178
|
+
return provider_config.get('cluster', DEFAULT_CLUSTER_NAME)
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def get_partition_from_config(provider_config: Dict[str, Any]) -> str:
|
|
182
|
+
"""Return the partition from the provider config.
|
|
183
|
+
|
|
184
|
+
The concept of partition can be mapped to a cloud zone.
|
|
185
|
+
"""
|
|
186
|
+
return provider_config.get('partition', DEFAULT_PARTITION)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
@annotations.lru_cache(scope='request')
|
|
190
|
+
def get_cluster_default_partition(cluster_name: str) -> str:
|
|
191
|
+
"""Get the default partition for a Slurm cluster.
|
|
192
|
+
|
|
193
|
+
Queries the Slurm cluster for the partition marked with an asterisk (*)
|
|
194
|
+
in sinfo output. Falls back to DEFAULT_PARTITION if the query fails or
|
|
195
|
+
no default partition is found.
|
|
196
|
+
|
|
197
|
+
Args:
|
|
198
|
+
cluster_name: Name of the Slurm cluster.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
The default partition name for the cluster.
|
|
202
|
+
"""
|
|
203
|
+
try:
|
|
204
|
+
ssh_config = get_slurm_ssh_config()
|
|
205
|
+
ssh_config_dict = ssh_config.lookup(cluster_name)
|
|
206
|
+
except Exception as e:
|
|
207
|
+
raise ValueError(
|
|
208
|
+
f'Failed to load SSH configuration from {DEFAULT_SLURM_PATH}: '
|
|
209
|
+
f'{common_utils.format_exception(e)}') from e
|
|
210
|
+
|
|
211
|
+
client = slurm.SlurmClient(
|
|
212
|
+
ssh_config_dict['hostname'],
|
|
213
|
+
int(ssh_config_dict.get('port', 22)),
|
|
214
|
+
ssh_config_dict['user'],
|
|
215
|
+
ssh_config_dict['identityfile'][0],
|
|
216
|
+
ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
|
|
217
|
+
)
|
|
218
|
+
|
|
219
|
+
default_partition = client.get_default_partition()
|
|
220
|
+
if default_partition is None:
|
|
221
|
+
# TODO(kevin): Have a way to specify default partition in
|
|
222
|
+
# ~/.sky/config.yaml if needed, in case a Slurm cluster
|
|
223
|
+
# really does not have a default partition.
|
|
224
|
+
raise ValueError('No default partition found for cluster '
|
|
225
|
+
f'{cluster_name}.')
|
|
226
|
+
return default_partition
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
def get_all_slurm_cluster_names() -> List[str]:
|
|
230
|
+
"""Get all Slurm cluster names available in the environment.
|
|
231
|
+
|
|
232
|
+
Returns:
|
|
233
|
+
List[str]: The list of Slurm cluster names if available,
|
|
234
|
+
an empty list otherwise.
|
|
235
|
+
"""
|
|
236
|
+
try:
|
|
237
|
+
ssh_config = get_slurm_ssh_config()
|
|
238
|
+
except FileNotFoundError:
|
|
239
|
+
return []
|
|
240
|
+
except Exception as e:
|
|
241
|
+
raise ValueError(
|
|
242
|
+
f'Failed to load SSH configuration from {DEFAULT_SLURM_PATH}: '
|
|
243
|
+
f'{common_utils.format_exception(e)}') from e
|
|
244
|
+
|
|
245
|
+
cluster_names = []
|
|
246
|
+
for cluster in ssh_config.get_hostnames():
|
|
247
|
+
if cluster == '*':
|
|
248
|
+
continue
|
|
249
|
+
|
|
250
|
+
cluster_names.append(cluster)
|
|
251
|
+
|
|
252
|
+
return cluster_names
|
|
253
|
+
|
|
254
|
+
|
|
255
|
+
def _check_cpu_mem_fits(
|
|
256
|
+
candidate_instance_type: SlurmInstanceType,
|
|
257
|
+
node_list: List[slurm.NodeInfo]) -> Tuple[bool, Optional[str]]:
|
|
258
|
+
"""Checks if instance fits on candidate nodes based on CPU and memory.
|
|
259
|
+
|
|
260
|
+
We check capacity (not allocatable) because availability can change
|
|
261
|
+
during scheduling, and we want to let the Slurm scheduler handle that.
|
|
262
|
+
"""
|
|
263
|
+
# We log max CPU and memory found on the GPU nodes for debugging.
|
|
264
|
+
max_cpu = 0
|
|
265
|
+
max_mem_gb = 0.0
|
|
266
|
+
|
|
267
|
+
for node_info in node_list:
|
|
268
|
+
node_cpus = node_info.cpus
|
|
269
|
+
node_mem_gb = node_info.memory_gb
|
|
270
|
+
|
|
271
|
+
if node_cpus > max_cpu:
|
|
272
|
+
max_cpu = node_cpus
|
|
273
|
+
max_mem_gb = node_mem_gb
|
|
274
|
+
|
|
275
|
+
if (node_cpus >= candidate_instance_type.cpus and
|
|
276
|
+
node_mem_gb >= candidate_instance_type.memory):
|
|
277
|
+
return True, None
|
|
278
|
+
|
|
279
|
+
return False, (f'Max found: {max_cpu} CPUs, '
|
|
280
|
+
f'{common_utils.format_float(max_mem_gb)}G memory')
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def check_instance_fits(
|
|
284
|
+
cluster: str,
|
|
285
|
+
instance_type: str,
|
|
286
|
+
partition: Optional[str] = None) -> Tuple[bool, Optional[str]]:
|
|
287
|
+
"""Check if the given instance type fits in the given cluster/partition.
|
|
288
|
+
|
|
289
|
+
Args:
|
|
290
|
+
cluster: Name of the Slurm cluster.
|
|
291
|
+
instance_type: The instance type to check.
|
|
292
|
+
partition: Optional partition name. If None, checks all partitions.
|
|
293
|
+
|
|
294
|
+
Returns:
|
|
295
|
+
Tuple of (fits, reason) where fits is True if available.
|
|
296
|
+
"""
|
|
297
|
+
# Get Slurm node list in the given cluster (region).
|
|
298
|
+
try:
|
|
299
|
+
ssh_config = get_slurm_ssh_config()
|
|
300
|
+
except FileNotFoundError:
|
|
301
|
+
return (False, f'Could not query Slurm cluster {cluster} '
|
|
302
|
+
f'because the Slurm configuration file '
|
|
303
|
+
f'{DEFAULT_SLURM_PATH} does not exist.')
|
|
304
|
+
except Exception as e: # pylint: disable=broad-except
|
|
305
|
+
return (False, f'Could not query Slurm cluster {cluster} '
|
|
306
|
+
f'because Slurm SSH configuration at {DEFAULT_SLURM_PATH} '
|
|
307
|
+
f'could not be loaded: {common_utils.format_exception(e)}.')
|
|
308
|
+
ssh_config_dict = ssh_config.lookup(cluster)
|
|
309
|
+
|
|
310
|
+
client = slurm.SlurmClient(
|
|
311
|
+
ssh_config_dict['hostname'],
|
|
312
|
+
int(ssh_config_dict.get('port', 22)),
|
|
313
|
+
ssh_config_dict['user'],
|
|
314
|
+
ssh_config_dict['identityfile'][0],
|
|
315
|
+
ssh_proxy_command=ssh_config_dict.get('proxycommand', None),
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
nodes = client.info_nodes()
|
|
319
|
+
default_partition = get_cluster_default_partition(cluster)
|
|
320
|
+
|
|
321
|
+
def is_default_partition(node_partition: str) -> bool:
|
|
322
|
+
# info_nodes does not strip the '*' from the default partition name.
|
|
323
|
+
# But non-default partition names can also end with '*',
|
|
324
|
+
# so we need to check whether the partition name without the '*'
|
|
325
|
+
# is the same as the default partition name.
|
|
326
|
+
return (node_partition.endswith('*') and
|
|
327
|
+
node_partition[:-1] == default_partition)
|
|
328
|
+
|
|
329
|
+
partition_suffix = ''
|
|
330
|
+
if partition is not None:
|
|
331
|
+
filtered = []
|
|
332
|
+
for node_info in nodes:
|
|
333
|
+
node_partition = node_info.partition
|
|
334
|
+
if is_default_partition(node_partition):
|
|
335
|
+
# Strip '*' from default partition name.
|
|
336
|
+
node_partition = node_partition[:-1]
|
|
337
|
+
if node_partition == partition:
|
|
338
|
+
filtered.append(node_info)
|
|
339
|
+
nodes = filtered
|
|
340
|
+
partition_suffix = f' in partition {partition}'
|
|
341
|
+
|
|
342
|
+
slurm_instance_type = SlurmInstanceType.from_instance_type(instance_type)
|
|
343
|
+
acc_count = (slurm_instance_type.accelerator_count
|
|
344
|
+
if slurm_instance_type.accelerator_count is not None else 0)
|
|
345
|
+
acc_type = slurm_instance_type.accelerator_type
|
|
346
|
+
candidate_nodes = nodes
|
|
347
|
+
not_fit_reason_prefix = (
|
|
348
|
+
f'No nodes found with enough '
|
|
349
|
+
f'CPU (> {slurm_instance_type.cpus} CPUs) and/or '
|
|
350
|
+
f'memory (> {slurm_instance_type.memory} G){partition_suffix}. ')
|
|
351
|
+
if acc_type is not None:
|
|
352
|
+
assert acc_count is not None, (acc_type, acc_count)
|
|
353
|
+
|
|
354
|
+
gpu_nodes = []
|
|
355
|
+
# GRES string format: 'gpu:acc_type:acc_count(optional_extra_info)'
|
|
356
|
+
# Examples:
|
|
357
|
+
# - gpu:nvidia_h100_80gb_hbm3:8(S:0-1)
|
|
358
|
+
# - gpu:a10g:8
|
|
359
|
+
# - gpu:l4:1
|
|
360
|
+
gres_pattern = re.compile(r'^gpu:([^:]+):(\d+)')
|
|
361
|
+
for node_info in nodes:
|
|
362
|
+
gres_str = node_info.gres
|
|
363
|
+
# Extract the GPU type and count from the GRES string
|
|
364
|
+
match = gres_pattern.match(gres_str)
|
|
365
|
+
if not match:
|
|
366
|
+
continue
|
|
367
|
+
|
|
368
|
+
node_acc_type = match.group(1).lower()
|
|
369
|
+
node_acc_count = int(match.group(2))
|
|
370
|
+
|
|
371
|
+
# TODO(jwj): Handle status check.
|
|
372
|
+
|
|
373
|
+
# Check if the node has the requested GPU type and at least the
|
|
374
|
+
# requested count
|
|
375
|
+
if (node_acc_type == acc_type.lower() and
|
|
376
|
+
node_acc_count >= acc_count):
|
|
377
|
+
gpu_nodes.append(node_info)
|
|
378
|
+
if len(gpu_nodes) == 0:
|
|
379
|
+
return (False,
|
|
380
|
+
f'No GPU nodes found with at least {acc_type}:{acc_count} '
|
|
381
|
+
f'on the cluster.')
|
|
382
|
+
|
|
383
|
+
candidate_nodes = gpu_nodes
|
|
384
|
+
not_fit_reason_prefix = (
|
|
385
|
+
f'GPU nodes with {acc_type}{partition_suffix} do not have '
|
|
386
|
+
f'enough CPU (> {slurm_instance_type.cpus} CPUs) and/or '
|
|
387
|
+
f'memory (> {slurm_instance_type.memory} G). ')
|
|
388
|
+
|
|
389
|
+
# Check if CPU and memory requirements are met on at least one
|
|
390
|
+
# candidate node.
|
|
391
|
+
fits, reason = _check_cpu_mem_fits(slurm_instance_type, candidate_nodes)
|
|
392
|
+
if not fits and reason is not None:
|
|
393
|
+
reason = not_fit_reason_prefix + reason
|
|
394
|
+
return fits, reason
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
def _get_slurm_node_info_list(
|
|
398
|
+
slurm_cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
399
|
+
"""Gathers detailed information about each node in the Slurm cluster.
|
|
400
|
+
|
|
401
|
+
Raises:
|
|
402
|
+
FileNotFoundError: If the Slurm configuration file does not exist.
|
|
403
|
+
ValueError: If no Slurm cluster name is found in the Slurm
|
|
404
|
+
configuration file.
|
|
405
|
+
"""
|
|
406
|
+
# 1. Get node state and GRES using sinfo
|
|
407
|
+
|
|
408
|
+
# can raise FileNotFoundError if config file does not exist.
|
|
409
|
+
slurm_config = get_slurm_ssh_config()
|
|
410
|
+
if slurm_cluster_name is None:
|
|
411
|
+
slurm_cluster_names = get_all_slurm_cluster_names()
|
|
412
|
+
if slurm_cluster_names:
|
|
413
|
+
slurm_cluster_name = slurm_cluster_names[0]
|
|
414
|
+
if slurm_cluster_name is None:
|
|
415
|
+
raise ValueError(
|
|
416
|
+
f'No Slurm cluster name found in the {DEFAULT_SLURM_PATH} '
|
|
417
|
+
f'configuration.')
|
|
418
|
+
slurm_config_dict = slurm_config.lookup(slurm_cluster_name)
|
|
419
|
+
logger.debug(f'Slurm config dict: {slurm_config_dict}')
|
|
420
|
+
slurm_client = slurm.SlurmClient(
|
|
421
|
+
slurm_config_dict['hostname'],
|
|
422
|
+
int(slurm_config_dict.get('port', 22)),
|
|
423
|
+
slurm_config_dict['user'],
|
|
424
|
+
slurm_config_dict['identityfile'][0],
|
|
425
|
+
ssh_proxy_command=slurm_config_dict.get('proxycommand', None),
|
|
426
|
+
)
|
|
427
|
+
node_infos = slurm_client.info_nodes()
|
|
428
|
+
|
|
429
|
+
if not node_infos:
|
|
430
|
+
logger.warning(
|
|
431
|
+
f'`sinfo -N` returned no output on cluster {slurm_cluster_name}. '
|
|
432
|
+
f'No nodes found?')
|
|
433
|
+
return []
|
|
434
|
+
|
|
435
|
+
# 2. Process each node, aggregating partitions per node
|
|
436
|
+
slurm_nodes_info: Dict[str, Dict[str, Any]] = {}
|
|
437
|
+
gres_gpu_pattern = re.compile(r'((gpu)(?::([^:]+))?:(\d+))')
|
|
438
|
+
|
|
439
|
+
for node_info in node_infos:
|
|
440
|
+
node_name = node_info.node
|
|
441
|
+
state = node_info.state
|
|
442
|
+
gres_str = node_info.gres
|
|
443
|
+
partition = node_info.partition
|
|
444
|
+
|
|
445
|
+
if node_name in slurm_nodes_info:
|
|
446
|
+
slurm_nodes_info[node_name]['partitions'].append(partition)
|
|
447
|
+
continue
|
|
448
|
+
|
|
449
|
+
# Extract GPU info from GRES
|
|
450
|
+
gres_match = gres_gpu_pattern.search(gres_str)
|
|
451
|
+
|
|
452
|
+
total_gpus = 0
|
|
453
|
+
gpu_type_from_sinfo = None # Default to None for CPU-only nodes
|
|
454
|
+
if gres_match:
|
|
455
|
+
try:
|
|
456
|
+
total_gpus = int(gres_match.group(4))
|
|
457
|
+
if gres_match.group(3):
|
|
458
|
+
gpu_type_from_sinfo = gres_match.group(3).upper()
|
|
459
|
+
# If total_gpus > 0 but no type, default to 'GPU'
|
|
460
|
+
elif total_gpus > 0:
|
|
461
|
+
gpu_type_from_sinfo = 'GPU'
|
|
462
|
+
except ValueError:
|
|
463
|
+
logger.warning(
|
|
464
|
+
f'Could not parse GPU count from GRES for {node_name}.')
|
|
465
|
+
|
|
466
|
+
# Get allocated GPUs via squeue
|
|
467
|
+
allocated_gpus = 0
|
|
468
|
+
# TODO(zhwu): move to enum
|
|
469
|
+
if state in ('alloc', 'mix', 'drain', 'drng', 'drained', 'resv',
|
|
470
|
+
'comp'):
|
|
471
|
+
try:
|
|
472
|
+
node_jobs = slurm_client.get_node_jobs(node_name)
|
|
473
|
+
if node_jobs:
|
|
474
|
+
job_gres_pattern = re.compile(r'gpu(?::[^:]+)*:(\d+)')
|
|
475
|
+
for job_line in node_jobs:
|
|
476
|
+
gres_job_match = job_gres_pattern.search(job_line)
|
|
477
|
+
if gres_job_match:
|
|
478
|
+
allocated_gpus += int(gres_job_match.group(1))
|
|
479
|
+
except Exception as e: # pylint: disable=broad-except
|
|
480
|
+
if state == 'alloc':
|
|
481
|
+
# We can infer allocated GPUs only if the node is
|
|
482
|
+
# in 'alloc' state.
|
|
483
|
+
allocated_gpus = total_gpus
|
|
484
|
+
else:
|
|
485
|
+
# Otherwise, just raise the error.
|
|
486
|
+
raise e
|
|
487
|
+
elif state == 'idle':
|
|
488
|
+
allocated_gpus = 0
|
|
489
|
+
|
|
490
|
+
free_gpus = total_gpus - allocated_gpus if state not in ('down',
|
|
491
|
+
'drain',
|
|
492
|
+
'drng',
|
|
493
|
+
'maint') else 0
|
|
494
|
+
free_gpus = max(0, free_gpus)
|
|
495
|
+
|
|
496
|
+
# Get CPU/Mem info via scontrol
|
|
497
|
+
vcpu_total = 0
|
|
498
|
+
mem_gb = 0.0
|
|
499
|
+
try:
|
|
500
|
+
node_details = slurm_client.node_details(node_name)
|
|
501
|
+
vcpu_total = int(node_details.get('CPUTot', '0'))
|
|
502
|
+
mem_gb = float(node_details.get('RealMemory', '0')) / 1024.0
|
|
503
|
+
except Exception as e: # pylint: disable=broad-except
|
|
504
|
+
logger.warning(
|
|
505
|
+
f'Failed to get CPU/memory info for {node_name}: {e}')
|
|
506
|
+
|
|
507
|
+
slurm_nodes_info[node_name] = {
|
|
508
|
+
'node_name': node_name,
|
|
509
|
+
'slurm_cluster_name': slurm_cluster_name,
|
|
510
|
+
'partitions': [partition],
|
|
511
|
+
'node_state': state,
|
|
512
|
+
'gpu_type': gpu_type_from_sinfo,
|
|
513
|
+
'total_gpus': total_gpus,
|
|
514
|
+
'free_gpus': free_gpus,
|
|
515
|
+
'vcpu_count': vcpu_total,
|
|
516
|
+
'memory_gb': round(mem_gb, 2),
|
|
517
|
+
}
|
|
518
|
+
|
|
519
|
+
for node_info in slurm_nodes_info.values():
|
|
520
|
+
partitions = node_info.pop('partitions')
|
|
521
|
+
node_info['partition'] = ','.join(str(p) for p in partitions)
|
|
522
|
+
|
|
523
|
+
return list(slurm_nodes_info.values())
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def slurm_node_info(
|
|
527
|
+
slurm_cluster_name: Optional[str] = None) -> List[Dict[str, Any]]:
|
|
528
|
+
"""Gets detailed information for each node in the Slurm cluster.
|
|
529
|
+
|
|
530
|
+
Returns:
|
|
531
|
+
List[Dict[str, Any]]: A list of dictionaries, each containing node info.
|
|
532
|
+
"""
|
|
533
|
+
try:
|
|
534
|
+
node_list = _get_slurm_node_info_list(
|
|
535
|
+
slurm_cluster_name=slurm_cluster_name)
|
|
536
|
+
except (RuntimeError, exceptions.NotSupportedError) as e:
|
|
537
|
+
logger.debug(f'Could not retrieve Slurm node info: {e}')
|
|
538
|
+
return []
|
|
539
|
+
return node_list
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def is_inside_slurm_job() -> bool:
|
|
543
|
+
return os.environ.get('SLURM_JOB_ID') is not None
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def get_partitions(cluster_name: str) -> List[str]:
|
|
547
|
+
"""Get unique partition names available in a Slurm cluster.
|
|
548
|
+
|
|
549
|
+
Args:
|
|
550
|
+
cluster_name: Name of the Slurm cluster.
|
|
551
|
+
|
|
552
|
+
Returns:
|
|
553
|
+
List of unique partition names available in the cluster.
|
|
554
|
+
The default partition appears first,
|
|
555
|
+
and the rest are sorted alphabetically.
|
|
556
|
+
"""
|
|
557
|
+
try:
|
|
558
|
+
slurm_config = SSHConfig.from_path(
|
|
559
|
+
os.path.expanduser(DEFAULT_SLURM_PATH))
|
|
560
|
+
slurm_config_dict = slurm_config.lookup(cluster_name)
|
|
561
|
+
|
|
562
|
+
client = slurm.SlurmClient(
|
|
563
|
+
slurm_config_dict['hostname'],
|
|
564
|
+
int(slurm_config_dict.get('port', 22)),
|
|
565
|
+
slurm_config_dict['user'],
|
|
566
|
+
slurm_config_dict['identityfile'][0],
|
|
567
|
+
ssh_proxy_command=slurm_config_dict.get('proxycommand', None),
|
|
568
|
+
)
|
|
569
|
+
|
|
570
|
+
partitions_info = client.get_partitions_info()
|
|
571
|
+
default_partitions = []
|
|
572
|
+
other_partitions = []
|
|
573
|
+
for partition in partitions_info:
|
|
574
|
+
if partition.is_default:
|
|
575
|
+
default_partitions.append(partition.name)
|
|
576
|
+
else:
|
|
577
|
+
other_partitions.append(partition.name)
|
|
578
|
+
return default_partitions + sorted(other_partitions)
|
|
579
|
+
except Exception as e: # pylint: disable=broad-except
|
|
580
|
+
logger.warning(
|
|
581
|
+
f'Failed to get partitions for cluster {cluster_name}: {e}')
|
|
582
|
+
# Fall back to default partition if query fails
|
|
583
|
+
return [DEFAULT_PARTITION]
|
sky/provision/vast/instance.py
CHANGED
|
@@ -39,14 +39,15 @@ def _filter_instances(cluster_name_on_cloud: str,
|
|
|
39
39
|
|
|
40
40
|
def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
|
|
41
41
|
for inst_id, inst in instances.items():
|
|
42
|
-
if inst['name'].endswith('-head'):
|
|
42
|
+
if inst.get('name') and inst['name'].endswith('-head'):
|
|
43
43
|
return inst_id
|
|
44
44
|
return None
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
def run_instances(region: str, cluster_name_on_cloud: str,
|
|
47
|
+
def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
|
|
48
48
|
config: common.ProvisionConfig) -> common.ProvisionRecord:
|
|
49
49
|
"""Runs instances for the given cluster."""
|
|
50
|
+
del cluster_name # unused
|
|
50
51
|
pending_status = ['CREATED', 'RESTARTING']
|
|
51
52
|
|
|
52
53
|
created_instance_ids = []
|
|
@@ -88,6 +89,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
88
89
|
resumed_instance_ids=[],
|
|
89
90
|
created_instance_ids=[])
|
|
90
91
|
|
|
92
|
+
secure_only = config.provider_config.get('secure_only', False)
|
|
91
93
|
for _ in range(to_start_count):
|
|
92
94
|
node_type = 'head' if head_instance_id is None else 'worker'
|
|
93
95
|
try:
|
|
@@ -98,7 +100,9 @@ def run_instances(region: str, cluster_name_on_cloud: str,
|
|
|
98
100
|
disk_size=config.node_config['DiskSize'],
|
|
99
101
|
preemptible=config.node_config['Preemptible'],
|
|
100
102
|
image_name=config.node_config['ImageId'],
|
|
101
|
-
ports=config.ports_to_open_on_launch
|
|
103
|
+
ports=config.ports_to_open_on_launch,
|
|
104
|
+
secure_only=secure_only,
|
|
105
|
+
)
|
|
102
106
|
except Exception as e: # pylint: disable=broad-except
|
|
103
107
|
logger.warning(f'run_instances error: {e}')
|
|
104
108
|
raise
|
|
@@ -220,9 +224,10 @@ def query_instances(
|
|
|
220
224
|
cluster_name_on_cloud: str,
|
|
221
225
|
provider_config: Optional[Dict[str, Any]] = None,
|
|
222
226
|
non_terminated_only: bool = True,
|
|
227
|
+
retry_if_missing: bool = False,
|
|
223
228
|
) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
|
|
224
229
|
"""See sky/provision/__init__.py"""
|
|
225
|
-
del cluster_name # unused
|
|
230
|
+
del cluster_name, retry_if_missing # unused
|
|
226
231
|
assert provider_config is not None, (cluster_name_on_cloud, provider_config)
|
|
227
232
|
instances = _filter_instances(cluster_name_on_cloud, None)
|
|
228
233
|
# "running", "frozen", "stopped", "unknown", "loading"
|
sky/provision/vast/utils.py
CHANGED
|
@@ -34,8 +34,8 @@ def list_instances() -> Dict[str, Dict[str, Any]]:
|
|
|
34
34
|
|
|
35
35
|
|
|
36
36
|
def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
37
|
-
image_name: str, ports: Optional[List[int]],
|
|
38
|
-
|
|
37
|
+
image_name: str, ports: Optional[List[int]], preemptible: bool,
|
|
38
|
+
secure_only: bool) -> str:
|
|
39
39
|
"""Launches an instance with the given parameters.
|
|
40
40
|
|
|
41
41
|
Converts the instance_type to the Vast GPU name, finds the specs for the
|
|
@@ -87,7 +87,7 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
|
87
87
|
gpu_name = instance_type.split('-')[1].replace('_', ' ')
|
|
88
88
|
num_gpus = int(instance_type.split('-')[0].replace('x', ''))
|
|
89
89
|
|
|
90
|
-
query =
|
|
90
|
+
query = [
|
|
91
91
|
'chunked=true',
|
|
92
92
|
'georegion=true',
|
|
93
93
|
f'geolocation="{region[-2:]}"',
|
|
@@ -95,13 +95,17 @@ def launch(name: str, instance_type: str, region: str, disk_size: int,
|
|
|
95
95
|
f'num_gpus={num_gpus}',
|
|
96
96
|
f'gpu_name="{gpu_name}"',
|
|
97
97
|
f'cpu_ram>="{cpu_ram}"',
|
|
98
|
-
]
|
|
98
|
+
]
|
|
99
|
+
if secure_only:
|
|
100
|
+
query.append('datacenter=true')
|
|
101
|
+
query_str = ' '.join(query)
|
|
99
102
|
|
|
100
|
-
instance_list = vast.vast().search_offers(query=
|
|
103
|
+
instance_list = vast.vast().search_offers(query=query_str)
|
|
101
104
|
|
|
102
105
|
if isinstance(instance_list, int) or len(instance_list) == 0:
|
|
103
106
|
raise RuntimeError('Failed to create instances, could not find an '
|
|
104
|
-
|
|
107
|
+
'offer that satisfies the requirements '
|
|
108
|
+
f'"{query_str}".')
|
|
105
109
|
|
|
106
110
|
instance_touse = instance_list[0]
|
|
107
111
|
|