skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,184 @@
|
|
|
1
|
+
"""Seeweb service catalog.
|
|
2
|
+
|
|
3
|
+
This module loads the service catalog file and can be used to
|
|
4
|
+
query instance types and pricing information for Seeweb.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import typing
|
|
8
|
+
from typing import Dict, List, Optional, Tuple
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from sky.catalog import common
|
|
13
|
+
from sky.utils import resources_utils
|
|
14
|
+
from sky.utils import ux_utils
|
|
15
|
+
|
|
16
|
+
if typing.TYPE_CHECKING:
|
|
17
|
+
from sky.clouds import cloud
|
|
18
|
+
|
|
19
|
+
_PULL_FREQUENCY_HOURS = 8
|
|
20
|
+
_df = common.read_catalog('seeweb/vms.csv',
|
|
21
|
+
pull_frequency_hours=_PULL_FREQUENCY_HOURS)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def instance_type_exists(instance_type: str) -> bool:
|
|
25
|
+
result = common.instance_type_exists_impl(_df, instance_type)
|
|
26
|
+
return result
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def validate_region_zone(
|
|
30
|
+
region: Optional[str],
|
|
31
|
+
zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
|
|
32
|
+
if zone is not None:
|
|
33
|
+
with ux_utils.print_exception_no_traceback():
|
|
34
|
+
raise ValueError('Seeweb does not support zones.')
|
|
35
|
+
|
|
36
|
+
result = common.validate_region_zone_impl('Seeweb', _df, region, zone)
|
|
37
|
+
return result
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def get_hourly_cost(instance_type: str,
|
|
41
|
+
use_spot: bool = False,
|
|
42
|
+
region: Optional[str] = None,
|
|
43
|
+
zone: Optional[str] = None) -> float:
|
|
44
|
+
"""Returns the cost, or the cheapest cost among all zones for spot."""
|
|
45
|
+
if zone is not None:
|
|
46
|
+
with ux_utils.print_exception_no_traceback():
|
|
47
|
+
raise ValueError('Seeweb does not support zones.')
|
|
48
|
+
|
|
49
|
+
result = common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
|
|
50
|
+
zone)
|
|
51
|
+
return result
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def get_vcpus_mem_from_instance_type(
|
|
55
|
+
instance_type: str) -> Tuple[Optional[float], Optional[float]]:
|
|
56
|
+
result = common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
|
|
57
|
+
return result
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
61
|
+
memory: Optional[str] = None,
|
|
62
|
+
disk_tier: Optional[
|
|
63
|
+
resources_utils.DiskTier] = None,
|
|
64
|
+
region: Optional[str] = None,
|
|
65
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
66
|
+
del disk_tier # unused
|
|
67
|
+
result = common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory,
|
|
68
|
+
region, zone)
|
|
69
|
+
return result
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def get_accelerators_from_instance_type(
|
|
73
|
+
instance_type: str) -> Optional[Dict[str, int]]:
|
|
74
|
+
# Filter the dataframe for the specific instance type
|
|
75
|
+
df_filtered = _df[_df['InstanceType'] == instance_type]
|
|
76
|
+
if df_filtered.empty:
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
# Get the first row (all rows for same instance
|
|
80
|
+
# type should have same accelerator info)
|
|
81
|
+
row = df_filtered.iloc[0]
|
|
82
|
+
acc_name = row['AcceleratorName']
|
|
83
|
+
acc_count = row['AcceleratorCount']
|
|
84
|
+
|
|
85
|
+
# Check if the instance has accelerators
|
|
86
|
+
if pd.isna(acc_name) or pd.isna(
|
|
87
|
+
acc_count) or acc_name == '' or acc_count == '':
|
|
88
|
+
return None
|
|
89
|
+
|
|
90
|
+
# Convert accelerator count to int/float
|
|
91
|
+
try:
|
|
92
|
+
if int(acc_count) == acc_count:
|
|
93
|
+
acc_count = int(acc_count)
|
|
94
|
+
else:
|
|
95
|
+
acc_count = float(acc_count)
|
|
96
|
+
except (ValueError, TypeError):
|
|
97
|
+
return None
|
|
98
|
+
|
|
99
|
+
result = {acc_name: acc_count}
|
|
100
|
+
return result
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def get_instance_type_for_accelerator(
|
|
104
|
+
acc_name: str,
|
|
105
|
+
acc_count: int,
|
|
106
|
+
cpus: Optional[str] = None,
|
|
107
|
+
memory: Optional[str] = None,
|
|
108
|
+
use_spot: bool = False,
|
|
109
|
+
region: Optional[str] = None,
|
|
110
|
+
zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
|
|
111
|
+
"""Returns a list of instance types satisfying
|
|
112
|
+
the required count of accelerators."""
|
|
113
|
+
if zone is not None:
|
|
114
|
+
with ux_utils.print_exception_no_traceback():
|
|
115
|
+
raise ValueError('Seeweb does not support zones.')
|
|
116
|
+
|
|
117
|
+
result = common.get_instance_type_for_accelerator_impl(df=_df,
|
|
118
|
+
acc_name=acc_name,
|
|
119
|
+
acc_count=acc_count,
|
|
120
|
+
cpus=cpus,
|
|
121
|
+
memory=memory,
|
|
122
|
+
use_spot=use_spot,
|
|
123
|
+
region=region,
|
|
124
|
+
zone=zone)
|
|
125
|
+
return result
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def regions() -> List['cloud.Region']:
|
|
129
|
+
result = common.get_region_zones(_df, use_spot=False)
|
|
130
|
+
return result
|
|
131
|
+
|
|
132
|
+
|
|
133
|
+
def get_region_zones_for_instance_type(instance_type: str,
|
|
134
|
+
use_spot: bool = False
|
|
135
|
+
) -> List['cloud.Region']:
|
|
136
|
+
"""Returns a list of regions for a given instance type."""
|
|
137
|
+
# Filter the dataframe for the specific instance type
|
|
138
|
+
df_filtered = _df[_df['InstanceType'] == instance_type]
|
|
139
|
+
if df_filtered.empty:
|
|
140
|
+
return []
|
|
141
|
+
|
|
142
|
+
# Use common.get_region_zones() like all other providers
|
|
143
|
+
region_list = common.get_region_zones(df_filtered, use_spot)
|
|
144
|
+
|
|
145
|
+
# Default region: Frosinone (it-fr2)
|
|
146
|
+
# Other regions: Milano (it-mi2), Lugano (ch-lug1), Bulgaria (bg-sof1)
|
|
147
|
+
priority_regions = ['it-fr2']
|
|
148
|
+
prioritized_regions = []
|
|
149
|
+
other_regions = []
|
|
150
|
+
|
|
151
|
+
# First, add regions in priority order if they exist
|
|
152
|
+
for priority_region in priority_regions:
|
|
153
|
+
for region in region_list:
|
|
154
|
+
if region.name == priority_region:
|
|
155
|
+
prioritized_regions.append(region)
|
|
156
|
+
break
|
|
157
|
+
|
|
158
|
+
# Then, add any remaining regions that weren't in the priority list
|
|
159
|
+
for region in region_list:
|
|
160
|
+
if region.name not in priority_regions:
|
|
161
|
+
other_regions.append(region)
|
|
162
|
+
|
|
163
|
+
result = prioritized_regions + other_regions
|
|
164
|
+
return result
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def list_accelerators(
|
|
168
|
+
gpus_only: bool,
|
|
169
|
+
name_filter: Optional[str],
|
|
170
|
+
region_filter: Optional[str],
|
|
171
|
+
quantity_filter: Optional[int],
|
|
172
|
+
case_sensitive: bool = True,
|
|
173
|
+
all_regions: bool = False,
|
|
174
|
+
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
|
|
175
|
+
"""Lists accelerators offered in Seeweb."""
|
|
176
|
+
# Filter out rows with empty or null regions (indicating unavailability)
|
|
177
|
+
df_filtered = _df.dropna(subset=['Region'])
|
|
178
|
+
df_filtered = df_filtered[df_filtered['Region'].str.strip() != '']
|
|
179
|
+
|
|
180
|
+
result = common.list_accelerators_impl('Seeweb', df_filtered, gpus_only,
|
|
181
|
+
name_filter, region_filter,
|
|
182
|
+
quantity_filter, case_sensitive,
|
|
183
|
+
all_regions, require_price)
|
|
184
|
+
return result
|
|
@@ -0,0 +1,165 @@
|
|
|
1
|
+
""" Shadeform | Catalog
|
|
2
|
+
|
|
3
|
+
This module loads pricing and instance information from the Shadeform API
|
|
4
|
+
and can be used to query instance types and pricing information for Shadeform.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import typing
|
|
8
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
9
|
+
|
|
10
|
+
import pandas as pd
|
|
11
|
+
|
|
12
|
+
from sky.catalog import common
|
|
13
|
+
|
|
14
|
+
if typing.TYPE_CHECKING:
|
|
15
|
+
from sky.clouds import cloud
|
|
16
|
+
|
|
17
|
+
# We'll use dynamic fetching, so no static CSV file to load
|
|
18
|
+
_df = None
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _get_df():
|
|
22
|
+
"""Get the dataframe, fetching from API if needed."""
|
|
23
|
+
global _df
|
|
24
|
+
if _df is None:
|
|
25
|
+
# For now, we'll fall back to a minimal static catalog
|
|
26
|
+
# In a full implementation, this would call the Shadeform API
|
|
27
|
+
# to dynamically fetch the latest instance types and pricing
|
|
28
|
+
try:
|
|
29
|
+
df = common.read_catalog('shadeform/vms.csv')
|
|
30
|
+
except FileNotFoundError:
|
|
31
|
+
# If no static catalog exists, create an empty one
|
|
32
|
+
# This would be replaced with dynamic API fetching
|
|
33
|
+
_df = pd.DataFrame(columns=[
|
|
34
|
+
'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs',
|
|
35
|
+
'MemoryGiB', 'Price', 'Region', 'GpuInfo', 'SpotPrice'
|
|
36
|
+
])
|
|
37
|
+
else:
|
|
38
|
+
df = df[df['InstanceType'].notna()]
|
|
39
|
+
if 'AcceleratorName' in df.columns:
|
|
40
|
+
df = df[df['AcceleratorName'].notna()]
|
|
41
|
+
df = df.assign(AcceleratorName=df['AcceleratorName'].astype(
|
|
42
|
+
str).str.strip())
|
|
43
|
+
_df = df.reset_index(drop=True)
|
|
44
|
+
return _df
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
def _is_not_found_error(err: ValueError) -> bool:
|
|
48
|
+
msg = str(err).lower()
|
|
49
|
+
return 'not found' in msg or 'not supported' in msg
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def _call_or_default(func, default):
|
|
53
|
+
try:
|
|
54
|
+
return func()
|
|
55
|
+
except ValueError as err:
|
|
56
|
+
if _is_not_found_error(err):
|
|
57
|
+
return default
|
|
58
|
+
raise
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def instance_type_exists(instance_type: str) -> bool:
|
|
62
|
+
"""Check if an instance type exists."""
|
|
63
|
+
return common.instance_type_exists_impl(_get_df(), instance_type)
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
def validate_region_zone(
|
|
67
|
+
region: Optional[str],
|
|
68
|
+
zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
|
|
69
|
+
"""Validate region and zone for Shadeform."""
|
|
70
|
+
return common.validate_region_zone_impl('shadeform', _get_df(), region,
|
|
71
|
+
zone)
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def get_hourly_cost(instance_type: str,
|
|
75
|
+
use_spot: bool = False,
|
|
76
|
+
region: Optional[str] = None,
|
|
77
|
+
zone: Optional[str] = None) -> float:
|
|
78
|
+
"""Returns the cost, or the cheapest cost among all zones for spot."""
|
|
79
|
+
# Shadeform doesn't support spot instances currently
|
|
80
|
+
if use_spot:
|
|
81
|
+
raise ValueError('Spot instances are not supported on Shadeform')
|
|
82
|
+
|
|
83
|
+
return common.get_hourly_cost_impl(_get_df(), instance_type, use_spot,
|
|
84
|
+
region, zone)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def get_vcpus_mem_from_instance_type(
|
|
88
|
+
instance_type: str) -> Tuple[Optional[float], Optional[float]]:
|
|
89
|
+
"""Get vCPUs and memory from instance type."""
|
|
90
|
+
return _call_or_default(
|
|
91
|
+
lambda: common.get_vcpus_mem_from_instance_type_impl(
|
|
92
|
+
_get_df(), instance_type), (None, None))
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
96
|
+
memory: Optional[str] = None,
|
|
97
|
+
disk_tier: Optional[str] = None,
|
|
98
|
+
region: Optional[str] = None,
|
|
99
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
100
|
+
"""Get default instance type based on requirements."""
|
|
101
|
+
del disk_tier # Shadeform doesn't support custom disk tiers yet
|
|
102
|
+
return _call_or_default(
|
|
103
|
+
lambda: common.get_instance_type_for_cpus_mem_impl(
|
|
104
|
+
_get_df(), cpus, memory, region, zone), None)
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def get_accelerators_from_instance_type(
|
|
108
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
|
109
|
+
"""Get accelerator information from instance type."""
|
|
110
|
+
return _call_or_default(
|
|
111
|
+
lambda: common.get_accelerators_from_instance_type_impl(
|
|
112
|
+
_get_df(), instance_type), None)
|
|
113
|
+
|
|
114
|
+
|
|
115
|
+
def get_instance_type_for_accelerator(
|
|
116
|
+
acc_name: str,
|
|
117
|
+
acc_count: int,
|
|
118
|
+
cpus: Optional[str] = None,
|
|
119
|
+
memory: Optional[str] = None,
|
|
120
|
+
use_spot: bool = False,
|
|
121
|
+
region: Optional[str] = None,
|
|
122
|
+
zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
|
|
123
|
+
"""Returns a list of instance types that have the given accelerator."""
|
|
124
|
+
if use_spot:
|
|
125
|
+
# Return empty lists since spot is not supported
|
|
126
|
+
return None, ['Spot instances are not supported on Shadeform']
|
|
127
|
+
|
|
128
|
+
return _call_or_default(
|
|
129
|
+
lambda: common.get_instance_type_for_accelerator_impl(
|
|
130
|
+
df=_get_df(),
|
|
131
|
+
acc_name=acc_name,
|
|
132
|
+
acc_count=acc_count,
|
|
133
|
+
cpus=cpus,
|
|
134
|
+
memory=memory,
|
|
135
|
+
use_spot=use_spot,
|
|
136
|
+
region=region,
|
|
137
|
+
zone=zone), (None, []))
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def get_region_zones_for_instance_type(instance_type: str,
|
|
141
|
+
use_spot: bool) -> List['cloud.Region']:
|
|
142
|
+
"""Get regions and zones for an instance type."""
|
|
143
|
+
if use_spot:
|
|
144
|
+
return [] # No spot support
|
|
145
|
+
|
|
146
|
+
df = _get_df()
|
|
147
|
+
df_filtered = df[df['InstanceType'] == instance_type]
|
|
148
|
+
return _call_or_default(
|
|
149
|
+
lambda: common.get_region_zones(df_filtered, use_spot), [])
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
def list_accelerators(
|
|
153
|
+
gpus_only: bool,
|
|
154
|
+
name_filter: Optional[str],
|
|
155
|
+
region_filter: Optional[str],
|
|
156
|
+
quantity_filter: Optional[int],
|
|
157
|
+
case_sensitive: bool = True,
|
|
158
|
+
all_regions: bool = False,
|
|
159
|
+
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
|
|
160
|
+
"""Returns all instance types in Shadeform offering GPUs."""
|
|
161
|
+
del require_price # Unused.
|
|
162
|
+
return common.list_accelerators_impl('Shadeform', _get_df(), gpus_only,
|
|
163
|
+
name_filter, region_filter,
|
|
164
|
+
quantity_filter, case_sensitive,
|
|
165
|
+
all_regions)
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
"""Slurm Catalog."""
|
|
2
|
+
|
|
3
|
+
import collections
|
|
4
|
+
import re
|
|
5
|
+
from typing import Dict, List, Optional, Set, Tuple
|
|
6
|
+
|
|
7
|
+
from sky import check as sky_check
|
|
8
|
+
from sky import clouds as sky_clouds
|
|
9
|
+
from sky import sky_logging
|
|
10
|
+
from sky.catalog import common
|
|
11
|
+
from sky.clouds import cloud
|
|
12
|
+
from sky.provision.slurm import utils as slurm_utils
|
|
13
|
+
from sky.utils import resources_utils
|
|
14
|
+
|
|
15
|
+
logger = sky_logging.init_logger(__name__)
|
|
16
|
+
|
|
17
|
+
_DEFAULT_NUM_VCPUS = 2
|
|
18
|
+
_DEFAULT_MEMORY_CPU_RATIO = 1
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def instance_type_exists(instance_type: str) -> bool:
|
|
22
|
+
"""Check if the given instance type is valid for Slurm."""
|
|
23
|
+
return slurm_utils.SlurmInstanceType.is_valid_instance_type(instance_type)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
27
|
+
memory: Optional[str] = None,
|
|
28
|
+
disk_tier: Optional[
|
|
29
|
+
resources_utils.DiskTier] = None,
|
|
30
|
+
region: Optional[str] = None,
|
|
31
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
32
|
+
# Delete unused parameters.
|
|
33
|
+
del disk_tier, region, zone
|
|
34
|
+
|
|
35
|
+
# Slurm provisions resources via --cpus-per-task and --mem.
|
|
36
|
+
instance_cpus = float(
|
|
37
|
+
cpus.strip('+')) if cpus is not None else _DEFAULT_NUM_VCPUS
|
|
38
|
+
if memory is not None:
|
|
39
|
+
if memory.endswith('+'):
|
|
40
|
+
instance_mem = float(memory[:-1])
|
|
41
|
+
elif memory.endswith('x'):
|
|
42
|
+
instance_mem = float(memory[:-1]) * instance_cpus
|
|
43
|
+
else:
|
|
44
|
+
instance_mem = float(memory)
|
|
45
|
+
else:
|
|
46
|
+
instance_mem = instance_cpus * _DEFAULT_MEMORY_CPU_RATIO
|
|
47
|
+
virtual_instance_type = slurm_utils.SlurmInstanceType(
|
|
48
|
+
instance_cpus, instance_mem).name
|
|
49
|
+
return virtual_instance_type
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def list_accelerators(
|
|
53
|
+
gpus_only: bool,
|
|
54
|
+
name_filter: Optional[str],
|
|
55
|
+
region_filter: Optional[str],
|
|
56
|
+
quantity_filter: Optional[int],
|
|
57
|
+
case_sensitive: bool = True,
|
|
58
|
+
all_regions: bool = False,
|
|
59
|
+
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
|
|
60
|
+
"""List accelerators in Slurm clusters.
|
|
61
|
+
|
|
62
|
+
Returns a dictionary mapping GPU type to a list of InstanceTypeInfo objects.
|
|
63
|
+
"""
|
|
64
|
+
return list_accelerators_realtime(gpus_only, name_filter, region_filter,
|
|
65
|
+
quantity_filter, case_sensitive,
|
|
66
|
+
all_regions, require_price)[0]
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def list_accelerators_realtime(
|
|
70
|
+
gpus_only: bool = True,
|
|
71
|
+
name_filter: Optional[str] = None,
|
|
72
|
+
region_filter: Optional[str] = None,
|
|
73
|
+
quantity_filter: Optional[int] = None,
|
|
74
|
+
case_sensitive: bool = True,
|
|
75
|
+
all_regions: bool = False,
|
|
76
|
+
require_price: bool = False,
|
|
77
|
+
) -> Tuple[Dict[str, List[common.InstanceTypeInfo]], Dict[str, int], Dict[str,
|
|
78
|
+
int]]:
|
|
79
|
+
"""Fetches real-time accelerator information from the Slurm cluster.
|
|
80
|
+
|
|
81
|
+
Uses the `get_slurm_node_info_list` helper function.
|
|
82
|
+
|
|
83
|
+
Args:
|
|
84
|
+
gpus_only: If True, only return GPU accelerators.
|
|
85
|
+
name_filter: Regex filter for accelerator names (e.g., 'V100', 'gpu').
|
|
86
|
+
region_filter: Optional filter for Slurm partitions.
|
|
87
|
+
quantity_filter: Minimum number of accelerators required per node.
|
|
88
|
+
case_sensitive: Whether name_filter is case-sensitive.
|
|
89
|
+
all_regions: Unused in Slurm context.
|
|
90
|
+
require_price: Unused in Slurm context.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
A tuple of three dictionaries:
|
|
94
|
+
- qtys_map: Maps GPU type to set of InstanceTypeInfo objects for unique
|
|
95
|
+
counts found per node.
|
|
96
|
+
- total_capacity: Maps GPU type to total count across all nodes.
|
|
97
|
+
- total_available: Maps GPU type to total free count across all nodes.
|
|
98
|
+
"""
|
|
99
|
+
del gpus_only, all_regions, require_price
|
|
100
|
+
|
|
101
|
+
enabled_clouds = sky_check.get_cached_enabled_clouds_or_refresh(
|
|
102
|
+
cloud.CloudCapability.COMPUTE)
|
|
103
|
+
if not sky_clouds.cloud_in_iterable(sky_clouds.Slurm(), enabled_clouds):
|
|
104
|
+
return {}, {}, {}
|
|
105
|
+
|
|
106
|
+
if region_filter is None:
|
|
107
|
+
# Get the first available cluster as default
|
|
108
|
+
all_clusters = slurm_utils.get_all_slurm_cluster_names()
|
|
109
|
+
if not all_clusters:
|
|
110
|
+
return {}, {}, {}
|
|
111
|
+
slurm_cluster = all_clusters[0]
|
|
112
|
+
else:
|
|
113
|
+
slurm_cluster = region_filter
|
|
114
|
+
|
|
115
|
+
partition_filter = slurm_utils.get_cluster_default_partition(slurm_cluster)
|
|
116
|
+
|
|
117
|
+
# Call the helper function to get node info
|
|
118
|
+
slurm_nodes_info = slurm_utils.slurm_node_info(
|
|
119
|
+
slurm_cluster_name=slurm_cluster)
|
|
120
|
+
|
|
121
|
+
if not slurm_nodes_info:
|
|
122
|
+
# Customize error message based on filters
|
|
123
|
+
err_msg = 'No matching GPU nodes found in the Slurm cluster'
|
|
124
|
+
filters_applied = []
|
|
125
|
+
if name_filter:
|
|
126
|
+
filters_applied.append(f'gpu_name={name_filter!r}')
|
|
127
|
+
if quantity_filter:
|
|
128
|
+
filters_applied.append(f'quantity>={quantity_filter}')
|
|
129
|
+
if region_filter:
|
|
130
|
+
filters_applied.append(f'cluster={region_filter!r}')
|
|
131
|
+
if filters_applied:
|
|
132
|
+
err_msg += f' with filters ({", ".join(filters_applied)})'
|
|
133
|
+
err_msg += '.'
|
|
134
|
+
logger.error(
|
|
135
|
+
err_msg) # Log as error as it indicates no usable resources found
|
|
136
|
+
raise ValueError(err_msg)
|
|
137
|
+
|
|
138
|
+
# Aggregate results into the required format
|
|
139
|
+
qtys_map: Dict[str,
|
|
140
|
+
Set[common.InstanceTypeInfo]] = collections.defaultdict(set)
|
|
141
|
+
total_capacity: Dict[str, int] = collections.defaultdict(int)
|
|
142
|
+
total_available: Dict[str, int] = collections.defaultdict(int)
|
|
143
|
+
|
|
144
|
+
for node_info in slurm_nodes_info:
|
|
145
|
+
gpu_type = node_info['gpu_type']
|
|
146
|
+
node_total_gpus = node_info['total_gpus']
|
|
147
|
+
node_free_gpus = node_info['free_gpus']
|
|
148
|
+
partition = node_info['partition']
|
|
149
|
+
|
|
150
|
+
# Apply name filter to the determined GPU type
|
|
151
|
+
regex_flags = 0 if case_sensitive else re.IGNORECASE
|
|
152
|
+
if name_filter and not re.match(
|
|
153
|
+
name_filter, gpu_type, flags=regex_flags):
|
|
154
|
+
continue
|
|
155
|
+
|
|
156
|
+
# Apply quantity filter (total GPUs on node must meet this)
|
|
157
|
+
if quantity_filter and node_total_gpus < quantity_filter:
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
# Apply partition filter if specified
|
|
161
|
+
# TODO(zhwu): when a node is in multiple partitions, the partition
|
|
162
|
+
# mapping from node to partition does not work.
|
|
163
|
+
# if partition_filter and partition != partition_filter:
|
|
164
|
+
# continue
|
|
165
|
+
|
|
166
|
+
# Create InstanceTypeInfo objects for various GPU counts
|
|
167
|
+
# Similar to Kubernetes, generate powers of 2 up to node_total_gpus
|
|
168
|
+
if node_total_gpus > 0:
|
|
169
|
+
count = 1
|
|
170
|
+
while count <= node_total_gpus:
|
|
171
|
+
instance_info = common.InstanceTypeInfo(
|
|
172
|
+
instance_type=None, # Slurm doesn't have instance types
|
|
173
|
+
accelerator_name=gpu_type,
|
|
174
|
+
accelerator_count=count,
|
|
175
|
+
cpu_count=node_info['vcpu_count'],
|
|
176
|
+
memory=node_info['memory_gb'],
|
|
177
|
+
price=0.0, # Slurm doesn't have price info
|
|
178
|
+
region=partition, # Use partition as region
|
|
179
|
+
cloud='slurm', # Specify cloud as 'slurm'
|
|
180
|
+
device_memory=0.0, # No GPU memory info from Slurm
|
|
181
|
+
spot_price=0.0, # Slurm doesn't have spot pricing
|
|
182
|
+
)
|
|
183
|
+
qtys_map[gpu_type].add(instance_info)
|
|
184
|
+
count *= 2
|
|
185
|
+
|
|
186
|
+
# Add the actual total if it's not already included
|
|
187
|
+
# (e.g., if node has 12 GPUs, include counts 1, 2, 4, 8, 12)
|
|
188
|
+
if count // 2 != node_total_gpus:
|
|
189
|
+
instance_info = common.InstanceTypeInfo(
|
|
190
|
+
instance_type=None,
|
|
191
|
+
accelerator_name=gpu_type,
|
|
192
|
+
accelerator_count=node_total_gpus,
|
|
193
|
+
cpu_count=node_info['vcpu_count'],
|
|
194
|
+
memory=node_info['memory_gb'],
|
|
195
|
+
price=0.0,
|
|
196
|
+
region=partition,
|
|
197
|
+
cloud='slurm',
|
|
198
|
+
device_memory=0.0,
|
|
199
|
+
spot_price=0.0,
|
|
200
|
+
)
|
|
201
|
+
qtys_map[gpu_type].add(instance_info)
|
|
202
|
+
|
|
203
|
+
# Map of GPU type -> total count across all matched nodes
|
|
204
|
+
total_capacity[gpu_type] += node_total_gpus
|
|
205
|
+
|
|
206
|
+
# Map of GPU type -> total *free* count across all matched nodes
|
|
207
|
+
total_available[gpu_type] += node_free_gpus
|
|
208
|
+
|
|
209
|
+
# Check if any GPUs were found after applying filters
|
|
210
|
+
if not total_capacity:
|
|
211
|
+
err_msg = 'No matching GPU nodes found in the Slurm cluster'
|
|
212
|
+
filters_applied = []
|
|
213
|
+
if name_filter:
|
|
214
|
+
filters_applied.append(f'gpu_name={name_filter!r}')
|
|
215
|
+
if quantity_filter:
|
|
216
|
+
filters_applied.append(f'quantity>={quantity_filter}')
|
|
217
|
+
if partition_filter:
|
|
218
|
+
filters_applied.append(f'partition={partition_filter!r}')
|
|
219
|
+
if filters_applied:
|
|
220
|
+
err_msg += f' with filters ({", ".join(filters_applied)})'
|
|
221
|
+
err_msg += '.'
|
|
222
|
+
logger.error(err_msg)
|
|
223
|
+
raise ValueError(err_msg)
|
|
224
|
+
|
|
225
|
+
# Convert sets of InstanceTypeInfo to sorted lists
|
|
226
|
+
final_qtys_map = {
|
|
227
|
+
gpu: sorted(list(instances), key=lambda x: x.accelerator_count)
|
|
228
|
+
for gpu, instances in qtys_map.items()
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
logger.debug(f'Aggregated Slurm GPU Info: '
|
|
232
|
+
f'qtys={final_qtys_map}, '
|
|
233
|
+
f'capacity={dict(total_capacity)}, '
|
|
234
|
+
f'available={dict(total_available)}')
|
|
235
|
+
|
|
236
|
+
return final_qtys_map, dict(total_capacity), dict(total_available)
|
|
237
|
+
|
|
238
|
+
|
|
239
|
+
def validate_region_zone(
|
|
240
|
+
region_name: Optional[str],
|
|
241
|
+
zone_name: Optional[str],
|
|
242
|
+
) -> Tuple[Optional[str], Optional[str]]:
|
|
243
|
+
return (region_name, zone_name)
|