skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sky/__init__.py +12 -2
- sky/adaptors/aws.py +27 -22
- sky/adaptors/common.py +25 -2
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/do.py +8 -2
- sky/adaptors/gcp.py +11 -0
- sky/adaptors/ibm.py +5 -2
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/nebius.py +3 -1
- sky/adaptors/primeintellect.py +1 -0
- sky/adaptors/seeweb.py +183 -0
- sky/adaptors/shadeform.py +89 -0
- sky/adaptors/slurm.py +478 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +157 -263
- sky/backends/__init__.py +3 -2
- sky/backends/backend.py +11 -3
- sky/backends/backend_utils.py +630 -185
- sky/backends/cloud_vm_ray_backend.py +1111 -928
- sky/backends/local_docker_backend.py +9 -5
- sky/backends/task_codegen.py +971 -0
- sky/backends/wheel_utils.py +18 -0
- sky/catalog/__init__.py +8 -3
- sky/catalog/aws_catalog.py +4 -0
- sky/catalog/common.py +19 -1
- sky/catalog/data_fetchers/fetch_aws.py +102 -80
- sky/catalog/data_fetchers/fetch_gcp.py +30 -3
- sky/catalog/data_fetchers/fetch_nebius.py +9 -6
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +36 -32
- sky/catalog/primeintellect_catalog.py +95 -0
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/seeweb_catalog.py +184 -0
- sky/catalog/shadeform_catalog.py +165 -0
- sky/catalog/slurm_catalog.py +243 -0
- sky/check.py +87 -46
- sky/client/cli/command.py +1004 -434
- sky/client/cli/flags.py +4 -2
- sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
- sky/client/cli/utils.py +79 -0
- sky/client/common.py +12 -2
- sky/client/sdk.py +188 -65
- sky/client/sdk_async.py +34 -33
- sky/cloud_stores.py +82 -3
- sky/clouds/__init__.py +8 -0
- sky/clouds/aws.py +337 -129
- sky/clouds/azure.py +24 -18
- sky/clouds/cloud.py +47 -13
- sky/clouds/cudo.py +16 -13
- sky/clouds/do.py +9 -7
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +14 -7
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +80 -45
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +23 -9
- sky/clouds/oci.py +19 -12
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +317 -0
- sky/clouds/runpod.py +85 -24
- sky/clouds/scp.py +12 -8
- sky/clouds/seeweb.py +477 -0
- sky/clouds/shadeform.py +400 -0
- sky/clouds/slurm.py +578 -0
- sky/clouds/ssh.py +6 -3
- sky/clouds/utils/scp_utils.py +61 -50
- sky/clouds/vast.py +43 -27
- sky/clouds/vsphere.py +14 -16
- sky/core.py +296 -195
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
- sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
- sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
- sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
- sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
- sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
- sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
- sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
- sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
- sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
- sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
- sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
- sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
- sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
- sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
- sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
- sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/plugins/[...slug].html +1 -0
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +177 -30
- sky/data/storage.py +200 -19
- sky/data/storage_utils.py +10 -45
- sky/exceptions.py +18 -7
- sky/execution.py +74 -31
- sky/global_user_state.py +605 -191
- sky/jobs/__init__.py +2 -0
- sky/jobs/client/sdk.py +101 -4
- sky/jobs/client/sdk_async.py +31 -5
- sky/jobs/constants.py +15 -8
- sky/jobs/controller.py +726 -284
- sky/jobs/file_content_utils.py +128 -0
- sky/jobs/log_gc.py +193 -0
- sky/jobs/recovery_strategy.py +250 -100
- sky/jobs/scheduler.py +271 -173
- sky/jobs/server/core.py +367 -114
- sky/jobs/server/server.py +81 -35
- sky/jobs/server/utils.py +89 -35
- sky/jobs/state.py +1498 -620
- sky/jobs/utils.py +771 -306
- sky/logs/agent.py +40 -5
- sky/logs/aws.py +9 -19
- sky/metrics/utils.py +282 -39
- sky/models.py +2 -0
- sky/optimizer.py +7 -6
- sky/provision/__init__.py +38 -1
- sky/provision/aws/config.py +34 -13
- sky/provision/aws/instance.py +5 -2
- sky/provision/azure/instance.py +5 -3
- sky/provision/common.py +22 -0
- sky/provision/cudo/instance.py +4 -3
- sky/provision/do/instance.py +4 -3
- sky/provision/docker_utils.py +112 -28
- sky/provision/fluidstack/instance.py +6 -5
- sky/provision/gcp/config.py +6 -1
- sky/provision/gcp/instance.py +4 -2
- sky/provision/hyperbolic/instance.py +4 -2
- sky/provision/instance_setup.py +66 -20
- sky/provision/kubernetes/__init__.py +2 -0
- sky/provision/kubernetes/config.py +7 -44
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +609 -213
- sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/network_utils.py +8 -25
- sky/provision/kubernetes/utils.py +422 -422
- sky/provision/kubernetes/volume.py +150 -18
- sky/provision/lambda_cloud/instance.py +16 -13
- sky/provision/nebius/instance.py +6 -2
- sky/provision/nebius/utils.py +103 -86
- sky/provision/oci/instance.py +4 -2
- sky/provision/paperspace/instance.py +4 -3
- sky/provision/primeintellect/__init__.py +10 -0
- sky/provision/primeintellect/config.py +11 -0
- sky/provision/primeintellect/instance.py +454 -0
- sky/provision/primeintellect/utils.py +398 -0
- sky/provision/provisioner.py +45 -15
- sky/provision/runpod/__init__.py +2 -0
- sky/provision/runpod/instance.py +4 -3
- sky/provision/runpod/volume.py +69 -13
- sky/provision/scp/instance.py +307 -130
- sky/provision/seeweb/__init__.py +11 -0
- sky/provision/seeweb/config.py +13 -0
- sky/provision/seeweb/instance.py +812 -0
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/slurm/__init__.py +12 -0
- sky/provision/slurm/config.py +13 -0
- sky/provision/slurm/instance.py +572 -0
- sky/provision/slurm/utils.py +583 -0
- sky/provision/vast/instance.py +9 -4
- sky/provision/vast/utils.py +10 -6
- sky/provision/volume.py +164 -0
- sky/provision/vsphere/common/ssl_helper.py +1 -1
- sky/provision/vsphere/common/vapiconnect.py +2 -1
- sky/provision/vsphere/common/vim_utils.py +3 -2
- sky/provision/vsphere/instance.py +8 -6
- sky/provision/vsphere/vsphere_utils.py +8 -1
- sky/resources.py +11 -3
- sky/schemas/api/responses.py +107 -6
- sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
- sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
- sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
- sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
- sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
- sky/schemas/db/serve_state/002_yaml_content.py +34 -0
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
- sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
- sky/schemas/generated/jobsv1_pb2.py +86 -0
- sky/schemas/generated/jobsv1_pb2.pyi +254 -0
- sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
- sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
- sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
- sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
- sky/schemas/generated/servev1_pb2.py +58 -0
- sky/schemas/generated/servev1_pb2.pyi +115 -0
- sky/schemas/generated/servev1_pb2_grpc.py +322 -0
- sky/serve/autoscalers.py +2 -0
- sky/serve/client/impl.py +55 -21
- sky/serve/constants.py +4 -3
- sky/serve/controller.py +17 -11
- sky/serve/load_balancing_policies.py +1 -1
- sky/serve/replica_managers.py +219 -142
- sky/serve/serve_rpc_utils.py +179 -0
- sky/serve/serve_state.py +63 -54
- sky/serve/serve_utils.py +145 -109
- sky/serve/server/core.py +46 -25
- sky/serve/server/impl.py +311 -162
- sky/serve/server/server.py +21 -19
- sky/serve/service.py +84 -68
- sky/serve/service_spec.py +45 -7
- sky/server/auth/loopback.py +38 -0
- sky/server/auth/oauth2_proxy.py +12 -7
- sky/server/common.py +47 -24
- sky/server/config.py +62 -28
- sky/server/constants.py +9 -1
- sky/server/daemons.py +109 -38
- sky/server/metrics.py +76 -96
- sky/server/middleware_utils.py +166 -0
- sky/server/plugins.py +222 -0
- sky/server/requests/executor.py +384 -145
- sky/server/requests/payloads.py +83 -19
- sky/server/requests/preconditions.py +15 -13
- sky/server/requests/request_names.py +123 -0
- sky/server/requests/requests.py +511 -157
- sky/server/requests/serializers/decoders.py +48 -17
- sky/server/requests/serializers/encoders.py +102 -20
- sky/server/requests/serializers/return_value_serializers.py +60 -0
- sky/server/requests/threads.py +117 -0
- sky/server/rest.py +116 -24
- sky/server/server.py +497 -179
- sky/server/server_utils.py +30 -0
- sky/server/stream_utils.py +219 -45
- sky/server/uvicorn.py +30 -19
- sky/setup_files/MANIFEST.in +6 -1
- sky/setup_files/alembic.ini +8 -0
- sky/setup_files/dependencies.py +64 -19
- sky/setup_files/setup.py +44 -44
- sky/sky_logging.py +13 -5
- sky/skylet/attempt_skylet.py +116 -24
- sky/skylet/configs.py +3 -1
- sky/skylet/constants.py +139 -29
- sky/skylet/events.py +74 -14
- sky/skylet/executor/__init__.py +1 -0
- sky/skylet/executor/slurm.py +189 -0
- sky/skylet/job_lib.py +143 -105
- sky/skylet/log_lib.py +252 -8
- sky/skylet/log_lib.pyi +47 -7
- sky/skylet/providers/ibm/node_provider.py +12 -8
- sky/skylet/providers/ibm/vpc_provider.py +13 -12
- sky/skylet/runtime_utils.py +21 -0
- sky/skylet/services.py +524 -0
- sky/skylet/skylet.py +27 -2
- sky/skylet/subprocess_daemon.py +104 -28
- sky/skypilot_config.py +99 -79
- sky/ssh_node_pools/constants.py +12 -0
- sky/ssh_node_pools/core.py +40 -3
- sky/ssh_node_pools/deploy/__init__.py +4 -0
- sky/ssh_node_pools/deploy/deploy.py +952 -0
- sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
- sky/ssh_node_pools/deploy/utils.py +173 -0
- sky/ssh_node_pools/server.py +20 -21
- sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
- sky/task.py +221 -104
- sky/templates/aws-ray.yml.j2 +1 -0
- sky/templates/azure-ray.yml.j2 +1 -0
- sky/templates/cudo-ray.yml.j2 +1 -0
- sky/templates/do-ray.yml.j2 +1 -0
- sky/templates/fluidstack-ray.yml.j2 +1 -0
- sky/templates/gcp-ray.yml.j2 +1 -0
- sky/templates/hyperbolic-ray.yml.j2 +1 -0
- sky/templates/ibm-ray.yml.j2 +2 -1
- sky/templates/jobs-controller.yaml.j2 +3 -0
- sky/templates/kubernetes-ray.yml.j2 +204 -55
- sky/templates/lambda-ray.yml.j2 +1 -0
- sky/templates/nebius-ray.yml.j2 +3 -0
- sky/templates/oci-ray.yml.j2 +1 -0
- sky/templates/paperspace-ray.yml.j2 +1 -0
- sky/templates/primeintellect-ray.yml.j2 +72 -0
- sky/templates/runpod-ray.yml.j2 +1 -0
- sky/templates/scp-ray.yml.j2 +1 -0
- sky/templates/seeweb-ray.yml.j2 +171 -0
- sky/templates/shadeform-ray.yml.j2 +73 -0
- sky/templates/slurm-ray.yml.j2 +85 -0
- sky/templates/vast-ray.yml.j2 +2 -0
- sky/templates/vsphere-ray.yml.j2 +1 -0
- sky/templates/websocket_proxy.py +188 -43
- sky/usage/usage_lib.py +16 -4
- sky/users/model.conf +1 -1
- sky/users/permission.py +84 -44
- sky/users/rbac.py +31 -3
- sky/utils/accelerator_registry.py +6 -3
- sky/utils/admin_policy_utils.py +18 -5
- sky/utils/annotations.py +128 -6
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/atomic.py +1 -1
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +12 -7
- sky/utils/cluster_utils.py +28 -6
- sky/utils/command_runner.py +283 -30
- sky/utils/command_runner.pyi +63 -7
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +55 -7
- sky/utils/config_utils.py +1 -14
- sky/utils/context.py +127 -40
- sky/utils/context_utils.py +73 -18
- sky/utils/controller_utils.py +229 -70
- sky/utils/db/db_utils.py +95 -18
- sky/utils/db/kv_cache.py +149 -0
- sky/utils/db/migration_utils.py +24 -7
- sky/utils/env_options.py +4 -0
- sky/utils/git.py +559 -1
- sky/utils/kubernetes/create_cluster.sh +15 -30
- sky/utils/kubernetes/delete_cluster.sh +10 -7
- sky/utils/kubernetes/generate_kind_config.py +6 -66
- sky/utils/kubernetes/gpu_labeler.py +13 -3
- sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
- sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
- sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
- sky/utils/kubernetes/rsync_helper.sh +11 -3
- sky/utils/kubernetes/ssh-tunnel.sh +7 -376
- sky/utils/kubernetes_enums.py +7 -15
- sky/utils/lock_events.py +4 -4
- sky/utils/locks.py +128 -31
- sky/utils/log_utils.py +0 -319
- sky/utils/resource_checker.py +13 -10
- sky/utils/resources_utils.py +53 -29
- sky/utils/rich_utils.py +8 -4
- sky/utils/schemas.py +138 -52
- sky/utils/subprocess_utils.py +17 -4
- sky/utils/thread_utils.py +91 -0
- sky/utils/timeline.py +2 -1
- sky/utils/ux_utils.py +35 -1
- sky/utils/volume.py +88 -4
- sky/utils/yaml_utils.py +9 -0
- sky/volumes/client/sdk.py +48 -10
- sky/volumes/server/core.py +59 -22
- sky/volumes/server/server.py +46 -17
- sky/volumes/volume.py +54 -42
- sky/workspaces/core.py +57 -21
- sky/workspaces/server.py +13 -12
- sky_templates/README.md +3 -0
- sky_templates/__init__.py +3 -0
- sky_templates/ray/__init__.py +0 -0
- sky_templates/ray/start_cluster +183 -0
- sky_templates/ray/stop_cluster +75 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
- skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
- skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
- sky/client/cli/git.py +0 -549
- sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
- sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
- sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
- sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
- sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
- sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
- sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
- sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
- sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
- sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
- sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
- sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
- sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
- sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
- sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
- sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
- sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
- sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
- sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
- sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
- sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
- skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
- skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
- /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,329 @@
|
|
|
1
|
+
"""A script that generates the Seeweb catalog.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
python fetch_seeweb.py [-h] [--api-key API_KEY]
|
|
5
|
+
[--api-key-path API_KEY_PATH]
|
|
6
|
+
|
|
7
|
+
If neither --api-key nor --api-key-path are provided, this script will parse
|
|
8
|
+
`~/.seeweb_cloud/seeweb_keys` to look for Seeweb API key.
|
|
9
|
+
"""
|
|
10
|
+
import argparse
|
|
11
|
+
import configparser
|
|
12
|
+
import csv
|
|
13
|
+
import json
|
|
14
|
+
import os
|
|
15
|
+
from typing import Any, Dict, List, Optional
|
|
16
|
+
|
|
17
|
+
from sky.adaptors.seeweb import ecsapi
|
|
18
|
+
|
|
19
|
+
# GPU name mapping from Seeweb to SkyPilot canonical names
|
|
20
|
+
SEEWEB_GPU_NAME_TO_SKYPILOT_GPU_NAME = {
|
|
21
|
+
'H200 141GB': 'H200',
|
|
22
|
+
'RTX A6000 48GB': 'RTXA6000',
|
|
23
|
+
'A100 80GB': 'A100',
|
|
24
|
+
'L4 24GB': 'L4',
|
|
25
|
+
'L40s 48GB': 'L40S',
|
|
26
|
+
'H100 80GB': 'H100',
|
|
27
|
+
'MI300X': 'MI300X',
|
|
28
|
+
'A30': 'A30',
|
|
29
|
+
'RTX 6000 24GB': 'RTX6000',
|
|
30
|
+
'Tenstorrent Grayskull e75': 'GRAYSKULL-E75',
|
|
31
|
+
'Tenstorrent Grayskull e150': 'GRAYSKULL-E150',
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
# GPU VRAM mapping in MB
|
|
35
|
+
VRAM = {
|
|
36
|
+
'RTXA6000': 48384, # 48GB
|
|
37
|
+
'H200': 144384, # 141GB
|
|
38
|
+
'A100': 81920, # 80GB
|
|
39
|
+
'L4': 24576, # 24GB
|
|
40
|
+
'L40S': 49152, # 48GB
|
|
41
|
+
'H100': 81920, # 80GB
|
|
42
|
+
'MI300X': 192000, # 192GB
|
|
43
|
+
'A30': 24576, # 24GB
|
|
44
|
+
'RTX6000': 24576, # 24GB
|
|
45
|
+
'GRAYSKULL-E75': 8192, # 8GB
|
|
46
|
+
'GRAYSKULL-E150': 8192, # 8GB
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def is_tenstorrent_gpu_name(gpu_name: Optional[str]) -> bool:
|
|
51
|
+
"""Return True if the given GPU name refers to a Tenstorrent GPU.
|
|
52
|
+
|
|
53
|
+
Detects by common identifiers present in normalized names (e.g., GRAYSKULL)
|
|
54
|
+
or by the vendor name directly.
|
|
55
|
+
"""
|
|
56
|
+
if not gpu_name:
|
|
57
|
+
return False
|
|
58
|
+
upper = str(gpu_name).upper()
|
|
59
|
+
return 'TENSTORRENT' in upper or 'GRAYSKULL' in upper
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def is_mi300x_gpu_name(gpu_name: Optional[str]) -> bool:
|
|
63
|
+
"""Return True if the given GPU name refers to AMD MI300X."""
|
|
64
|
+
if not gpu_name:
|
|
65
|
+
return False
|
|
66
|
+
return 'MI300X' in str(gpu_name).upper()
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def get_api_key(path: Optional[str] = None) -> str:
|
|
70
|
+
"""Get API key from config file or environment variable."""
|
|
71
|
+
# Step 1: Try to get from config file
|
|
72
|
+
if path is None:
|
|
73
|
+
path = os.path.expanduser('~/.seeweb_cloud/seeweb_keys')
|
|
74
|
+
else:
|
|
75
|
+
path = os.path.expanduser(path)
|
|
76
|
+
|
|
77
|
+
try:
|
|
78
|
+
parser = configparser.ConfigParser()
|
|
79
|
+
parser.read(path)
|
|
80
|
+
return parser['DEFAULT']['api_key'].strip()
|
|
81
|
+
except (KeyError, FileNotFoundError) as exc:
|
|
82
|
+
# Step 2: Try environment variable
|
|
83
|
+
api_key = os.environ.get('SEEWEB_API_KEY')
|
|
84
|
+
if api_key:
|
|
85
|
+
return api_key.strip()
|
|
86
|
+
|
|
87
|
+
# If neither found, raise error
|
|
88
|
+
raise ValueError(
|
|
89
|
+
f'API key not found in {path} or ENV variable SEEWEB_API_KEY'
|
|
90
|
+
) from exc
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def normalize_gpu_name(gpu_name: str) -> str:
|
|
94
|
+
"""Normalize GPU name from Seeweb API to SkyPilot canonical name."""
|
|
95
|
+
if not gpu_name:
|
|
96
|
+
return ''
|
|
97
|
+
|
|
98
|
+
# Map to canonical name if available
|
|
99
|
+
canonical_name = SEEWEB_GPU_NAME_TO_SKYPILOT_GPU_NAME.get(gpu_name)
|
|
100
|
+
if canonical_name:
|
|
101
|
+
return canonical_name
|
|
102
|
+
|
|
103
|
+
# If not found in mapping, return original name
|
|
104
|
+
print(f'Warning: GPU name "{gpu_name}" not found in mapping,'
|
|
105
|
+
f'using original name')
|
|
106
|
+
return gpu_name
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def parse_plan_info(plan: Any) -> Dict[str, Any]:
|
|
110
|
+
"""Parse plan information from Seeweb API response."""
|
|
111
|
+
# Handle both dictionary and object formats
|
|
112
|
+
if hasattr(plan, 'name'):
|
|
113
|
+
# Object format from API
|
|
114
|
+
plan_name = getattr(plan, 'name', 'unknown')
|
|
115
|
+
vcpus = int(getattr(plan, 'cpu', 0))
|
|
116
|
+
|
|
117
|
+
# Handle memory conversion safely
|
|
118
|
+
memory_mb = getattr(plan, 'ram', 0)
|
|
119
|
+
try:
|
|
120
|
+
memory_gb = int(
|
|
121
|
+
memory_mb) / 1024 if memory_mb else 0 # Convert to GB
|
|
122
|
+
except (ValueError, TypeError):
|
|
123
|
+
memory_gb = 0
|
|
124
|
+
|
|
125
|
+
# Handle price safely
|
|
126
|
+
try:
|
|
127
|
+
price = float(getattr(plan, 'hourly_price', 0.0))
|
|
128
|
+
except (ValueError, TypeError):
|
|
129
|
+
price = 0.0
|
|
130
|
+
|
|
131
|
+
# Handle GPU info
|
|
132
|
+
try:
|
|
133
|
+
gpu_count = int(getattr(plan, 'gpu', 0))
|
|
134
|
+
except (ValueError, TypeError):
|
|
135
|
+
gpu_count = 0
|
|
136
|
+
|
|
137
|
+
gpu_label = getattr(plan, 'gpu_label', None)
|
|
138
|
+
|
|
139
|
+
# Determine GPU name - use gpu_label if available,
|
|
140
|
+
# otherwise try to infer from plan name
|
|
141
|
+
if gpu_label:
|
|
142
|
+
gpu_name = normalize_gpu_name(gpu_label) # Normalize the GPU name
|
|
143
|
+
else:
|
|
144
|
+
# Try to extract GPU name from plan name
|
|
145
|
+
plan_name = getattr(plan, 'name', '')
|
|
146
|
+
if 'GPU' in plan_name:
|
|
147
|
+
# Extract GPU type from plan name (e.g., ECS1GPU11 -> GPU11)
|
|
148
|
+
parts = plan_name.split('GPU')
|
|
149
|
+
if len(parts) > 1:
|
|
150
|
+
gpu_name = 'GPU' + parts[1]
|
|
151
|
+
else:
|
|
152
|
+
gpu_name = 'GPU'
|
|
153
|
+
else:
|
|
154
|
+
gpu_name = None
|
|
155
|
+
|
|
156
|
+
# Get GPU VRAM from mapping using the normalized name
|
|
157
|
+
gpu_vram_mb = VRAM.get(gpu_name, 0) if gpu_name else 0
|
|
158
|
+
else:
|
|
159
|
+
raise ValueError(f'Unsupported plan format: {type(plan)}')
|
|
160
|
+
|
|
161
|
+
return {
|
|
162
|
+
'plan_name': plan_name,
|
|
163
|
+
'vcpus': vcpus,
|
|
164
|
+
'memory_gb': memory_gb,
|
|
165
|
+
'gpu_name': gpu_name,
|
|
166
|
+
'gpu_count': gpu_count,
|
|
167
|
+
'gpu_vram_mb': gpu_vram_mb,
|
|
168
|
+
'price': price,
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def get_gpu_info(gpu_count: int, gpu_name: str, gpu_vram_mb: int = 0) -> str:
|
|
173
|
+
"""Generate GPU info JSON string compatible with SkyPilot."""
|
|
174
|
+
if not gpu_name or gpu_count == 0:
|
|
175
|
+
return ''
|
|
176
|
+
|
|
177
|
+
# Determine manufacturer based on GPU name
|
|
178
|
+
gpu_name_upper = str(gpu_name).upper()
|
|
179
|
+
if 'MI300' in gpu_name_upper or gpu_name_upper == 'MI300X':
|
|
180
|
+
manufacturer = 'AMD'
|
|
181
|
+
elif 'GRAYSKULL' in gpu_name_upper:
|
|
182
|
+
manufacturer = 'TENSTORRENT'
|
|
183
|
+
else:
|
|
184
|
+
manufacturer = 'NVIDIA'
|
|
185
|
+
|
|
186
|
+
gpu_info = {
|
|
187
|
+
'Gpus': [{
|
|
188
|
+
'Name': gpu_name,
|
|
189
|
+
'Manufacturer': manufacturer,
|
|
190
|
+
'Count': float(gpu_count),
|
|
191
|
+
'MemoryInfo': {
|
|
192
|
+
'SizeInMiB': gpu_vram_mb
|
|
193
|
+
},
|
|
194
|
+
}],
|
|
195
|
+
'TotalGpuMemoryInMiB': gpu_vram_mb * gpu_count if gpu_vram_mb else 0
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
return json.dumps(gpu_info).replace('"', '\'')
|
|
199
|
+
|
|
200
|
+
|
|
201
|
+
def fetch_seeweb_data(api_key: str) -> List[Dict]:
|
|
202
|
+
"""Fetch data from Seeweb API."""
|
|
203
|
+
if ecsapi is None:
|
|
204
|
+
raise ImportError('ecsapi not available')
|
|
205
|
+
|
|
206
|
+
try:
|
|
207
|
+
client = ecsapi.Api(token=api_key)
|
|
208
|
+
|
|
209
|
+
print('Fetching plans from Seeweb API...')
|
|
210
|
+
api_plans = client.fetch_plans()
|
|
211
|
+
|
|
212
|
+
if not api_plans:
|
|
213
|
+
raise ValueError('No plans returned from API')
|
|
214
|
+
|
|
215
|
+
print(f'Successfully fetched {len(api_plans)} plans from API')
|
|
216
|
+
plans = []
|
|
217
|
+
|
|
218
|
+
for plan in api_plans:
|
|
219
|
+
try:
|
|
220
|
+
# Parse first so we can filter
|
|
221
|
+
# Tenstorrent before extra API calls
|
|
222
|
+
parsed = parse_plan_info(plan)
|
|
223
|
+
|
|
224
|
+
if is_tenstorrent_gpu_name(parsed.get('gpu_name')):
|
|
225
|
+
print(f'Skipping Tenstorrent plan {plan.name}')
|
|
226
|
+
continue
|
|
227
|
+
|
|
228
|
+
if is_mi300x_gpu_name(parsed.get('gpu_name')):
|
|
229
|
+
print(f'Skipping MI300X plan {plan.name}')
|
|
230
|
+
continue
|
|
231
|
+
|
|
232
|
+
print(f'Fetching regions available for {plan.name}')
|
|
233
|
+
regions_available = client.fetch_regions_available(plan.name)
|
|
234
|
+
|
|
235
|
+
parsed.update({'regions_available': regions_available})
|
|
236
|
+
plans.append(parsed)
|
|
237
|
+
except Exception as e: # pylint: disable=broad-except
|
|
238
|
+
print(f'Error parsing plan {plan.name}: {e}')
|
|
239
|
+
continue
|
|
240
|
+
|
|
241
|
+
print(f'Successfully parsed {len(plans)} plans')
|
|
242
|
+
return plans
|
|
243
|
+
|
|
244
|
+
except Exception as e: # pylint: disable=broad-except
|
|
245
|
+
raise Exception(f'Error fetching data from Seeweb API: {e}') from e
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
def create_catalog(api_key: str, output_path: str) -> None:
|
|
249
|
+
"""Create Seeweb catalog by fetching data from API."""
|
|
250
|
+
plans = fetch_seeweb_data(api_key)
|
|
251
|
+
|
|
252
|
+
# Create CSV catalog
|
|
253
|
+
print(f'Writing catalog to {output_path}')
|
|
254
|
+
with open(output_path, mode='w', encoding='utf-8') as f:
|
|
255
|
+
writer = csv.writer(f, delimiter=',', quotechar='"')
|
|
256
|
+
writer.writerow([
|
|
257
|
+
'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs',
|
|
258
|
+
'MemoryGiB', 'Price', 'Region', 'GpuInfo', 'SpotPrice'
|
|
259
|
+
])
|
|
260
|
+
|
|
261
|
+
for plan in plans:
|
|
262
|
+
try:
|
|
263
|
+
gpu_info_str = ''
|
|
264
|
+
if plan['gpu_name'] and plan['gpu_count'] > 0:
|
|
265
|
+
gpu_info_str = get_gpu_info(plan['gpu_count'],
|
|
266
|
+
plan['gpu_name'],
|
|
267
|
+
plan.get('gpu_vram_mb', 0))
|
|
268
|
+
|
|
269
|
+
# Handle regions - create a row for each available region
|
|
270
|
+
regions_available = plan['regions_available']
|
|
271
|
+
if isinstance(regions_available,
|
|
272
|
+
list) and len(regions_available) > 0:
|
|
273
|
+
# Create a row for each region
|
|
274
|
+
for region in regions_available:
|
|
275
|
+
writer.writerow([
|
|
276
|
+
plan['plan_name'], # InstanceType
|
|
277
|
+
plan['gpu_name'], # AcceleratorName (cleaned)
|
|
278
|
+
plan['gpu_count'] if plan['gpu_count'] > 0 else
|
|
279
|
+
'', # AcceleratorCount
|
|
280
|
+
plan['vcpus'], # vCPUs
|
|
281
|
+
plan['memory_gb'], # MemoryGiB
|
|
282
|
+
plan['price'], # Price
|
|
283
|
+
region, # Region (single region per row)
|
|
284
|
+
gpu_info_str, # GpuInfo
|
|
285
|
+
'' # SpotPrice (Seeweb doesn't support spot)
|
|
286
|
+
])
|
|
287
|
+
else:
|
|
288
|
+
# No regions available, create a row with empty region
|
|
289
|
+
writer.writerow([
|
|
290
|
+
plan['plan_name'], # InstanceType
|
|
291
|
+
plan['gpu_name'], # AcceleratorName (cleaned)
|
|
292
|
+
plan['gpu_count']
|
|
293
|
+
if plan['gpu_count'] > 0 else '', # AcceleratorCount
|
|
294
|
+
plan['vcpus'], # vCPUs
|
|
295
|
+
plan['memory_gb'], # MemoryGiB
|
|
296
|
+
plan['price'], # Price
|
|
297
|
+
'', # Region (empty)
|
|
298
|
+
gpu_info_str, # GpuInfo
|
|
299
|
+
'' # SpotPrice (Seeweb doesn't support spot)
|
|
300
|
+
])
|
|
301
|
+
except Exception as e: # pylint: disable=broad-except
|
|
302
|
+
print(f'Error processing plan {plan["plan_name"]}: {e}')
|
|
303
|
+
continue
|
|
304
|
+
|
|
305
|
+
print(f'Seeweb catalog saved to {output_path}')
|
|
306
|
+
print(f'Created {len(plans)} instance types')
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
def main() -> None:
|
|
310
|
+
"""Main function to fetch and write Seeweb platform prices to a CSV file."""
|
|
311
|
+
parser = argparse.ArgumentParser()
|
|
312
|
+
parser.add_argument('--api-key', help='Seeweb API key')
|
|
313
|
+
parser.add_argument('--api-key-path',
|
|
314
|
+
help='Path to file containing Seeweb API key')
|
|
315
|
+
args = parser.parse_args()
|
|
316
|
+
|
|
317
|
+
# Get API key
|
|
318
|
+
if args.api_key:
|
|
319
|
+
api_key = args.api_key
|
|
320
|
+
else:
|
|
321
|
+
api_key = get_api_key(args.api_key_path)
|
|
322
|
+
|
|
323
|
+
os.makedirs('seeweb', exist_ok=True)
|
|
324
|
+
create_catalog(api_key, 'seeweb/vms.csv')
|
|
325
|
+
print('Seeweb Service Catalog saved to seeweb/vms.csv')
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
if __name__ == '__main__':
|
|
329
|
+
main()
|
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
"""A script that generates the Shadeform catalog.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
python fetch_shadeform.py [-h] [--api-key API_KEY]
|
|
5
|
+
[--api-key-path API_KEY_PATH]
|
|
6
|
+
|
|
7
|
+
If neither --api-key nor --api-key-path are provided, this script will parse
|
|
8
|
+
`~/.shadeform/api_key` to look for Shadeform API key.
|
|
9
|
+
"""
|
|
10
|
+
import argparse
|
|
11
|
+
import csv
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
from typing import Dict
|
|
15
|
+
|
|
16
|
+
import requests
|
|
17
|
+
|
|
18
|
+
ENDPOINT = 'https://api.shadeform.ai/v1/instances/types'
|
|
19
|
+
DEFAULT_SHADEFORM_API_KEY_PATH = os.path.expanduser('~/.shadeform/api_key')
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def parse_gpu_info(gpu_type: str, num_gpus: int, ram_per_gpu: int) -> Dict:
|
|
23
|
+
"""Parse GPU information for the catalog."""
|
|
24
|
+
|
|
25
|
+
manufacturer = 'NVIDIA'
|
|
26
|
+
if gpu_type == 'MI300X':
|
|
27
|
+
manufacturer = 'AMD'
|
|
28
|
+
elif gpu_type == 'GAUDI2':
|
|
29
|
+
manufacturer = 'Intel'
|
|
30
|
+
|
|
31
|
+
return {
|
|
32
|
+
'Gpus': [{
|
|
33
|
+
'Name': gpu_type,
|
|
34
|
+
'Manufacturer': manufacturer,
|
|
35
|
+
'Count': float(num_gpus),
|
|
36
|
+
'MemoryInfo': {
|
|
37
|
+
'SizeInMiB': ram_per_gpu
|
|
38
|
+
},
|
|
39
|
+
'TotalGpuMemoryInMiB': ram_per_gpu * num_gpus
|
|
40
|
+
}]
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def create_catalog(api_key: str, output_path: str) -> None:
|
|
45
|
+
"""Create Shadeform catalog by fetching from API."""
|
|
46
|
+
headers = {'X-API-KEY': api_key}
|
|
47
|
+
|
|
48
|
+
params = {'available': 'true'}
|
|
49
|
+
|
|
50
|
+
response = requests.get(ENDPOINT,
|
|
51
|
+
headers=headers,
|
|
52
|
+
params=params,
|
|
53
|
+
timeout=30)
|
|
54
|
+
response.raise_for_status()
|
|
55
|
+
|
|
56
|
+
data = response.json()
|
|
57
|
+
instance_types = data.get('instance_types', [])
|
|
58
|
+
|
|
59
|
+
with open(output_path, mode='w', encoding='utf-8') as f:
|
|
60
|
+
writer = csv.writer(f, delimiter=',', quotechar='"')
|
|
61
|
+
writer.writerow([
|
|
62
|
+
'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs',
|
|
63
|
+
'MemoryGiB', 'Price', 'Region', 'GpuInfo', 'SpotPrice'
|
|
64
|
+
])
|
|
65
|
+
|
|
66
|
+
for instance in instance_types:
|
|
67
|
+
config = instance['configuration']
|
|
68
|
+
|
|
69
|
+
cloud = instance['cloud']
|
|
70
|
+
shade_instance_type = instance['shade_instance_type']
|
|
71
|
+
instance_type = f'{cloud}_{shade_instance_type.replace("_", "-")}'
|
|
72
|
+
gpu_type = config['gpu_type'].replace('_', '-')
|
|
73
|
+
gpu_count = float(config['num_gpus'])
|
|
74
|
+
vcpus = float(config['vcpus'])
|
|
75
|
+
memory_gb = int(config['memory_in_gb'])
|
|
76
|
+
|
|
77
|
+
# Append "B" to instance_type and gpu_type if they end with "G"
|
|
78
|
+
if instance_type.endswith('G'):
|
|
79
|
+
instance_type += 'B'
|
|
80
|
+
if gpu_type.endswith('G'):
|
|
81
|
+
gpu_type += 'B'
|
|
82
|
+
|
|
83
|
+
# Replace "Gx" with "GBx" (case sensitive)
|
|
84
|
+
if 'Gx' in instance_type:
|
|
85
|
+
instance_type = instance_type.replace('Gx', 'GBx')
|
|
86
|
+
|
|
87
|
+
# Price is in cents per hour, convert to dollars
|
|
88
|
+
price = float(instance['hourly_price']) / 100
|
|
89
|
+
|
|
90
|
+
# Create GPU info
|
|
91
|
+
gpuinfo = None
|
|
92
|
+
if gpu_count > 0:
|
|
93
|
+
gpuinfo_dict = parse_gpu_info(gpu_type, int(gpu_count),
|
|
94
|
+
int(config['vram_per_gpu_in_gb']))
|
|
95
|
+
gpuinfo = json.dumps(gpuinfo_dict).replace('"', '\'')
|
|
96
|
+
|
|
97
|
+
# Write entry for each available region
|
|
98
|
+
for availability in instance.get('availability', []):
|
|
99
|
+
if availability['available'] and gpu_count > 0:
|
|
100
|
+
region = availability['region']
|
|
101
|
+
writer.writerow([
|
|
102
|
+
instance_type,
|
|
103
|
+
gpu_type,
|
|
104
|
+
gpu_count,
|
|
105
|
+
vcpus,
|
|
106
|
+
memory_gb,
|
|
107
|
+
price,
|
|
108
|
+
region,
|
|
109
|
+
gpuinfo,
|
|
110
|
+
'' # No spot pricing info available
|
|
111
|
+
])
|
|
112
|
+
|
|
113
|
+
|
|
114
|
+
def get_api_key(cmdline_args: argparse.Namespace) -> str:
|
|
115
|
+
"""Get Shadeform API key from cmdline or default path."""
|
|
116
|
+
api_key = cmdline_args.api_key
|
|
117
|
+
if api_key is None:
|
|
118
|
+
if cmdline_args.api_key_path is not None:
|
|
119
|
+
with open(cmdline_args.api_key_path, mode='r',
|
|
120
|
+
encoding='utf-8') as f:
|
|
121
|
+
api_key = f.read().strip()
|
|
122
|
+
else:
|
|
123
|
+
# Read from ~/.shadeform/api_key
|
|
124
|
+
with open(DEFAULT_SHADEFORM_API_KEY_PATH,
|
|
125
|
+
mode='r',
|
|
126
|
+
encoding='utf-8') as f:
|
|
127
|
+
api_key = f.read().strip()
|
|
128
|
+
assert api_key is not None, (
|
|
129
|
+
f'API key not found. Please provide via --api-key or place in '
|
|
130
|
+
f'{DEFAULT_SHADEFORM_API_KEY_PATH}')
|
|
131
|
+
return api_key
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
if __name__ == '__main__':
|
|
135
|
+
parser = argparse.ArgumentParser()
|
|
136
|
+
parser.add_argument('--api-key', help='Shadeform API key.')
|
|
137
|
+
parser.add_argument('--api-key-path',
|
|
138
|
+
help='path of file containing Shadeform API key.')
|
|
139
|
+
args = parser.parse_args()
|
|
140
|
+
os.makedirs('shadeform', exist_ok=True)
|
|
141
|
+
create_catalog(get_api_key(args), 'shadeform/vms.csv')
|
|
142
|
+
print('Shadeform catalog saved to shadeform/vms.csv')
|
|
@@ -3,6 +3,7 @@
|
|
|
3
3
|
Kubernetes does not require a catalog of instances, but we need an image catalog
|
|
4
4
|
mapping SkyPilot image tags to corresponding container image tags.
|
|
5
5
|
"""
|
|
6
|
+
import collections
|
|
6
7
|
import re
|
|
7
8
|
import typing
|
|
8
9
|
from typing import Dict, List, Optional, Set, Tuple
|
|
@@ -167,12 +168,25 @@ def _list_accelerators(
|
|
|
167
168
|
accelerators_qtys: Set[Tuple[str, int]] = set()
|
|
168
169
|
keys = lf.get_label_keys()
|
|
169
170
|
nodes = kubernetes_utils.get_kubernetes_nodes(context=context)
|
|
170
|
-
|
|
171
|
-
if
|
|
172
|
-
|
|
171
|
+
|
|
172
|
+
# Check if any nodes have accelerators before fetching pods
|
|
173
|
+
has_accelerator_nodes = False
|
|
174
|
+
for node in nodes:
|
|
175
|
+
for key in keys:
|
|
176
|
+
if key in node.metadata.labels:
|
|
177
|
+
has_accelerator_nodes = True
|
|
178
|
+
break
|
|
179
|
+
if has_accelerator_nodes:
|
|
180
|
+
break
|
|
181
|
+
|
|
182
|
+
# Only fetch pods if we have accelerator nodes and realtime is requested
|
|
183
|
+
allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
|
|
184
|
+
error_on_get_allocated_gpu_qty_by_node = False
|
|
185
|
+
if realtime and has_accelerator_nodes:
|
|
186
|
+
# Get the allocated GPU quantity by each node
|
|
173
187
|
try:
|
|
174
|
-
|
|
175
|
-
context=context)
|
|
188
|
+
allocated_qty_by_node = (
|
|
189
|
+
kubernetes_utils.get_allocated_gpu_qty_by_node(context=context))
|
|
176
190
|
except kubernetes.api_exception() as e:
|
|
177
191
|
if e.status == 403:
|
|
178
192
|
logger.warning(
|
|
@@ -180,6 +194,7 @@ def _list_accelerators(
|
|
|
180
194
|
'(forbidden). Please check if your account has '
|
|
181
195
|
'necessary permissions to list pods. Realtime GPU '
|
|
182
196
|
'availability information may be incorrect.')
|
|
197
|
+
error_on_get_allocated_gpu_qty_by_node = True
|
|
183
198
|
else:
|
|
184
199
|
raise
|
|
185
200
|
# Total number of GPUs in the cluster
|
|
@@ -189,9 +204,11 @@ def _list_accelerators(
|
|
|
189
204
|
min_quantity_filter = quantity_filter if quantity_filter else 1
|
|
190
205
|
|
|
191
206
|
for node in nodes:
|
|
207
|
+
# Check if node is ready
|
|
208
|
+
node_is_ready = node.is_ready()
|
|
209
|
+
|
|
192
210
|
for key in keys:
|
|
193
211
|
if key in node.metadata.labels:
|
|
194
|
-
allocated_qty = 0
|
|
195
212
|
accelerator_name = lf.get_accelerator_from_label_value(
|
|
196
213
|
node.metadata.labels.get(key))
|
|
197
214
|
|
|
@@ -246,37 +263,24 @@ def _list_accelerators(
|
|
|
246
263
|
total_accelerators_capacity[
|
|
247
264
|
accelerator_name] += quantized_count
|
|
248
265
|
|
|
249
|
-
if pods is None:
|
|
250
|
-
# If we can't get the pods, we can't get the GPU usage
|
|
251
|
-
total_accelerators_available[accelerator_name] = -1
|
|
252
|
-
continue
|
|
253
|
-
|
|
254
|
-
for pod in pods:
|
|
255
|
-
# Get all the pods running on the node
|
|
256
|
-
if (pod.spec.node_name == node.metadata.name and
|
|
257
|
-
pod.status.phase in ['Running', 'Pending']):
|
|
258
|
-
# Skip pods that should not count against GPU count
|
|
259
|
-
if (kubernetes_utils.
|
|
260
|
-
should_exclude_pod_from_gpu_allocation(pod)):
|
|
261
|
-
logger.debug(
|
|
262
|
-
f'Excluding pod '
|
|
263
|
-
f'{pod.metadata.name} from GPU count '
|
|
264
|
-
f'calculations on node {node.metadata.name}')
|
|
265
|
-
continue
|
|
266
|
-
# Iterate over all the containers in the pod and sum
|
|
267
|
-
# the GPU requests
|
|
268
|
-
for container in pod.spec.containers:
|
|
269
|
-
if container.resources.requests:
|
|
270
|
-
allocated_qty += (
|
|
271
|
-
kubernetes_utils.get_node_accelerator_count(
|
|
272
|
-
context, container.resources.requests))
|
|
273
|
-
|
|
274
|
-
accelerators_available = accelerator_count - allocated_qty
|
|
275
266
|
# Initialize the total_accelerators_available to make sure the
|
|
276
267
|
# key exists in the dictionary.
|
|
277
268
|
total_accelerators_available[accelerator_name] = (
|
|
278
269
|
total_accelerators_available.get(accelerator_name, 0))
|
|
279
270
|
|
|
271
|
+
# Skip availability counting for not-ready nodes
|
|
272
|
+
if not node_is_ready:
|
|
273
|
+
continue
|
|
274
|
+
|
|
275
|
+
if error_on_get_allocated_gpu_qty_by_node:
|
|
276
|
+
# If we can't get the allocated GPU quantity by each node,
|
|
277
|
+
# we can't get the GPU usage.
|
|
278
|
+
total_accelerators_available[accelerator_name] = -1
|
|
279
|
+
continue
|
|
280
|
+
|
|
281
|
+
allocated_qty = allocated_qty_by_node[node.metadata.name]
|
|
282
|
+
accelerators_available = accelerator_count - allocated_qty
|
|
283
|
+
|
|
280
284
|
if accelerators_available >= min_quantity_filter:
|
|
281
285
|
quantized_availability = min_quantity_filter * (
|
|
282
286
|
accelerators_available // min_quantity_filter)
|
|
@@ -0,0 +1,95 @@
|
|
|
1
|
+
"""PrimeIntellect service catalog.
|
|
2
|
+
|
|
3
|
+
This module loads the service catalog file and can be used to
|
|
4
|
+
query instance types and pricing information for PrimeIntellect.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import typing
|
|
8
|
+
from typing import Dict, List, Optional, Tuple, Union
|
|
9
|
+
|
|
10
|
+
from sky.catalog import common
|
|
11
|
+
|
|
12
|
+
if typing.TYPE_CHECKING:
|
|
13
|
+
from sky.clouds import cloud
|
|
14
|
+
|
|
15
|
+
_df = common.read_catalog('primeintellect/vms.csv')
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def instance_type_exists(instance_type: str) -> bool:
|
|
19
|
+
return common.instance_type_exists_impl(_df, instance_type)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def validate_region_zone(
|
|
23
|
+
region: Optional[str],
|
|
24
|
+
zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
|
|
25
|
+
return common.validate_region_zone_impl('primeintellect', _df, region, zone)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def get_hourly_cost(instance_type: str,
|
|
29
|
+
use_spot: bool = False,
|
|
30
|
+
region: Optional[str] = None,
|
|
31
|
+
zone: Optional[str] = None) -> float:
|
|
32
|
+
"""Returns the cost, or the cheapest cost among all zones for spot."""
|
|
33
|
+
return common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
|
|
34
|
+
zone)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def get_vcpus_mem_from_instance_type(
|
|
38
|
+
instance_type: str) -> Tuple[Optional[float], Optional[float]]:
|
|
39
|
+
return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def get_default_instance_type(cpus: Optional[str] = None,
|
|
43
|
+
memory: Optional[str] = None,
|
|
44
|
+
disk_tier: Optional[str] = None,
|
|
45
|
+
region: Optional[str] = None,
|
|
46
|
+
zone: Optional[str] = None) -> Optional[str]:
|
|
47
|
+
del disk_tier # no disk tiers
|
|
48
|
+
return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
|
|
49
|
+
zone)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def get_accelerators_from_instance_type(
|
|
53
|
+
instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
|
|
54
|
+
return common.get_accelerators_from_instance_type_impl(_df, instance_type)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def get_instance_type_for_accelerator(
|
|
58
|
+
acc_name: str,
|
|
59
|
+
acc_count: int,
|
|
60
|
+
cpus: Optional[str] = None,
|
|
61
|
+
memory: Optional[str] = None,
|
|
62
|
+
use_spot: bool = False,
|
|
63
|
+
region: Optional[str] = None,
|
|
64
|
+
zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
|
|
65
|
+
"""Returns a list of instance types that have the given accelerator."""
|
|
66
|
+
return common.get_instance_type_for_accelerator_impl(df=_df,
|
|
67
|
+
acc_name=acc_name,
|
|
68
|
+
acc_count=acc_count,
|
|
69
|
+
cpus=cpus,
|
|
70
|
+
memory=memory,
|
|
71
|
+
use_spot=use_spot,
|
|
72
|
+
region=region,
|
|
73
|
+
zone=zone)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def get_region_zones_for_instance_type(instance_type: str,
|
|
77
|
+
use_spot: bool) -> List['cloud.Region']:
|
|
78
|
+
df = _df[_df['InstanceType'] == instance_type]
|
|
79
|
+
return common.get_region_zones(df, use_spot)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def list_accelerators(
|
|
83
|
+
gpus_only: bool,
|
|
84
|
+
name_filter: Optional[str],
|
|
85
|
+
region_filter: Optional[str],
|
|
86
|
+
quantity_filter: Optional[int],
|
|
87
|
+
case_sensitive: bool = True,
|
|
88
|
+
all_regions: bool = False,
|
|
89
|
+
require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
|
|
90
|
+
"""Returns all instance types in Prime Intellect offering GPUs."""
|
|
91
|
+
del require_price
|
|
92
|
+
return common.list_accelerators_impl('PrimeIntellect', _df, gpus_only,
|
|
93
|
+
name_filter, region_filter,
|
|
94
|
+
quantity_filter, case_sensitive,
|
|
95
|
+
all_regions)
|
sky/catalog/runpod_catalog.py
CHANGED
|
@@ -12,7 +12,11 @@ from sky.catalog import common
|
|
|
12
12
|
if typing.TYPE_CHECKING:
|
|
13
13
|
from sky.clouds import cloud
|
|
14
14
|
|
|
15
|
-
|
|
15
|
+
# Runpod has no set updated schedule for their catalog. We pull the catalog
|
|
16
|
+
# every 7 hours to make sure we have the latest information.
|
|
17
|
+
_PULL_FREQUENCY_HOURS = 7
|
|
18
|
+
_df = common.read_catalog('runpod/vms.csv',
|
|
19
|
+
pull_frequency_hours=_PULL_FREQUENCY_HOURS)
|
|
16
20
|
|
|
17
21
|
|
|
18
22
|
def instance_type_exists(instance_type: str) -> bool:
|