skypilot-nightly 1.0.0.dev20251009__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of skypilot-nightly might be problematic. Click here for more details.
- sky/__init__.py +6 -2
- sky/adaptors/aws.py +25 -7
- sky/adaptors/coreweave.py +278 -0
- sky/adaptors/kubernetes.py +64 -0
- sky/adaptors/shadeform.py +89 -0
- sky/admin_policy.py +20 -0
- sky/authentication.py +59 -149
- sky/backends/backend_utils.py +104 -63
- sky/backends/cloud_vm_ray_backend.py +84 -39
- sky/catalog/data_fetchers/fetch_runpod.py +698 -0
- sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
- sky/catalog/kubernetes_catalog.py +24 -28
- sky/catalog/runpod_catalog.py +5 -1
- sky/catalog/shadeform_catalog.py +165 -0
- sky/check.py +25 -13
- sky/client/cli/command.py +335 -86
- sky/client/cli/flags.py +4 -2
- sky/client/cli/table_utils.py +17 -9
- sky/client/sdk.py +59 -12
- sky/cloud_stores.py +73 -0
- sky/clouds/__init__.py +2 -0
- sky/clouds/aws.py +71 -16
- sky/clouds/azure.py +12 -5
- sky/clouds/cloud.py +19 -9
- sky/clouds/cudo.py +12 -5
- sky/clouds/do.py +4 -1
- sky/clouds/fluidstack.py +12 -5
- sky/clouds/gcp.py +12 -5
- sky/clouds/hyperbolic.py +12 -5
- sky/clouds/ibm.py +12 -5
- sky/clouds/kubernetes.py +62 -25
- sky/clouds/lambda_cloud.py +12 -5
- sky/clouds/nebius.py +12 -5
- sky/clouds/oci.py +12 -5
- sky/clouds/paperspace.py +4 -1
- sky/clouds/primeintellect.py +4 -1
- sky/clouds/runpod.py +12 -5
- sky/clouds/scp.py +12 -5
- sky/clouds/seeweb.py +4 -1
- sky/clouds/shadeform.py +400 -0
- sky/clouds/ssh.py +4 -2
- sky/clouds/vast.py +12 -5
- sky/clouds/vsphere.py +4 -1
- sky/core.py +12 -11
- sky/dashboard/out/404.html +1 -1
- sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
- sky/dashboard/out/_next/static/chunks/{1871-49141c317f3a9020.js → 1871-74503c8e80fd253b.js} +1 -1
- sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
- sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
- sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
- sky/dashboard/out/_next/static/chunks/{3785.a19328ba41517b8b.js → 3785.ad6adaa2a0fa9768.js} +1 -1
- sky/dashboard/out/_next/static/chunks/{4725.10f7a9a5d3ea8208.js → 4725.a830b5c9e7867c92.js} +1 -1
- sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
- sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
- sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
- sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
- sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
- sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
- sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
- sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/clusters/{[cluster]-477555ab7c0b13d8.js → [cluster]-a37d2063af475a1c.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{clusters-2f61f65487f6d8ff.js → clusters-d44859594e6f8064.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-553b8b5cb65e100b.js → [context]-c0b5935149902e6f.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{infra-910a22500c50596f.js → infra-aed0ea19df7cf961.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
- sky/dashboard/out/_next/static/chunks/pages/jobs/pools/{[pool]-bc979970c247d8f3.js → [pool]-6edeb7d06032adfc.js} +2 -2
- sky/dashboard/out/_next/static/chunks/pages/{jobs-a35a9dc3c5ccd657.js → jobs-479dde13399cf270.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{users-98d2ed979084162a.js → users-5ab3b907622cf0fe.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{volumes-835d14ba94808f79.js → volumes-b84b948ff357c43e.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-e8688c35c06f0ac5.js → [name]-c5a3eeee1c218af1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/pages/{workspaces-69c80d677d3c2949.js → workspaces-22b23febb3e89ce1.js} +1 -1
- sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
- sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
- sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
- sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
- sky/dashboard/out/clusters/[cluster].html +1 -1
- sky/dashboard/out/clusters.html +1 -1
- sky/dashboard/out/config.html +1 -1
- sky/dashboard/out/index.html +1 -1
- sky/dashboard/out/infra/[context].html +1 -1
- sky/dashboard/out/infra.html +1 -1
- sky/dashboard/out/jobs/[job].html +1 -1
- sky/dashboard/out/jobs/pools/[pool].html +1 -1
- sky/dashboard/out/jobs.html +1 -1
- sky/dashboard/out/users.html +1 -1
- sky/dashboard/out/volumes.html +1 -1
- sky/dashboard/out/workspace/new.html +1 -1
- sky/dashboard/out/workspaces/[name].html +1 -1
- sky/dashboard/out/workspaces.html +1 -1
- sky/data/data_utils.py +92 -1
- sky/data/mounting_utils.py +143 -19
- sky/data/storage.py +168 -11
- sky/exceptions.py +13 -1
- sky/execution.py +13 -0
- sky/global_user_state.py +189 -113
- sky/jobs/client/sdk.py +32 -10
- sky/jobs/client/sdk_async.py +9 -3
- sky/jobs/constants.py +3 -1
- sky/jobs/controller.py +164 -192
- sky/jobs/file_content_utils.py +80 -0
- sky/jobs/log_gc.py +201 -0
- sky/jobs/recovery_strategy.py +59 -82
- sky/jobs/scheduler.py +20 -9
- sky/jobs/server/core.py +105 -23
- sky/jobs/server/server.py +40 -28
- sky/jobs/server/utils.py +32 -11
- sky/jobs/state.py +588 -110
- sky/jobs/utils.py +442 -209
- sky/logs/agent.py +1 -1
- sky/metrics/utils.py +45 -6
- sky/optimizer.py +1 -1
- sky/provision/__init__.py +7 -0
- sky/provision/aws/instance.py +2 -1
- sky/provision/azure/instance.py +2 -1
- sky/provision/common.py +2 -0
- sky/provision/cudo/instance.py +2 -1
- sky/provision/do/instance.py +2 -1
- sky/provision/fluidstack/instance.py +4 -3
- sky/provision/gcp/instance.py +2 -1
- sky/provision/hyperbolic/instance.py +2 -1
- sky/provision/instance_setup.py +10 -2
- sky/provision/kubernetes/constants.py +0 -1
- sky/provision/kubernetes/instance.py +222 -89
- sky/provision/kubernetes/network.py +12 -8
- sky/provision/kubernetes/utils.py +114 -53
- sky/provision/kubernetes/volume.py +5 -4
- sky/provision/lambda_cloud/instance.py +2 -1
- sky/provision/nebius/instance.py +2 -1
- sky/provision/oci/instance.py +2 -1
- sky/provision/paperspace/instance.py +2 -1
- sky/provision/provisioner.py +11 -2
- sky/provision/runpod/instance.py +2 -1
- sky/provision/scp/instance.py +2 -1
- sky/provision/seeweb/instance.py +3 -3
- sky/provision/shadeform/__init__.py +11 -0
- sky/provision/shadeform/config.py +12 -0
- sky/provision/shadeform/instance.py +351 -0
- sky/provision/shadeform/shadeform_utils.py +83 -0
- sky/provision/vast/instance.py +2 -1
- sky/provision/vsphere/instance.py +2 -1
- sky/resources.py +1 -1
- sky/schemas/api/responses.py +9 -5
- sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
- sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
- sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
- sky/schemas/generated/jobsv1_pb2.py +52 -52
- sky/schemas/generated/jobsv1_pb2.pyi +4 -2
- sky/schemas/generated/managed_jobsv1_pb2.py +39 -35
- sky/schemas/generated/managed_jobsv1_pb2.pyi +21 -5
- sky/serve/client/impl.py +11 -3
- sky/serve/replica_managers.py +5 -2
- sky/serve/serve_utils.py +9 -2
- sky/serve/server/impl.py +7 -2
- sky/serve/server/server.py +18 -15
- sky/serve/service.py +2 -2
- sky/server/auth/oauth2_proxy.py +2 -5
- sky/server/common.py +31 -28
- sky/server/constants.py +5 -1
- sky/server/daemons.py +27 -19
- sky/server/requests/executor.py +138 -74
- sky/server/requests/payloads.py +9 -1
- sky/server/requests/preconditions.py +13 -10
- sky/server/requests/request_names.py +120 -0
- sky/server/requests/requests.py +485 -153
- sky/server/requests/serializers/decoders.py +26 -13
- sky/server/requests/serializers/encoders.py +56 -11
- sky/server/requests/threads.py +106 -0
- sky/server/rest.py +70 -18
- sky/server/server.py +283 -104
- sky/server/stream_utils.py +233 -59
- sky/server/uvicorn.py +18 -17
- sky/setup_files/alembic.ini +4 -0
- sky/setup_files/dependencies.py +32 -13
- sky/sky_logging.py +0 -2
- sky/skylet/constants.py +30 -7
- sky/skylet/events.py +7 -0
- sky/skylet/log_lib.py +8 -2
- sky/skylet/log_lib.pyi +1 -1
- sky/skylet/services.py +26 -13
- sky/skylet/subprocess_daemon.py +103 -29
- sky/skypilot_config.py +87 -75
- sky/ssh_node_pools/server.py +9 -8
- sky/task.py +67 -54
- sky/templates/kubernetes-ray.yml.j2 +8 -1
- sky/templates/nebius-ray.yml.j2 +1 -0
- sky/templates/shadeform-ray.yml.j2 +72 -0
- sky/templates/websocket_proxy.py +142 -12
- sky/users/permission.py +8 -1
- sky/utils/admin_policy_utils.py +16 -3
- sky/utils/asyncio_utils.py +78 -0
- sky/utils/auth_utils.py +153 -0
- sky/utils/cli_utils/status_utils.py +8 -2
- sky/utils/command_runner.py +11 -0
- sky/utils/common.py +3 -1
- sky/utils/common_utils.py +7 -4
- sky/utils/context.py +57 -51
- sky/utils/context_utils.py +30 -12
- sky/utils/controller_utils.py +35 -8
- sky/utils/db/db_utils.py +37 -10
- sky/utils/db/migration_utils.py +8 -4
- sky/utils/locks.py +24 -6
- sky/utils/resource_checker.py +4 -1
- sky/utils/resources_utils.py +53 -29
- sky/utils/schemas.py +23 -4
- sky/utils/subprocess_utils.py +17 -4
- sky/volumes/server/server.py +7 -6
- sky/workspaces/server.py +13 -12
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/METADATA +306 -55
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/RECORD +215 -195
- sky/dashboard/out/_next/static/chunks/1121-d0782b9251f0fcd3.js +0 -1
- sky/dashboard/out/_next/static/chunks/1141-3b40c39626f99c89.js +0 -11
- sky/dashboard/out/_next/static/chunks/2755.97300e1362fe7c98.js +0 -26
- sky/dashboard/out/_next/static/chunks/3015-8d748834fcc60b46.js +0 -1
- sky/dashboard/out/_next/static/chunks/3294.1fafbf42b3bcebff.js +0 -1
- sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
- sky/dashboard/out/_next/static/chunks/6856-5fdc9b851a18acdb.js +0 -1
- sky/dashboard/out/_next/static/chunks/6990-f6818c84ed8f1c86.js +0 -1
- sky/dashboard/out/_next/static/chunks/8969-66237729cdf9749e.js +0 -1
- sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
- sky/dashboard/out/_next/static/chunks/9360.71e83b2ddc844ec2.js +0 -31
- sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-8f058b0346db2aff.js +0 -16
- sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-4f7079dcab6ed653.js +0 -16
- sky/dashboard/out/_next/static/chunks/webpack-6a5ddd0184bfa22c.js +0 -1
- sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
- sky/dashboard/out/_next/static/hIViZcQBkn0HE8SpaSsUU/_buildManifest.js +0 -1
- /sky/dashboard/out/_next/static/{hIViZcQBkn0HE8SpaSsUU → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
- {skypilot_nightly-1.0.0.dev20251009.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,698 @@
|
|
|
1
|
+
"""A script that generates the Runpod catalog.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
python fetch_runpod.py [-h] [--output-dir OUTPUT_DIR] [--gpu-ids GPU_IDS]
|
|
5
|
+
|
|
6
|
+
The RUNPOD_API_KEY environment variable must be set with a valid read-access
|
|
7
|
+
RunPod API key.
|
|
8
|
+
|
|
9
|
+
If --gpu-ids is provided, only fetches details for
|
|
10
|
+
the specified GPU IDs (comma-separated). Otherwise, fetches all available GPUs.
|
|
11
|
+
This flag is intended for testing and debugging individual GPU configurations.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
import argparse
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import sys
|
|
18
|
+
import traceback
|
|
19
|
+
from typing import Any, Dict, List, Optional, Union
|
|
20
|
+
|
|
21
|
+
import pandas as pd
|
|
22
|
+
import runpod
|
|
23
|
+
from runpod.api import graphql
|
|
24
|
+
|
|
25
|
+
# The API currently returns a dynamic number of vCPUs per pod that
|
|
26
|
+
# changes frequently (less than 30 mins)
|
|
27
|
+
# Therefore we hard code a default number of vCPUs from:
|
|
28
|
+
# 1. The previous catalog, if the GPU exists there
|
|
29
|
+
# 2. Or if not, the pricing page https://www.runpod.io/pricing
|
|
30
|
+
# 3. Otherwise, the minimum of the returned# vCPU count from the API
|
|
31
|
+
# The max count of GPUs per pod is set to 8 apart from A40 at 10
|
|
32
|
+
DEFAULT_MAX_GPUS = 8
|
|
33
|
+
DEFAULT_GPU_INFO: Dict[str, Dict[str, Union[int, float]]] = {
|
|
34
|
+
'A100-80GB': {
|
|
35
|
+
'vcpus': 8.0,
|
|
36
|
+
'memory': 117.0,
|
|
37
|
+
'max_count': 8
|
|
38
|
+
},
|
|
39
|
+
'A100-80GB-SXM': {
|
|
40
|
+
'vcpus': 16.0,
|
|
41
|
+
'memory': 117.0,
|
|
42
|
+
'max_count': 8
|
|
43
|
+
},
|
|
44
|
+
'A30': {
|
|
45
|
+
'vcpus': 12.0,
|
|
46
|
+
'memory': 39.0,
|
|
47
|
+
'max_count': 8
|
|
48
|
+
},
|
|
49
|
+
'A40': {
|
|
50
|
+
'vcpus': 9.0,
|
|
51
|
+
'memory': 48.0,
|
|
52
|
+
'max_count': 10
|
|
53
|
+
},
|
|
54
|
+
'B200': {
|
|
55
|
+
'vcpus': 28.0,
|
|
56
|
+
'memory': 180.0,
|
|
57
|
+
'max_count': 8
|
|
58
|
+
},
|
|
59
|
+
'H100': {
|
|
60
|
+
'vcpus': 16.0,
|
|
61
|
+
'memory': 176.0,
|
|
62
|
+
'max_count': 8
|
|
63
|
+
},
|
|
64
|
+
'H100-NVL': {
|
|
65
|
+
'vcpus': 16.0,
|
|
66
|
+
'memory': 94.0,
|
|
67
|
+
'max_count': 10
|
|
68
|
+
},
|
|
69
|
+
'H100-SXM': {
|
|
70
|
+
'vcpus': 20.0,
|
|
71
|
+
'memory': 125.0,
|
|
72
|
+
'max_count': 8
|
|
73
|
+
},
|
|
74
|
+
'H200-SXM': {
|
|
75
|
+
'vcpus': 12.0,
|
|
76
|
+
'memory': 188.0,
|
|
77
|
+
'max_count': 8
|
|
78
|
+
},
|
|
79
|
+
'L4': {
|
|
80
|
+
'vcpus': 8.0,
|
|
81
|
+
'memory': 45.0,
|
|
82
|
+
'max_count': 10
|
|
83
|
+
},
|
|
84
|
+
'L40': {
|
|
85
|
+
'vcpus': 9.0,
|
|
86
|
+
'memory': 125.0,
|
|
87
|
+
'max_count': 10
|
|
88
|
+
},
|
|
89
|
+
'L40S': {
|
|
90
|
+
'vcpus': 12.0,
|
|
91
|
+
'memory': 62.0,
|
|
92
|
+
'max_count': 8
|
|
93
|
+
},
|
|
94
|
+
'MI300X': {
|
|
95
|
+
'vcpus': 24.0,
|
|
96
|
+
'memory': 283.0,
|
|
97
|
+
'max_count': 8
|
|
98
|
+
},
|
|
99
|
+
'RTX2000-Ada': {
|
|
100
|
+
'vcpus': 6.0,
|
|
101
|
+
'memory': 31.0,
|
|
102
|
+
'max_count': 8
|
|
103
|
+
},
|
|
104
|
+
'RTX3070': {
|
|
105
|
+
'vcpus': 8.0,
|
|
106
|
+
'memory': 30.0,
|
|
107
|
+
'max_count': 8
|
|
108
|
+
},
|
|
109
|
+
'RTX3080': {
|
|
110
|
+
'vcpus': 8.0,
|
|
111
|
+
'memory': 14.0,
|
|
112
|
+
'max_count': 4
|
|
113
|
+
},
|
|
114
|
+
'RTX3080-Ti': {
|
|
115
|
+
'vcpus': 8.0,
|
|
116
|
+
'memory': 18.0,
|
|
117
|
+
'max_count': 5
|
|
118
|
+
},
|
|
119
|
+
'RTX3090': {
|
|
120
|
+
'vcpus': 4.0,
|
|
121
|
+
'memory': 25.0,
|
|
122
|
+
'max_count': 8
|
|
123
|
+
},
|
|
124
|
+
'RTX3090-Ti': {
|
|
125
|
+
'vcpus': 8.0,
|
|
126
|
+
'memory': 24.0,
|
|
127
|
+
'max_count': 9
|
|
128
|
+
},
|
|
129
|
+
'RTX4000-Ada': {
|
|
130
|
+
'vcpus': 8.0,
|
|
131
|
+
'memory': 47.0,
|
|
132
|
+
'max_count': 8
|
|
133
|
+
},
|
|
134
|
+
'RTX4080': {
|
|
135
|
+
'vcpus': 8.0,
|
|
136
|
+
'memory': 22.0,
|
|
137
|
+
'max_count': 5
|
|
138
|
+
},
|
|
139
|
+
'RTX4080-SUPER': {
|
|
140
|
+
'vcpus': 12.0,
|
|
141
|
+
'memory': 62.0,
|
|
142
|
+
'max_count': 6
|
|
143
|
+
},
|
|
144
|
+
'RTX4090': {
|
|
145
|
+
'vcpus': 5.0,
|
|
146
|
+
'memory': 29.0,
|
|
147
|
+
'max_count': 8
|
|
148
|
+
},
|
|
149
|
+
'RTX5000-Ada': {
|
|
150
|
+
'vcpus': 6.0,
|
|
151
|
+
'memory': 62.0,
|
|
152
|
+
'max_count': 8
|
|
153
|
+
},
|
|
154
|
+
'RTX5080': {
|
|
155
|
+
'vcpus': 5.0,
|
|
156
|
+
'memory': 30.0,
|
|
157
|
+
'max_count': 8
|
|
158
|
+
},
|
|
159
|
+
'RTX5090': {
|
|
160
|
+
'vcpus': 6.0,
|
|
161
|
+
'memory': 46.0,
|
|
162
|
+
'max_count': 8
|
|
163
|
+
},
|
|
164
|
+
'RTX6000-Ada': {
|
|
165
|
+
'vcpus': 10.0,
|
|
166
|
+
'memory': 62.0,
|
|
167
|
+
'max_count': 8
|
|
168
|
+
},
|
|
169
|
+
'RTXA4000': {
|
|
170
|
+
'vcpus': 6.0,
|
|
171
|
+
'memory': 35.0,
|
|
172
|
+
'max_count': 12
|
|
173
|
+
},
|
|
174
|
+
'RTXA4500': {
|
|
175
|
+
'vcpus': 7.0,
|
|
176
|
+
'memory': 30.0,
|
|
177
|
+
'max_count': 4
|
|
178
|
+
},
|
|
179
|
+
'RTXA5000': {
|
|
180
|
+
'vcpus': 3.0,
|
|
181
|
+
'memory': 25.0,
|
|
182
|
+
'max_count': 10
|
|
183
|
+
},
|
|
184
|
+
'RTXA6000': {
|
|
185
|
+
'vcpus': 8.0,
|
|
186
|
+
'memory': 50.0,
|
|
187
|
+
'max_count': 10
|
|
188
|
+
},
|
|
189
|
+
'RTXPRO6000': {
|
|
190
|
+
'vcpus': 14.0,
|
|
191
|
+
'memory': 125.0,
|
|
192
|
+
'max_count': 9
|
|
193
|
+
},
|
|
194
|
+
'RTXPRO6000-MaxQ': {
|
|
195
|
+
'vcpus': 18.0,
|
|
196
|
+
'memory': 215.0,
|
|
197
|
+
'max_count': 7
|
|
198
|
+
},
|
|
199
|
+
'RTXPRO6000-WK': {
|
|
200
|
+
'vcpus': 12.0,
|
|
201
|
+
'memory': 186.0,
|
|
202
|
+
'max_count': 4
|
|
203
|
+
},
|
|
204
|
+
'V100-SXM2': {
|
|
205
|
+
'vcpus': 10.0,
|
|
206
|
+
'memory': 62.0,
|
|
207
|
+
'max_count': 8
|
|
208
|
+
},
|
|
209
|
+
'V100-SXM2-32GB': {
|
|
210
|
+
'vcpus': 20.0,
|
|
211
|
+
'memory': 93.0,
|
|
212
|
+
'max_count': 4
|
|
213
|
+
}
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
# A manual list of all CPU IDs RunPod currently supports
|
|
217
|
+
# These are named as cpu{generation}{tier}
|
|
218
|
+
# TODO: Investigate if these can be found from the API in an automated way
|
|
219
|
+
# currently there is little documentation or API to obtain them.
|
|
220
|
+
DEFAULT_CPU_ONLY_IDS = ['cpu3c', 'cpu3g', 'cpu3m', 'cpu5c', 'cpu5g', 'cpu5m']
|
|
221
|
+
|
|
222
|
+
# for backwards compatibility, force rename some gpus.
|
|
223
|
+
# map the generated name to the original name.
|
|
224
|
+
# RunPod GPU names currently supported are listed here:
|
|
225
|
+
# https://docs.runpod.io/references/gpu-types
|
|
226
|
+
GPU_NAME_OVERRIDES = {
|
|
227
|
+
'A100-PCIe': 'A100-80GB',
|
|
228
|
+
'A100-SXM': 'A100-80GB-SXM',
|
|
229
|
+
'H100-PCIe': 'H100',
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
# Constants
|
|
233
|
+
USEFUL_COLUMNS = [
|
|
234
|
+
'InstanceType',
|
|
235
|
+
'AcceleratorName',
|
|
236
|
+
'AcceleratorCount',
|
|
237
|
+
'vCPUs',
|
|
238
|
+
'MemoryGiB',
|
|
239
|
+
'Region',
|
|
240
|
+
'SpotPrice',
|
|
241
|
+
'Price',
|
|
242
|
+
'AvailabilityZone',
|
|
243
|
+
'GpuInfo',
|
|
244
|
+
]
|
|
245
|
+
|
|
246
|
+
# Mapping of regions to their availability zones
|
|
247
|
+
# TODO: Investigate if these can be found from the API in an automated way
|
|
248
|
+
# currently there is little documentation or API to obtain them.
|
|
249
|
+
REGION_ZONES = {
|
|
250
|
+
'CA': ['CA-MTL-1', 'CA-MTL-2', 'CA-MTL-3'],
|
|
251
|
+
'CZ': ['EU-CZ-1'],
|
|
252
|
+
'IS': ['EUR-IS-1', 'EUR-IS-2', 'EUR-IS-3'],
|
|
253
|
+
'NL': ['EU-NL-1'],
|
|
254
|
+
'NO': ['EU-SE-1'],
|
|
255
|
+
'RO': ['EU-RO-1'],
|
|
256
|
+
'SE': ['EU-SE-1'],
|
|
257
|
+
'US': [
|
|
258
|
+
'US-CA-1',
|
|
259
|
+
'US-CA-2',
|
|
260
|
+
'US-DE-1',
|
|
261
|
+
'US-GA-1',
|
|
262
|
+
'US-GA-2',
|
|
263
|
+
'US-IL-1',
|
|
264
|
+
'US-KS-1',
|
|
265
|
+
'US-KS-2',
|
|
266
|
+
'US-NC-1',
|
|
267
|
+
'US-TX-1',
|
|
268
|
+
'US-TX-2',
|
|
269
|
+
'US-TX-3',
|
|
270
|
+
'US-TX-4',
|
|
271
|
+
'US-WA-1',
|
|
272
|
+
],
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def get_gpu_details(gpu_id: str, gpu_count: int = 1) -> Dict[str, Any]:
|
|
277
|
+
"""Get detailed GPU information using GraphQL query.
|
|
278
|
+
|
|
279
|
+
This uses a custom graphql query because runpod.get_gpu(id) does not include
|
|
280
|
+
full lowestPrice information.
|
|
281
|
+
"""
|
|
282
|
+
query = f"""
|
|
283
|
+
query GpuTypes {{
|
|
284
|
+
gpuTypes(input: {{id: "{gpu_id}"}}) {{
|
|
285
|
+
maxGpuCount
|
|
286
|
+
id
|
|
287
|
+
displayName
|
|
288
|
+
manufacturer
|
|
289
|
+
memoryInGb
|
|
290
|
+
cudaCores
|
|
291
|
+
secureCloud
|
|
292
|
+
communityCloud
|
|
293
|
+
securePrice
|
|
294
|
+
communityPrice
|
|
295
|
+
oneMonthPrice
|
|
296
|
+
threeMonthPrice
|
|
297
|
+
oneWeekPrice
|
|
298
|
+
communitySpotPrice
|
|
299
|
+
secureSpotPrice
|
|
300
|
+
lowestPrice(input: {{gpuCount: {gpu_count}}}) {{
|
|
301
|
+
minimumBidPrice
|
|
302
|
+
uninterruptablePrice
|
|
303
|
+
minVcpu
|
|
304
|
+
minMemory
|
|
305
|
+
stockStatus
|
|
306
|
+
compliance
|
|
307
|
+
maxUnreservedGpuCount
|
|
308
|
+
availableGpuCounts
|
|
309
|
+
}}
|
|
310
|
+
}}
|
|
311
|
+
}}
|
|
312
|
+
"""
|
|
313
|
+
|
|
314
|
+
result = graphql.run_graphql_query(query)
|
|
315
|
+
|
|
316
|
+
if 'errors' in result:
|
|
317
|
+
raise RuntimeError(f'GraphQL errors: {result["errors"]}')
|
|
318
|
+
|
|
319
|
+
try:
|
|
320
|
+
gpu_query_result = result['data']['gpuTypes'][0]
|
|
321
|
+
except Exception as e:
|
|
322
|
+
error_msg = ('No GPU Types found in RunPod query with '
|
|
323
|
+
f'gpu_id={gpu_id}, gpu_count={gpu_count}')
|
|
324
|
+
raise ValueError(error_msg) from e
|
|
325
|
+
|
|
326
|
+
return gpu_query_result
|
|
327
|
+
|
|
328
|
+
|
|
329
|
+
def query_cpu_id(cpu_id: str) -> List[Dict[str, Any]]:
|
|
330
|
+
query = f"""
|
|
331
|
+
query SecureCpuTypes {{
|
|
332
|
+
cpuFlavors(input: {{id: "{cpu_id}"}}) {{
|
|
333
|
+
id
|
|
334
|
+
groupId
|
|
335
|
+
displayName
|
|
336
|
+
minVcpu
|
|
337
|
+
maxVcpu
|
|
338
|
+
vcpuBurstable
|
|
339
|
+
ramMultiplier
|
|
340
|
+
diskLimitPerVcpu
|
|
341
|
+
}}
|
|
342
|
+
}}"""
|
|
343
|
+
result = graphql.run_graphql_query(query)
|
|
344
|
+
|
|
345
|
+
if 'errors' in result:
|
|
346
|
+
raise RuntimeError(f'GraphQL errors: {result["errors"]}')
|
|
347
|
+
|
|
348
|
+
try:
|
|
349
|
+
cpu_query_result = result['data']['cpuFlavors']
|
|
350
|
+
except Exception as e:
|
|
351
|
+
error_msg = (f'No CPU Types found in RunPod query with cpu_id={cpu_id}')
|
|
352
|
+
raise ValueError(error_msg) from e
|
|
353
|
+
|
|
354
|
+
return cpu_query_result
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def query_cpu_specifics(cpu_id: str,
|
|
358
|
+
cpu_spec_id: str,
|
|
359
|
+
data_center_id: str = '') -> List[Dict[str, Any]]:
|
|
360
|
+
query = f"""
|
|
361
|
+
query SecureCpuTypes {{
|
|
362
|
+
cpuFlavors(input: {{id: "{cpu_id}"}}) {{
|
|
363
|
+
id
|
|
364
|
+
groupId
|
|
365
|
+
displayName
|
|
366
|
+
specifics(input: {{instanceId: "{cpu_spec_id}", dataCenterId: "{data_center_id}"}}) {{
|
|
367
|
+
stockStatus
|
|
368
|
+
securePrice
|
|
369
|
+
slsPrice
|
|
370
|
+
}}
|
|
371
|
+
}}
|
|
372
|
+
}}"""
|
|
373
|
+
result = graphql.run_graphql_query(query)
|
|
374
|
+
|
|
375
|
+
if 'errors' in result:
|
|
376
|
+
raise RuntimeError(f'GraphQL errors: {result["errors"]}')
|
|
377
|
+
|
|
378
|
+
try:
|
|
379
|
+
cpu_query_result = result['data']['cpuFlavors']
|
|
380
|
+
except Exception as e:
|
|
381
|
+
error_msg = ('No CPU Types found in RunPod query with '
|
|
382
|
+
f'cpu_id={cpu_id} cpu_spec_id={cpu_spec_id}')
|
|
383
|
+
raise ValueError(error_msg) from e
|
|
384
|
+
|
|
385
|
+
return cpu_query_result
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
def format_price(price: float) -> float:
|
|
389
|
+
"""Format price to two decimal places."""
|
|
390
|
+
return round(price, 2)
|
|
391
|
+
|
|
392
|
+
|
|
393
|
+
def format_gpu_name(gpu_type: Dict[str, Any]) -> str:
|
|
394
|
+
"""Format GPU name to match the required format.
|
|
395
|
+
|
|
396
|
+
Programmatically generates the name from RunPod's GPU display name.
|
|
397
|
+
For compatibility, some names are overridden in GPU_NAME_OVERRIDES.
|
|
398
|
+
"""
|
|
399
|
+
# Extract base name
|
|
400
|
+
base_name = (
|
|
401
|
+
gpu_type['displayName']
|
|
402
|
+
# handle formatting names of RTX GPUs
|
|
403
|
+
.replace('RTX PRO ', 'RTXPRO')
|
|
404
|
+
# skypilot has no hyphen in RTX names. ie. RTX3090, not RTX-3090
|
|
405
|
+
.replace('RTX ', 'RTX')
|
|
406
|
+
# replace spaces with hyphens
|
|
407
|
+
.replace(' ', '-'))
|
|
408
|
+
|
|
409
|
+
# handle name overrides for backwards compatibility
|
|
410
|
+
if base_name in GPU_NAME_OVERRIDES:
|
|
411
|
+
base_name = GPU_NAME_OVERRIDES[base_name]
|
|
412
|
+
|
|
413
|
+
return base_name
|
|
414
|
+
|
|
415
|
+
|
|
416
|
+
def get_gpu_info(base_gpu_name: str, gpu_type: Dict[str, Any],
|
|
417
|
+
gpu_count: int) -> Optional[Dict[str, Any]]:
|
|
418
|
+
"""Extract relevant GPU information from RunPod GPU type data."""
|
|
419
|
+
|
|
420
|
+
# Use minVcpu & minMemory in the lowestPrice info if defaults not available
|
|
421
|
+
# Don't use this value by default as it is dynamic and changes often
|
|
422
|
+
vcpus = DEFAULT_GPU_INFO.get(base_gpu_name, {}).get('vcpus')
|
|
423
|
+
if vcpus is None:
|
|
424
|
+
vcpus = gpu_type.get('lowestPrice', {}).get('minVcpu')
|
|
425
|
+
else:
|
|
426
|
+
vcpus = vcpus * gpu_count
|
|
427
|
+
|
|
428
|
+
# This is the (minimum) pod RAM memory (scaled to count)
|
|
429
|
+
memory = DEFAULT_GPU_INFO.get(base_gpu_name, {}).get('memory')
|
|
430
|
+
if memory is None:
|
|
431
|
+
memory = gpu_type.get('lowestPrice', {}).get('minMemory')
|
|
432
|
+
|
|
433
|
+
# This is the VRAM memory per GPU (not scaled to count)
|
|
434
|
+
gpu_memory = gpu_type.get('memoryInGb', 0)
|
|
435
|
+
|
|
436
|
+
# Return None if memory or vcpus not valid
|
|
437
|
+
if not isinstance(vcpus, (float, int)) or vcpus <= 0:
|
|
438
|
+
print(f'Skipping GPU {base_gpu_name}:'
|
|
439
|
+
' vCPUs must be a positive number, not {vcpus}')
|
|
440
|
+
return None
|
|
441
|
+
if not isinstance(memory, (float, int)) or memory <= 0:
|
|
442
|
+
print(f'Skipping GPU {base_gpu_name}:'
|
|
443
|
+
' Memory must be a positive number, not {memory}')
|
|
444
|
+
return None
|
|
445
|
+
|
|
446
|
+
gpu_info_dict = {
|
|
447
|
+
'Gpus': [{
|
|
448
|
+
'Name': gpu_type['displayName'],
|
|
449
|
+
'Manufacturer': gpu_type['manufacturer'],
|
|
450
|
+
'Count': gpu_count,
|
|
451
|
+
'MemoryInfo': {
|
|
452
|
+
'SizeInMiB': gpu_memory
|
|
453
|
+
},
|
|
454
|
+
}],
|
|
455
|
+
'TotalGpuMemoryInMiB': gpu_memory * gpu_count,
|
|
456
|
+
}
|
|
457
|
+
gpu_info = json.dumps(gpu_info_dict).replace('"', '\'')
|
|
458
|
+
|
|
459
|
+
# Convert the counts, vCPUs, and memory to float
|
|
460
|
+
# for consistency with skypilot's catalog format
|
|
461
|
+
return {
|
|
462
|
+
'vCPUs': float(vcpus),
|
|
463
|
+
'MemoryGiB': float(memory * gpu_count),
|
|
464
|
+
'GpuInfo': gpu_info,
|
|
465
|
+
}
|
|
466
|
+
|
|
467
|
+
|
|
468
|
+
def get_cpu_instance_configurations(cpu_id: str) -> List[Dict[str, Any]]:
|
|
469
|
+
"""Retrieves available CPU instance configurations for a CPU ID.
|
|
470
|
+
This function queries the available vCPU and memory combinations
|
|
471
|
+
for given CPU types over all supported regions and zones.
|
|
472
|
+
Args:
|
|
473
|
+
cpu_id (str): The identifier for the CPU type to query.
|
|
474
|
+
Returns:
|
|
475
|
+
List[Dict]: A list of dictionaries, each representing an instance
|
|
476
|
+
configuration with the following keys:
|
|
477
|
+
- 'InstanceType': Unique identifier for the instance type (str)
|
|
478
|
+
- 'AcceleratorName': Name of accelerator (None for CPU-only)
|
|
479
|
+
- 'AcceleratorCount': Number of accelerators (None for CPU-only)
|
|
480
|
+
- 'vCPUs': Number of virtual CPUs (float).
|
|
481
|
+
- 'SpotPrice': Spot price for the instance (None currently)
|
|
482
|
+
- 'MemoryGB': Amount of memory in GB (float).
|
|
483
|
+
- 'Price': Secure price for the instance (float).
|
|
484
|
+
- 'Region': Cloud region name (str).
|
|
485
|
+
- 'AvailabilityZone': Availability zone within the region (str).
|
|
486
|
+
"""
|
|
487
|
+
|
|
488
|
+
instances = []
|
|
489
|
+
|
|
490
|
+
# Get vCPU and memory combinations for this CPU type
|
|
491
|
+
for cpu_info in query_cpu_id(cpu_id):
|
|
492
|
+
if not cpu_info.get('minVcpu') or not cpu_info.get(
|
|
493
|
+
'maxVcpu') or not cpu_info.get('ramMultiplier'):
|
|
494
|
+
print(f'Skipping CPU {cpu_id} due to missing vCPU or memory info')
|
|
495
|
+
continue
|
|
496
|
+
min_vcpu = int(cpu_info['minVcpu'])
|
|
497
|
+
max_vcpu = int(cpu_info['maxVcpu'])
|
|
498
|
+
ram_multiplier = int(cpu_info['ramMultiplier'])
|
|
499
|
+
|
|
500
|
+
# Iterate over possible vCPU counts (powers of 2 up to 2**8=512 vCPUs)
|
|
501
|
+
vcpu_counts = [
|
|
502
|
+
2**ii
|
|
503
|
+
for ii in range(1, 9)
|
|
504
|
+
if 2**ii >= min_vcpu and 2**ii <= max_vcpu
|
|
505
|
+
]
|
|
506
|
+
for vcpus in vcpu_counts:
|
|
507
|
+
memory = int(vcpus * ram_multiplier)
|
|
508
|
+
cpu_spec_id = f'{cpu_id}-{vcpus}-{memory}'
|
|
509
|
+
|
|
510
|
+
# Iterate over all regions and zones
|
|
511
|
+
for region, zones in REGION_ZONES.items():
|
|
512
|
+
for zone in zones:
|
|
513
|
+
for cpu_spec_output in query_cpu_specifics(
|
|
514
|
+
cpu_id, cpu_spec_id, zone):
|
|
515
|
+
instances.append({
|
|
516
|
+
'InstanceType': cpu_spec_id,
|
|
517
|
+
'AcceleratorName': None,
|
|
518
|
+
'AcceleratorCount': None,
|
|
519
|
+
'vCPUs': float(vcpus),
|
|
520
|
+
'SpotPrice': None,
|
|
521
|
+
'MemoryGiB': float(memory),
|
|
522
|
+
'Price': float(
|
|
523
|
+
cpu_spec_output['specifics']['securePrice']),
|
|
524
|
+
'Region': region,
|
|
525
|
+
'AvailabilityZone': zone,
|
|
526
|
+
'GpuInfo': None,
|
|
527
|
+
})
|
|
528
|
+
|
|
529
|
+
return instances
|
|
530
|
+
|
|
531
|
+
|
|
532
|
+
def get_gpu_instance_configurations(gpu_id: str) -> List[Dict[str, Any]]:
|
|
533
|
+
"""Retrieves available GPU instance configurations for a given GPU ID.
|
|
534
|
+
Only secure cloud instances are included (community cloud instances
|
|
535
|
+
are skipped). Each configuration includes pricing (spot and base), region,
|
|
536
|
+
availabilityzone, and hardware details.
|
|
537
|
+
If the GPU type is not found a default maximum GPU count & memory is used.
|
|
538
|
+
Args:
|
|
539
|
+
gpu_id (str): The identifier of the GPU type
|
|
540
|
+
Returns:
|
|
541
|
+
List[Dict]: A list of dictionaries, each representing an instance
|
|
542
|
+
configuration with the following keys:
|
|
543
|
+
- 'InstanceType': String describing the instance type
|
|
544
|
+
- 'AcceleratorName': Name of the GPU accelerator.
|
|
545
|
+
- 'AcceleratorCount': Number of GPUs in the instance.
|
|
546
|
+
- 'SpotPrice': Spot price for the instance (if available).
|
|
547
|
+
- 'Price': Base price for the instance (if available).
|
|
548
|
+
- 'Region': Cloud region.
|
|
549
|
+
- 'AvailabilityZone': Availability zone within the region.
|
|
550
|
+
- Additional hardware info (e.g., memory, vCPU) from GPU info.
|
|
551
|
+
"""
|
|
552
|
+
|
|
553
|
+
instances = []
|
|
554
|
+
detailed_gpu_1 = get_gpu_details(gpu_id, gpu_count=1)
|
|
555
|
+
base_gpu_name = format_gpu_name(detailed_gpu_1)
|
|
556
|
+
|
|
557
|
+
# If the GPU isn't in DEFAULT_GPU_INFO we default to a max of 8 GPUs
|
|
558
|
+
if base_gpu_name in DEFAULT_GPU_INFO:
|
|
559
|
+
max_gpu_count = DEFAULT_GPU_INFO[base_gpu_name].get(
|
|
560
|
+
'max_count', DEFAULT_MAX_GPUS)
|
|
561
|
+
else:
|
|
562
|
+
max_gpu_count = DEFAULT_MAX_GPUS
|
|
563
|
+
|
|
564
|
+
for gpu_count in range(1, int(max_gpu_count) + 1):
|
|
565
|
+
# Get detailed GPU info for this count
|
|
566
|
+
if gpu_count == 1:
|
|
567
|
+
detailed_gpu = detailed_gpu_1
|
|
568
|
+
else:
|
|
569
|
+
detailed_gpu = get_gpu_details(gpu_id, gpu_count)
|
|
570
|
+
|
|
571
|
+
# Only add secure clouds skipping community cloud instances.
|
|
572
|
+
if not detailed_gpu['secureCloud']:
|
|
573
|
+
continue
|
|
574
|
+
|
|
575
|
+
# Get basic info including memory & vcpu from the returned data
|
|
576
|
+
# If memory or vpcu is not available, skip this gpu count
|
|
577
|
+
gpu_info = get_gpu_info(base_gpu_name, detailed_gpu, gpu_count)
|
|
578
|
+
if gpu_info is None:
|
|
579
|
+
continue
|
|
580
|
+
|
|
581
|
+
spot_price = base_price = None
|
|
582
|
+
if detailed_gpu['secureSpotPrice'] is not None:
|
|
583
|
+
spot_price = format_price(detailed_gpu['secureSpotPrice'] *
|
|
584
|
+
gpu_count)
|
|
585
|
+
if detailed_gpu['securePrice'] is not None:
|
|
586
|
+
base_price = format_price(detailed_gpu['securePrice'] * gpu_count)
|
|
587
|
+
|
|
588
|
+
for region, zones in REGION_ZONES.items():
|
|
589
|
+
for zone in zones:
|
|
590
|
+
instances.append({
|
|
591
|
+
'InstanceType': f'{gpu_count}x_{base_gpu_name}_SECURE',
|
|
592
|
+
'AcceleratorName': base_gpu_name,
|
|
593
|
+
'AcceleratorCount': float(gpu_count),
|
|
594
|
+
'SpotPrice': spot_price,
|
|
595
|
+
'Price': base_price,
|
|
596
|
+
'Region': region,
|
|
597
|
+
'AvailabilityZone': zone,
|
|
598
|
+
**gpu_info
|
|
599
|
+
})
|
|
600
|
+
|
|
601
|
+
return instances
|
|
602
|
+
|
|
603
|
+
|
|
604
|
+
def fetch_runpod_catalog(no_gpu: bool, no_cpu: bool) -> pd.DataFrame:
|
|
605
|
+
"""Fetch and process RunPod GPU catalog data.
|
|
606
|
+
|
|
607
|
+
Args:
|
|
608
|
+
gpu_ids: Optional comma-separated list of RunPod GPU IDs to fetch.
|
|
609
|
+
If None, fetch all available GPUs.
|
|
610
|
+
"""
|
|
611
|
+
try:
|
|
612
|
+
# Initialize RunPod client
|
|
613
|
+
runpod.api_key = os.getenv('RUNPOD_API_KEY')
|
|
614
|
+
if not runpod.api_key:
|
|
615
|
+
raise ValueError('RUNPOD_API_KEY environment variable not set')
|
|
616
|
+
|
|
617
|
+
# Get GPU list from API
|
|
618
|
+
instances = []
|
|
619
|
+
if not no_gpu:
|
|
620
|
+
gpus = runpod.get_gpus()
|
|
621
|
+
if not gpus:
|
|
622
|
+
raise ValueError('No GPU types returned from RunPod API')
|
|
623
|
+
|
|
624
|
+
# Generate instances from GPU ids
|
|
625
|
+
instances.extend([
|
|
626
|
+
instance for gpu in gpus
|
|
627
|
+
for instance in get_gpu_instance_configurations(gpu['id'])
|
|
628
|
+
])
|
|
629
|
+
|
|
630
|
+
if not no_cpu:
|
|
631
|
+
# Generate instances from CPU ids
|
|
632
|
+
instances.extend([
|
|
633
|
+
instance for cpu_id in DEFAULT_CPU_ONLY_IDS
|
|
634
|
+
for instance in get_cpu_instance_configurations(cpu_id)
|
|
635
|
+
])
|
|
636
|
+
|
|
637
|
+
return instances
|
|
638
|
+
|
|
639
|
+
except Exception as e:
|
|
640
|
+
print(traceback.format_exc())
|
|
641
|
+
print(f'Failed to fetch RunPod catalog: {e}', file=sys.stderr)
|
|
642
|
+
raise
|
|
643
|
+
|
|
644
|
+
|
|
645
|
+
def save_catalog(instances: List[Dict[str, Any]], output_file: str) -> None:
|
|
646
|
+
"""Save the catalog to a CSV file."""
|
|
647
|
+
|
|
648
|
+
# Create DataFrame
|
|
649
|
+
df = pd.DataFrame(instances)
|
|
650
|
+
|
|
651
|
+
# Validate required columns
|
|
652
|
+
missing_columns = set(USEFUL_COLUMNS) - set(df.columns)
|
|
653
|
+
if missing_columns:
|
|
654
|
+
raise ValueError(f'Missing required columns: {missing_columns}')
|
|
655
|
+
|
|
656
|
+
# Ensure all required columns are present and in correct order
|
|
657
|
+
df = df[USEFUL_COLUMNS]
|
|
658
|
+
|
|
659
|
+
# Sort for consistency
|
|
660
|
+
df.sort_values(['AcceleratorName', 'InstanceType', 'AvailabilityZone'],
|
|
661
|
+
inplace=True)
|
|
662
|
+
|
|
663
|
+
df.to_csv(output_file, index=False)
|
|
664
|
+
print(f'RunPod catalog saved to {output_file}')
|
|
665
|
+
|
|
666
|
+
|
|
667
|
+
def main():
|
|
668
|
+
parser = argparse.ArgumentParser(
|
|
669
|
+
description='Update RunPod catalog for SkyPilot')
|
|
670
|
+
parser.add_argument('--output-dir',
|
|
671
|
+
default='runpod',
|
|
672
|
+
help='Directory to save the catalog files')
|
|
673
|
+
parser.add_argument('--no-gpu',
|
|
674
|
+
help='Do not fetch and store catalog for RunPod GPUs',
|
|
675
|
+
default=False,
|
|
676
|
+
action='store_true')
|
|
677
|
+
parser.add_argument(
|
|
678
|
+
'--no-cpu',
|
|
679
|
+
help='Do not fetch and store catalog for RunPod CPUs (serverless)',
|
|
680
|
+
default=False,
|
|
681
|
+
action='store_true')
|
|
682
|
+
args = parser.parse_args()
|
|
683
|
+
|
|
684
|
+
try:
|
|
685
|
+
os.makedirs(args.output_dir, exist_ok=True)
|
|
686
|
+
|
|
687
|
+
catalog = fetch_runpod_catalog(args.no_gpu, args.no_cpu)
|
|
688
|
+
|
|
689
|
+
output_file_location = os.path.join(args.output_dir, 'vms.csv')
|
|
690
|
+
save_catalog(catalog, output_file_location)
|
|
691
|
+
|
|
692
|
+
except ValueError as e:
|
|
693
|
+
print(f'Error updating RunPod catalog: {e}', file=sys.stderr)
|
|
694
|
+
sys.exit(1)
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
if __name__ == '__main__':
|
|
698
|
+
main()
|