skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,698 @@
1
+ """A script that generates the Runpod catalog.
2
+
3
+ Usage:
4
+ python fetch_runpod.py [-h] [--output-dir OUTPUT_DIR] [--gpu-ids GPU_IDS]
5
+
6
+ The RUNPOD_API_KEY environment variable must be set with a valid read-access
7
+ RunPod API key.
8
+
9
+ If --gpu-ids is provided, only fetches details for
10
+ the specified GPU IDs (comma-separated). Otherwise, fetches all available GPUs.
11
+ This flag is intended for testing and debugging individual GPU configurations.
12
+ """
13
+
14
+ import argparse
15
+ import json
16
+ import os
17
+ import sys
18
+ import traceback
19
+ from typing import Any, Dict, List, Optional, Union
20
+
21
+ import pandas as pd
22
+ import runpod
23
+ from runpod.api import graphql
24
+
25
+ # The API currently returns a dynamic number of vCPUs per pod that
26
+ # changes frequently (less than 30 mins)
27
+ # Therefore we hard code a default number of vCPUs from:
28
+ # 1. The previous catalog, if the GPU exists there
29
+ # 2. Or if not, the pricing page https://www.runpod.io/pricing
30
+ # 3. Otherwise, the minimum of the returned# vCPU count from the API
31
+ # The max count of GPUs per pod is set to 8 apart from A40 at 10
32
+ DEFAULT_MAX_GPUS = 8
33
+ DEFAULT_GPU_INFO: Dict[str, Dict[str, Union[int, float]]] = {
34
+ 'A100-80GB': {
35
+ 'vcpus': 8.0,
36
+ 'memory': 117.0,
37
+ 'max_count': 8
38
+ },
39
+ 'A100-80GB-SXM': {
40
+ 'vcpus': 16.0,
41
+ 'memory': 117.0,
42
+ 'max_count': 8
43
+ },
44
+ 'A30': {
45
+ 'vcpus': 12.0,
46
+ 'memory': 39.0,
47
+ 'max_count': 8
48
+ },
49
+ 'A40': {
50
+ 'vcpus': 9.0,
51
+ 'memory': 48.0,
52
+ 'max_count': 10
53
+ },
54
+ 'B200': {
55
+ 'vcpus': 28.0,
56
+ 'memory': 180.0,
57
+ 'max_count': 8
58
+ },
59
+ 'H100': {
60
+ 'vcpus': 16.0,
61
+ 'memory': 176.0,
62
+ 'max_count': 8
63
+ },
64
+ 'H100-NVL': {
65
+ 'vcpus': 16.0,
66
+ 'memory': 94.0,
67
+ 'max_count': 10
68
+ },
69
+ 'H100-SXM': {
70
+ 'vcpus': 20.0,
71
+ 'memory': 125.0,
72
+ 'max_count': 8
73
+ },
74
+ 'H200-SXM': {
75
+ 'vcpus': 12.0,
76
+ 'memory': 188.0,
77
+ 'max_count': 8
78
+ },
79
+ 'L4': {
80
+ 'vcpus': 8.0,
81
+ 'memory': 45.0,
82
+ 'max_count': 10
83
+ },
84
+ 'L40': {
85
+ 'vcpus': 9.0,
86
+ 'memory': 125.0,
87
+ 'max_count': 10
88
+ },
89
+ 'L40S': {
90
+ 'vcpus': 12.0,
91
+ 'memory': 62.0,
92
+ 'max_count': 8
93
+ },
94
+ 'MI300X': {
95
+ 'vcpus': 24.0,
96
+ 'memory': 283.0,
97
+ 'max_count': 8
98
+ },
99
+ 'RTX2000-Ada': {
100
+ 'vcpus': 6.0,
101
+ 'memory': 31.0,
102
+ 'max_count': 8
103
+ },
104
+ 'RTX3070': {
105
+ 'vcpus': 8.0,
106
+ 'memory': 30.0,
107
+ 'max_count': 8
108
+ },
109
+ 'RTX3080': {
110
+ 'vcpus': 8.0,
111
+ 'memory': 14.0,
112
+ 'max_count': 4
113
+ },
114
+ 'RTX3080-Ti': {
115
+ 'vcpus': 8.0,
116
+ 'memory': 18.0,
117
+ 'max_count': 5
118
+ },
119
+ 'RTX3090': {
120
+ 'vcpus': 4.0,
121
+ 'memory': 25.0,
122
+ 'max_count': 8
123
+ },
124
+ 'RTX3090-Ti': {
125
+ 'vcpus': 8.0,
126
+ 'memory': 24.0,
127
+ 'max_count': 9
128
+ },
129
+ 'RTX4000-Ada': {
130
+ 'vcpus': 8.0,
131
+ 'memory': 47.0,
132
+ 'max_count': 8
133
+ },
134
+ 'RTX4080': {
135
+ 'vcpus': 8.0,
136
+ 'memory': 22.0,
137
+ 'max_count': 5
138
+ },
139
+ 'RTX4080-SUPER': {
140
+ 'vcpus': 12.0,
141
+ 'memory': 62.0,
142
+ 'max_count': 6
143
+ },
144
+ 'RTX4090': {
145
+ 'vcpus': 5.0,
146
+ 'memory': 29.0,
147
+ 'max_count': 8
148
+ },
149
+ 'RTX5000-Ada': {
150
+ 'vcpus': 6.0,
151
+ 'memory': 62.0,
152
+ 'max_count': 8
153
+ },
154
+ 'RTX5080': {
155
+ 'vcpus': 5.0,
156
+ 'memory': 30.0,
157
+ 'max_count': 8
158
+ },
159
+ 'RTX5090': {
160
+ 'vcpus': 6.0,
161
+ 'memory': 46.0,
162
+ 'max_count': 8
163
+ },
164
+ 'RTX6000-Ada': {
165
+ 'vcpus': 10.0,
166
+ 'memory': 62.0,
167
+ 'max_count': 8
168
+ },
169
+ 'RTXA4000': {
170
+ 'vcpus': 6.0,
171
+ 'memory': 35.0,
172
+ 'max_count': 12
173
+ },
174
+ 'RTXA4500': {
175
+ 'vcpus': 7.0,
176
+ 'memory': 30.0,
177
+ 'max_count': 4
178
+ },
179
+ 'RTXA5000': {
180
+ 'vcpus': 3.0,
181
+ 'memory': 25.0,
182
+ 'max_count': 10
183
+ },
184
+ 'RTXA6000': {
185
+ 'vcpus': 8.0,
186
+ 'memory': 50.0,
187
+ 'max_count': 10
188
+ },
189
+ 'RTXPRO6000': {
190
+ 'vcpus': 14.0,
191
+ 'memory': 125.0,
192
+ 'max_count': 9
193
+ },
194
+ 'RTXPRO6000-MaxQ': {
195
+ 'vcpus': 18.0,
196
+ 'memory': 215.0,
197
+ 'max_count': 7
198
+ },
199
+ 'RTXPRO6000-WK': {
200
+ 'vcpus': 12.0,
201
+ 'memory': 186.0,
202
+ 'max_count': 4
203
+ },
204
+ 'V100-SXM2': {
205
+ 'vcpus': 10.0,
206
+ 'memory': 62.0,
207
+ 'max_count': 8
208
+ },
209
+ 'V100-SXM2-32GB': {
210
+ 'vcpus': 20.0,
211
+ 'memory': 93.0,
212
+ 'max_count': 4
213
+ }
214
+ }
215
+
216
+ # A manual list of all CPU IDs RunPod currently supports
217
+ # These are named as cpu{generation}{tier}
218
+ # TODO: Investigate if these can be found from the API in an automated way
219
+ # currently there is little documentation or API to obtain them.
220
+ DEFAULT_CPU_ONLY_IDS = ['cpu3c', 'cpu3g', 'cpu3m', 'cpu5c', 'cpu5g', 'cpu5m']
221
+
222
+ # for backwards compatibility, force rename some gpus.
223
+ # map the generated name to the original name.
224
+ # RunPod GPU names currently supported are listed here:
225
+ # https://docs.runpod.io/references/gpu-types
226
+ GPU_NAME_OVERRIDES = {
227
+ 'A100-PCIe': 'A100-80GB',
228
+ 'A100-SXM': 'A100-80GB-SXM',
229
+ 'H100-PCIe': 'H100',
230
+ }
231
+
232
+ # Constants
233
+ USEFUL_COLUMNS = [
234
+ 'InstanceType',
235
+ 'AcceleratorName',
236
+ 'AcceleratorCount',
237
+ 'vCPUs',
238
+ 'MemoryGiB',
239
+ 'Region',
240
+ 'SpotPrice',
241
+ 'Price',
242
+ 'AvailabilityZone',
243
+ 'GpuInfo',
244
+ ]
245
+
246
+ # Mapping of regions to their availability zones
247
+ # TODO: Investigate if these can be found from the API in an automated way
248
+ # currently there is little documentation or API to obtain them.
249
+ REGION_ZONES = {
250
+ 'CA': ['CA-MTL-1', 'CA-MTL-2', 'CA-MTL-3'],
251
+ 'CZ': ['EU-CZ-1'],
252
+ 'IS': ['EUR-IS-1', 'EUR-IS-2', 'EUR-IS-3'],
253
+ 'NL': ['EU-NL-1'],
254
+ 'NO': ['EU-SE-1'],
255
+ 'RO': ['EU-RO-1'],
256
+ 'SE': ['EU-SE-1'],
257
+ 'US': [
258
+ 'US-CA-1',
259
+ 'US-CA-2',
260
+ 'US-DE-1',
261
+ 'US-GA-1',
262
+ 'US-GA-2',
263
+ 'US-IL-1',
264
+ 'US-KS-1',
265
+ 'US-KS-2',
266
+ 'US-NC-1',
267
+ 'US-TX-1',
268
+ 'US-TX-2',
269
+ 'US-TX-3',
270
+ 'US-TX-4',
271
+ 'US-WA-1',
272
+ ],
273
+ }
274
+
275
+
276
+ def get_gpu_details(gpu_id: str, gpu_count: int = 1) -> Dict[str, Any]:
277
+ """Get detailed GPU information using GraphQL query.
278
+
279
+ This uses a custom graphql query because runpod.get_gpu(id) does not include
280
+ full lowestPrice information.
281
+ """
282
+ query = f"""
283
+ query GpuTypes {{
284
+ gpuTypes(input: {{id: "{gpu_id}"}}) {{
285
+ maxGpuCount
286
+ id
287
+ displayName
288
+ manufacturer
289
+ memoryInGb
290
+ cudaCores
291
+ secureCloud
292
+ communityCloud
293
+ securePrice
294
+ communityPrice
295
+ oneMonthPrice
296
+ threeMonthPrice
297
+ oneWeekPrice
298
+ communitySpotPrice
299
+ secureSpotPrice
300
+ lowestPrice(input: {{gpuCount: {gpu_count}}}) {{
301
+ minimumBidPrice
302
+ uninterruptablePrice
303
+ minVcpu
304
+ minMemory
305
+ stockStatus
306
+ compliance
307
+ maxUnreservedGpuCount
308
+ availableGpuCounts
309
+ }}
310
+ }}
311
+ }}
312
+ """
313
+
314
+ result = graphql.run_graphql_query(query)
315
+
316
+ if 'errors' in result:
317
+ raise RuntimeError(f'GraphQL errors: {result["errors"]}')
318
+
319
+ try:
320
+ gpu_query_result = result['data']['gpuTypes'][0]
321
+ except Exception as e:
322
+ error_msg = ('No GPU Types found in RunPod query with '
323
+ f'gpu_id={gpu_id}, gpu_count={gpu_count}')
324
+ raise ValueError(error_msg) from e
325
+
326
+ return gpu_query_result
327
+
328
+
329
+ def query_cpu_id(cpu_id: str) -> List[Dict[str, Any]]:
330
+ query = f"""
331
+ query SecureCpuTypes {{
332
+ cpuFlavors(input: {{id: "{cpu_id}"}}) {{
333
+ id
334
+ groupId
335
+ displayName
336
+ minVcpu
337
+ maxVcpu
338
+ vcpuBurstable
339
+ ramMultiplier
340
+ diskLimitPerVcpu
341
+ }}
342
+ }}"""
343
+ result = graphql.run_graphql_query(query)
344
+
345
+ if 'errors' in result:
346
+ raise RuntimeError(f'GraphQL errors: {result["errors"]}')
347
+
348
+ try:
349
+ cpu_query_result = result['data']['cpuFlavors']
350
+ except Exception as e:
351
+ error_msg = (f'No CPU Types found in RunPod query with cpu_id={cpu_id}')
352
+ raise ValueError(error_msg) from e
353
+
354
+ return cpu_query_result
355
+
356
+
357
+ def query_cpu_specifics(cpu_id: str,
358
+ cpu_spec_id: str,
359
+ data_center_id: str = '') -> List[Dict[str, Any]]:
360
+ query = f"""
361
+ query SecureCpuTypes {{
362
+ cpuFlavors(input: {{id: "{cpu_id}"}}) {{
363
+ id
364
+ groupId
365
+ displayName
366
+ specifics(input: {{instanceId: "{cpu_spec_id}", dataCenterId: "{data_center_id}"}}) {{
367
+ stockStatus
368
+ securePrice
369
+ slsPrice
370
+ }}
371
+ }}
372
+ }}"""
373
+ result = graphql.run_graphql_query(query)
374
+
375
+ if 'errors' in result:
376
+ raise RuntimeError(f'GraphQL errors: {result["errors"]}')
377
+
378
+ try:
379
+ cpu_query_result = result['data']['cpuFlavors']
380
+ except Exception as e:
381
+ error_msg = ('No CPU Types found in RunPod query with '
382
+ f'cpu_id={cpu_id} cpu_spec_id={cpu_spec_id}')
383
+ raise ValueError(error_msg) from e
384
+
385
+ return cpu_query_result
386
+
387
+
388
+ def format_price(price: float) -> float:
389
+ """Format price to two decimal places."""
390
+ return round(price, 2)
391
+
392
+
393
+ def format_gpu_name(gpu_type: Dict[str, Any]) -> str:
394
+ """Format GPU name to match the required format.
395
+
396
+ Programmatically generates the name from RunPod's GPU display name.
397
+ For compatibility, some names are overridden in GPU_NAME_OVERRIDES.
398
+ """
399
+ # Extract base name
400
+ base_name = (
401
+ gpu_type['displayName']
402
+ # handle formatting names of RTX GPUs
403
+ .replace('RTX PRO ', 'RTXPRO')
404
+ # skypilot has no hyphen in RTX names. ie. RTX3090, not RTX-3090
405
+ .replace('RTX ', 'RTX')
406
+ # replace spaces with hyphens
407
+ .replace(' ', '-'))
408
+
409
+ # handle name overrides for backwards compatibility
410
+ if base_name in GPU_NAME_OVERRIDES:
411
+ base_name = GPU_NAME_OVERRIDES[base_name]
412
+
413
+ return base_name
414
+
415
+
416
+ def get_gpu_info(base_gpu_name: str, gpu_type: Dict[str, Any],
417
+ gpu_count: int) -> Optional[Dict[str, Any]]:
418
+ """Extract relevant GPU information from RunPod GPU type data."""
419
+
420
+ # Use minVcpu & minMemory in the lowestPrice info if defaults not available
421
+ # Don't use this value by default as it is dynamic and changes often
422
+ vcpus = DEFAULT_GPU_INFO.get(base_gpu_name, {}).get('vcpus')
423
+ if vcpus is None:
424
+ vcpus = gpu_type.get('lowestPrice', {}).get('minVcpu')
425
+ else:
426
+ vcpus = vcpus * gpu_count
427
+
428
+ # This is the (minimum) pod RAM memory (scaled to count)
429
+ memory = DEFAULT_GPU_INFO.get(base_gpu_name, {}).get('memory')
430
+ if memory is None:
431
+ memory = gpu_type.get('lowestPrice', {}).get('minMemory')
432
+
433
+ # This is the VRAM memory per GPU (not scaled to count)
434
+ gpu_memory = gpu_type.get('memoryInGb', 0)
435
+
436
+ # Return None if memory or vcpus not valid
437
+ if not isinstance(vcpus, (float, int)) or vcpus <= 0:
438
+ print(f'Skipping GPU {base_gpu_name}:'
439
+ ' vCPUs must be a positive number, not {vcpus}')
440
+ return None
441
+ if not isinstance(memory, (float, int)) or memory <= 0:
442
+ print(f'Skipping GPU {base_gpu_name}:'
443
+ ' Memory must be a positive number, not {memory}')
444
+ return None
445
+
446
+ gpu_info_dict = {
447
+ 'Gpus': [{
448
+ 'Name': gpu_type['displayName'],
449
+ 'Manufacturer': gpu_type['manufacturer'],
450
+ 'Count': gpu_count,
451
+ 'MemoryInfo': {
452
+ 'SizeInMiB': gpu_memory
453
+ },
454
+ }],
455
+ 'TotalGpuMemoryInMiB': gpu_memory * gpu_count,
456
+ }
457
+ gpu_info = json.dumps(gpu_info_dict).replace('"', '\'')
458
+
459
+ # Convert the counts, vCPUs, and memory to float
460
+ # for consistency with skypilot's catalog format
461
+ return {
462
+ 'vCPUs': float(vcpus),
463
+ 'MemoryGiB': float(memory * gpu_count),
464
+ 'GpuInfo': gpu_info,
465
+ }
466
+
467
+
468
+ def get_cpu_instance_configurations(cpu_id: str) -> List[Dict[str, Any]]:
469
+ """Retrieves available CPU instance configurations for a CPU ID.
470
+ This function queries the available vCPU and memory combinations
471
+ for given CPU types over all supported regions and zones.
472
+ Args:
473
+ cpu_id (str): The identifier for the CPU type to query.
474
+ Returns:
475
+ List[Dict]: A list of dictionaries, each representing an instance
476
+ configuration with the following keys:
477
+ - 'InstanceType': Unique identifier for the instance type (str)
478
+ - 'AcceleratorName': Name of accelerator (None for CPU-only)
479
+ - 'AcceleratorCount': Number of accelerators (None for CPU-only)
480
+ - 'vCPUs': Number of virtual CPUs (float).
481
+ - 'SpotPrice': Spot price for the instance (None currently)
482
+ - 'MemoryGB': Amount of memory in GB (float).
483
+ - 'Price': Secure price for the instance (float).
484
+ - 'Region': Cloud region name (str).
485
+ - 'AvailabilityZone': Availability zone within the region (str).
486
+ """
487
+
488
+ instances = []
489
+
490
+ # Get vCPU and memory combinations for this CPU type
491
+ for cpu_info in query_cpu_id(cpu_id):
492
+ if not cpu_info.get('minVcpu') or not cpu_info.get(
493
+ 'maxVcpu') or not cpu_info.get('ramMultiplier'):
494
+ print(f'Skipping CPU {cpu_id} due to missing vCPU or memory info')
495
+ continue
496
+ min_vcpu = int(cpu_info['minVcpu'])
497
+ max_vcpu = int(cpu_info['maxVcpu'])
498
+ ram_multiplier = int(cpu_info['ramMultiplier'])
499
+
500
+ # Iterate over possible vCPU counts (powers of 2 up to 2**8=512 vCPUs)
501
+ vcpu_counts = [
502
+ 2**ii
503
+ for ii in range(1, 9)
504
+ if 2**ii >= min_vcpu and 2**ii <= max_vcpu
505
+ ]
506
+ for vcpus in vcpu_counts:
507
+ memory = int(vcpus * ram_multiplier)
508
+ cpu_spec_id = f'{cpu_id}-{vcpus}-{memory}'
509
+
510
+ # Iterate over all regions and zones
511
+ for region, zones in REGION_ZONES.items():
512
+ for zone in zones:
513
+ for cpu_spec_output in query_cpu_specifics(
514
+ cpu_id, cpu_spec_id, zone):
515
+ instances.append({
516
+ 'InstanceType': cpu_spec_id,
517
+ 'AcceleratorName': None,
518
+ 'AcceleratorCount': None,
519
+ 'vCPUs': float(vcpus),
520
+ 'SpotPrice': None,
521
+ 'MemoryGiB': float(memory),
522
+ 'Price': float(
523
+ cpu_spec_output['specifics']['securePrice']),
524
+ 'Region': region,
525
+ 'AvailabilityZone': zone,
526
+ 'GpuInfo': None,
527
+ })
528
+
529
+ return instances
530
+
531
+
532
+ def get_gpu_instance_configurations(gpu_id: str) -> List[Dict[str, Any]]:
533
+ """Retrieves available GPU instance configurations for a given GPU ID.
534
+ Only secure cloud instances are included (community cloud instances
535
+ are skipped). Each configuration includes pricing (spot and base), region,
536
+ availabilityzone, and hardware details.
537
+ If the GPU type is not found a default maximum GPU count & memory is used.
538
+ Args:
539
+ gpu_id (str): The identifier of the GPU type
540
+ Returns:
541
+ List[Dict]: A list of dictionaries, each representing an instance
542
+ configuration with the following keys:
543
+ - 'InstanceType': String describing the instance type
544
+ - 'AcceleratorName': Name of the GPU accelerator.
545
+ - 'AcceleratorCount': Number of GPUs in the instance.
546
+ - 'SpotPrice': Spot price for the instance (if available).
547
+ - 'Price': Base price for the instance (if available).
548
+ - 'Region': Cloud region.
549
+ - 'AvailabilityZone': Availability zone within the region.
550
+ - Additional hardware info (e.g., memory, vCPU) from GPU info.
551
+ """
552
+
553
+ instances = []
554
+ detailed_gpu_1 = get_gpu_details(gpu_id, gpu_count=1)
555
+ base_gpu_name = format_gpu_name(detailed_gpu_1)
556
+
557
+ # If the GPU isn't in DEFAULT_GPU_INFO we default to a max of 8 GPUs
558
+ if base_gpu_name in DEFAULT_GPU_INFO:
559
+ max_gpu_count = DEFAULT_GPU_INFO[base_gpu_name].get(
560
+ 'max_count', DEFAULT_MAX_GPUS)
561
+ else:
562
+ max_gpu_count = DEFAULT_MAX_GPUS
563
+
564
+ for gpu_count in range(1, int(max_gpu_count) + 1):
565
+ # Get detailed GPU info for this count
566
+ if gpu_count == 1:
567
+ detailed_gpu = detailed_gpu_1
568
+ else:
569
+ detailed_gpu = get_gpu_details(gpu_id, gpu_count)
570
+
571
+ # Only add secure clouds skipping community cloud instances.
572
+ if not detailed_gpu['secureCloud']:
573
+ continue
574
+
575
+ # Get basic info including memory & vcpu from the returned data
576
+ # If memory or vpcu is not available, skip this gpu count
577
+ gpu_info = get_gpu_info(base_gpu_name, detailed_gpu, gpu_count)
578
+ if gpu_info is None:
579
+ continue
580
+
581
+ spot_price = base_price = None
582
+ if detailed_gpu['secureSpotPrice'] is not None:
583
+ spot_price = format_price(detailed_gpu['secureSpotPrice'] *
584
+ gpu_count)
585
+ if detailed_gpu['securePrice'] is not None:
586
+ base_price = format_price(detailed_gpu['securePrice'] * gpu_count)
587
+
588
+ for region, zones in REGION_ZONES.items():
589
+ for zone in zones:
590
+ instances.append({
591
+ 'InstanceType': f'{gpu_count}x_{base_gpu_name}_SECURE',
592
+ 'AcceleratorName': base_gpu_name,
593
+ 'AcceleratorCount': float(gpu_count),
594
+ 'SpotPrice': spot_price,
595
+ 'Price': base_price,
596
+ 'Region': region,
597
+ 'AvailabilityZone': zone,
598
+ **gpu_info
599
+ })
600
+
601
+ return instances
602
+
603
+
604
+ def fetch_runpod_catalog(no_gpu: bool, no_cpu: bool) -> pd.DataFrame:
605
+ """Fetch and process RunPod GPU catalog data.
606
+
607
+ Args:
608
+ gpu_ids: Optional comma-separated list of RunPod GPU IDs to fetch.
609
+ If None, fetch all available GPUs.
610
+ """
611
+ try:
612
+ # Initialize RunPod client
613
+ runpod.api_key = os.getenv('RUNPOD_API_KEY')
614
+ if not runpod.api_key:
615
+ raise ValueError('RUNPOD_API_KEY environment variable not set')
616
+
617
+ # Get GPU list from API
618
+ instances = []
619
+ if not no_gpu:
620
+ gpus = runpod.get_gpus()
621
+ if not gpus:
622
+ raise ValueError('No GPU types returned from RunPod API')
623
+
624
+ # Generate instances from GPU ids
625
+ instances.extend([
626
+ instance for gpu in gpus
627
+ for instance in get_gpu_instance_configurations(gpu['id'])
628
+ ])
629
+
630
+ if not no_cpu:
631
+ # Generate instances from CPU ids
632
+ instances.extend([
633
+ instance for cpu_id in DEFAULT_CPU_ONLY_IDS
634
+ for instance in get_cpu_instance_configurations(cpu_id)
635
+ ])
636
+
637
+ return instances
638
+
639
+ except Exception as e:
640
+ print(traceback.format_exc())
641
+ print(f'Failed to fetch RunPod catalog: {e}', file=sys.stderr)
642
+ raise
643
+
644
+
645
+ def save_catalog(instances: List[Dict[str, Any]], output_file: str) -> None:
646
+ """Save the catalog to a CSV file."""
647
+
648
+ # Create DataFrame
649
+ df = pd.DataFrame(instances)
650
+
651
+ # Validate required columns
652
+ missing_columns = set(USEFUL_COLUMNS) - set(df.columns)
653
+ if missing_columns:
654
+ raise ValueError(f'Missing required columns: {missing_columns}')
655
+
656
+ # Ensure all required columns are present and in correct order
657
+ df = df[USEFUL_COLUMNS]
658
+
659
+ # Sort for consistency
660
+ df.sort_values(['AcceleratorName', 'InstanceType', 'AvailabilityZone'],
661
+ inplace=True)
662
+
663
+ df.to_csv(output_file, index=False)
664
+ print(f'RunPod catalog saved to {output_file}')
665
+
666
+
667
+ def main():
668
+ parser = argparse.ArgumentParser(
669
+ description='Update RunPod catalog for SkyPilot')
670
+ parser.add_argument('--output-dir',
671
+ default='runpod',
672
+ help='Directory to save the catalog files')
673
+ parser.add_argument('--no-gpu',
674
+ help='Do not fetch and store catalog for RunPod GPUs',
675
+ default=False,
676
+ action='store_true')
677
+ parser.add_argument(
678
+ '--no-cpu',
679
+ help='Do not fetch and store catalog for RunPod CPUs (serverless)',
680
+ default=False,
681
+ action='store_true')
682
+ args = parser.parse_args()
683
+
684
+ try:
685
+ os.makedirs(args.output_dir, exist_ok=True)
686
+
687
+ catalog = fetch_runpod_catalog(args.no_gpu, args.no_cpu)
688
+
689
+ output_file_location = os.path.join(args.output_dir, 'vms.csv')
690
+ save_catalog(catalog, output_file_location)
691
+
692
+ except ValueError as e:
693
+ print(f'Error updating RunPod catalog: {e}', file=sys.stderr)
694
+ sys.exit(1)
695
+
696
+
697
+ if __name__ == '__main__':
698
+ main()