skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -100,6 +100,13 @@ def _build_sky_wheel() -> pathlib.Path:
100
100
  # modify the commit hash in the file later.
101
101
  # Symlink other files/folders.
102
102
  target.symlink_to(item, target_is_directory=item.is_dir())
103
+
104
+ # Symlink sky_templates directory from repo root
105
+ sky_templates_src = SKY_PACKAGE_PATH.parent / 'sky_templates'
106
+ if sky_templates_src.exists():
107
+ sky_templates_target = tmp_dir / 'sky_templates'
108
+ sky_templates_target.symlink_to(sky_templates_src,
109
+ target_is_directory=True)
103
110
  setup_files_dir = SKY_PACKAGE_PATH / 'setup_files'
104
111
 
105
112
  setup_content = (setup_files_dir / 'setup.py').read_text()
@@ -244,6 +251,17 @@ def build_sky_wheel() -> Tuple[pathlib.Path, str]:
244
251
  # protocol. "compare, update and clone" has to be atomic to avoid
245
252
  # race conditions.
246
253
  last_modification_time = _get_latest_modification_time(SKY_PACKAGE_PATH)
254
+ # Also check sky_templates directory modification time
255
+ sky_templates_path = SKY_PACKAGE_PATH.parent / 'sky_templates'
256
+ if sky_templates_path.exists():
257
+ sky_templates_mtime = _get_latest_modification_time(
258
+ sky_templates_path)
259
+ if (last_modification_time is not None and
260
+ sky_templates_mtime is not None):
261
+ last_modification_time = max(last_modification_time,
262
+ sky_templates_mtime)
263
+ elif last_modification_time is None:
264
+ last_modification_time = sky_templates_mtime
247
265
  last_wheel_modification_time = _get_latest_modification_time(WHEEL_DIR)
248
266
 
249
267
  # Only build wheels if the wheel is outdated, wheel does not exist
sky/catalog/__init__.py CHANGED
@@ -247,6 +247,13 @@ def get_accelerators_from_instance_type(
247
247
  instance_type)
248
248
 
249
249
 
250
+ def get_arch_from_instance_type(instance_type: str,
251
+ clouds: CloudFilter = None) -> Optional[str]:
252
+ """Returns the arch from a instance type."""
253
+ return _map_clouds_catalog(clouds, 'get_arch_from_instance_type',
254
+ instance_type)
255
+
256
+
250
257
  def get_instance_type_for_accelerator(
251
258
  acc_name: str,
252
259
  acc_count: Union[int, float],
@@ -326,6 +333,7 @@ def get_common_gpus() -> List[str]:
326
333
  'A10G',
327
334
  'A100',
328
335
  'A100-80GB',
336
+ 'B200',
329
337
  'H100',
330
338
  'H200',
331
339
  'L4',
@@ -271,6 +271,10 @@ def get_accelerators_from_instance_type(
271
271
  _get_df(), instance_type)
272
272
 
273
273
 
274
+ def get_arch_from_instance_type(instance_type: str) -> Optional[str]:
275
+ return common.get_arch_from_instance_type_impl(_get_df(), instance_type)
276
+
277
+
274
278
  def get_instance_type_for_accelerator(
275
279
  acc_name: str,
276
280
  acc_count: int,
sky/catalog/common.py CHANGED
@@ -385,7 +385,7 @@ def get_hourly_cost_impl(
385
385
  f'{instance_type!r}.')
386
386
  cheapest_idx = df[price_str].idxmin()
387
387
  cheapest = df.loc[cheapest_idx]
388
- return cheapest[price_str]
388
+ return float(cheapest[price_str])
389
389
 
390
390
 
391
391
  def _get_value(value):
@@ -527,6 +527,24 @@ def get_accelerators_from_instance_type_impl(
527
527
  return {acc_name: _convert(acc_count)}
528
528
 
529
529
 
530
+ def get_arch_from_instance_type_impl(
531
+ df: 'pd.DataFrame',
532
+ instance_type: str,
533
+ ) -> Optional[str]:
534
+ df = _get_instance_type(df, instance_type, None)
535
+ if df.empty:
536
+ with ux_utils.print_exception_no_traceback():
537
+ raise ValueError(f'No instance type {instance_type} found.')
538
+ row = df.iloc[0]
539
+ if 'Arch' not in row:
540
+ return None
541
+ arch = row['Arch']
542
+ if pd.isnull(arch):
543
+ return None
544
+
545
+ return arch
546
+
547
+
530
548
  def get_instance_type_for_accelerator_impl(
531
549
  df: 'pd.DataFrame',
532
550
  acc_name: str,
@@ -60,6 +60,7 @@ ALL_REGIONS = [
60
60
  'ap-northeast-2',
61
61
  'ap-southeast-1',
62
62
  'ap-southeast-2',
63
+ 'ap-southeast-4',
63
64
  'ap-northeast-1',
64
65
  ]
65
66
  US_REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
@@ -67,17 +68,13 @@ US_REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
67
68
  # The following columns will be included in the final catalog.
68
69
  USEFUL_COLUMNS = [
69
70
  'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB',
70
- 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone'
71
+ 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone', 'Arch'
71
72
  ]
72
73
 
73
74
  # NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is
74
75
  # only available in this region, but it serves pricing information for all
75
76
  # regions.
76
77
  PRICING_TABLE_URL_FMT = 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.csv' # pylint: disable=line-too-long
77
- # Hardcode the regions that offer p4de.24xlarge as our credential does not have
78
- # the permission to query the offerings of the instance.
79
- # Ref: https://aws.amazon.com/ec2/instance-types/p4/
80
- P4DE_REGIONS = ['us-east-1', 'us-west-2']
81
78
  # g6f instances have fractional GPUs, but the API returns Count: 1 under
82
79
  # GpuInfo. However, the GPU memory is properly scaled. Taking the instance GPU
83
80
  # divided by the total memory of an L4 will give us the fraction of the GPU.
@@ -214,35 +211,6 @@ def _get_spot_pricing_table(region: str) -> 'pd.DataFrame':
214
211
  return df
215
212
 
216
213
 
217
- def _patch_p4de(region: str, df: 'pd.DataFrame',
218
- pricing_df: 'pd.DataFrame') -> 'pd.DataFrame':
219
- # Hardcoded patch for p4de.24xlarge, as our credentials doesn't have access
220
- # to the instance type.
221
- # Columns:
222
- # InstanceType,AcceleratorName,AcceleratorCount,vCPUs,MemoryGiB,GpuInfo,
223
- # Price,SpotPrice,Region,AvailabilityZone
224
- records = []
225
- for zone in df[df['Region'] == region]['AvailabilityZone'].unique():
226
- records.append({
227
- 'InstanceType': 'p4de.24xlarge',
228
- 'AcceleratorName': 'A100-80GB',
229
- 'AcceleratorCount': 8,
230
- 'vCPUs': 96,
231
- 'MemoryGiB': 1152,
232
- 'GpuInfo':
233
- ('{\'Gpus\': [{\'Name\': \'A100-80GB\', \'Manufacturer\': '
234
- '\'NVIDIA\', \'Count\': 8, \'MemoryInfo\': {\'SizeInMiB\': '
235
- '81920}}], \'TotalGpuMemoryInMiB\': 655360}'),
236
- 'AvailabilityZone': zone,
237
- 'Region': region,
238
- 'Price': pricing_df[pricing_df['InstanceType'] == 'p4de.24xlarge']
239
- ['Price'].values[0],
240
- 'SpotPrice': np.nan,
241
- })
242
- df = pd.concat([df, pd.DataFrame.from_records(records)])
243
- return df
244
-
245
-
246
214
  def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
247
215
  try:
248
216
  # Fetch the zone info first to make sure the account has access to the
@@ -266,7 +234,7 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
266
234
  def get_acc_info(row) -> Tuple[Optional[str], float]:
267
235
  accelerator = None
268
236
  for col, info_key in [('GpuInfo', 'Gpus'),
269
- ('InferenceAcceleratorInfo', 'Accelerators'),
237
+ ('NeuronInfo', 'NeuronDevices'),
270
238
  ('FpgaInfo', 'Fpgas')]:
271
239
  info = row.get(col)
272
240
  if isinstance(info, dict):
@@ -275,6 +243,17 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
275
243
  return None, np.nan
276
244
  return accelerator['Name'], accelerator['Count']
277
245
 
246
+ def get_arch(row) -> Optional[str]:
247
+ if 'ProcessorInfo' in row:
248
+ processor = row['ProcessorInfo']
249
+ if 'SupportedArchitectures' in processor:
250
+ archs = processor['SupportedArchitectures']
251
+ if isinstance(archs, list):
252
+ return archs[0]
253
+ elif isinstance(archs, str):
254
+ return archs
255
+ return None
256
+
278
257
  def get_vcpus(row) -> float:
279
258
  if not np.isnan(row['vCPU']):
280
259
  return float(row['vCPU'])
@@ -299,18 +278,6 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
299
278
  if row['InstanceType'] == 'p4de.24xlarge':
300
279
  acc_name = 'A100-80GB'
301
280
  acc_count = 8
302
- if row['InstanceType'].startswith('trn1'):
303
- # Trainium instances does not have a field for information of
304
- # the accelerators. We need to infer the accelerator info from
305
- # the instance type name.
306
- # aws ec2 describe-instance-types --region us-east-1
307
- # https://aws.amazon.com/ec2/instance-types/trn1/
308
- acc_name = 'Trainium'
309
- find_num_in_name = re.search(r'(\d+)xlarge',
310
- row['InstanceType'])
311
- assert find_num_in_name is not None, row['InstanceType']
312
- num_in_name = find_num_in_name.group(1)
313
- acc_count = int(num_in_name) // 2
314
281
  if row['InstanceType'] == 'p5en.48xlarge':
315
282
  # TODO(andyl): Check if this workaround still needed after
316
283
  # v0.10.0 released. Currently, the acc_name returned by the
@@ -320,10 +287,15 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
320
287
  if (row['InstanceType'].startswith('g6f') or
321
288
  row['InstanceType'].startswith('gr6f')):
322
289
  # These instance actually have only fractional GPUs, but the API
323
- # returns Count: 1 under GpuInfo. We need to check the GPU
324
- # memory to get the actual fraction of the GPU.
290
+ # returns Count: 1 or Count: 0 under GpuInfo. We need to
291
+ # directly check the GPU memory to get the actual fraction of
292
+ # the GPU. Note that TotalGpuMemoryInMiB seems unreliable here -
293
+ # sometimes it is unexpectedly 0.
325
294
  # See also Standard_NV{vcpu}ads_A10_v5 support on Azure.
326
- fraction = row['GpuInfo']['TotalGpuMemoryInMiB'] / L4_GPU_MEMORY
295
+ assert len(row['GpuInfo']['Gpus']) == 1
296
+ assert row['GpuInfo']['Gpus'][0]['Name'] == 'L4'
297
+ fraction = row['GpuInfo']['Gpus'][0]['MemoryInfo'][
298
+ 'SizeInMiB'] / L4_GPU_MEMORY
327
299
  acc_count = round(fraction, 3)
328
300
  if row['InstanceType'] == 'p5.4xlarge':
329
301
  acc_count = 1
@@ -332,6 +304,7 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
332
304
  'AcceleratorCount': acc_count,
333
305
  'vCPUs': get_vcpus(row),
334
306
  'MemoryGiB': get_memory_gib(row),
307
+ 'Arch': get_arch(row),
335
308
  })
336
309
 
337
310
  # The AWS API may not have all the instance types in the pricing table,
@@ -355,11 +328,21 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
355
328
  df = pd.concat(
356
329
  [df, df.apply(get_additional_columns, axis='columns')],
357
330
  axis='columns')
358
- # patch the df for p4de.24xlarge
359
- if region in P4DE_REGIONS:
360
- df = _patch_p4de(region, df, pricing_df)
361
331
  if 'GpuInfo' not in df.columns:
362
332
  df['GpuInfo'] = np.nan
333
+ if 'NeuronInfo' in df.columns:
334
+ # The AWS Neuron API uses 'NeuronDevices' instead of 'Gpus'
335
+ # in its dict; for consistency with GPU handling, rename key.
336
+ def map_neuroninfo(neuroninfo):
337
+ if isinstance(neuroninfo,
338
+ dict) and 'NeuronDevices' in neuroninfo:
339
+ # Rename 'NeuronDevices' to 'Gpus'
340
+ neuroninfo = neuroninfo.copy()
341
+ neuroninfo['Gpus'] = neuroninfo.pop('NeuronDevices')
342
+ return neuroninfo
343
+
344
+ df['NeuronInfo'] = df['NeuronInfo'].apply(map_neuroninfo)
345
+ df['GpuInfo'] = df['GpuInfo'].fillna(df['NeuronInfo'])
363
346
  df = df[USEFUL_COLUMNS]
364
347
  except Exception as e: # pylint: disable=broad-except
365
348
  print(traceback.format_exc())
@@ -407,44 +390,70 @@ def get_all_regions_instance_types_df(regions: Set[str]) -> 'pd.DataFrame':
407
390
  # TODO(tian): find out the driver version.
408
391
  # Neuron driver:
409
392
  _GPU_DESC_UBUNTU_DATE = [
410
- ('gpu', 'AMI GPU PyTorch 2.1.0', '20.04', '20231103'),
411
- ('gpu', 'AMI GPU PyTorch 1.10.0', '18.04', '20221114'),
412
- ('k80', 'AMI GPU PyTorch 1.10.0', '20.04', '20211208'),
413
- ('k80', 'AMI GPU PyTorch 1.10.0', '18.04', '20211208'),
414
- ('neuron', 'Base Neuron AMI', '22.04', '20240923'),
393
+ ('neuron', '/aws/service/neuron/dlami/multi-framework', '22.04'),
415
394
  ]
416
395
 
417
396
 
418
- def _fetch_image_id(region: str, description: str, ubuntu_version: str,
419
- creation_date: str) -> Optional[str]:
397
+ def _fetch_image_creation_date(region: str,
398
+ image_id: Optional[str]) -> Optional[str]:
399
+ if image_id is None:
400
+ return None
420
401
  try:
421
402
  image = subprocess.check_output(f"""\
422
- aws ec2 describe-images --region {region} --owners amazon \\
423
- --filters 'Name=name,Values="Deep Learning {description} (Ubuntu {ubuntu_version}) {creation_date}"' \\
424
- 'Name=state,Values=available' --query 'Images[:1].ImageId' --output text
403
+ aws ec2 describe-images --region {region} --image-ids {image_id} \\
404
+ --query 'Images[0].Name' --output text
425
405
  """,
426
406
  shell=True)
427
407
  except subprocess.CalledProcessError as e:
428
- print(f'Failed {region}, {description}, {ubuntu_version}, '
429
- f'{creation_date}. Trying next date.')
408
+ print(f'Failed to fetch image creation date for {region}, {image_id}')
430
409
  print(f'{type(e)}: {e}')
431
410
  image_id = None
411
+ else:
412
+ assert image is not None
413
+ image_name = image.decode('utf-8').strip()
414
+ match = re.search(r'(\d+)$', image_name)
415
+ if match:
416
+ return match.group(1)
417
+ return None
418
+
419
+
420
+ def _fetch_image_id_from_ssm_param(
421
+ region: str,
422
+ ssm_prefix: str,
423
+ ubuntu_version: str = '22.04') -> Optional[str]:
424
+ try:
425
+ image = subprocess.check_output(f"""\
426
+ aws ssm get-parameter --region {region} --name "{ssm_prefix}/ubuntu-{ubuntu_version}/latest/image_id" \\
427
+ --query 'Parameter.Value' --output text
428
+ """,
429
+ shell=True)
430
+ except subprocess.CalledProcessError as e:
431
+ print(
432
+ f'Failed to fetch image ID from SSM parameter for {region}, {ssm_prefix}, {ubuntu_version}'
433
+ )
434
+ print(f'{type(e)}: {e}')
435
+ return None
432
436
  else:
433
437
  assert image is not None
434
438
  image_id = image.decode('utf-8').strip()
435
439
  return image_id
436
440
 
437
441
 
438
- def _get_image_row(region: str, gpu: str, description: str, ubuntu_version: str,
439
- date: str) -> Tuple[str, str, str, str, Optional[str], str]:
440
- print(f'Getting image for {region}, {description}, {ubuntu_version}, {gpu}')
441
- image_id = _fetch_image_id(region, description, ubuntu_version, date)
442
- if image_id is None:
443
- # not found
444
- print(f'Failed to find image for {region}, {description}, '
445
- f'{ubuntu_version}, {gpu}')
442
+ def _get_image_row(
443
+ region: str,
444
+ gpu: str,
445
+ ssm_prefix: str,
446
+ ubuntu_version: str = '22.04'
447
+ ) -> Tuple[str, str, str, str, Optional[str], Optional[str]]:
448
+ print(f'Getting image for {region}, {ssm_prefix}, {ubuntu_version}, {gpu}')
449
+ image_id = _fetch_image_id_from_ssm_param(region, ssm_prefix,
450
+ ubuntu_version)
451
+ if image_id is not None:
452
+ creation_date = _fetch_image_creation_date(region, image_id)
453
+ else:
454
+ creation_date = None
446
455
  tag = f'skypilot:{gpu}-ubuntu-{ubuntu_version.replace(".", "")}'
447
- return tag, region, 'ubuntu', ubuntu_version, image_id, date
456
+ return tag, region, 'ubuntu', ubuntu_version, image_id, creation_date
448
457
 
449
458
 
450
459
  def get_all_regions_images_df(regions: Set[str]) -> 'pd.DataFrame':
@@ -559,13 +568,26 @@ if __name__ == '__main__':
559
568
  instance_df.to_csv('aws/vms.csv', index=False)
560
569
  print('AWS Service Catalog saved to aws/vms.csv')
561
570
 
562
- # Disable refreshing images.csv as we are using skypilot custom AMIs
571
+ # Disable refreshing images.csv for skypilot custom AMIs
572
+ # refresh only the neuron based images
563
573
  # See sky/clouds/catalog/images/README.md for more details.
564
- # image_df = get_all_regions_images_df(user_regions)
565
- # _check_regions_integrity(image_df, 'images')
566
-
567
- # image_df.to_csv('aws/images.csv', index=False)
568
- # print('AWS Images saved to aws/images.csv')
574
+ image_df = get_all_regions_images_df(user_regions)
575
+ _check_regions_integrity(image_df, 'images')
576
+ # filter out rows where ImageId is None
577
+ image_df = image_df[image_df['ImageId'].notna()]
578
+
579
+ # check if aws/images.csv exists
580
+ if os.path.exists('aws/images.csv'):
581
+ # load the data from aws/images.csv
582
+ existing_image_df = pd.read_csv('aws/images.csv')
583
+ # filter out the neuron based images
584
+ existing_image_df = existing_image_df[~existing_image_df['Tag'].
585
+ eq('skypilot:neuron-ubuntu-2204')]
586
+ # concat the new neuron based images with the existing images
587
+ image_df = pd.concat([existing_image_df, image_df])
588
+
589
+ image_df.to_csv('aws/images.csv', index=False)
590
+ print('AWS Images saved to aws/images.csv')
569
591
 
570
592
  if args.az_mappings:
571
593
  az_mappings_df = fetch_availability_zone_mappings()
@@ -182,8 +182,9 @@ TPU_V4_HOST_DF = pd.read_csv(
182
182
  SERIES_TO_DESCRIPTION = {
183
183
  'a2': 'A2 Instance',
184
184
  'a3': 'A3 Instance',
185
- # TODO(zhwu): GCP does not have A4 instance in SKUs API yet. We keep it here
186
- # for completeness.
185
+ # NOTE: GCP does not provide separate CPU/RAM pricing for A4 instances.
186
+ # The B200 GPU pricing includes the full VM cost. See special handling in
187
+ # get_vm_price() which sets A4 VM price to 0.
187
188
  'a4': 'A4 Instance',
188
189
  'c2': 'Compute optimized',
189
190
  'c2d': 'C2D AMD Instance',
@@ -394,6 +395,15 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
394
395
  if series in ['f1', 'g1']:
395
396
  memory_price = 0.0
396
397
 
398
+ # Special case for A4 instances.
399
+ # GCP does not provide separate CPU/RAM pricing for A4 instances in the
400
+ # SKUs API. The GPU pricing (B200) includes the full VM cost.
401
+ # We set the VM price to 0 so the entry is not dropped, and the GPU
402
+ # pricing will provide the total cost.
403
+ if series == 'a4':
404
+ cpu_price = 0.0
405
+ memory_price = 0.0
406
+
397
407
  # TODO(tian): (2024/11/10) Some SKUs are missing in the SKUs API. We
398
408
  # skip them in the catalog for now. We should investigate why they are
399
409
  # missing and add them back.
@@ -525,7 +535,24 @@ def get_gpu_df(skus: List[Dict[str, Any]],
525
535
  row_gpu_name = row['AcceleratorName']
526
536
  if row['Region'] not in sku['serviceRegions']:
527
537
  continue
528
- if sku['category']['usageType'] != ondemand_or_spot:
538
+
539
+ # Check usageType matches, with special handling for B200 spot.
540
+ # GCP has a bug where some B200 spot SKUs have usageType='OnDemand'
541
+ # but the description contains 'Spot Preemptible'.
542
+ usage_type = sku['category']['usageType']
543
+ description = sku['description']
544
+ is_spot_description = 'spot preemptible' in description.lower()
545
+
546
+ if usage_type != ondemand_or_spot:
547
+ # For B200 spot pricing, also accept SKUs where description
548
+ # says "Spot Preemptible" even if usageType is wrong.
549
+ if not (spot and row_gpu_name == 'B200' and
550
+ is_spot_description):
551
+ continue
552
+
553
+ # For B200 on-demand, skip SKUs that are actually spot (description
554
+ # says "Spot Preemptible" but usageType is incorrectly 'OnDemand').
555
+ if not spot and row_gpu_name == 'B200' and is_spot_description:
529
556
  continue
530
557
 
531
558
  gpu_names = [f'{row_gpu_name} GPU']
@@ -7,6 +7,7 @@ from dataclasses import dataclass
7
7
  import decimal
8
8
  import json
9
9
  import logging
10
+ import os
10
11
  import re
11
12
  from typing import Any, Dict, List, Optional
12
13
 
@@ -22,8 +23,6 @@ TIMEOUT = 10
22
23
  PARENT_ID_TEMPLATE = 'project-{}public-images'
23
24
  ACCELERATOR_MANUFACTURER = 'NVIDIA'
24
25
 
25
- VRAM = {'L40S': 49152, 'H100': 81920, 'H200': 144384, 'B200': 184320}
26
-
27
26
 
28
27
  @dataclass
29
28
  class PresetInfo:
@@ -38,6 +37,7 @@ class PresetInfo:
38
37
  platform_name (str): The name of the platform the preset belongs to.
39
38
  gpu (int): The number of GPUs in the preset.
40
39
  vcpu (int): The number of virtual CPUs in the preset.
40
+ gpu_memory_gibibytes (int): size of gpu memory in GiB.
41
41
  memory_gib (int): The amount of memory in GiB in the preset.
42
42
  accelerator_manufacturer (str | None): The manufacturer of the
43
43
  accelerator (e.g., "NVIDIA"), or None if no accelerator.
@@ -54,6 +54,7 @@ class PresetInfo:
54
54
  platform_name: str
55
55
  gpu: int
56
56
  vcpu: int
57
+ gpu_memory_gibibytes: int
57
58
  memory_gib: int
58
59
  accelerator_manufacturer: Optional[str]
59
60
  accelerator_name: Optional[str]
@@ -157,6 +158,7 @@ def _estimate_platforms(platforms: List[Any], parent_id: str,
157
158
  platform_name=platform_name,
158
159
  gpu=preset.resources.gpu_count or 0,
159
160
  vcpu=preset.resources.vcpu_count,
161
+ gpu_memory_gibibytes=platform.spec.gpu_memory_gibibytes,
160
162
  memory_gib=preset.resources.memory_gibibytes,
161
163
  accelerator_manufacturer=ACCELERATOR_MANUFACTURER
162
164
  if platform_name.startswith('gpu-') else '',
@@ -178,6 +180,7 @@ def _write_preset_prices(presets: List[PresetInfo], output_file: str) -> None:
178
180
  presets (List[PresetInfo]): A list of PresetInfo objects to write.
179
181
  output_file (str): The path to the output CSV file.
180
182
  """
183
+ os.makedirs(os.path.dirname(output_file))
181
184
  # Set up the CSV writer to output to stdout
182
185
  with open(output_file, 'w', encoding='utf-8') as out:
183
186
  header = [
@@ -193,23 +196,23 @@ def _write_preset_prices(presets: List[PresetInfo], output_file: str) -> None:
193
196
  ]
194
197
  writer = csv.DictWriter(out, fieldnames=header)
195
198
  writer.writeheader()
196
-
199
+ # logger.info(presets)
197
200
  for preset in sorted(presets,
198
201
  key=lambda x:
199
202
  (bool(x.gpu), x.region, x.platform_name, x.vcpu)):
200
203
  gpu_info = ''
201
204
  if preset.gpu > 0 and preset.accelerator_name:
205
+ vram = preset.gpu_memory_gibibytes * 1024
202
206
  gpu_info_dict = {
203
207
  'Gpus': [{
204
208
  'Name': preset.accelerator_name,
205
209
  'Manufacturer': preset.accelerator_manufacturer,
206
210
  'Count': preset.gpu,
207
211
  'MemoryInfo': {
208
- 'SizeInMiB': VRAM.get(preset.accelerator_name, 0)
212
+ 'SizeInMiB': vram
209
213
  },
210
214
  }],
211
- 'TotalGpuMemoryInMiB': VRAM.get(preset.accelerator_name, 0)
212
- * preset.gpu,
215
+ 'TotalGpuMemoryInMiB': vram * preset.gpu,
213
216
  }
214
217
  gpu_info = json.dumps(gpu_info_dict).replace('"', '\'')
215
218