skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -46,7 +46,7 @@ ALL_REGIONS = [
46
46
  'eu-west-1',
47
47
  'eu-west-2',
48
48
  'eu-south-1',
49
- # 'eu-south-2', # no supported AMI
49
+ 'eu-south-2',
50
50
  'eu-west-3',
51
51
  'eu-north-1',
52
52
  'me-south-1',
@@ -67,17 +67,17 @@ US_REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
67
67
  # The following columns will be included in the final catalog.
68
68
  USEFUL_COLUMNS = [
69
69
  'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB',
70
- 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone'
70
+ 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone', 'Arch'
71
71
  ]
72
72
 
73
73
  # NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is
74
74
  # only available in this region, but it serves pricing information for all
75
75
  # regions.
76
76
  PRICING_TABLE_URL_FMT = 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.csv' # pylint: disable=line-too-long
77
- # Hardcode the regions that offer p4de.24xlarge as our credential does not have
78
- # the permission to query the offerings of the instance.
79
- # Ref: https://aws.amazon.com/ec2/instance-types/p4/
80
- P4DE_REGIONS = ['us-east-1', 'us-west-2']
77
+ # g6f instances have fractional GPUs, but the API returns Count: 1 under
78
+ # GpuInfo. However, the GPU memory is properly scaled. Taking the instance GPU
79
+ # divided by the total memory of an L4 will give us the fraction of the GPU.
80
+ L4_GPU_MEMORY = 22888
81
81
 
82
82
  regions_enabled: Optional[Set[str]] = None
83
83
 
@@ -210,35 +210,6 @@ def _get_spot_pricing_table(region: str) -> 'pd.DataFrame':
210
210
  return df
211
211
 
212
212
 
213
- def _patch_p4de(region: str, df: 'pd.DataFrame',
214
- pricing_df: 'pd.DataFrame') -> 'pd.DataFrame':
215
- # Hardcoded patch for p4de.24xlarge, as our credentials doesn't have access
216
- # to the instance type.
217
- # Columns:
218
- # InstanceType,AcceleratorName,AcceleratorCount,vCPUs,MemoryGiB,GpuInfo,
219
- # Price,SpotPrice,Region,AvailabilityZone
220
- records = []
221
- for zone in df[df['Region'] == region]['AvailabilityZone'].unique():
222
- records.append({
223
- 'InstanceType': 'p4de.24xlarge',
224
- 'AcceleratorName': 'A100-80GB',
225
- 'AcceleratorCount': 8,
226
- 'vCPUs': 96,
227
- 'MemoryGiB': 1152,
228
- 'GpuInfo':
229
- ('{\'Gpus\': [{\'Name\': \'A100-80GB\', \'Manufacturer\': '
230
- '\'NVIDIA\', \'Count\': 8, \'MemoryInfo\': {\'SizeInMiB\': '
231
- '81920}}], \'TotalGpuMemoryInMiB\': 655360}'),
232
- 'AvailabilityZone': zone,
233
- 'Region': region,
234
- 'Price': pricing_df[pricing_df['InstanceType'] == 'p4de.24xlarge']
235
- ['Price'].values[0],
236
- 'SpotPrice': np.nan,
237
- })
238
- df = pd.concat([df, pd.DataFrame.from_records(records)])
239
- return df
240
-
241
-
242
213
  def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
243
214
  try:
244
215
  # Fetch the zone info first to make sure the account has access to the
@@ -271,13 +242,24 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
271
242
  return None, np.nan
272
243
  return accelerator['Name'], accelerator['Count']
273
244
 
245
+ def get_arch(row) -> Optional[str]:
246
+ if 'ProcessorInfo' in row:
247
+ processor = row['ProcessorInfo']
248
+ if 'SupportedArchitectures' in processor:
249
+ archs = processor['SupportedArchitectures']
250
+ if isinstance(archs, list):
251
+ return archs[0]
252
+ elif isinstance(archs, str):
253
+ return archs
254
+ return None
255
+
274
256
  def get_vcpus(row) -> float:
275
257
  if not np.isnan(row['vCPU']):
276
258
  return float(row['vCPU'])
277
259
  try:
278
260
  return float(row['VCpuInfo']['DefaultVCpus'])
279
261
  except Exception as e: # pylint: disable=broad-except
280
- print('Error occured for row:', row)
262
+ print('Error occurred for row:', row)
281
263
  print('Error:', e)
282
264
  raise
283
265
 
@@ -313,11 +295,22 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
313
295
  # AWS API is 'NVIDIA', which is incorrect. See #4652.
314
296
  acc_name = 'H200'
315
297
  acc_count = 8
298
+ if (row['InstanceType'].startswith('g6f') or
299
+ row['InstanceType'].startswith('gr6f')):
300
+ # These instance actually have only fractional GPUs, but the API
301
+ # returns Count: 1 under GpuInfo. We need to check the GPU
302
+ # memory to get the actual fraction of the GPU.
303
+ # See also Standard_NV{vcpu}ads_A10_v5 support on Azure.
304
+ fraction = row['GpuInfo']['TotalGpuMemoryInMiB'] / L4_GPU_MEMORY
305
+ acc_count = round(fraction, 3)
306
+ if row['InstanceType'] == 'p5.4xlarge':
307
+ acc_count = 1
316
308
  return pd.Series({
317
309
  'AcceleratorName': acc_name,
318
310
  'AcceleratorCount': acc_count,
319
311
  'vCPUs': get_vcpus(row),
320
312
  'MemoryGiB': get_memory_gib(row),
313
+ 'Arch': get_arch(row),
321
314
  })
322
315
 
323
316
  # The AWS API may not have all the instance types in the pricing table,
@@ -341,9 +334,6 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
341
334
  df = pd.concat(
342
335
  [df, df.apply(get_additional_columns, axis='columns')],
343
336
  axis='columns')
344
- # patch the df for p4de.24xlarge
345
- if region in P4DE_REGIONS:
346
- df = _patch_p4de(region, df, pricing_df)
347
337
  if 'GpuInfo' not in df.columns:
348
338
  df['GpuInfo'] = np.nan
349
339
  df = df[USEFUL_COLUMNS]
@@ -546,7 +536,7 @@ if __name__ == '__main__':
546
536
  print('AWS Service Catalog saved to aws/vms.csv')
547
537
 
548
538
  # Disable refreshing images.csv as we are using skypilot custom AMIs
549
- # See sky/clouds/service_catalog/images/README.md for more details.
539
+ # See sky/clouds/catalog/images/README.md for more details.
550
540
  # image_df = get_all_regions_images_df(user_regions)
551
541
  # _check_regions_integrity(image_df, 'images')
552
542
 
@@ -9,18 +9,11 @@ import os
9
9
 
10
10
  import cudo_compute
11
11
 
12
- import sky.provision.cudo.cudo_utils as utils
12
+ from sky.provision.cudo import cudo_utils as utils
13
13
 
14
14
  VMS_CSV = 'cudo/vms.csv'
15
15
 
16
16
 
17
- def cudo_api():
18
- configuration = cudo_compute.Configuration()
19
- configuration.host = 'https://rest.compute.cudo.org'
20
- client = cudo_compute.ApiClient(configuration)
21
- return cudo_compute.VirtualMachinesApi(client)
22
-
23
-
24
17
  def get_gpu_info(count, model):
25
18
  mem = utils.cudo_gpu_mem[model]
26
19
  # pylint: disable=line-too-long
@@ -45,39 +38,46 @@ def get_instance_type(machine_type, vcpu, mem, gpu):
45
38
  mem) + 'gb'
46
39
 
47
40
 
48
- def machine_types(gpu_model, mem_gib, vcpu_count, gpu_count):
49
- try:
50
- api = cudo_api()
51
- types = api.list_vm_machine_types(mem_gib,
52
- vcpu_count,
53
- gpu=gpu_count,
54
- gpu_model=gpu_model)
55
- return types.to_dict()
56
- except cudo_compute.rest.ApiException as e:
57
- raise e
58
-
59
-
60
41
  def update_prices():
61
42
  rows = []
43
+
44
+ api = cudo_compute.cudo_api.virtual_machines()
45
+ all_types = api.list_vm_machine_types2()
46
+ all_machine_types = all_types.to_dict()['machine_types']
47
+
62
48
  for spec in utils.machine_specs:
63
- mts = machine_types('', spec['mem'], spec['vcpu'], spec['gpu'])
64
- for hc in mts['host_configs']:
65
- if not utils.gpu_exists(hc['gpu_model']):
66
- continue
67
- accelerator_name = utils.cudo_gpu_to_skypilot_gpu(hc['gpu_model'])
68
- row = {
69
- 'instance_type': get_instance_type(hc['machine_type'],
70
- spec['vcpu'], spec['mem'],
71
- spec['gpu']),
72
- 'accelerator_name': accelerator_name,
73
- 'accelerator_count': str(spec['gpu']) + '.0',
74
- 'vcpus': str(spec['vcpu']),
75
- 'memory_gib': str(spec['mem']),
76
- 'price': hc['total_price_hr']['value'],
77
- 'region': hc['data_center_id'],
78
- 'gpu_info': get_gpu_info(spec['gpu'], accelerator_name),
79
- }
80
- rows.append(row)
49
+ for machine_type in all_machine_types:
50
+ if (machine_type['min_vcpu'] <= spec['vcpu'] and
51
+ machine_type['min_memory_gib'] <= spec['mem'] and
52
+ utils.gpu_exists(machine_type['gpu_model'])):
53
+
54
+ accelerator_name = utils.cudo_gpu_to_skypilot_gpu(
55
+ machine_type['gpu_model'])
56
+
57
+ # Calculate total price per hour based on the given spec
58
+ vcpu_price = float(
59
+ machine_type['vcpu_price_hr']['value']) * spec['vcpu']
60
+ memory_price = float(
61
+ machine_type['memory_gib_price_hr']['value']) * spec['mem']
62
+ gpu_price = float(
63
+ machine_type['gpu_price_hr']['value']) * spec['gpu']
64
+ # Note: Not including storage and IPv4 prices
65
+ # for now as they may be optional
66
+ total_price = vcpu_price + memory_price + gpu_price
67
+
68
+ row = {
69
+ 'instance_type': get_instance_type(
70
+ machine_type['machine_type'], spec['vcpu'], spec['mem'],
71
+ spec['gpu']),
72
+ 'accelerator_name': accelerator_name,
73
+ 'accelerator_count': str(spec['gpu']) + '.0',
74
+ 'vcpus': str(spec['vcpu']),
75
+ 'memory_gib': str(spec['mem']),
76
+ 'price': str(total_price),
77
+ 'region': machine_type['data_center_id'],
78
+ 'gpu_info': get_gpu_info(spec['gpu'], accelerator_name),
79
+ }
80
+ rows.append(row)
81
81
  path = VMS_CSV
82
82
  with open(path, 'w', encoding='utf-8') as file:
83
83
  file.write(
@@ -179,9 +179,12 @@ TPU_V4_HOST_DF = pd.read_csv(
179
179
  # TODO(woosuk): Make this more robust.
180
180
  # Refer to: https://github.com/skypilot-org/skypilot/issues/1006
181
181
  # Unsupported Series: 'f1', 'm2'
182
- SERIES_TO_DISCRIPTION = {
182
+ SERIES_TO_DESCRIPTION = {
183
183
  'a2': 'A2 Instance',
184
184
  'a3': 'A3 Instance',
185
+ # TODO(zhwu): GCP does not have A4 instance in SKUs API yet. We keep it here
186
+ # for completeness.
187
+ 'a4': 'A4 Instance',
185
188
  'c2': 'Compute optimized',
186
189
  'c2d': 'C2D AMD Instance',
187
190
  'c3': 'C3 Instance',
@@ -195,9 +198,11 @@ SERIES_TO_DISCRIPTION = {
195
198
  'n1': 'N1 Predefined Instance',
196
199
  'n2': 'N2 Instance',
197
200
  'n2d': 'N2D AMD Instance',
201
+ 'n4': 'N4 Instance',
198
202
  't2a': 'T2A Arm Instance',
199
203
  't2d': 'T2D AMD Instance',
200
204
  }
205
+
201
206
  creds, project_id = google.auth.default()
202
207
  gcp_client = discovery.build('compute', 'v1')
203
208
  tpu_client = discovery.build('tpu', 'v1')
@@ -334,7 +339,7 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
334
339
 
335
340
  # Drop the unsupported series.
336
341
  df = df[df['InstanceType'].str.startswith(
337
- tuple(f'{series}-' for series in SERIES_TO_DISCRIPTION))]
342
+ tuple(f'{series}-' for series in SERIES_TO_DESCRIPTION))]
338
343
  df = df[~df['AvailabilityZone'].str.startswith(tuple(TPU_V4_ZONES))]
339
344
 
340
345
  # TODO(woosuk): Make this more efficient.
@@ -352,7 +357,7 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
352
357
 
353
358
  # Check if the SKU is for the correct series.
354
359
  description = sku['description']
355
- if SERIES_TO_DISCRIPTION[series].lower() not in description.lower():
360
+ if SERIES_TO_DESCRIPTION[series].lower() not in description.lower():
356
361
  continue
357
362
  # Special check for M1 instances.
358
363
  if series == 'm1' and 'M3' in description:
@@ -434,10 +439,18 @@ def _get_gpus_for_zone(zone: str) -> 'pd.DataFrame':
434
439
  gpu_name = gpu_name.upper()
435
440
  if 'H100-80GB' in gpu_name:
436
441
  gpu_name = 'H100'
437
- if 'H100-MEGA-80GB' in gpu_name:
442
+
443
+ if 'H100-MEGA' in gpu_name:
438
444
  gpu_name = 'H100-MEGA'
439
445
  if count != 8:
440
- # H100-MEGA only has 8 cards.
446
+ continue
447
+ elif 'H200' in gpu_name:
448
+ gpu_name = 'H200'
449
+ if count != 8:
450
+ continue
451
+ elif 'B200' in gpu_name:
452
+ gpu_name = 'B200'
453
+ if count != 8:
441
454
  continue
442
455
  if 'VWS' in gpu_name:
443
456
  continue
@@ -468,6 +481,8 @@ def _gpu_info_from_name(name: str) -> Optional[Dict[str, List[Dict[str, Any]]]]:
468
481
  'A100': 40 * 1024,
469
482
  'H100': 80 * 1024,
470
483
  'H100-MEGA': 80 * 1024,
484
+ 'H200': 141 * 1024,
485
+ 'B200': 180 * 1024,
471
486
  'P4': 8 * 1024,
472
487
  'T4': 16 * 1024,
473
488
  'V100': 16 * 1024,
@@ -507,22 +522,30 @@ def get_gpu_df(skus: List[Dict[str, Any]],
507
522
  ondemand_or_spot = 'OnDemand' if not spot else 'Preemptible'
508
523
  gpu_price = None
509
524
  for sku in gpu_skus:
525
+ row_gpu_name = row['AcceleratorName']
510
526
  if row['Region'] not in sku['serviceRegions']:
511
527
  continue
512
528
  if sku['category']['usageType'] != ondemand_or_spot:
513
529
  continue
514
530
 
515
- gpu_names = [row['AcceleratorName']]
516
- if gpu_names[0] == 'A100-80GB':
517
- gpu_names = ['A100 80GB']
518
- if gpu_names[0] == 'H100':
519
- gpu_names = ['H100 80GB']
520
- if gpu_names[0] == 'H100-MEGA':
531
+ gpu_names = [f'{row_gpu_name} GPU']
532
+ if row_gpu_name == 'A100-80GB':
533
+ gpu_names = ['A100 80GB GPU']
534
+ elif row_gpu_name == 'H100':
535
+ gpu_names = ['H100 80GB GPU']
536
+ elif row_gpu_name == 'H100-MEGA':
521
537
  # Seems that H100-MEGA has two different descriptions in SKUs in
522
538
  # different regions: 'H100 80GB Mega' and 'H100 80GB Plus'.
523
- gpu_names = ['H100 80GB Mega', 'H100 80GB Plus']
524
- if not any(f'{gpu_name} GPU' in sku['description']
525
- for gpu_name in gpu_names):
539
+ gpu_names = [
540
+ 'H100 80GB Mega GPU', 'H100 Mega 80GB GPU',
541
+ 'H100 80GB Plus GPU'
542
+ ]
543
+ elif row_gpu_name == 'H200':
544
+ gpu_names = ['H200 141GB GPU']
545
+ elif row_gpu_name == 'B200':
546
+ gpu_names = ['Nvidia B200 (1 gpu slice)']
547
+ if not any(
548
+ gpu_name in sku['description'] for gpu_name in gpu_names):
526
549
  continue
527
550
 
528
551
  unit_price = _get_unit_price(sku)
@@ -554,7 +577,7 @@ def _get_tpu_response_for_zone(zone: str) -> list:
554
577
  # Sometimes the response is empty ({}) even for enabled zones. Here we
555
578
  # retry the request for a few times.
556
579
  backoff = common_utils.Backoff(initial_backoff=1)
557
- for _ in range(TPU_RETRY_CNT):
580
+ for retry_cnt in range(TPU_RETRY_CNT):
558
581
  tpus_request = (
559
582
  tpu_client.projects().locations().acceleratorTypes().list(
560
583
  parent=parent))
@@ -570,6 +593,10 @@ def _get_tpu_response_for_zone(zone: str) -> list:
570
593
  print(f' An error occurred: {error}')
571
594
  # If error happens, fail early.
572
595
  return []
596
+ except TimeoutError:
597
+ print(f' TimeoutError: Failed to fetch TPUs for zone {zone!r}, '
598
+ f'retry {retry_cnt + 1} of {TPU_RETRY_CNT}')
599
+
573
600
  time_to_sleep = backoff.current_backoff()
574
601
  print(f' Retry zone {zone!r} in {time_to_sleep} seconds...')
575
602
  time.sleep(time_to_sleep)
@@ -0,0 +1,136 @@
1
+ """Script to fetch Hyperbolic instance data and generate catalog."""
2
+ import argparse
3
+ import csv
4
+ import json
5
+ import os
6
+ import sys
7
+ from typing import Any, Dict
8
+
9
+ import requests
10
+
11
+ ENDPOINT = 'https://api.hyperbolic.xyz/v2/skypilot/catalog'
12
+ API_KEY_PATH = os.path.expanduser('~/.hyperbolic/api_key')
13
+
14
+ REQUIRED_FIELDS = [
15
+ 'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB',
16
+ 'StorageGiB', 'Price', 'Region', 'GpuInfo', 'SpotPrice'
17
+ ]
18
+
19
+
20
+ class HyperbolicCatalogError(Exception):
21
+ """Base exception for Hyperbolic catalog errors."""
22
+ pass
23
+
24
+
25
+ def get_api_key(api_key=None) -> str:
26
+ """Get API key from arg, env var, or file."""
27
+ if api_key:
28
+ return api_key
29
+ if api_key := os.environ.get('HYPERBOLIC_API_KEY'):
30
+ return api_key
31
+ try:
32
+ with open(API_KEY_PATH, 'r', encoding='utf-8') as f:
33
+ return f.read().strip()
34
+ except FileNotFoundError as exc:
35
+ raise HyperbolicCatalogError(
36
+ 'No API key found. Please either:\n'
37
+ '1. Pass --api-key\n'
38
+ '2. Set HYPERBOLIC_API_KEY environment variable\n'
39
+ '3. Create ~/.hyperbolic/api_key file') from exc
40
+
41
+
42
+ def get_output_path() -> str:
43
+ """Get output path for catalog file."""
44
+ current_dir = os.getcwd()
45
+ if os.path.basename(current_dir) == 'hyperbolic':
46
+ return 'vms.csv'
47
+ hyperbolic_dir = os.path.join(current_dir, 'hyperbolic')
48
+ os.makedirs(hyperbolic_dir, exist_ok=True)
49
+ return os.path.join(hyperbolic_dir, 'vms.csv')
50
+
51
+
52
+ def validate_instance_data(instance: Dict[str, Any]) -> None:
53
+ """Validate instance data has all required fields."""
54
+ missing_fields = [
55
+ field for field in REQUIRED_FIELDS if field not in instance
56
+ ]
57
+ if missing_fields:
58
+ raise HyperbolicCatalogError(
59
+ f'Instance data missing required fields: {missing_fields}')
60
+
61
+
62
+ def create_catalog(api_key=None) -> None:
63
+ """Generate Hyperbolic catalog CSV file."""
64
+ try:
65
+ response = requests.get(
66
+ ENDPOINT,
67
+ headers={'Authorization': f'Bearer {get_api_key(api_key)}'},
68
+ timeout=30)
69
+ response.raise_for_status()
70
+
71
+ try:
72
+ data = response.json()
73
+ except json.JSONDecodeError as e:
74
+ raise HyperbolicCatalogError(
75
+ f'Invalid JSON response from API: {response.text}') from e
76
+
77
+ if 'vms' not in data:
78
+ raise HyperbolicCatalogError(
79
+ f'Missing "vms" field in API response: {data}')
80
+
81
+ instances = data['vms']
82
+ if not isinstance(instances, list):
83
+ raise HyperbolicCatalogError(
84
+ f'Expected list of instances, got {type(instances)}')
85
+
86
+ if not instances:
87
+ raise HyperbolicCatalogError('No instances found in API response')
88
+
89
+ # Validate each instance
90
+ for instance in instances:
91
+ validate_instance_data(instance)
92
+
93
+ except requests.exceptions.RequestException as e:
94
+ raise HyperbolicCatalogError(
95
+ f'Failed to fetch instance data: {e}') from e
96
+
97
+ output_path = get_output_path()
98
+ try:
99
+ with open(output_path, 'w', newline='', encoding='utf-8') as f:
100
+ writer = csv.DictWriter(f, fieldnames=REQUIRED_FIELDS)
101
+ writer.writeheader()
102
+
103
+ for instance in instances:
104
+ entry = instance.copy()
105
+ # Convert GpuInfo to string format
106
+ entry['GpuInfo'] = json.dumps(entry['GpuInfo'],
107
+ ensure_ascii=False).replace(
108
+ '"', "'") # pylint: disable=invalid-string-quote
109
+ writer.writerow(entry)
110
+ except (IOError, OSError) as e:
111
+ raise HyperbolicCatalogError(
112
+ f'Failed to write catalog file to {output_path}: {e}') from e
113
+
114
+
115
+ def main() -> int:
116
+ """Main entry point."""
117
+ parser = argparse.ArgumentParser(
118
+ description='Fetch Hyperbolic instance data')
119
+ parser.add_argument('--api-key', help='Hyperbolic API key')
120
+ args = parser.parse_args()
121
+
122
+ try:
123
+ create_catalog(args.api_key)
124
+ print(f'Hyperbolic Service Catalog saved to {get_output_path()}')
125
+ return 0
126
+ except HyperbolicCatalogError as e:
127
+ print(f'Error: {e}', file=sys.stderr)
128
+ return 1
129
+ except (requests.exceptions.RequestException, json.JSONDecodeError, IOError,
130
+ OSError) as e:
131
+ print(f'Unexpected error: {e}', file=sys.stderr)
132
+ return 1
133
+
134
+
135
+ if __name__ == '__main__':
136
+ sys.exit(main())
@@ -49,6 +49,7 @@ GPU_TO_MEMORY = {
49
49
  'V100': 16384,
50
50
  'H100': 81920,
51
51
  'GH200': 98304,
52
+ 'B200': 184320, # 180 GB
52
53
  'GENERAL': None
53
54
  }
54
55