skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -46,7 +46,7 @@ ALL_REGIONS = [
46
46
  'eu-west-1',
47
47
  'eu-west-2',
48
48
  'eu-south-1',
49
- # 'eu-south-2', # no supported AMI
49
+ 'eu-south-2',
50
50
  'eu-west-3',
51
51
  'eu-north-1',
52
52
  'me-south-1',
@@ -60,6 +60,7 @@ ALL_REGIONS = [
60
60
  'ap-northeast-2',
61
61
  'ap-southeast-1',
62
62
  'ap-southeast-2',
63
+ 'ap-southeast-4',
63
64
  'ap-northeast-1',
64
65
  ]
65
66
  US_REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
@@ -67,17 +68,17 @@ US_REGIONS = ['us-east-1', 'us-east-2', 'us-west-1', 'us-west-2']
67
68
  # The following columns will be included in the final catalog.
68
69
  USEFUL_COLUMNS = [
69
70
  'InstanceType', 'AcceleratorName', 'AcceleratorCount', 'vCPUs', 'MemoryGiB',
70
- 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone'
71
+ 'GpuInfo', 'Price', 'SpotPrice', 'Region', 'AvailabilityZone', 'Arch'
71
72
  ]
72
73
 
73
74
  # NOTE: the hard-coded us-east-1 URL is not a typo. AWS pricing endpoint is
74
75
  # only available in this region, but it serves pricing information for all
75
76
  # regions.
76
77
  PRICING_TABLE_URL_FMT = 'https://pricing.us-east-1.amazonaws.com/offers/v1.0/aws/AmazonEC2/current/{region}/index.csv' # pylint: disable=line-too-long
77
- # Hardcode the regions that offer p4de.24xlarge as our credential does not have
78
- # the permission to query the offerings of the instance.
79
- # Ref: https://aws.amazon.com/ec2/instance-types/p4/
80
- P4DE_REGIONS = ['us-east-1', 'us-west-2']
78
+ # g6f instances have fractional GPUs, but the API returns Count: 1 under
79
+ # GpuInfo. However, the GPU memory is properly scaled. Taking the instance GPU
80
+ # divided by the total memory of an L4 will give us the fraction of the GPU.
81
+ L4_GPU_MEMORY = 22888
81
82
 
82
83
  regions_enabled: Optional[Set[str]] = None
83
84
 
@@ -210,35 +211,6 @@ def _get_spot_pricing_table(region: str) -> 'pd.DataFrame':
210
211
  return df
211
212
 
212
213
 
213
- def _patch_p4de(region: str, df: 'pd.DataFrame',
214
- pricing_df: 'pd.DataFrame') -> 'pd.DataFrame':
215
- # Hardcoded patch for p4de.24xlarge, as our credentials doesn't have access
216
- # to the instance type.
217
- # Columns:
218
- # InstanceType,AcceleratorName,AcceleratorCount,vCPUs,MemoryGiB,GpuInfo,
219
- # Price,SpotPrice,Region,AvailabilityZone
220
- records = []
221
- for zone in df[df['Region'] == region]['AvailabilityZone'].unique():
222
- records.append({
223
- 'InstanceType': 'p4de.24xlarge',
224
- 'AcceleratorName': 'A100-80GB',
225
- 'AcceleratorCount': 8,
226
- 'vCPUs': 96,
227
- 'MemoryGiB': 1152,
228
- 'GpuInfo':
229
- ('{\'Gpus\': [{\'Name\': \'A100-80GB\', \'Manufacturer\': '
230
- '\'NVIDIA\', \'Count\': 8, \'MemoryInfo\': {\'SizeInMiB\': '
231
- '81920}}], \'TotalGpuMemoryInMiB\': 655360}'),
232
- 'AvailabilityZone': zone,
233
- 'Region': region,
234
- 'Price': pricing_df[pricing_df['InstanceType'] == 'p4de.24xlarge']
235
- ['Price'].values[0],
236
- 'SpotPrice': np.nan,
237
- })
238
- df = pd.concat([df, pd.DataFrame.from_records(records)])
239
- return df
240
-
241
-
242
214
  def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
243
215
  try:
244
216
  # Fetch the zone info first to make sure the account has access to the
@@ -262,7 +234,7 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
262
234
  def get_acc_info(row) -> Tuple[Optional[str], float]:
263
235
  accelerator = None
264
236
  for col, info_key in [('GpuInfo', 'Gpus'),
265
- ('InferenceAcceleratorInfo', 'Accelerators'),
237
+ ('NeuronInfo', 'NeuronDevices'),
266
238
  ('FpgaInfo', 'Fpgas')]:
267
239
  info = row.get(col)
268
240
  if isinstance(info, dict):
@@ -271,13 +243,24 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
271
243
  return None, np.nan
272
244
  return accelerator['Name'], accelerator['Count']
273
245
 
246
+ def get_arch(row) -> Optional[str]:
247
+ if 'ProcessorInfo' in row:
248
+ processor = row['ProcessorInfo']
249
+ if 'SupportedArchitectures' in processor:
250
+ archs = processor['SupportedArchitectures']
251
+ if isinstance(archs, list):
252
+ return archs[0]
253
+ elif isinstance(archs, str):
254
+ return archs
255
+ return None
256
+
274
257
  def get_vcpus(row) -> float:
275
258
  if not np.isnan(row['vCPU']):
276
259
  return float(row['vCPU'])
277
260
  try:
278
261
  return float(row['VCpuInfo']['DefaultVCpus'])
279
262
  except Exception as e: # pylint: disable=broad-except
280
- print('Error occured for row:', row)
263
+ print('Error occurred for row:', row)
281
264
  print('Error:', e)
282
265
  raise
283
266
 
@@ -295,29 +278,33 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
295
278
  if row['InstanceType'] == 'p4de.24xlarge':
296
279
  acc_name = 'A100-80GB'
297
280
  acc_count = 8
298
- if row['InstanceType'].startswith('trn1'):
299
- # Trainium instances does not have a field for information of
300
- # the accelerators. We need to infer the accelerator info from
301
- # the instance type name.
302
- # aws ec2 describe-instance-types --region us-east-1
303
- # https://aws.amazon.com/ec2/instance-types/trn1/
304
- acc_name = 'Trainium'
305
- find_num_in_name = re.search(r'(\d+)xlarge',
306
- row['InstanceType'])
307
- assert find_num_in_name is not None, row['InstanceType']
308
- num_in_name = find_num_in_name.group(1)
309
- acc_count = int(num_in_name) // 2
310
281
  if row['InstanceType'] == 'p5en.48xlarge':
311
282
  # TODO(andyl): Check if this workaround still needed after
312
283
  # v0.10.0 released. Currently, the acc_name returned by the
313
284
  # AWS API is 'NVIDIA', which is incorrect. See #4652.
314
285
  acc_name = 'H200'
315
286
  acc_count = 8
287
+ if (row['InstanceType'].startswith('g6f') or
288
+ row['InstanceType'].startswith('gr6f')):
289
+ # These instance actually have only fractional GPUs, but the API
290
+ # returns Count: 1 or Count: 0 under GpuInfo. We need to
291
+ # directly check the GPU memory to get the actual fraction of
292
+ # the GPU. Note that TotalGpuMemoryInMiB seems unreliable here -
293
+ # sometimes it is unexpectedly 0.
294
+ # See also Standard_NV{vcpu}ads_A10_v5 support on Azure.
295
+ assert len(row['GpuInfo']['Gpus']) == 1
296
+ assert row['GpuInfo']['Gpus'][0]['Name'] == 'L4'
297
+ fraction = row['GpuInfo']['Gpus'][0]['MemoryInfo'][
298
+ 'SizeInMiB'] / L4_GPU_MEMORY
299
+ acc_count = round(fraction, 3)
300
+ if row['InstanceType'] == 'p5.4xlarge':
301
+ acc_count = 1
316
302
  return pd.Series({
317
303
  'AcceleratorName': acc_name,
318
304
  'AcceleratorCount': acc_count,
319
305
  'vCPUs': get_vcpus(row),
320
306
  'MemoryGiB': get_memory_gib(row),
307
+ 'Arch': get_arch(row),
321
308
  })
322
309
 
323
310
  # The AWS API may not have all the instance types in the pricing table,
@@ -341,11 +328,21 @@ def _get_instance_types_df(region: str) -> Union[str, 'pd.DataFrame']:
341
328
  df = pd.concat(
342
329
  [df, df.apply(get_additional_columns, axis='columns')],
343
330
  axis='columns')
344
- # patch the df for p4de.24xlarge
345
- if region in P4DE_REGIONS:
346
- df = _patch_p4de(region, df, pricing_df)
347
331
  if 'GpuInfo' not in df.columns:
348
332
  df['GpuInfo'] = np.nan
333
+ if 'NeuronInfo' in df.columns:
334
+ # The AWS Neuron API uses 'NeuronDevices' instead of 'Gpus'
335
+ # in its dict; for consistency with GPU handling, rename key.
336
+ def map_neuroninfo(neuroninfo):
337
+ if isinstance(neuroninfo,
338
+ dict) and 'NeuronDevices' in neuroninfo:
339
+ # Rename 'NeuronDevices' to 'Gpus'
340
+ neuroninfo = neuroninfo.copy()
341
+ neuroninfo['Gpus'] = neuroninfo.pop('NeuronDevices')
342
+ return neuroninfo
343
+
344
+ df['NeuronInfo'] = df['NeuronInfo'].apply(map_neuroninfo)
345
+ df['GpuInfo'] = df['GpuInfo'].fillna(df['NeuronInfo'])
349
346
  df = df[USEFUL_COLUMNS]
350
347
  except Exception as e: # pylint: disable=broad-except
351
348
  print(traceback.format_exc())
@@ -393,44 +390,70 @@ def get_all_regions_instance_types_df(regions: Set[str]) -> 'pd.DataFrame':
393
390
  # TODO(tian): find out the driver version.
394
391
  # Neuron driver:
395
392
  _GPU_DESC_UBUNTU_DATE = [
396
- ('gpu', 'AMI GPU PyTorch 2.1.0', '20.04', '20231103'),
397
- ('gpu', 'AMI GPU PyTorch 1.10.0', '18.04', '20221114'),
398
- ('k80', 'AMI GPU PyTorch 1.10.0', '20.04', '20211208'),
399
- ('k80', 'AMI GPU PyTorch 1.10.0', '18.04', '20211208'),
400
- ('neuron', 'Base Neuron AMI', '22.04', '20240923'),
393
+ ('neuron', '/aws/service/neuron/dlami/multi-framework', '22.04'),
401
394
  ]
402
395
 
403
396
 
404
- def _fetch_image_id(region: str, description: str, ubuntu_version: str,
405
- creation_date: str) -> Optional[str]:
397
+ def _fetch_image_creation_date(region: str,
398
+ image_id: Optional[str]) -> Optional[str]:
399
+ if image_id is None:
400
+ return None
406
401
  try:
407
402
  image = subprocess.check_output(f"""\
408
- aws ec2 describe-images --region {region} --owners amazon \\
409
- --filters 'Name=name,Values="Deep Learning {description} (Ubuntu {ubuntu_version}) {creation_date}"' \\
410
- 'Name=state,Values=available' --query 'Images[:1].ImageId' --output text
403
+ aws ec2 describe-images --region {region} --image-ids {image_id} \\
404
+ --query 'Images[0].Name' --output text
411
405
  """,
412
406
  shell=True)
413
407
  except subprocess.CalledProcessError as e:
414
- print(f'Failed {region}, {description}, {ubuntu_version}, '
415
- f'{creation_date}. Trying next date.')
408
+ print(f'Failed to fetch image creation date for {region}, {image_id}')
416
409
  print(f'{type(e)}: {e}')
417
410
  image_id = None
411
+ else:
412
+ assert image is not None
413
+ image_name = image.decode('utf-8').strip()
414
+ match = re.search(r'(\d+)$', image_name)
415
+ if match:
416
+ return match.group(1)
417
+ return None
418
+
419
+
420
+ def _fetch_image_id_from_ssm_param(
421
+ region: str,
422
+ ssm_prefix: str,
423
+ ubuntu_version: str = '22.04') -> Optional[str]:
424
+ try:
425
+ image = subprocess.check_output(f"""\
426
+ aws ssm get-parameter --region {region} --name "{ssm_prefix}/ubuntu-{ubuntu_version}/latest/image_id" \\
427
+ --query 'Parameter.Value' --output text
428
+ """,
429
+ shell=True)
430
+ except subprocess.CalledProcessError as e:
431
+ print(
432
+ f'Failed to fetch image ID from SSM parameter for {region}, {ssm_prefix}, {ubuntu_version}'
433
+ )
434
+ print(f'{type(e)}: {e}')
435
+ return None
418
436
  else:
419
437
  assert image is not None
420
438
  image_id = image.decode('utf-8').strip()
421
439
  return image_id
422
440
 
423
441
 
424
- def _get_image_row(region: str, gpu: str, description: str, ubuntu_version: str,
425
- date: str) -> Tuple[str, str, str, str, Optional[str], str]:
426
- print(f'Getting image for {region}, {description}, {ubuntu_version}, {gpu}')
427
- image_id = _fetch_image_id(region, description, ubuntu_version, date)
428
- if image_id is None:
429
- # not found
430
- print(f'Failed to find image for {region}, {description}, '
431
- f'{ubuntu_version}, {gpu}')
442
+ def _get_image_row(
443
+ region: str,
444
+ gpu: str,
445
+ ssm_prefix: str,
446
+ ubuntu_version: str = '22.04'
447
+ ) -> Tuple[str, str, str, str, Optional[str], Optional[str]]:
448
+ print(f'Getting image for {region}, {ssm_prefix}, {ubuntu_version}, {gpu}')
449
+ image_id = _fetch_image_id_from_ssm_param(region, ssm_prefix,
450
+ ubuntu_version)
451
+ if image_id is not None:
452
+ creation_date = _fetch_image_creation_date(region, image_id)
453
+ else:
454
+ creation_date = None
432
455
  tag = f'skypilot:{gpu}-ubuntu-{ubuntu_version.replace(".", "")}'
433
- return tag, region, 'ubuntu', ubuntu_version, image_id, date
456
+ return tag, region, 'ubuntu', ubuntu_version, image_id, creation_date
434
457
 
435
458
 
436
459
  def get_all_regions_images_df(regions: Set[str]) -> 'pd.DataFrame':
@@ -545,13 +568,26 @@ if __name__ == '__main__':
545
568
  instance_df.to_csv('aws/vms.csv', index=False)
546
569
  print('AWS Service Catalog saved to aws/vms.csv')
547
570
 
548
- # Disable refreshing images.csv as we are using skypilot custom AMIs
549
- # See sky/clouds/service_catalog/images/README.md for more details.
550
- # image_df = get_all_regions_images_df(user_regions)
551
- # _check_regions_integrity(image_df, 'images')
552
-
553
- # image_df.to_csv('aws/images.csv', index=False)
554
- # print('AWS Images saved to aws/images.csv')
571
+ # Disable refreshing images.csv for skypilot custom AMIs
572
+ # refresh only the neuron based images
573
+ # See sky/clouds/catalog/images/README.md for more details.
574
+ image_df = get_all_regions_images_df(user_regions)
575
+ _check_regions_integrity(image_df, 'images')
576
+ # filter out rows where ImageId is None
577
+ image_df = image_df[image_df['ImageId'].notna()]
578
+
579
+ # check if aws/images.csv exists
580
+ if os.path.exists('aws/images.csv'):
581
+ # load the data from aws/images.csv
582
+ existing_image_df = pd.read_csv('aws/images.csv')
583
+ # filter out the neuron based images
584
+ existing_image_df = existing_image_df[~existing_image_df['Tag'].
585
+ eq('skypilot:neuron-ubuntu-2204')]
586
+ # concat the new neuron based images with the existing images
587
+ image_df = pd.concat([existing_image_df, image_df])
588
+
589
+ image_df.to_csv('aws/images.csv', index=False)
590
+ print('AWS Images saved to aws/images.csv')
555
591
 
556
592
  if args.az_mappings:
557
593
  az_mappings_df = fetch_availability_zone_mappings()
@@ -9,18 +9,11 @@ import os
9
9
 
10
10
  import cudo_compute
11
11
 
12
- import sky.provision.cudo.cudo_utils as utils
12
+ from sky.provision.cudo import cudo_utils as utils
13
13
 
14
14
  VMS_CSV = 'cudo/vms.csv'
15
15
 
16
16
 
17
- def cudo_api():
18
- configuration = cudo_compute.Configuration()
19
- configuration.host = 'https://rest.compute.cudo.org'
20
- client = cudo_compute.ApiClient(configuration)
21
- return cudo_compute.VirtualMachinesApi(client)
22
-
23
-
24
17
  def get_gpu_info(count, model):
25
18
  mem = utils.cudo_gpu_mem[model]
26
19
  # pylint: disable=line-too-long
@@ -45,39 +38,46 @@ def get_instance_type(machine_type, vcpu, mem, gpu):
45
38
  mem) + 'gb'
46
39
 
47
40
 
48
- def machine_types(gpu_model, mem_gib, vcpu_count, gpu_count):
49
- try:
50
- api = cudo_api()
51
- types = api.list_vm_machine_types(mem_gib,
52
- vcpu_count,
53
- gpu=gpu_count,
54
- gpu_model=gpu_model)
55
- return types.to_dict()
56
- except cudo_compute.rest.ApiException as e:
57
- raise e
58
-
59
-
60
41
  def update_prices():
61
42
  rows = []
43
+
44
+ api = cudo_compute.cudo_api.virtual_machines()
45
+ all_types = api.list_vm_machine_types2()
46
+ all_machine_types = all_types.to_dict()['machine_types']
47
+
62
48
  for spec in utils.machine_specs:
63
- mts = machine_types('', spec['mem'], spec['vcpu'], spec['gpu'])
64
- for hc in mts['host_configs']:
65
- if not utils.gpu_exists(hc['gpu_model']):
66
- continue
67
- accelerator_name = utils.cudo_gpu_to_skypilot_gpu(hc['gpu_model'])
68
- row = {
69
- 'instance_type': get_instance_type(hc['machine_type'],
70
- spec['vcpu'], spec['mem'],
71
- spec['gpu']),
72
- 'accelerator_name': accelerator_name,
73
- 'accelerator_count': str(spec['gpu']) + '.0',
74
- 'vcpus': str(spec['vcpu']),
75
- 'memory_gib': str(spec['mem']),
76
- 'price': hc['total_price_hr']['value'],
77
- 'region': hc['data_center_id'],
78
- 'gpu_info': get_gpu_info(spec['gpu'], accelerator_name),
79
- }
80
- rows.append(row)
49
+ for machine_type in all_machine_types:
50
+ if (machine_type['min_vcpu'] <= spec['vcpu'] and
51
+ machine_type['min_memory_gib'] <= spec['mem'] and
52
+ utils.gpu_exists(machine_type['gpu_model'])):
53
+
54
+ accelerator_name = utils.cudo_gpu_to_skypilot_gpu(
55
+ machine_type['gpu_model'])
56
+
57
+ # Calculate total price per hour based on the given spec
58
+ vcpu_price = float(
59
+ machine_type['vcpu_price_hr']['value']) * spec['vcpu']
60
+ memory_price = float(
61
+ machine_type['memory_gib_price_hr']['value']) * spec['mem']
62
+ gpu_price = float(
63
+ machine_type['gpu_price_hr']['value']) * spec['gpu']
64
+ # Note: Not including storage and IPv4 prices
65
+ # for now as they may be optional
66
+ total_price = vcpu_price + memory_price + gpu_price
67
+
68
+ row = {
69
+ 'instance_type': get_instance_type(
70
+ machine_type['machine_type'], spec['vcpu'], spec['mem'],
71
+ spec['gpu']),
72
+ 'accelerator_name': accelerator_name,
73
+ 'accelerator_count': str(spec['gpu']) + '.0',
74
+ 'vcpus': str(spec['vcpu']),
75
+ 'memory_gib': str(spec['mem']),
76
+ 'price': str(total_price),
77
+ 'region': machine_type['data_center_id'],
78
+ 'gpu_info': get_gpu_info(spec['gpu'], accelerator_name),
79
+ }
80
+ rows.append(row)
81
81
  path = VMS_CSV
82
82
  with open(path, 'w', encoding='utf-8') as file:
83
83
  file.write(
@@ -179,9 +179,13 @@ TPU_V4_HOST_DF = pd.read_csv(
179
179
  # TODO(woosuk): Make this more robust.
180
180
  # Refer to: https://github.com/skypilot-org/skypilot/issues/1006
181
181
  # Unsupported Series: 'f1', 'm2'
182
- SERIES_TO_DISCRIPTION = {
182
+ SERIES_TO_DESCRIPTION = {
183
183
  'a2': 'A2 Instance',
184
184
  'a3': 'A3 Instance',
185
+ # NOTE: GCP does not provide separate CPU/RAM pricing for A4 instances.
186
+ # The B200 GPU pricing includes the full VM cost. See special handling in
187
+ # get_vm_price() which sets A4 VM price to 0.
188
+ 'a4': 'A4 Instance',
185
189
  'c2': 'Compute optimized',
186
190
  'c2d': 'C2D AMD Instance',
187
191
  'c3': 'C3 Instance',
@@ -195,9 +199,11 @@ SERIES_TO_DISCRIPTION = {
195
199
  'n1': 'N1 Predefined Instance',
196
200
  'n2': 'N2 Instance',
197
201
  'n2d': 'N2D AMD Instance',
202
+ 'n4': 'N4 Instance',
198
203
  't2a': 'T2A Arm Instance',
199
204
  't2d': 'T2D AMD Instance',
200
205
  }
206
+
201
207
  creds, project_id = google.auth.default()
202
208
  gcp_client = discovery.build('compute', 'v1')
203
209
  tpu_client = discovery.build('tpu', 'v1')
@@ -334,7 +340,7 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
334
340
 
335
341
  # Drop the unsupported series.
336
342
  df = df[df['InstanceType'].str.startswith(
337
- tuple(f'{series}-' for series in SERIES_TO_DISCRIPTION))]
343
+ tuple(f'{series}-' for series in SERIES_TO_DESCRIPTION))]
338
344
  df = df[~df['AvailabilityZone'].str.startswith(tuple(TPU_V4_ZONES))]
339
345
 
340
346
  # TODO(woosuk): Make this more efficient.
@@ -352,7 +358,7 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
352
358
 
353
359
  # Check if the SKU is for the correct series.
354
360
  description = sku['description']
355
- if SERIES_TO_DISCRIPTION[series].lower() not in description.lower():
361
+ if SERIES_TO_DESCRIPTION[series].lower() not in description.lower():
356
362
  continue
357
363
  # Special check for M1 instances.
358
364
  if series == 'm1' and 'M3' in description:
@@ -389,6 +395,15 @@ def get_vm_df(skus: List[Dict[str, Any]], region_prefix: str) -> 'pd.DataFrame':
389
395
  if series in ['f1', 'g1']:
390
396
  memory_price = 0.0
391
397
 
398
+ # Special case for A4 instances.
399
+ # GCP does not provide separate CPU/RAM pricing for A4 instances in the
400
+ # SKUs API. The GPU pricing (B200) includes the full VM cost.
401
+ # We set the VM price to 0 so the entry is not dropped, and the GPU
402
+ # pricing will provide the total cost.
403
+ if series == 'a4':
404
+ cpu_price = 0.0
405
+ memory_price = 0.0
406
+
392
407
  # TODO(tian): (2024/11/10) Some SKUs are missing in the SKUs API. We
393
408
  # skip them in the catalog for now. We should investigate why they are
394
409
  # missing and add them back.
@@ -434,10 +449,18 @@ def _get_gpus_for_zone(zone: str) -> 'pd.DataFrame':
434
449
  gpu_name = gpu_name.upper()
435
450
  if 'H100-80GB' in gpu_name:
436
451
  gpu_name = 'H100'
437
- if 'H100-MEGA-80GB' in gpu_name:
452
+
453
+ if 'H100-MEGA' in gpu_name:
438
454
  gpu_name = 'H100-MEGA'
439
455
  if count != 8:
440
- # H100-MEGA only has 8 cards.
456
+ continue
457
+ elif 'H200' in gpu_name:
458
+ gpu_name = 'H200'
459
+ if count != 8:
460
+ continue
461
+ elif 'B200' in gpu_name:
462
+ gpu_name = 'B200'
463
+ if count != 8:
441
464
  continue
442
465
  if 'VWS' in gpu_name:
443
466
  continue
@@ -468,6 +491,8 @@ def _gpu_info_from_name(name: str) -> Optional[Dict[str, List[Dict[str, Any]]]]:
468
491
  'A100': 40 * 1024,
469
492
  'H100': 80 * 1024,
470
493
  'H100-MEGA': 80 * 1024,
494
+ 'H200': 141 * 1024,
495
+ 'B200': 180 * 1024,
471
496
  'P4': 8 * 1024,
472
497
  'T4': 16 * 1024,
473
498
  'V100': 16 * 1024,
@@ -507,22 +532,47 @@ def get_gpu_df(skus: List[Dict[str, Any]],
507
532
  ondemand_or_spot = 'OnDemand' if not spot else 'Preemptible'
508
533
  gpu_price = None
509
534
  for sku in gpu_skus:
535
+ row_gpu_name = row['AcceleratorName']
510
536
  if row['Region'] not in sku['serviceRegions']:
511
537
  continue
512
- if sku['category']['usageType'] != ondemand_or_spot:
538
+
539
+ # Check usageType matches, with special handling for B200 spot.
540
+ # GCP has a bug where some B200 spot SKUs have usageType='OnDemand'
541
+ # but the description contains 'Spot Preemptible'.
542
+ usage_type = sku['category']['usageType']
543
+ description = sku['description']
544
+ is_spot_description = 'spot preemptible' in description.lower()
545
+
546
+ if usage_type != ondemand_or_spot:
547
+ # For B200 spot pricing, also accept SKUs where description
548
+ # says "Spot Preemptible" even if usageType is wrong.
549
+ if not (spot and row_gpu_name == 'B200' and
550
+ is_spot_description):
551
+ continue
552
+
553
+ # For B200 on-demand, skip SKUs that are actually spot (description
554
+ # says "Spot Preemptible" but usageType is incorrectly 'OnDemand').
555
+ if not spot and row_gpu_name == 'B200' and is_spot_description:
513
556
  continue
514
557
 
515
- gpu_names = [row['AcceleratorName']]
516
- if gpu_names[0] == 'A100-80GB':
517
- gpu_names = ['A100 80GB']
518
- if gpu_names[0] == 'H100':
519
- gpu_names = ['H100 80GB']
520
- if gpu_names[0] == 'H100-MEGA':
558
+ gpu_names = [f'{row_gpu_name} GPU']
559
+ if row_gpu_name == 'A100-80GB':
560
+ gpu_names = ['A100 80GB GPU']
561
+ elif row_gpu_name == 'H100':
562
+ gpu_names = ['H100 80GB GPU']
563
+ elif row_gpu_name == 'H100-MEGA':
521
564
  # Seems that H100-MEGA has two different descriptions in SKUs in
522
565
  # different regions: 'H100 80GB Mega' and 'H100 80GB Plus'.
523
- gpu_names = ['H100 80GB Mega', 'H100 80GB Plus']
524
- if not any(f'{gpu_name} GPU' in sku['description']
525
- for gpu_name in gpu_names):
566
+ gpu_names = [
567
+ 'H100 80GB Mega GPU', 'H100 Mega 80GB GPU',
568
+ 'H100 80GB Plus GPU'
569
+ ]
570
+ elif row_gpu_name == 'H200':
571
+ gpu_names = ['H200 141GB GPU']
572
+ elif row_gpu_name == 'B200':
573
+ gpu_names = ['Nvidia B200 (1 gpu slice)']
574
+ if not any(
575
+ gpu_name in sku['description'] for gpu_name in gpu_names):
526
576
  continue
527
577
 
528
578
  unit_price = _get_unit_price(sku)
@@ -554,7 +604,7 @@ def _get_tpu_response_for_zone(zone: str) -> list:
554
604
  # Sometimes the response is empty ({}) even for enabled zones. Here we
555
605
  # retry the request for a few times.
556
606
  backoff = common_utils.Backoff(initial_backoff=1)
557
- for _ in range(TPU_RETRY_CNT):
607
+ for retry_cnt in range(TPU_RETRY_CNT):
558
608
  tpus_request = (
559
609
  tpu_client.projects().locations().acceleratorTypes().list(
560
610
  parent=parent))
@@ -570,6 +620,10 @@ def _get_tpu_response_for_zone(zone: str) -> list:
570
620
  print(f' An error occurred: {error}')
571
621
  # If error happens, fail early.
572
622
  return []
623
+ except TimeoutError:
624
+ print(f' TimeoutError: Failed to fetch TPUs for zone {zone!r}, '
625
+ f'retry {retry_cnt + 1} of {TPU_RETRY_CNT}')
626
+
573
627
  time_to_sleep = backoff.current_backoff()
574
628
  print(f' Retry zone {zone!r} in {time_to_sleep} seconds...')
575
629
  time.sleep(time_to_sleep)