skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,136 @@
1
+ """Hyperbolic Cloud service catalog.
2
+
3
+ This module loads and queries the service catalog for Hyperbolic Cloud.
4
+ """
5
+ from typing import Dict, List, Optional, Tuple, Union
6
+
7
+ from sky.catalog import common
8
+ from sky.clouds import cloud # Import cloud here for Region
9
+ from sky.utils import ux_utils
10
+
11
+ # Initialize cloud variable at module level
12
+ CLOUD = 'hyperbolic'
13
+
14
+ _df = common.read_catalog('hyperbolic/vms.csv')
15
+
16
+
17
+ def instance_type_exists(instance_type: str) -> bool:
18
+ return common.instance_type_exists_impl(_df, instance_type)
19
+
20
+
21
+ def validate_region_zone(
22
+ region: Optional[str],
23
+ zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
24
+ if zone is not None:
25
+ with ux_utils.print_exception_no_traceback():
26
+ raise ValueError('Hyperbolic Cloud does not support zones.')
27
+ return common.validate_region_zone_impl('hyperbolic', _df, region, zone)
28
+
29
+
30
+ def get_hourly_cost(
31
+ instance_type: str,
32
+ use_spot: bool = False,
33
+ region: Optional[str] = None,
34
+ zone: Optional[str] = None,
35
+ ) -> float:
36
+ if zone is not None:
37
+ with ux_utils.print_exception_no_traceback():
38
+ raise ValueError('Hyperbolic Cloud does not support zones.')
39
+ return common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
40
+ zone)
41
+
42
+
43
+ def get_vcpus_mem_from_instance_type(
44
+ instance_type: str,) -> Tuple[Optional[float], Optional[float]]:
45
+ return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
46
+
47
+
48
+ def get_accelerators_from_instance_type(
49
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
50
+ return common.get_accelerators_from_instance_type_impl(_df, instance_type)
51
+
52
+
53
+ def get_vcpus_from_instance_type(instance_type: str) -> Optional[float]:
54
+ vcpus, _ = get_vcpus_mem_from_instance_type(instance_type)
55
+ return vcpus
56
+
57
+
58
+ def get_memory_from_instance_type(instance_type: str) -> Optional[float]:
59
+ _, mem = get_vcpus_mem_from_instance_type(instance_type)
60
+ return mem
61
+
62
+
63
+ def get_zone_shell_cmd() -> Optional[str]:
64
+ """Returns the shell command to obtain the zone."""
65
+ return None
66
+
67
+
68
+ def get_default_instance_type(cpus: Optional[str] = None,
69
+ memory: Optional[str] = None,
70
+ disk_tier: Optional[str] = None,
71
+ region: Optional[str] = None,
72
+ zone: Optional[str] = None) -> Optional[str]:
73
+ del disk_tier # Unused
74
+ return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
75
+ zone)
76
+
77
+
78
+ def get_instance_type_for_accelerator(
79
+ acc_name: str,
80
+ acc_count: int,
81
+ cpus: Optional[str] = None,
82
+ memory: Optional[str] = None,
83
+ use_spot: bool = False,
84
+ region: Optional[str] = None,
85
+ zone: Optional[str] = None,
86
+ ) -> Tuple[Optional[List[str]], List[str]]:
87
+ if zone is not None:
88
+ with ux_utils.print_exception_no_traceback():
89
+ raise ValueError('Hyperbolic Cloud does not support zones.')
90
+ return common.get_instance_type_for_accelerator_impl(df=_df,
91
+ acc_name=acc_name,
92
+ acc_count=acc_count,
93
+ cpus=cpus,
94
+ memory=memory,
95
+ use_spot=use_spot,
96
+ region=region,
97
+ zone=zone)
98
+
99
+
100
+ def get_region_zones_for_instance_type(instance_type: str,
101
+ use_spot: bool) -> List[cloud.Region]:
102
+ df = _df[_df['InstanceType'] == instance_type]
103
+ return common.get_region_zones(df, use_spot)
104
+
105
+
106
+ def get_gen_version(instance_type: str) -> Optional[str]:
107
+ """Returns the generation version of the instance type."""
108
+ del instance_type # Unused
109
+ # TODO: Implement generation version detection
110
+ return None
111
+
112
+
113
+ def list_accelerators(
114
+ gpus_only: bool = True,
115
+ name_filter: Optional[str] = None,
116
+ region_filter: Optional[str] = None,
117
+ quantity_filter: Optional[int] = None,
118
+ case_sensitive: bool = True,
119
+ all_regions: bool = False,
120
+ require_price: bool = True,
121
+ ) -> Dict[str, List[common.InstanceTypeInfo]]:
122
+ """Returns all instance types in Hyperbolic Cloud offering accelerators."""
123
+ del require_price # Unused
124
+ return common.list_accelerators_impl('Hyperbolic', _df, gpus_only,
125
+ name_filter, region_filter,
126
+ quantity_filter, case_sensitive,
127
+ all_regions)
128
+
129
+
130
+ def get_instance_type_from_catalog() -> dict:
131
+ # TODO: Implement this function
132
+ return {}
133
+
134
+
135
+ def regions() -> List[cloud.Region]:
136
+ return [cloud.Region('default')]
@@ -8,8 +8,8 @@ from typing import Dict, List, Optional, Tuple, Union
8
8
 
9
9
  from sky import sky_logging
10
10
  from sky.adaptors import ibm
11
+ from sky.catalog import common
11
12
  from sky.clouds import cloud
12
- from sky.clouds.service_catalog import common
13
13
  from sky.utils import resources_utils
14
14
 
15
15
  logger = sky_logging.init_logger(__name__)
@@ -92,10 +92,12 @@ def list_accelerators(
92
92
  case_sensitive, all_regions)
93
93
 
94
94
 
95
- def get_default_instance_type(
96
- cpus: Optional[str] = None,
97
- memory: Optional[str] = None,
98
- disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
95
+ def get_default_instance_type(cpus: Optional[str] = None,
96
+ memory: Optional[str] = None,
97
+ disk_tier: Optional[
98
+ resources_utils.DiskTier] = None,
99
+ region: Optional[str] = None,
100
+ zone: Optional[str] = None) -> Optional[str]:
99
101
  del disk_tier # unused
100
102
  if cpus is None and memory is None:
101
103
  cpus = f'{_DEFAULT_NUM_VCPUS}+'
@@ -107,7 +109,8 @@ def get_default_instance_type(
107
109
  instance_type_prefix = f'{_DEFAULT_INSTANCE_FAMILY}-'
108
110
  df = _df[_df['InstanceType'].str.startswith(instance_type_prefix)]
109
111
  return common.get_instance_type_for_cpus_mem_impl(df, cpus,
110
- memory_gb_or_ratio)
112
+ memory_gb_or_ratio,
113
+ region, zone)
111
114
 
112
115
 
113
116
  def is_image_tag_valid(tag: str, region: Optional[str]) -> bool:
@@ -3,6 +3,7 @@
3
3
  Kubernetes does not require a catalog of instances, but we need an image catalog
4
4
  mapping SkyPilot image tags to corresponding container image tags.
5
5
  """
6
+ import collections
6
7
  import re
7
8
  import typing
8
9
  from typing import Dict, List, Optional, Set, Tuple
@@ -12,9 +13,9 @@ from sky import clouds as sky_clouds
12
13
  from sky import sky_logging
13
14
  from sky.adaptors import common as adaptors_common
14
15
  from sky.adaptors import kubernetes
16
+ from sky.catalog import CloudFilter
17
+ from sky.catalog import common
15
18
  from sky.clouds import cloud
16
- from sky.clouds.service_catalog import CloudFilter
17
- from sky.clouds.service_catalog import common
18
19
  from sky.provision.kubernetes import utils as kubernetes_utils
19
20
 
20
21
  logger = sky_logging.init_logger(__name__)
@@ -167,12 +168,25 @@ def _list_accelerators(
167
168
  accelerators_qtys: Set[Tuple[str, int]] = set()
168
169
  keys = lf.get_label_keys()
169
170
  nodes = kubernetes_utils.get_kubernetes_nodes(context=context)
170
- pods = None
171
- if realtime:
172
- # Get the pods to get the real-time GPU usage
171
+
172
+ # Check if any nodes have accelerators before fetching pods
173
+ has_accelerator_nodes = False
174
+ for node in nodes:
175
+ for key in keys:
176
+ if key in node.metadata.labels:
177
+ has_accelerator_nodes = True
178
+ break
179
+ if has_accelerator_nodes:
180
+ break
181
+
182
+ # Only fetch pods if we have accelerator nodes and realtime is requested
183
+ allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
184
+ error_on_get_allocated_gpu_qty_by_node = False
185
+ if realtime and has_accelerator_nodes:
186
+ # Get the allocated GPU quantity by each node
173
187
  try:
174
- pods = kubernetes_utils.get_all_pods_in_kubernetes_cluster(
175
- context=context)
188
+ allocated_qty_by_node = (
189
+ kubernetes_utils.get_allocated_gpu_qty_by_node(context=context))
176
190
  except kubernetes.api_exception() as e:
177
191
  if e.status == 403:
178
192
  logger.warning(
@@ -180,6 +194,7 @@ def _list_accelerators(
180
194
  '(forbidden). Please check if your account has '
181
195
  'necessary permissions to list pods. Realtime GPU '
182
196
  'availability information may be incorrect.')
197
+ error_on_get_allocated_gpu_qty_by_node = True
183
198
  else:
184
199
  raise
185
200
  # Total number of GPUs in the cluster
@@ -191,10 +206,13 @@ def _list_accelerators(
191
206
  for node in nodes:
192
207
  for key in keys:
193
208
  if key in node.metadata.labels:
194
- allocated_qty = 0
195
209
  accelerator_name = lf.get_accelerator_from_label_value(
196
210
  node.metadata.labels.get(key))
197
211
 
212
+ # Heterogenous cluster may have some nodes with empty labels.
213
+ if not accelerator_name:
214
+ continue
215
+
198
216
  # Exclude multi-host TPUs from being processed.
199
217
  # TODO(Doyoung): Remove the logic when adding support for
200
218
  # multi-host TPUs.
@@ -210,9 +228,9 @@ def _list_accelerators(
210
228
  # Generate the accelerator quantities
211
229
  accelerator_count = (
212
230
  kubernetes_utils.get_node_accelerator_count(
213
- node.status.allocatable))
231
+ context, node.status.allocatable))
214
232
 
215
- if accelerator_name and accelerator_count > 0:
233
+ if accelerator_count > 0:
216
234
  # TPUs are counted in a different way compared to GPUs.
217
235
  # Multi-node GPUs can be split into smaller units and be
218
236
  # provisioned, but TPUs are considered as an atomic unit.
@@ -242,24 +260,18 @@ def _list_accelerators(
242
260
  total_accelerators_capacity[
243
261
  accelerator_name] += quantized_count
244
262
 
245
- if pods is None:
246
- # If we can't get the pods, we can't get the GPU usage
263
+ if error_on_get_allocated_gpu_qty_by_node:
264
+ # If we can't get the allocated GPU quantity by each node,
265
+ # we can't get the GPU usage.
247
266
  total_accelerators_available[accelerator_name] = -1
248
267
  continue
249
268
 
250
- for pod in pods:
251
- # Get all the pods running on the node
252
- if (pod.spec.node_name == node.metadata.name and
253
- pod.status.phase in ['Running', 'Pending']):
254
- # Iterate over all the containers in the pod and sum
255
- # the GPU requests
256
- for container in pod.spec.containers:
257
- if container.resources.requests:
258
- allocated_qty += (
259
- kubernetes_utils.get_node_accelerator_count(
260
- container.resources.requests))
261
-
269
+ allocated_qty = allocated_qty_by_node[node.metadata.name]
262
270
  accelerators_available = accelerator_count - allocated_qty
271
+ # Initialize the total_accelerators_available to make sure the
272
+ # key exists in the dictionary.
273
+ total_accelerators_available[accelerator_name] = (
274
+ total_accelerators_available.get(accelerator_name, 0))
263
275
 
264
276
  if accelerators_available >= min_quantity_filter:
265
277
  quantized_availability = min_quantity_filter * (
@@ -6,7 +6,7 @@ instance types and pricing information for Lambda.
6
6
  import typing
7
7
  from typing import Dict, List, Optional, Tuple, Union
8
8
 
9
- from sky.clouds.service_catalog import common
9
+ from sky.catalog import common
10
10
  from sky.utils import resources_utils
11
11
  from sky.utils import ux_utils
12
12
 
@@ -56,10 +56,12 @@ def get_vcpus_mem_from_instance_type(
56
56
  return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
57
57
 
58
58
 
59
- def get_default_instance_type(
60
- cpus: Optional[str] = None,
61
- memory: Optional[str] = None,
62
- disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
59
+ def get_default_instance_type(cpus: Optional[str] = None,
60
+ memory: Optional[str] = None,
61
+ disk_tier: Optional[
62
+ resources_utils.DiskTier] = None,
63
+ region: Optional[str] = None,
64
+ zone: Optional[str] = None) -> Optional[str]:
63
65
  del disk_tier # unused
64
66
  if cpus is None and memory is None:
65
67
  cpus = f'{_DEFAULT_NUM_VCPUS}+'
@@ -68,7 +70,8 @@ def get_default_instance_type(
68
70
  else:
69
71
  memory_gb_or_ratio = memory
70
72
  return common.get_instance_type_for_cpus_mem_impl(_df, cpus,
71
- memory_gb_or_ratio)
73
+ memory_gb_or_ratio,
74
+ region, zone)
72
75
 
73
76
 
74
77
  def get_accelerators_from_instance_type(
@@ -6,7 +6,7 @@ instance types and pricing information for Nebius.
6
6
  import typing
7
7
  from typing import Dict, List, Optional, Tuple, Union
8
8
 
9
- from sky.clouds.service_catalog import common
9
+ from sky.catalog import common
10
10
  from sky.utils import resources_utils
11
11
  from sky.utils import ux_utils
12
12
 
@@ -38,7 +38,6 @@ def get_hourly_cost(instance_type: str,
38
38
  region: Optional[str] = None,
39
39
  zone: Optional[str] = None) -> float:
40
40
  """Returns the cost, or the cheapest cost among all zones for spot."""
41
- assert not use_spot, 'Nebius does not support spot.'
42
41
  if zone is not None:
43
42
  with ux_utils.print_exception_no_traceback():
44
43
  raise ValueError('Nebius does not support zones.')
@@ -51,12 +50,15 @@ def get_vcpus_mem_from_instance_type(
51
50
  return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
52
51
 
53
52
 
54
- def get_default_instance_type(
55
- cpus: Optional[str] = None,
56
- memory: Optional[str] = None,
57
- disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
53
+ def get_default_instance_type(cpus: Optional[str] = None,
54
+ memory: Optional[str] = None,
55
+ disk_tier: Optional[
56
+ resources_utils.DiskTier] = None,
57
+ region: Optional[str] = None,
58
+ zone: Optional[str] = None) -> Optional[str]:
58
59
  del disk_tier # unused
59
- return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory)
60
+ return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
61
+ zone)
60
62
 
61
63
 
62
64
  def get_accelerators_from_instance_type(
@@ -17,8 +17,8 @@ import typing
17
17
  from typing import Dict, List, Optional, Tuple, Union
18
18
 
19
19
  from sky.adaptors import oci as oci_adaptor
20
+ from sky.catalog import common
20
21
  from sky.clouds import OCI
21
- from sky.clouds.service_catalog import common
22
22
  from sky.clouds.utils import oci_utils
23
23
  from sky.utils import resources_utils
24
24
 
@@ -101,10 +101,12 @@ def get_hourly_cost(instance_type: str,
101
101
  region, zone)
102
102
 
103
103
 
104
- def get_default_instance_type(
105
- cpus: Optional[str] = None,
106
- memory: Optional[str] = None,
107
- disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
104
+ def get_default_instance_type(cpus: Optional[str] = None,
105
+ memory: Optional[str] = None,
106
+ disk_tier: Optional[
107
+ resources_utils.DiskTier] = None,
108
+ region: Optional[str] = None,
109
+ zone: Optional[str] = None) -> Optional[str]:
108
110
  if cpus is None:
109
111
  cpus = f'{oci_utils.oci_config.DEFAULT_NUM_VCPUS}+'
110
112
 
@@ -127,7 +129,8 @@ def get_default_instance_type(
127
129
 
128
130
  logger.debug(f'# get_default_instance_type: {df}')
129
131
  return common.get_instance_type_for_cpus_mem_impl(df, cpus,
130
- memory_gb_or_ratio)
132
+ memory_gb_or_ratio,
133
+ region, zone)
131
134
 
132
135
 
133
136
  def get_accelerators_from_instance_type(
@@ -7,7 +7,7 @@ query instance types and pricing information for Paperspace.
7
7
  import typing
8
8
  from typing import Dict, List, Optional, Tuple, Union
9
9
 
10
- from sky.clouds.service_catalog import common
10
+ from sky.catalog import common
11
11
  from sky.utils import ux_utils
12
12
 
13
13
  if typing.TYPE_CHECKING:
@@ -52,11 +52,14 @@ def get_default_instance_type(
52
52
  cpus: Optional[str] = None,
53
53
  memory: Optional[str] = None,
54
54
  disk_tier: Optional[str] = None,
55
+ region: Optional[str] = None,
56
+ zone: Optional[str] = None,
55
57
  ) -> Optional[str]:
56
58
  # NOTE: After expanding catalog to multiple entries, you may
57
59
  # want to specify a default instance type or family.
58
60
  del disk_tier # unused
59
- return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory)
61
+ return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
62
+ zone)
60
63
 
61
64
 
62
65
  def get_accelerators_from_instance_type(
@@ -0,0 +1,95 @@
1
+ """PrimeIntellect service catalog.
2
+
3
+ This module loads the service catalog file and can be used to
4
+ query instance types and pricing information for PrimeIntellect.
5
+ """
6
+
7
+ import typing
8
+ from typing import Dict, List, Optional, Tuple, Union
9
+
10
+ from sky.catalog import common
11
+
12
+ if typing.TYPE_CHECKING:
13
+ from sky.clouds import cloud
14
+
15
+ _df = common.read_catalog('primeintellect/vms.csv')
16
+
17
+
18
+ def instance_type_exists(instance_type: str) -> bool:
19
+ return common.instance_type_exists_impl(_df, instance_type)
20
+
21
+
22
+ def validate_region_zone(
23
+ region: Optional[str],
24
+ zone: Optional[str]) -> Tuple[Optional[str], Optional[str]]:
25
+ return common.validate_region_zone_impl('primeintellect', _df, region, zone)
26
+
27
+
28
+ def get_hourly_cost(instance_type: str,
29
+ use_spot: bool = False,
30
+ region: Optional[str] = None,
31
+ zone: Optional[str] = None) -> float:
32
+ """Returns the cost, or the cheapest cost among all zones for spot."""
33
+ return common.get_hourly_cost_impl(_df, instance_type, use_spot, region,
34
+ zone)
35
+
36
+
37
+ def get_vcpus_mem_from_instance_type(
38
+ instance_type: str) -> Tuple[Optional[float], Optional[float]]:
39
+ return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
40
+
41
+
42
+ def get_default_instance_type(cpus: Optional[str] = None,
43
+ memory: Optional[str] = None,
44
+ disk_tier: Optional[str] = None,
45
+ region: Optional[str] = None,
46
+ zone: Optional[str] = None) -> Optional[str]:
47
+ del disk_tier # no disk tiers
48
+ return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
49
+ zone)
50
+
51
+
52
+ def get_accelerators_from_instance_type(
53
+ instance_type: str) -> Optional[Dict[str, Union[int, float]]]:
54
+ return common.get_accelerators_from_instance_type_impl(_df, instance_type)
55
+
56
+
57
+ def get_instance_type_for_accelerator(
58
+ acc_name: str,
59
+ acc_count: int,
60
+ cpus: Optional[str] = None,
61
+ memory: Optional[str] = None,
62
+ use_spot: bool = False,
63
+ region: Optional[str] = None,
64
+ zone: Optional[str] = None) -> Tuple[Optional[List[str]], List[str]]:
65
+ """Returns a list of instance types that have the given accelerator."""
66
+ return common.get_instance_type_for_accelerator_impl(df=_df,
67
+ acc_name=acc_name,
68
+ acc_count=acc_count,
69
+ cpus=cpus,
70
+ memory=memory,
71
+ use_spot=use_spot,
72
+ region=region,
73
+ zone=zone)
74
+
75
+
76
+ def get_region_zones_for_instance_type(instance_type: str,
77
+ use_spot: bool) -> List['cloud.Region']:
78
+ df = _df[_df['InstanceType'] == instance_type]
79
+ return common.get_region_zones(df, use_spot)
80
+
81
+
82
+ def list_accelerators(
83
+ gpus_only: bool,
84
+ name_filter: Optional[str],
85
+ region_filter: Optional[str],
86
+ quantity_filter: Optional[int],
87
+ case_sensitive: bool = True,
88
+ all_regions: bool = False,
89
+ require_price: bool = True) -> Dict[str, List[common.InstanceTypeInfo]]:
90
+ """Returns all instance types in Prime Intellect offering GPUs."""
91
+ del require_price
92
+ return common.list_accelerators_impl('PrimeIntellect', _df, gpus_only,
93
+ name_filter, region_filter,
94
+ quantity_filter, case_sensitive,
95
+ all_regions)
@@ -7,12 +7,16 @@ query instance types and pricing information for RunPod.
7
7
  import typing
8
8
  from typing import Dict, List, Optional, Tuple, Union
9
9
 
10
- from sky.clouds.service_catalog import common
10
+ from sky.catalog import common
11
11
 
12
12
  if typing.TYPE_CHECKING:
13
13
  from sky.clouds import cloud
14
14
 
15
- _df = common.read_catalog('runpod/vms.csv')
15
+ # Runpod has no set updated schedule for their catalog. We pull the catalog
16
+ # every 7 hours to make sure we have the latest information.
17
+ _PULL_FREQUENCY_HOURS = 7
18
+ _df = common.read_catalog('runpod/vms.csv',
19
+ pull_frequency_hours=_PULL_FREQUENCY_HOURS)
16
20
 
17
21
 
18
22
  def instance_type_exists(instance_type: str) -> bool:
@@ -41,11 +45,14 @@ def get_vcpus_mem_from_instance_type(
41
45
 
42
46
  def get_default_instance_type(cpus: Optional[str] = None,
43
47
  memory: Optional[str] = None,
44
- disk_tier: Optional[str] = None) -> Optional[str]:
48
+ disk_tier: Optional[str] = None,
49
+ region: Optional[str] = None,
50
+ zone: Optional[str] = None) -> Optional[str]:
45
51
  del disk_tier # RunPod does not support disk tiers.
46
52
  # NOTE: After expanding catalog to multiple entries, you may
47
53
  # want to specify a default instance type or family.
48
- return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory)
54
+ return common.get_instance_type_for_cpus_mem_impl(_df, cpus, memory, region,
55
+ zone)
49
56
 
50
57
 
51
58
  def get_accelerators_from_instance_type(
@@ -7,7 +7,7 @@ instance types and pricing information for SCP.
7
7
  import typing
8
8
  from typing import Dict, List, Optional, Tuple, Union
9
9
 
10
- from sky.clouds.service_catalog import common
10
+ from sky.catalog import common
11
11
  from sky.utils import resources_utils
12
12
  from sky.utils import ux_utils
13
13
 
@@ -51,10 +51,12 @@ def get_vcpus_mem_from_instance_type(
51
51
  return common.get_vcpus_mem_from_instance_type_impl(_df, instance_type)
52
52
 
53
53
 
54
- def get_default_instance_type(
55
- cpus: Optional[str] = None,
56
- memory: Optional[str] = None,
57
- disk_tier: Optional[resources_utils.DiskTier] = None) -> Optional[str]:
54
+ def get_default_instance_type(cpus: Optional[str] = None,
55
+ memory: Optional[str] = None,
56
+ disk_tier: Optional[
57
+ resources_utils.DiskTier] = None,
58
+ region: Optional[str] = None,
59
+ zone: Optional[str] = None) -> Optional[str]:
58
60
  del disk_tier # unused
59
61
  if cpus is None and memory is None:
60
62
  cpus = str(_DEFAULT_NUM_VCPUS)
@@ -63,7 +65,8 @@ def get_default_instance_type(
63
65
  else:
64
66
  memory_gb_or_ratio = memory
65
67
  return common.get_instance_type_for_cpus_mem_impl(_df, cpus,
66
- memory_gb_or_ratio)
68
+ memory_gb_or_ratio,
69
+ region, zone)
67
70
 
68
71
 
69
72
  def get_accelerators_from_instance_type(