skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,343 @@
1
+ """Kubernetes pvc provisioning."""
2
+ from typing import Any, Dict, List, Optional, Set, Tuple
3
+
4
+ from sky import global_user_state
5
+ from sky import models
6
+ from sky import sky_logging
7
+ from sky.adaptors import kubernetes
8
+ from sky.provision import constants
9
+ from sky.provision.kubernetes import config as config_lib
10
+ from sky.provision.kubernetes import constants as k8s_constants
11
+ from sky.provision.kubernetes import utils as kubernetes_utils
12
+ from sky.utils import resources_utils
13
+ from sky.utils import volume as volume_lib
14
+
15
+ logger = sky_logging.init_logger(__name__)
16
+
17
+
18
+ def _get_context_namespace(config: models.VolumeConfig) -> Tuple[str, str]:
19
+ """Gets the context and namespace of a volume."""
20
+ if config.region is None:
21
+ context = kubernetes_utils.get_current_kube_config_context_name()
22
+ config.region = context
23
+ else:
24
+ context = config.region
25
+ namespace = config.config.get('namespace')
26
+ if namespace is None:
27
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
28
+ config.config['namespace'] = namespace
29
+ return context, namespace
30
+
31
+
32
+ def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
33
+ pod_spec: Dict[str, Any]) -> None:
34
+ """Checks if the PVC is used by any pod in the namespace."""
35
+ volumes = pod_spec.get('spec', {}).get('volumes', [])
36
+ if not volumes:
37
+ return
38
+ once_modes = [
39
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE.value,
40
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE_POD.value
41
+ ]
42
+ for volume in volumes:
43
+ pvc_name = volume.get('persistentVolumeClaim', {}).get('claimName')
44
+ if not pvc_name:
45
+ continue
46
+ pvc = kubernetes.core_api(
47
+ context).read_namespaced_persistent_volume_claim(
48
+ name=pvc_name, namespace=namespace)
49
+ access_mode = pvc.spec.access_modes[0]
50
+ if access_mode not in once_modes:
51
+ continue
52
+ usedby_pods, _ = _get_volume_usedby(context, namespace, pvc_name)
53
+ if usedby_pods:
54
+ raise config_lib.KubernetesError(f'Volume {pvc_name} with access '
55
+ f'mode {access_mode} is already '
56
+ f'in use by Pods {usedby_pods}.')
57
+
58
+
59
+ def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
60
+ """Creates or registers a volume."""
61
+ context, namespace = _get_context_namespace(config)
62
+ pvc_spec = _get_pvc_spec(namespace, config)
63
+ # Check if the storage class exists
64
+ storage_class_name = pvc_spec['spec'].get('storageClassName')
65
+ if storage_class_name is not None:
66
+ try:
67
+ kubernetes.storage_api(context).read_storage_class(
68
+ name=storage_class_name)
69
+ except kubernetes.api_exception() as e:
70
+ raise config_lib.KubernetesError(
71
+ f'Check storage class {storage_class_name} error: {e}')
72
+ create_persistent_volume_claim(namespace, context, pvc_spec, config)
73
+ return config
74
+
75
+
76
+ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
77
+ """Deletes a volume."""
78
+ context, namespace = _get_context_namespace(config)
79
+ pvc_name = config.name_on_cloud
80
+ kubernetes_utils.delete_k8s_resource_with_retry(
81
+ delete_func=lambda pvc_name=pvc_name: kubernetes.core_api(
82
+ context).delete_namespaced_persistent_volume_claim(
83
+ name=pvc_name,
84
+ namespace=namespace,
85
+ _request_timeout=config_lib.DELETION_TIMEOUT),
86
+ resource_type='pvc',
87
+ resource_name=pvc_name)
88
+ logger.info(f'Deleted PVC {pvc_name} in namespace {namespace}')
89
+ return config
90
+
91
+
92
+ def _get_volume_usedby(
93
+ context: Optional[str],
94
+ namespace: str,
95
+ pvc_name: str,
96
+ ) -> Tuple[List[str], List[str]]:
97
+ """Gets the usedby resources of a volume.
98
+
99
+ This function returns the pods and clusters that are using the volume.
100
+ The usedby_pods is accurate, which also includes the Pods that are not
101
+ managed by SkyPilot.
102
+
103
+ Args:
104
+ context: Kubernetes context
105
+ namespace: Kubernetes namespace
106
+ pvc_name: PVC name
107
+
108
+ Returns:
109
+ usedby_pods: List of pods using the volume. These may include pods
110
+ not created by SkyPilot.
111
+ usedby_clusters: List of clusters using the volume.
112
+ """
113
+ usedby_pods = []
114
+ usedby_clusters = []
115
+ field_selector = ','.join([
116
+ f'status.phase!={phase}'
117
+ for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
118
+ ])
119
+ cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
120
+ # Get all pods in the namespace
121
+ pods = kubernetes.core_api(context).list_namespaced_pod(
122
+ namespace=namespace, field_selector=field_selector)
123
+ for pod in pods.items:
124
+ if pod.spec.volumes is None:
125
+ continue
126
+ for volume in pod.spec.volumes:
127
+ if volume.persistent_volume_claim is None:
128
+ continue
129
+ if volume.persistent_volume_claim.claim_name == pvc_name:
130
+ usedby_pods.append(pod.metadata.name)
131
+ # Get the real cluster name
132
+ cluster_name_on_cloud = pod.metadata.labels.get(
133
+ constants.TAG_SKYPILOT_CLUSTER_NAME)
134
+ if cluster_name_on_cloud is None:
135
+ continue
136
+ cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
137
+ if cluster_name is not None:
138
+ usedby_clusters.append(cluster_name)
139
+ if usedby_pods:
140
+ logger.debug(f'Volume {pvc_name} is used by Pods {usedby_pods}'
141
+ f' and clusters {usedby_clusters}')
142
+ return usedby_pods, usedby_clusters
143
+
144
+
145
+ def _get_cluster_name_on_cloud_to_cluster_name_map() -> Dict[str, str]:
146
+ """Gets the map from cluster name on cloud to cluster name."""
147
+ clusters = global_user_state.get_clusters()
148
+ cloud_to_name_map = {}
149
+ for cluster in clusters:
150
+ handle = cluster['handle']
151
+ if handle is None:
152
+ continue
153
+ cloud_to_name_map[handle.cluster_name_on_cloud] = cluster['name']
154
+ return cloud_to_name_map
155
+
156
+
157
+ def get_volume_usedby(
158
+ config: models.VolumeConfig,) -> Tuple[List[str], List[str]]:
159
+ """Gets the usedby resources of a volume."""
160
+ context, namespace = _get_context_namespace(config)
161
+ pvc_name = config.name_on_cloud
162
+ return _get_volume_usedby(context, namespace, pvc_name)
163
+
164
+
165
+ def get_all_volumes_usedby(
166
+ configs: List[models.VolumeConfig],
167
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
168
+ """Gets the usedby resources of all volumes."""
169
+ field_selector = ','.join([
170
+ f'status.phase!={phase}'
171
+ for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
172
+ ])
173
+ label_selector = 'parent=skypilot'
174
+ context_to_namespaces: Dict[str, Set[str]] = {}
175
+ pvc_names = set()
176
+ for config in configs:
177
+ context, namespace = _get_context_namespace(config)
178
+ if context not in context_to_namespaces:
179
+ context_to_namespaces[context] = set()
180
+ context_to_namespaces[context].add(namespace)
181
+ pvc_names.add(config.name_on_cloud)
182
+ cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
183
+ # Get all pods in the namespace
184
+ used_by_pods: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
185
+ used_by_clusters: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
186
+ for context, namespaces in context_to_namespaces.items():
187
+ used_by_pods[context] = {}
188
+ used_by_clusters[context] = {}
189
+ for namespace in namespaces:
190
+ used_by_pods[context][namespace] = {}
191
+ used_by_clusters[context][namespace] = {}
192
+ pods = kubernetes.core_api(context).list_namespaced_pod(
193
+ namespace=namespace,
194
+ field_selector=field_selector,
195
+ label_selector=label_selector)
196
+ for pod in pods.items:
197
+ if pod.spec.volumes is None:
198
+ continue
199
+ for volume in pod.spec.volumes:
200
+ if volume.persistent_volume_claim is None:
201
+ continue
202
+ volume_name = volume.persistent_volume_claim.claim_name
203
+ if volume_name not in pvc_names:
204
+ continue
205
+ if volume_name not in used_by_pods[context][namespace]:
206
+ used_by_pods[context][namespace][volume_name] = []
207
+ used_by_pods[context][namespace][volume_name].append(
208
+ pod.metadata.name)
209
+ cluster_name_on_cloud = pod.metadata.labels.get(
210
+ constants.TAG_SKYPILOT_CLUSTER_NAME)
211
+ if cluster_name_on_cloud is None:
212
+ continue
213
+ cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
214
+ if cluster_name is None:
215
+ continue
216
+ if cluster_name not in used_by_clusters[context][namespace]:
217
+ used_by_clusters[context][namespace][cluster_name] = []
218
+ used_by_clusters[context][namespace][cluster_name].append(
219
+ cluster_name)
220
+ return used_by_pods, used_by_clusters
221
+
222
+
223
+ def map_all_volumes_usedby(
224
+ used_by_pods: Dict[str, Any], used_by_clusters: Dict[str, Any],
225
+ config: models.VolumeConfig) -> Tuple[List[str], List[str]]:
226
+ """Maps the usedby resources of a volume."""
227
+ context, namespace = _get_context_namespace(config)
228
+ pvc_name = config.name_on_cloud
229
+
230
+ return (used_by_pods.get(context, {}).get(namespace, {}).get(pvc_name, []),
231
+ used_by_clusters.get(context, {}).get(namespace,
232
+ {}).get(pvc_name, []))
233
+
234
+
235
+ def _populate_config_from_pvc(config: models.VolumeConfig,
236
+ pvc_obj: Any) -> None:
237
+ """Populate missing fields in config from a PVC object.
238
+
239
+ Args:
240
+ config: VolumeConfig to populate
241
+ pvc_obj: V1PersistentVolumeClaim object from kubernetes client
242
+ """
243
+ if pvc_obj is None:
244
+ return
245
+ pvc_name = pvc_obj.metadata.name
246
+
247
+ # Populate storageClassName if not set
248
+ if config.config.get('storage_class_name') is None:
249
+ pvc_storage_class = getattr(pvc_obj.spec, 'storage_class_name', None)
250
+ if pvc_storage_class:
251
+ config.config['storage_class_name'] = pvc_storage_class
252
+
253
+ # Populate size if not set (prefer bound capacity, fallback to requested)
254
+ pvc_size = None
255
+ size_quantity = None
256
+ # Try status.capacity (dict) - actual bound size
257
+ capacity = getattr(getattr(pvc_obj, 'status', None), 'capacity', None)
258
+ if isinstance(capacity, dict) and 'storage' in capacity:
259
+ size_quantity = capacity['storage']
260
+ # Fallback to spec.resources.requests (dict) - requested size
261
+ if size_quantity is None:
262
+ requests = getattr(getattr(pvc_obj.spec, 'resources', None), 'requests',
263
+ None)
264
+ if isinstance(requests, dict):
265
+ size_quantity = requests.get('storage')
266
+ # Parse and normalize the size if found
267
+ if size_quantity:
268
+ try:
269
+ # Normalize to GB string (e.g., '20')
270
+ pvc_size = resources_utils.parse_memory_resource(
271
+ size_quantity, 'size', allow_rounding=True)
272
+ except ValueError as e:
273
+ # Just log the error since it is not critical.
274
+ logger.warning(f'Failed to parse PVC size {size_quantity!r} '
275
+ f'for PVC {pvc_name}: {e}')
276
+ if pvc_size is not None:
277
+ if config.size is not None and config.size != pvc_size:
278
+ logger.warning(f'PVC {pvc_name} has size {pvc_size} but config '
279
+ f'size is {config.size}, overriding the config size'
280
+ f' with the PVC size.')
281
+ config.size = pvc_size
282
+
283
+
284
+ def create_persistent_volume_claim(
285
+ namespace: str,
286
+ context: Optional[str],
287
+ pvc_spec: Dict[str, Any],
288
+ config: Optional[models.VolumeConfig] = None,
289
+ ) -> None:
290
+ """Creates a persistent volume claim for SkyServe controller."""
291
+ pvc_name = pvc_spec['metadata']['name']
292
+ try:
293
+ pvc = kubernetes.core_api(
294
+ context).read_namespaced_persistent_volume_claim(
295
+ name=pvc_name, namespace=namespace)
296
+ if config is not None:
297
+ _populate_config_from_pvc(config, pvc)
298
+ logger.debug(f'PVC {pvc_name} already exists')
299
+ return
300
+ except kubernetes.api_exception() as e:
301
+ if e.status != 404: # Not found
302
+ raise
303
+ use_existing = config is not None and config.config.get('use_existing')
304
+ if use_existing:
305
+ raise ValueError(
306
+ f'PVC {pvc_name} does not exist while use_existing is True.')
307
+ pvc = kubernetes.core_api(
308
+ context).create_namespaced_persistent_volume_claim(namespace=namespace,
309
+ body=pvc_spec)
310
+ logger.info(f'Created PVC {pvc_name} in namespace {namespace}')
311
+ if config is not None:
312
+ _populate_config_from_pvc(config, pvc)
313
+
314
+
315
+ def _get_pvc_spec(namespace: str,
316
+ config: models.VolumeConfig) -> Dict[str, Any]:
317
+ """Gets the PVC spec for the given storage config."""
318
+ access_mode = config.config.get('access_mode')
319
+ size = config.size
320
+ # The previous code assumes that the access_mode and size are always set.
321
+ assert access_mode is not None, f'access_mode is None for volume ' \
322
+ f'{config.name_on_cloud}'
323
+ pvc_spec: Dict[str, Any] = {
324
+ 'metadata': {
325
+ 'name': config.name_on_cloud,
326
+ 'namespace': namespace,
327
+ 'labels': {
328
+ 'parent': 'skypilot',
329
+ 'skypilot-name': config.name,
330
+ }
331
+ },
332
+ 'spec': {
333
+ 'accessModes': [access_mode],
334
+ }
335
+ }
336
+ if size is not None:
337
+ pvc_spec['spec']['resources'] = {'requests': {'storage': f'{size}Gi'}}
338
+ if config.labels:
339
+ pvc_spec['metadata']['labels'].update(config.labels)
340
+ storage_class = config.config.get('storage_class_name')
341
+ if storage_class is not None:
342
+ pvc_spec['spec']['storageClassName'] = storage_class
343
+ return pvc_spec
@@ -1,7 +1,7 @@
1
1
  """Lambda Cloud instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.provision import common
@@ -68,9 +68,10 @@ def _get_private_ip(instance_info: Dict[str, Any], single_node: bool) -> str:
68
68
  return private_ip
69
69
 
70
70
 
71
- def run_instances(region: str, cluster_name_on_cloud: str,
71
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
72
72
  config: common.ProvisionConfig) -> common.ProvisionRecord:
73
73
  """Runs instances for the given cluster"""
74
+ del cluster_name # unused
74
75
  lambda_client = _get_lambda_client()
75
76
  pending_status = ['booting']
76
77
  while True:
@@ -106,34 +107,35 @@ def run_instances(region: str, cluster_name_on_cloud: str,
106
107
  created_instance_ids = []
107
108
  remote_ssh_key_name = config.authentication_config['remote_key_name']
108
109
 
109
- def launch_nodes(node_type: str, quantity: int) -> List[str]:
110
+ def launch_node(node_type: str) -> str:
110
111
  try:
111
112
  instance_ids = lambda_client.create_instances(
112
113
  instance_type=config.node_config['InstanceType'],
113
114
  region=region,
114
115
  name=f'{cluster_name_on_cloud}-{node_type}',
115
- quantity=quantity,
116
+ # Quantity cannot actually be greater than 1; see:
117
+ # https://github.com/skypilot-org/skypilot/issues/7084
118
+ quantity=1,
116
119
  ssh_key_name=remote_ssh_key_name,
117
120
  )
118
- logger.info(f'Launched {len(instance_ids)} {node_type} node(s), '
119
- f'instance_ids: {instance_ids}')
120
- return instance_ids
121
+ logger.info(f'Launched {node_type} node, '
122
+ f'instance_id: {instance_ids[0]}')
123
+ return instance_ids[0]
121
124
  except Exception as e:
122
125
  logger.warning(f'run_instances error: {e}')
123
126
  raise
124
127
 
125
128
  if head_instance_id is None:
126
- instance_ids = launch_nodes('head', 1)
127
- assert len(instance_ids) == 1
128
- created_instance_ids.append(instance_ids[0])
129
- head_instance_id = instance_ids[0]
129
+ head_instance_id = launch_node('head')
130
+ created_instance_ids.append(head_instance_id)
130
131
 
131
132
  assert head_instance_id is not None, 'head_instance_id should not be None'
132
133
 
133
134
  worker_node_count = to_start_count - 1
134
135
  if worker_node_count > 0:
135
- instance_ids = launch_nodes('worker', worker_node_count)
136
- created_instance_ids.extend(instance_ids)
136
+ for _ in range(worker_node_count):
137
+ worker_instance_id = launch_node('worker')
138
+ created_instance_ids.append(worker_instance_id)
137
139
 
138
140
  while True:
139
141
  instances = _filter_instances(cluster_name_on_cloud, ['active'])
@@ -226,11 +228,14 @@ def get_cluster_info(
226
228
 
227
229
 
228
230
  def query_instances(
231
+ cluster_name: str,
229
232
  cluster_name_on_cloud: str,
230
233
  provider_config: Optional[Dict[str, Any]] = None,
231
234
  non_terminated_only: bool = True,
232
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
235
+ retry_if_missing: bool = False,
236
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
233
237
  """See sky/provision/__init__.py"""
238
+ del cluster_name, retry_if_missing # unused
234
239
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
235
240
  instances = _filter_instances(cluster_name_on_cloud, None)
236
241
 
@@ -240,12 +245,13 @@ def query_instances(
240
245
  'unhealthy': status_lib.ClusterStatus.INIT,
241
246
  'terminating': None,
242
247
  }
243
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
248
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
249
+ Optional[str]]] = {}
244
250
  for instance_id, instance in instances.items():
245
251
  status = status_map.get(instance['status'])
246
252
  if non_terminated_only and status is None:
247
253
  continue
248
- statuses[instance_id] = status
254
+ statuses[instance_id] = (status, None)
249
255
  return statuses
250
256
 
251
257
 
@@ -0,0 +1,50 @@
1
+ """Constants used by the Nebius provisioner."""
2
+
3
+ VERSION = 'v1'
4
+
5
+ # InfiniBand-capable instance platforms
6
+ INFINIBAND_INSTANCE_PLATFORMS = [
7
+ 'gpu-h100-sxm',
8
+ 'gpu-h200-sxm',
9
+ ]
10
+
11
+ # InfiniBand environment variables for NCCL and UCX
12
+ INFINIBAND_ENV_VARS = {
13
+ 'NCCL_IB_HCA': 'mlx5',
14
+ 'UCX_NET_DEVICES': ('mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,'
15
+ 'mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1')
16
+ }
17
+
18
+ # pylint: disable=line-too-long
19
+ INFINIBAND_IMAGE_ID = 'docker:cr.eu-north1.nebius.cloud/nebius-benchmarks/nccl-tests:2.23.4-ubu22.04-cu12.4'
20
+
21
+ # Docker run options for InfiniBand support
22
+ INFINIBAND_DOCKER_OPTIONS = ['--device=/dev/infiniband', '--cap-add=IPC_LOCK']
23
+
24
+ # InfiniBand fabric mapping by platform and region
25
+ # Based on Nebius documentation
26
+ INFINIBAND_FABRIC_MAPPING = {
27
+ # H100 platforms
28
+ ('gpu-h100-sxm', 'eu-north1'): [
29
+ 'fabric-2', 'fabric-3', 'fabric-4', 'fabric-6'
30
+ ],
31
+
32
+ # H200 platforms
33
+ ('gpu-h200-sxm', 'eu-north1'): ['fabric-7'],
34
+ ('gpu-h200-sxm', 'eu-west1'): ['fabric-5'],
35
+ ('gpu-h200-sxm', 'us-central1'): ['us-central1-a'],
36
+ }
37
+
38
+
39
+ def get_default_fabric(platform: str, region: str) -> str:
40
+ """Get the default (first) fabric for a given platform and region."""
41
+ fabrics = INFINIBAND_FABRIC_MAPPING.get((platform, region), [])
42
+ if not fabrics:
43
+ # Select north europe region as default
44
+ fabrics = INFINIBAND_FABRIC_MAPPING.get(('gpu-h100-sxm', 'eu-north1'),
45
+ [])
46
+ if not fabrics:
47
+ raise ValueError(
48
+ f'No InfiniBand fabric available for platform {platform} '
49
+ f'in region {region}')
50
+ return fabrics[0]
@@ -1,6 +1,6 @@
1
1
  """Nebius instance provisioning."""
2
2
  import time
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.provision import common
@@ -65,9 +65,10 @@ def _wait_until_no_pending(region: str, cluster_name_on_cloud: str) -> None:
65
65
  f' to be ready.')
66
66
 
67
67
 
68
- def run_instances(region: str, cluster_name_on_cloud: str,
68
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
69
69
  config: common.ProvisionConfig) -> common.ProvisionRecord:
70
70
  """Runs instances for the given cluster."""
71
+ del cluster_name # unused
71
72
  _wait_until_no_pending(region, cluster_name_on_cloud)
72
73
  running_instances = _filter_instances(region, cluster_name_on_cloud,
73
74
  ['RUNNING'])
@@ -124,6 +125,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
124
125
  node_type = 'head' if head_instance_id is None else 'worker'
125
126
  try:
126
127
  platform, preset = config.node_config['InstanceType'].split('_')
128
+
127
129
  instance_id = utils.launch(
128
130
  cluster_name_on_cloud=cluster_name_on_cloud,
129
131
  node_type=node_type,
@@ -132,7 +134,14 @@ def run_instances(region: str, cluster_name_on_cloud: str,
132
134
  region=region,
133
135
  image_family=config.node_config['ImageId'],
134
136
  disk_size=config.node_config['DiskSize'],
135
- user_data=config.node_config['UserData'])
137
+ user_data=config.node_config['UserData'],
138
+ use_spot=config.node_config['use_spot'],
139
+ associate_public_ip_address=(
140
+ not config.provider_config['use_internal_ips']),
141
+ use_static_ip_address=config.provider_config.get(
142
+ 'use_static_ip_address', False),
143
+ filesystems=config.node_config.get('filesystems', []),
144
+ network_tier=config.node_config.get('network_tier'))
136
145
  except Exception as e: # pylint: disable=broad-except
137
146
  logger.warning(f'run_instances error: {e}')
138
147
  raise
@@ -241,11 +250,14 @@ def get_cluster_info(
241
250
 
242
251
 
243
252
  def query_instances(
253
+ cluster_name: str,
244
254
  cluster_name_on_cloud: str,
245
255
  provider_config: Optional[Dict[str, Any]] = None,
246
256
  non_terminated_only: bool = True,
247
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
257
+ retry_if_missing: bool = False,
258
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
248
259
  """See sky/provision/__init__.py"""
260
+ del cluster_name, retry_if_missing # unused
249
261
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
250
262
  instances = _filter_instances(provider_config['region'],
251
263
  cluster_name_on_cloud, None)
@@ -257,12 +269,13 @@ def query_instances(
257
269
  'STOPPING': status_lib.ClusterStatus.STOPPED,
258
270
  'DELETING': status_lib.ClusterStatus.STOPPED,
259
271
  }
260
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
272
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
273
+ Optional[str]]] = {}
261
274
  for inst_id, inst in instances.items():
262
275
  status = status_map[inst['status']]
263
276
  if non_terminated_only and status is None:
264
277
  continue
265
- statuses[inst_id] = status
278
+ statuses[inst_id] = (status, None)
266
279
  return statuses
267
280
 
268
281