skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/clouds/nebius.py CHANGED
@@ -1,25 +1,22 @@
1
1
  """ Nebius Cloud. """
2
+ import json
2
3
  import os
3
4
  import typing
4
- from typing import Dict, Iterator, List, Optional, Tuple, Union
5
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
5
6
 
7
+ from sky import catalog
6
8
  from sky import clouds
9
+ from sky import exceptions
10
+ from sky import skypilot_config
7
11
  from sky.adaptors import nebius
8
- from sky.clouds import service_catalog
12
+ from sky.provision.nebius import constants as nebius_constants
9
13
  from sky.utils import annotations
10
14
  from sky.utils import registry
11
15
  from sky.utils import resources_utils
12
16
 
13
17
  if typing.TYPE_CHECKING:
14
18
  from sky import resources as resources_lib
15
-
16
- _CREDENTIAL_FILES = [
17
- # credential files for Nebius
18
- nebius.NEBIUS_TENANT_ID_FILENAME,
19
- nebius.NEBIUS_IAM_TOKEN_FILENAME,
20
- nebius.NEBIUS_PROJECT_ID_FILENAME,
21
- nebius.NEBIUS_CREDENTIALS_FILENAME
22
- ]
19
+ from sky.utils import volume as volume_lib
23
20
 
24
21
  _INDENT_PREFIX = ' '
25
22
 
@@ -55,14 +52,18 @@ class Nebius(clouds.Cloud):
55
52
  _CLOUD_UNSUPPORTED_FEATURES = {
56
53
  clouds.CloudImplementationFeatures.AUTODOWN:
57
54
  ('Autodown not supported. Can\'t delete OS disk.'),
58
- clouds.CloudImplementationFeatures.SPOT_INSTANCE:
59
- ('Spot is not supported, as Nebius API does not implement spot.'),
60
55
  clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
61
56
  (f'Migrating disk is currently not supported on {_REPR}.'),
62
57
  clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
63
58
  (f'Custom disk tier is currently not supported on {_REPR}.'),
59
+ clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
60
+ ('Custom network tier is currently only supported for '
61
+ 'H100:8 and H200:8 on Nebius.'),
64
62
  clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
65
63
  ('High availability controllers are not supported on Nebius.'),
64
+ clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK:
65
+ ('Customized multiple network interfaces are not supported on '
66
+ f'{_REPR}.'),
66
67
  }
67
68
  # Nebius maximum instance name length defined as <= 63 as a hostname length
68
69
  # 63 - 8 - 5 = 50 characters since
@@ -77,25 +78,43 @@ class Nebius(clouds.Cloud):
77
78
 
78
79
  @classmethod
79
80
  def _unsupported_features_for_resources(
80
- cls, resources: 'resources_lib.Resources'
81
+ cls,
82
+ resources: 'resources_lib.Resources',
83
+ region: Optional[str] = None,
81
84
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
82
- del resources # unused
83
- return cls._CLOUD_UNSUPPORTED_FEATURES
85
+ unsupported = cls._CLOUD_UNSUPPORTED_FEATURES.copy()
86
+
87
+ # Check if the accelerators support InfiniBand (H100 or H200) and 8 GPUs
88
+ if resources.accelerators is not None:
89
+ for acc_name, acc_count in resources.accelerators.items():
90
+ if acc_name.lower() in ('h100', 'h200') and acc_count == 8:
91
+ # Remove CUSTOM_NETWORK_TIER from unsupported features for
92
+ # InfiniBand-capable accelerators. Refer to:
93
+ # https://docs.nebius.com/compute/clusters/gpu#fabrics
94
+ unsupported.pop(
95
+ clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER,
96
+ None)
97
+ break
98
+
99
+ return unsupported
84
100
 
85
101
  @classmethod
86
102
  def _max_cluster_name_length(cls) -> Optional[int]:
87
103
  return cls._MAX_CLUSTER_NAME_LEN_LIMIT
88
104
 
89
105
  @classmethod
90
- def regions_with_offering(cls, instance_type: str,
91
- accelerators: Optional[Dict[str, int]],
92
- use_spot: bool, region: Optional[str],
93
- zone: Optional[str]) -> List[clouds.Region]:
106
+ def regions_with_offering(
107
+ cls,
108
+ instance_type: str,
109
+ accelerators: Optional[Dict[str, int]],
110
+ use_spot: bool,
111
+ region: Optional[str],
112
+ zone: Optional[str],
113
+ resources: Optional['resources_lib.Resources'] = None,
114
+ ) -> List[clouds.Region]:
94
115
  assert zone is None, 'Nebius does not support zones.'
95
116
  del accelerators, zone # unused
96
- if use_spot:
97
- return []
98
- regions = service_catalog.get_region_zones_for_instance_type(
117
+ regions = catalog.get_region_zones_for_instance_type(
99
118
  instance_type, use_spot, 'nebius')
100
119
 
101
120
  if region is not None:
@@ -107,8 +126,8 @@ class Nebius(clouds.Cloud):
107
126
  cls,
108
127
  instance_type: str,
109
128
  ) -> Tuple[Optional[float], Optional[float]]:
110
- return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
111
- clouds='nebius')
129
+ return catalog.get_vcpus_mem_from_instance_type(instance_type,
130
+ clouds='nebius')
112
131
 
113
132
  @classmethod
114
133
  def zones_provision_loop(
@@ -135,11 +154,11 @@ class Nebius(clouds.Cloud):
135
154
  use_spot: bool,
136
155
  region: Optional[str] = None,
137
156
  zone: Optional[str] = None) -> float:
138
- return service_catalog.get_hourly_cost(instance_type,
139
- use_spot=use_spot,
140
- region=region,
141
- zone=zone,
142
- clouds='nebius')
157
+ return catalog.get_hourly_cost(instance_type,
158
+ use_spot=use_spot,
159
+ region=region,
160
+ zone=zone,
161
+ clouds='nebius')
143
162
 
144
163
  def accelerators_to_hourly_cost(self,
145
164
  accelerators: Dict[str, int],
@@ -161,69 +180,124 @@ class Nebius(clouds.Cloud):
161
180
  return isinstance(other, Nebius)
162
181
 
163
182
  @classmethod
164
- def get_default_instance_type(
165
- cls,
166
- cpus: Optional[str] = None,
167
- memory: Optional[str] = None,
168
- disk_tier: Optional[resources_utils.DiskTier] = None
169
- ) -> Optional[str]:
183
+ def get_default_instance_type(cls,
184
+ cpus: Optional[str] = None,
185
+ memory: Optional[str] = None,
186
+ disk_tier: Optional[
187
+ resources_utils.DiskTier] = None,
188
+ region: Optional[str] = None,
189
+ zone: Optional[str] = None) -> Optional[str]:
170
190
  """Returns the default instance type for Nebius."""
171
- return service_catalog.get_default_instance_type(cpus=cpus,
172
- memory=memory,
173
- disk_tier=disk_tier,
174
- clouds='nebius')
191
+ return catalog.get_default_instance_type(cpus=cpus,
192
+ memory=memory,
193
+ disk_tier=disk_tier,
194
+ region=region,
195
+ zone=zone,
196
+ clouds='nebius')
175
197
 
176
198
  @classmethod
177
199
  def get_accelerators_from_instance_type(
178
200
  cls,
179
201
  instance_type: str,
180
202
  ) -> Optional[Dict[str, Union[int, float]]]:
181
- return service_catalog.get_accelerators_from_instance_type(
182
- instance_type, clouds='nebius')
203
+ return catalog.get_accelerators_from_instance_type(instance_type,
204
+ clouds='nebius')
183
205
 
184
206
  @classmethod
185
207
  def get_zone_shell_cmd(cls) -> Optional[str]:
186
208
  return None
187
209
 
188
210
  def make_deploy_resources_variables(
189
- self,
190
- resources: 'resources_lib.Resources',
191
- cluster_name: resources_utils.ClusterName,
192
- region: 'clouds.Region',
193
- zones: Optional[List['clouds.Zone']],
194
- num_nodes: int,
195
- dryrun: bool = False) -> Dict[str, Optional[str]]:
211
+ self,
212
+ resources: 'resources_lib.Resources',
213
+ cluster_name: resources_utils.ClusterName,
214
+ region: 'clouds.Region',
215
+ zones: Optional[List['clouds.Zone']],
216
+ num_nodes: int,
217
+ dryrun: bool = False,
218
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
219
+ ) -> Dict[str, Any]:
196
220
  del dryrun, cluster_name
197
221
  assert zones is None, ('Nebius does not support zones', zones)
198
222
 
199
- r = resources
200
- acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
223
+ resources = resources.assert_launchable()
224
+ acc_dict = self.get_accelerators_from_instance_type(
225
+ resources.instance_type)
201
226
  custom_resources = resources_utils.make_ray_custom_resources_str(
202
227
  acc_dict)
203
228
  platform, _ = resources.instance_type.split('_')
204
229
 
205
- if platform in ('cpu-d3', 'cpu-e2'):
206
- image_family = 'ubuntu22.04-driverless'
207
- elif platform in ('gpu-h100-sxm', 'gpu-h200-sxm', 'gpu-l40s-a'):
208
- image_family = 'ubuntu22.04-cuda12'
230
+ # Selecting image_family by platform
231
+ # https://docs.nebius.com/compute/storage/boot-disk-images
232
+ if platform.startswith('cpu'):
233
+ image_family = 'ubuntu24.04-driverless'
234
+ elif platform.startswith('gpu'):
235
+ image_family = 'ubuntu24.04-cuda12'
209
236
  else:
210
237
  raise RuntimeError('Unsupported instance type for Nebius cloud:'
211
238
  f' {resources.instance_type}')
212
239
 
213
- resources_vars = {
240
+ config_fs = skypilot_config.get_effective_region_config(
241
+ cloud='nebius',
242
+ region=region.name,
243
+ keys=('filesystems',),
244
+ default_value=[])
245
+ resources_vars_fs = []
246
+ for i, fs in enumerate(config_fs):
247
+ resources_vars_fs.append({
248
+ 'filesystem_id': fs['filesystem_id'],
249
+ 'filesystem_attach_mode': fs.get('attach_mode', 'READ_WRITE'),
250
+ 'filesystem_mount_path': fs.get(
251
+ 'mount_path', f'/mnt/filesystem-skypilot-{i+1}'),
252
+ 'filesystem_mount_tag': f'filesystem-skypilot-{i+1}'
253
+ })
254
+
255
+ use_static_ip_address = skypilot_config.get_nested(
256
+ ('nebius', 'use_static_ip_address'), default_value=False)
257
+ resources_vars: Dict[str, Any] = {
214
258
  'instance_type': resources.instance_type,
215
259
  'custom_resources': custom_resources,
260
+ 'use_static_ip_address': use_static_ip_address,
216
261
  'region': region.name,
217
262
  'image_id': image_family,
218
263
  # Nebius does not support specific zones.
219
264
  'zones': None,
265
+ 'use_spot': resources.use_spot,
266
+ 'filesystems': resources_vars_fs,
267
+ 'network_tier': resources.network_tier
220
268
  }
221
269
 
270
+ docker_run_options = []
271
+
222
272
  if acc_dict is not None:
223
273
  # Nebius cloud's docker runtime information does not contain
224
274
  # 'nvidia-container-runtime', causing no GPU option to be added to
225
275
  # the docker run command. We patch this by adding it here.
226
- resources_vars['docker_run_options'] = ['--gpus all']
276
+ docker_run_options.append('--gpus all')
277
+
278
+ # Check for InfiniBand support with network_tier: best
279
+ is_infiniband_capable = (
280
+ platform in nebius_constants.INFINIBAND_INSTANCE_PLATFORMS)
281
+ if (is_infiniband_capable and
282
+ resources.network_tier == resources_utils.NetworkTier.BEST):
283
+ # For Docker containers, add InfiniBand device access and
284
+ # IPC_LOCK capability
285
+ if resources.extract_docker_image() is not None:
286
+ docker_run_options.extend(
287
+ nebius_constants.INFINIBAND_DOCKER_OPTIONS)
288
+
289
+ # Add InfiniBand environment variables to docker run options
290
+ for env_var, env_value in (
291
+ nebius_constants.INFINIBAND_ENV_VARS.items()):
292
+ docker_run_options.extend(
293
+ ['-e', f'{env_var}={env_value}'])
294
+
295
+ # For all InfiniBand-capable instances, add env variables
296
+ resources_vars[
297
+ 'env_vars'] = nebius_constants.INFINIBAND_ENV_VARS
298
+
299
+ if docker_run_options:
300
+ resources_vars['docker_run_options'] = docker_run_options
227
301
 
228
302
  return resources_vars
229
303
 
@@ -255,7 +329,9 @@ class Nebius(clouds.Cloud):
255
329
  default_instance_type = Nebius.get_default_instance_type(
256
330
  cpus=resources.cpus,
257
331
  memory=resources.memory,
258
- disk_tier=resources.disk_tier)
332
+ disk_tier=resources.disk_tier,
333
+ region=resources.region,
334
+ zone=resources.zone)
259
335
  if default_instance_type is None:
260
336
  # TODO: Add hints to all return values in this method to help
261
337
  # users understand why the resources are not launchable.
@@ -266,15 +342,16 @@ class Nebius(clouds.Cloud):
266
342
 
267
343
  assert len(accelerators) == 1, resources
268
344
  acc, acc_count = list(accelerators.items())[0]
269
- (instance_list, fuzzy_candidate_list
270
- ) = service_catalog.get_instance_type_for_accelerator(
271
- acc,
272
- acc_count,
273
- use_spot=resources.use_spot,
274
- cpus=resources.cpus,
275
- region=resources.region,
276
- zone=resources.zone,
277
- clouds='nebius')
345
+ (instance_list,
346
+ fuzzy_candidate_list) = catalog.get_instance_type_for_accelerator(
347
+ acc,
348
+ acc_count,
349
+ use_spot=resources.use_spot,
350
+ cpus=resources.cpus,
351
+ memory=resources.memory,
352
+ region=resources.region,
353
+ zone=resources.zone,
354
+ clouds='nebius')
278
355
  if instance_list is None:
279
356
  return resources_utils.FeasibleResources([], fuzzy_candidate_list,
280
357
  None)
@@ -282,25 +359,25 @@ class Nebius(clouds.Cloud):
282
359
  fuzzy_candidate_list, None)
283
360
 
284
361
  @classmethod
285
- @annotations.lru_cache(scope='request')
286
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
362
+ def _check_compute_credentials(
363
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
287
364
  """Checks if the user has access credentials to
288
365
  Nebius's compute service."""
289
366
  token_cred_msg = (
290
367
  f'{_INDENT_PREFIX}Credentials can be set up by running: \n'
291
- f'{_INDENT_PREFIX} $ nebius iam get-access-token > {nebius.NEBIUS_IAM_TOKEN_PATH} \n' # pylint: disable=line-too-long
292
- f'{_INDENT_PREFIX} or generate ~/.nebius/credentials.json \n')
368
+ f'{_INDENT_PREFIX} $ nebius iam get-access-token > {nebius.iam_token_path()} \n' # pylint: disable=line-too-long
369
+ f'{_INDENT_PREFIX} or generate {nebius.credentials_path()} \n')
293
370
 
294
- tenant_msg = (f'{_INDENT_PREFIX} Copy your tenat ID from the web console and save it to file \n' # pylint: disable=line-too-long
295
- f'{_INDENT_PREFIX} $ echo $NEBIUS_TENANT_ID_PATH > {nebius.NEBIUS_TENANT_ID_PATH} \n' # pylint: disable=line-too-long
371
+ tenant_msg = (f'{_INDENT_PREFIX} Copy your tenant ID from the web console and save it to file \n' # pylint: disable=line-too-long
372
+ f'{_INDENT_PREFIX} $ echo $NEBIUS_TENANT_ID_PATH > {nebius.tenant_id_path()} \n' # pylint: disable=line-too-long
296
373
  f'{_INDENT_PREFIX} Or if you have 1 tenant you can run:\n' # pylint: disable=line-too-long
297
- f'{_INDENT_PREFIX} $ nebius --format json iam whoami|jq -r \'.user_profile.tenants[0].tenant_id\' > {nebius.NEBIUS_TENANT_ID_PATH} \n') # pylint: disable=line-too-long
374
+ f'{_INDENT_PREFIX} $ nebius --format json iam whoami|jq -r \'.user_profile.tenants[0].tenant_id\' > {nebius.tenant_id_path()} \n') # pylint: disable=line-too-long
298
375
  if not nebius.is_token_or_cred_file_exist():
299
376
  return False, f'{token_cred_msg}'
300
- sdk = nebius.sdk()
301
377
  tenant_id = nebius.get_tenant_id()
302
378
  if tenant_id is None:
303
379
  return False, f'{tenant_msg}'
380
+ sdk = nebius.sdk()
304
381
  try:
305
382
  service = nebius.iam().ProjectServiceClient(sdk)
306
383
  service.list(
@@ -314,7 +391,8 @@ class Nebius(clouds.Cloud):
314
391
 
315
392
  @classmethod
316
393
  @annotations.lru_cache(scope='request')
317
- def _check_storage_credentials(cls) -> Tuple[bool, Optional[str]]:
394
+ def _check_storage_credentials(
395
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
318
396
  """Checks if the user has access credentials to Nebius Object Storage.
319
397
 
320
398
  Returns:
@@ -341,8 +419,8 @@ class Nebius(clouds.Cloud):
341
419
 
342
420
  def get_credential_file_mounts(self) -> Dict[str, str]:
343
421
  credential_file_mounts = {
344
- f'~/.nebius/{filename}': f'~/.nebius/{filename}'
345
- for filename in _CREDENTIAL_FILES
422
+ filepath: filepath
423
+ for filepath in nebius.get_credential_file_paths()
346
424
  }
347
425
  if nebius_profile_in_aws_cred_and_config():
348
426
  credential_file_mounts['~/.aws/credentials'] = '~/.aws/credentials'
@@ -356,9 +434,60 @@ class Nebius(clouds.Cloud):
356
434
  return None
357
435
 
358
436
  def instance_type_exists(self, instance_type: str) -> bool:
359
- return service_catalog.instance_type_exists(instance_type, 'nebius')
437
+ return catalog.instance_type_exists(instance_type, 'nebius')
360
438
 
361
439
  def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
362
- return service_catalog.validate_region_zone(region,
363
- zone,
364
- clouds='nebius')
440
+ return catalog.validate_region_zone(region, zone, clouds='nebius')
441
+
442
+ @classmethod
443
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
444
+ """Returns the email address + project id of the active user."""
445
+ nebius_workspace_config = json.dumps(
446
+ skypilot_config.get_workspace_cloud('nebius'), sort_keys=True)
447
+ return cls._get_user_identities(nebius_workspace_config)
448
+
449
+ @classmethod
450
+ @annotations.lru_cache(scope='request', maxsize=5)
451
+ def _get_user_identities(
452
+ cls, workspace_config: Optional[str]) -> Optional[List[List[str]]]:
453
+ # We add workspace_config in args to avoid caching the identity for when
454
+ # different workspace configs are used.
455
+ del workspace_config # Unused
456
+ sdk = nebius.sdk()
457
+ profile_client = nebius.iam().ProfileServiceClient(sdk)
458
+ try:
459
+ profile = nebius.sync_call(
460
+ profile_client.get(nebius.iam().GetProfileRequest(),
461
+ timeout=nebius.READ_TIMEOUT))
462
+ except Exception as e:
463
+ raise exceptions.CloudUserIdentityError(
464
+ f'Error getting Nebius profile: {e}')
465
+ if profile.user_profile is not None:
466
+ if profile.user_profile.attributes is None:
467
+ raise exceptions.CloudUserIdentityError(
468
+ 'Nebius profile is a UserProfile, but has no attributes: '
469
+ f'{profile.user_profile}')
470
+ if profile.user_profile.attributes.email is None:
471
+ raise exceptions.CloudUserIdentityError(
472
+ 'Nebius profile is a UserProfile, but has no email: '
473
+ f'{profile.user_profile}')
474
+ return [[profile.user_profile.attributes.email]]
475
+ if profile.service_account_profile is not None:
476
+ if profile.service_account_profile.info is None:
477
+ raise exceptions.CloudUserIdentityError(
478
+ 'Nebius profile is a ServiceAccountProfile, but has no '
479
+ f'info: {profile.service_account_profile}')
480
+ if profile.service_account_profile.info.metadata is None:
481
+ raise exceptions.CloudUserIdentityError(
482
+ 'Nebius profile is a ServiceAccountProfile, but has no '
483
+ f'metadata: {profile.service_account_profile}')
484
+ if profile.service_account_profile.info.metadata.name is None:
485
+ raise exceptions.CloudUserIdentityError(
486
+ 'Nebius profile is a ServiceAccountProfile, but has no '
487
+ f'name: {profile.service_account_profile}')
488
+ return [[profile.service_account_profile.info.metadata.name]]
489
+ if profile.anonymous_profile is not None:
490
+ return None
491
+ unknown_profile_type = profile.which_field_in_oneof('profile')
492
+ raise exceptions.CloudUserIdentityError(
493
+ f'Nebius profile is of an unknown type - {unknown_profile_type}')