skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/clouds/nebius.py CHANGED
@@ -1,24 +1,22 @@
1
1
  """ Nebius Cloud. """
2
+ import json
2
3
  import os
3
4
  import typing
4
- from typing import Dict, Iterator, List, Optional, Tuple, Union
5
+ from typing import Any, Dict, Iterator, List, Optional, Tuple, Union
5
6
 
7
+ from sky import catalog
6
8
  from sky import clouds
9
+ from sky import exceptions
10
+ from sky import skypilot_config
7
11
  from sky.adaptors import nebius
8
- from sky.clouds import service_catalog
12
+ from sky.provision.nebius import constants as nebius_constants
9
13
  from sky.utils import annotations
10
14
  from sky.utils import registry
11
15
  from sky.utils import resources_utils
12
16
 
13
17
  if typing.TYPE_CHECKING:
14
18
  from sky import resources as resources_lib
15
-
16
- _CREDENTIAL_FILES = [
17
- # credential files for Nebius
18
- nebius.NEBIUS_TENANT_ID_FILENAME,
19
- nebius.NEBIUS_IAM_TOKEN_FILENAME,
20
- nebius.NEBIUS_CREDENTIALS_FILENAME
21
- ]
19
+ from sky.utils import volume as volume_lib
22
20
 
23
21
  _INDENT_PREFIX = ' '
24
22
 
@@ -54,14 +52,18 @@ class Nebius(clouds.Cloud):
54
52
  _CLOUD_UNSUPPORTED_FEATURES = {
55
53
  clouds.CloudImplementationFeatures.AUTODOWN:
56
54
  ('Autodown not supported. Can\'t delete OS disk.'),
57
- clouds.CloudImplementationFeatures.SPOT_INSTANCE:
58
- ('Spot is not supported, as Nebius API does not implement spot.'),
59
55
  clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
60
56
  (f'Migrating disk is currently not supported on {_REPR}.'),
61
57
  clouds.CloudImplementationFeatures.CUSTOM_DISK_TIER:
62
58
  (f'Custom disk tier is currently not supported on {_REPR}.'),
59
+ clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
60
+ ('Custom network tier is currently only supported for '
61
+ 'H100:8 and H200:8 on Nebius.'),
63
62
  clouds.CloudImplementationFeatures.HIGH_AVAILABILITY_CONTROLLERS:
64
63
  ('High availability controllers are not supported on Nebius.'),
64
+ clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK:
65
+ ('Customized multiple network interfaces are not supported on '
66
+ f'{_REPR}.'),
65
67
  }
66
68
  # Nebius maximum instance name length defined as <= 63 as a hostname length
67
69
  # 63 - 8 - 5 = 50 characters since
@@ -76,25 +78,43 @@ class Nebius(clouds.Cloud):
76
78
 
77
79
  @classmethod
78
80
  def _unsupported_features_for_resources(
79
- cls, resources: 'resources_lib.Resources'
81
+ cls,
82
+ resources: 'resources_lib.Resources',
83
+ region: Optional[str] = None,
80
84
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
81
- del resources # unused
82
- return cls._CLOUD_UNSUPPORTED_FEATURES
85
+ unsupported = cls._CLOUD_UNSUPPORTED_FEATURES.copy()
86
+
87
+ # Check if the accelerators support InfiniBand (H100 or H200) and 8 GPUs
88
+ if resources.accelerators is not None:
89
+ for acc_name, acc_count in resources.accelerators.items():
90
+ if acc_name.lower() in ('h100', 'h200') and acc_count == 8:
91
+ # Remove CUSTOM_NETWORK_TIER from unsupported features for
92
+ # InfiniBand-capable accelerators. Refer to:
93
+ # https://docs.nebius.com/compute/clusters/gpu#fabrics
94
+ unsupported.pop(
95
+ clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER,
96
+ None)
97
+ break
98
+
99
+ return unsupported
83
100
 
84
101
  @classmethod
85
102
  def _max_cluster_name_length(cls) -> Optional[int]:
86
103
  return cls._MAX_CLUSTER_NAME_LEN_LIMIT
87
104
 
88
105
  @classmethod
89
- def regions_with_offering(cls, instance_type: str,
90
- accelerators: Optional[Dict[str, int]],
91
- use_spot: bool, region: Optional[str],
92
- zone: Optional[str]) -> List[clouds.Region]:
106
+ def regions_with_offering(
107
+ cls,
108
+ instance_type: str,
109
+ accelerators: Optional[Dict[str, int]],
110
+ use_spot: bool,
111
+ region: Optional[str],
112
+ zone: Optional[str],
113
+ resources: Optional['resources_lib.Resources'] = None,
114
+ ) -> List[clouds.Region]:
93
115
  assert zone is None, 'Nebius does not support zones.'
94
116
  del accelerators, zone # unused
95
- if use_spot:
96
- return []
97
- regions = service_catalog.get_region_zones_for_instance_type(
117
+ regions = catalog.get_region_zones_for_instance_type(
98
118
  instance_type, use_spot, 'nebius')
99
119
 
100
120
  if region is not None:
@@ -106,8 +126,8 @@ class Nebius(clouds.Cloud):
106
126
  cls,
107
127
  instance_type: str,
108
128
  ) -> Tuple[Optional[float], Optional[float]]:
109
- return service_catalog.get_vcpus_mem_from_instance_type(instance_type,
110
- clouds='nebius')
129
+ return catalog.get_vcpus_mem_from_instance_type(instance_type,
130
+ clouds='nebius')
111
131
 
112
132
  @classmethod
113
133
  def zones_provision_loop(
@@ -134,11 +154,11 @@ class Nebius(clouds.Cloud):
134
154
  use_spot: bool,
135
155
  region: Optional[str] = None,
136
156
  zone: Optional[str] = None) -> float:
137
- return service_catalog.get_hourly_cost(instance_type,
138
- use_spot=use_spot,
139
- region=region,
140
- zone=zone,
141
- clouds='nebius')
157
+ return catalog.get_hourly_cost(instance_type,
158
+ use_spot=use_spot,
159
+ region=region,
160
+ zone=zone,
161
+ clouds='nebius')
142
162
 
143
163
  def accelerators_to_hourly_cost(self,
144
164
  accelerators: Dict[str, int],
@@ -160,69 +180,124 @@ class Nebius(clouds.Cloud):
160
180
  return isinstance(other, Nebius)
161
181
 
162
182
  @classmethod
163
- def get_default_instance_type(
164
- cls,
165
- cpus: Optional[str] = None,
166
- memory: Optional[str] = None,
167
- disk_tier: Optional[resources_utils.DiskTier] = None
168
- ) -> Optional[str]:
183
+ def get_default_instance_type(cls,
184
+ cpus: Optional[str] = None,
185
+ memory: Optional[str] = None,
186
+ disk_tier: Optional[
187
+ resources_utils.DiskTier] = None,
188
+ region: Optional[str] = None,
189
+ zone: Optional[str] = None) -> Optional[str]:
169
190
  """Returns the default instance type for Nebius."""
170
- return service_catalog.get_default_instance_type(cpus=cpus,
171
- memory=memory,
172
- disk_tier=disk_tier,
173
- clouds='nebius')
191
+ return catalog.get_default_instance_type(cpus=cpus,
192
+ memory=memory,
193
+ disk_tier=disk_tier,
194
+ region=region,
195
+ zone=zone,
196
+ clouds='nebius')
174
197
 
175
198
  @classmethod
176
199
  def get_accelerators_from_instance_type(
177
200
  cls,
178
201
  instance_type: str,
179
202
  ) -> Optional[Dict[str, Union[int, float]]]:
180
- return service_catalog.get_accelerators_from_instance_type(
181
- instance_type, clouds='nebius')
203
+ return catalog.get_accelerators_from_instance_type(instance_type,
204
+ clouds='nebius')
182
205
 
183
206
  @classmethod
184
207
  def get_zone_shell_cmd(cls) -> Optional[str]:
185
208
  return None
186
209
 
187
210
  def make_deploy_resources_variables(
188
- self,
189
- resources: 'resources_lib.Resources',
190
- cluster_name: resources_utils.ClusterName,
191
- region: 'clouds.Region',
192
- zones: Optional[List['clouds.Zone']],
193
- num_nodes: int,
194
- dryrun: bool = False) -> Dict[str, Optional[str]]:
211
+ self,
212
+ resources: 'resources_lib.Resources',
213
+ cluster_name: resources_utils.ClusterName,
214
+ region: 'clouds.Region',
215
+ zones: Optional[List['clouds.Zone']],
216
+ num_nodes: int,
217
+ dryrun: bool = False,
218
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
219
+ ) -> Dict[str, Any]:
195
220
  del dryrun, cluster_name
196
221
  assert zones is None, ('Nebius does not support zones', zones)
197
222
 
198
- r = resources
199
- acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
223
+ resources = resources.assert_launchable()
224
+ acc_dict = self.get_accelerators_from_instance_type(
225
+ resources.instance_type)
200
226
  custom_resources = resources_utils.make_ray_custom_resources_str(
201
227
  acc_dict)
202
228
  platform, _ = resources.instance_type.split('_')
203
229
 
204
- if platform in ('cpu-d3', 'cpu-e2'):
205
- image_family = 'ubuntu22.04-driverless'
206
- elif platform in ('gpu-h100-sxm', 'gpu-h200-sxm', 'gpu-l40s-a'):
207
- image_family = 'ubuntu22.04-cuda12'
230
+ # Selecting image_family by platform
231
+ # https://docs.nebius.com/compute/storage/boot-disk-images
232
+ if platform.startswith('cpu'):
233
+ image_family = 'ubuntu24.04-driverless'
234
+ elif platform.startswith('gpu'):
235
+ image_family = 'ubuntu24.04-cuda12'
208
236
  else:
209
237
  raise RuntimeError('Unsupported instance type for Nebius cloud:'
210
238
  f' {resources.instance_type}')
211
239
 
212
- resources_vars = {
240
+ config_fs = skypilot_config.get_effective_region_config(
241
+ cloud='nebius',
242
+ region=region.name,
243
+ keys=('filesystems',),
244
+ default_value=[])
245
+ resources_vars_fs = []
246
+ for i, fs in enumerate(config_fs):
247
+ resources_vars_fs.append({
248
+ 'filesystem_id': fs['filesystem_id'],
249
+ 'filesystem_attach_mode': fs.get('attach_mode', 'READ_WRITE'),
250
+ 'filesystem_mount_path': fs.get(
251
+ 'mount_path', f'/mnt/filesystem-skypilot-{i+1}'),
252
+ 'filesystem_mount_tag': f'filesystem-skypilot-{i+1}'
253
+ })
254
+
255
+ use_static_ip_address = skypilot_config.get_nested(
256
+ ('nebius', 'use_static_ip_address'), default_value=False)
257
+ resources_vars: Dict[str, Any] = {
213
258
  'instance_type': resources.instance_type,
214
259
  'custom_resources': custom_resources,
260
+ 'use_static_ip_address': use_static_ip_address,
215
261
  'region': region.name,
216
262
  'image_id': image_family,
217
263
  # Nebius does not support specific zones.
218
264
  'zones': None,
265
+ 'use_spot': resources.use_spot,
266
+ 'filesystems': resources_vars_fs,
267
+ 'network_tier': resources.network_tier
219
268
  }
220
269
 
270
+ docker_run_options = []
271
+
221
272
  if acc_dict is not None:
222
273
  # Nebius cloud's docker runtime information does not contain
223
274
  # 'nvidia-container-runtime', causing no GPU option to be added to
224
275
  # the docker run command. We patch this by adding it here.
225
- resources_vars['docker_run_options'] = ['--gpus all']
276
+ docker_run_options.append('--gpus all')
277
+
278
+ # Check for InfiniBand support with network_tier: best
279
+ is_infiniband_capable = (
280
+ platform in nebius_constants.INFINIBAND_INSTANCE_PLATFORMS)
281
+ if (is_infiniband_capable and
282
+ resources.network_tier == resources_utils.NetworkTier.BEST):
283
+ # For Docker containers, add InfiniBand device access and
284
+ # IPC_LOCK capability
285
+ if resources.extract_docker_image() is not None:
286
+ docker_run_options.extend(
287
+ nebius_constants.INFINIBAND_DOCKER_OPTIONS)
288
+
289
+ # Add InfiniBand environment variables to docker run options
290
+ for env_var, env_value in (
291
+ nebius_constants.INFINIBAND_ENV_VARS.items()):
292
+ docker_run_options.extend(
293
+ ['-e', f'{env_var}={env_value}'])
294
+
295
+ # For all InfiniBand-capable instances, add env variables
296
+ resources_vars[
297
+ 'env_vars'] = nebius_constants.INFINIBAND_ENV_VARS
298
+
299
+ if docker_run_options:
300
+ resources_vars['docker_run_options'] = docker_run_options
226
301
 
227
302
  return resources_vars
228
303
 
@@ -254,7 +329,9 @@ class Nebius(clouds.Cloud):
254
329
  default_instance_type = Nebius.get_default_instance_type(
255
330
  cpus=resources.cpus,
256
331
  memory=resources.memory,
257
- disk_tier=resources.disk_tier)
332
+ disk_tier=resources.disk_tier,
333
+ region=resources.region,
334
+ zone=resources.zone)
258
335
  if default_instance_type is None:
259
336
  # TODO: Add hints to all return values in this method to help
260
337
  # users understand why the resources are not launchable.
@@ -265,15 +342,16 @@ class Nebius(clouds.Cloud):
265
342
 
266
343
  assert len(accelerators) == 1, resources
267
344
  acc, acc_count = list(accelerators.items())[0]
268
- (instance_list, fuzzy_candidate_list
269
- ) = service_catalog.get_instance_type_for_accelerator(
270
- acc,
271
- acc_count,
272
- use_spot=resources.use_spot,
273
- cpus=resources.cpus,
274
- region=resources.region,
275
- zone=resources.zone,
276
- clouds='nebius')
345
+ (instance_list,
346
+ fuzzy_candidate_list) = catalog.get_instance_type_for_accelerator(
347
+ acc,
348
+ acc_count,
349
+ use_spot=resources.use_spot,
350
+ cpus=resources.cpus,
351
+ memory=resources.memory,
352
+ region=resources.region,
353
+ zone=resources.zone,
354
+ clouds='nebius')
277
355
  if instance_list is None:
278
356
  return resources_utils.FeasibleResources([], fuzzy_candidate_list,
279
357
  None)
@@ -281,25 +359,25 @@ class Nebius(clouds.Cloud):
281
359
  fuzzy_candidate_list, None)
282
360
 
283
361
  @classmethod
284
- @annotations.lru_cache(scope='request')
285
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
362
+ def _check_compute_credentials(
363
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
286
364
  """Checks if the user has access credentials to
287
365
  Nebius's compute service."""
288
366
  token_cred_msg = (
289
367
  f'{_INDENT_PREFIX}Credentials can be set up by running: \n'
290
- f'{_INDENT_PREFIX} $ nebius iam get-access-token > {nebius.NEBIUS_IAM_TOKEN_PATH} \n' # pylint: disable=line-too-long
291
- f'{_INDENT_PREFIX} or generate ~/.nebius/credentials.json \n')
368
+ f'{_INDENT_PREFIX} $ nebius iam get-access-token > {nebius.iam_token_path()} \n' # pylint: disable=line-too-long
369
+ f'{_INDENT_PREFIX} or generate {nebius.credentials_path()} \n')
292
370
 
293
- tenant_msg = (f'{_INDENT_PREFIX} Copy your tenat ID from the web console and save it to file \n' # pylint: disable=line-too-long
294
- f'{_INDENT_PREFIX} $ echo $NEBIUS_TENANT_ID_PATH > {nebius.NEBIUS_TENANT_ID_PATH} \n' # pylint: disable=line-too-long
371
+ tenant_msg = (f'{_INDENT_PREFIX} Copy your tenant ID from the web console and save it to file \n' # pylint: disable=line-too-long
372
+ f'{_INDENT_PREFIX} $ echo $NEBIUS_TENANT_ID_PATH > {nebius.tenant_id_path()} \n' # pylint: disable=line-too-long
295
373
  f'{_INDENT_PREFIX} Or if you have 1 tenant you can run:\n' # pylint: disable=line-too-long
296
- f'{_INDENT_PREFIX} $ nebius --format json iam whoami|jq -r \'.user_profile.tenants[0].tenant_id\' > {nebius.NEBIUS_TENANT_ID_PATH} \n') # pylint: disable=line-too-long
374
+ f'{_INDENT_PREFIX} $ nebius --format json iam whoami|jq -r \'.user_profile.tenants[0].tenant_id\' > {nebius.tenant_id_path()} \n') # pylint: disable=line-too-long
297
375
  if not nebius.is_token_or_cred_file_exist():
298
376
  return False, f'{token_cred_msg}'
299
- sdk = nebius.sdk()
300
377
  tenant_id = nebius.get_tenant_id()
301
378
  if tenant_id is None:
302
379
  return False, f'{tenant_msg}'
380
+ sdk = nebius.sdk()
303
381
  try:
304
382
  service = nebius.iam().ProjectServiceClient(sdk)
305
383
  service.list(
@@ -313,7 +391,8 @@ class Nebius(clouds.Cloud):
313
391
 
314
392
  @classmethod
315
393
  @annotations.lru_cache(scope='request')
316
- def _check_storage_credentials(cls) -> Tuple[bool, Optional[str]]:
394
+ def _check_storage_credentials(
395
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
317
396
  """Checks if the user has access credentials to Nebius Object Storage.
318
397
 
319
398
  Returns:
@@ -340,8 +419,8 @@ class Nebius(clouds.Cloud):
340
419
 
341
420
  def get_credential_file_mounts(self) -> Dict[str, str]:
342
421
  credential_file_mounts = {
343
- f'~/.nebius/{filename}': f'~/.nebius/{filename}'
344
- for filename in _CREDENTIAL_FILES
422
+ filepath: filepath
423
+ for filepath in nebius.get_credential_file_paths()
345
424
  }
346
425
  if nebius_profile_in_aws_cred_and_config():
347
426
  credential_file_mounts['~/.aws/credentials'] = '~/.aws/credentials'
@@ -355,9 +434,56 @@ class Nebius(clouds.Cloud):
355
434
  return None
356
435
 
357
436
  def instance_type_exists(self, instance_type: str) -> bool:
358
- return service_catalog.instance_type_exists(instance_type, 'nebius')
437
+ return catalog.instance_type_exists(instance_type, 'nebius')
359
438
 
360
439
  def validate_region_zone(self, region: Optional[str], zone: Optional[str]):
361
- return service_catalog.validate_region_zone(region,
362
- zone,
363
- clouds='nebius')
440
+ return catalog.validate_region_zone(region, zone, clouds='nebius')
441
+
442
+ @classmethod
443
+ def get_user_identities(cls) -> Optional[List[List[str]]]:
444
+ """Returns the email address + project id of the active user."""
445
+ nebius_workspace_config = json.dumps(
446
+ skypilot_config.get_workspace_cloud('nebius'), sort_keys=True)
447
+ return cls._get_user_identities(nebius_workspace_config)
448
+
449
+ @classmethod
450
+ @annotations.lru_cache(scope='request', maxsize=5)
451
+ def _get_user_identities(
452
+ cls, workspace_config: Optional[str]) -> Optional[List[List[str]]]:
453
+ # We add workspace_config in args to avoid caching the identity for when
454
+ # different workspace configs are used.
455
+ del workspace_config # Unused
456
+ sdk = nebius.sdk()
457
+ profile_client = nebius.iam().ProfileServiceClient(sdk)
458
+ profile = nebius.sync_call(
459
+ profile_client.get(nebius.iam().GetProfileRequest(),
460
+ timeout=nebius.READ_TIMEOUT))
461
+ if profile.user_profile is not None:
462
+ if profile.user_profile.attributes is None:
463
+ raise exceptions.CloudUserIdentityError(
464
+ 'Nebius profile is a UserProfile, but has no attributes: '
465
+ f'{profile.user_profile}')
466
+ if profile.user_profile.attributes.email is None:
467
+ raise exceptions.CloudUserIdentityError(
468
+ 'Nebius profile is a UserProfile, but has no email: '
469
+ f'{profile.user_profile}')
470
+ return [[profile.user_profile.attributes.email]]
471
+ if profile.service_account_profile is not None:
472
+ if profile.service_account_profile.info is None:
473
+ raise exceptions.CloudUserIdentityError(
474
+ 'Nebius profile is a ServiceAccountProfile, but has no '
475
+ f'info: {profile.service_account_profile}')
476
+ if profile.service_account_profile.info.metadata is None:
477
+ raise exceptions.CloudUserIdentityError(
478
+ 'Nebius profile is a ServiceAccountProfile, but has no '
479
+ f'metadata: {profile.service_account_profile}')
480
+ if profile.service_account_profile.info.metadata.name is None:
481
+ raise exceptions.CloudUserIdentityError(
482
+ 'Nebius profile is a ServiceAccountProfile, but has no '
483
+ f'name: {profile.service_account_profile}')
484
+ return [[profile.service_account_profile.info.metadata.name]]
485
+ if profile.anonymous_profile is not None:
486
+ return None
487
+ unknown_profile_type = profile.which_field_in_oneof('profile')
488
+ raise exceptions.CloudUserIdentityError(
489
+ f'Nebius profile is of an unknown type - {unknown_profile_type}')