skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -1,16 +1,21 @@
1
1
  """Nebius library wrapper for SkyPilot."""
2
2
  import time
3
- from typing import Any, Dict
3
+ from typing import Any, Dict, List, Optional
4
4
  import uuid
5
5
 
6
6
  from sky import sky_logging
7
+ from sky import skypilot_config
7
8
  from sky.adaptors import nebius
9
+ from sky.provision.nebius import constants as nebius_constants
8
10
  from sky.utils import common_utils
11
+ from sky.utils import resources_utils
9
12
 
10
13
  logger = sky_logging.init_logger(__name__)
11
14
 
12
15
  POLL_INTERVAL = 5
13
16
 
17
+ _MAX_OPERATIONS_TO_FETCH = 1000
18
+
14
19
 
15
20
  def retry(func):
16
21
  """Decorator to retry a function."""
@@ -33,68 +38,43 @@ def retry(func):
33
38
 
34
39
  def get_project_by_region(region: str) -> str:
35
40
  service = nebius.iam().ProjectServiceClient(nebius.sdk())
36
- projects = service.list(nebius.iam().ListProjectsRequest(
37
- parent_id=nebius.get_tenant_id())).wait()
38
- # To find a project in a specific region, we rely on the project ID to
39
- # deduce the region, since there is currently no method to retrieve region
40
- # information directly from the project. Additionally, there is only one
41
- # project per region, and projects cannot be created at this time.
42
- # The region is determined from the project ID using a region-specific
43
- # identifier embedded in it.
44
- # Project id looks like project-e00xxxxxxxxxxxxxx where
45
- # e00 - id of region 'eu-north1'
46
- # e01 - id of region 'eu-west1'
47
- region_ids = {'eu-north1': 'e00', 'eu-west1': 'e01'}
48
- # TODO(SalikovAlex): fix when info about region will be in projects list
49
- # Currently, Nebius cloud supports 2 regions. We manually enumerate
50
- # them here. Reference: https://docs.nebius.com/overview/regions
41
+ projects = nebius.sync_call(
42
+ service.list(
43
+ nebius.iam().ListProjectsRequest(parent_id=nebius.get_tenant_id()),
44
+ timeout=nebius.READ_TIMEOUT))
51
45
 
52
46
  # Check is there project if in config
53
- preferable_project_id = nebius.get_project_id()
54
- if preferable_project_id is not None:
55
- if preferable_project_id[8:11] == region_ids[region]:
56
- return preferable_project_id
57
- logger.warning(
58
- f'Can\'t use customized NEBIUS_PROJECT_ID ({preferable_project_id})'
59
- f' for region {region}. Please check if the project ID is correct.')
47
+ project_id = skypilot_config.get_effective_region_config(
48
+ cloud='nebius', region=region, keys=('project_id',), default_value=None)
49
+ if project_id is not None:
50
+ return project_id
60
51
  for project in projects.items:
61
- if project.metadata.id[8:11] == region_ids[region]:
52
+ if project.status.region == region:
62
53
  return project.metadata.id
63
54
  raise Exception(f'No project found for region "{region}".')
64
55
 
65
56
 
66
- def get_or_create_gpu_cluster(name: str, region: str) -> str:
57
+ def get_or_create_gpu_cluster(name: str, project_id: str, fabric: str) -> str:
67
58
  """Creates a GPU cluster.
68
- When creating a GPU cluster, select an InfiniBand fabric for it:
69
-
70
- fabric-2, fabric-3 or fabric-4 for projects in the eu-north1 region.
71
- fabric-5 for projects in the eu-west1 region.
72
-
73
59
  https://docs.nebius.com/compute/clusters/gpu
74
60
  """
75
- project_id = get_project_by_region(region)
76
61
  service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
77
62
  try:
78
- cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
79
- parent_id=project_id,
80
- name=name,
81
- )).wait()
82
- cluster_id = cluster.metadata.id
83
- except nebius.request_error() as no_cluster_found_error:
84
- if region == 'eu-north1':
85
- fabric = 'fabric-4'
86
- elif region == 'eu-west1':
87
- fabric = 'fabric-5'
88
- else:
89
- raise RuntimeError(
90
- f'Unsupported region {region}.') from no_cluster_found_error
91
- cluster = service.create(nebius.compute().CreateGpuClusterRequest(
92
- metadata=nebius.nebius_common().ResourceMetadata(
63
+ cluster = nebius.sync_call(
64
+ service.get_by_name(nebius.nebius_common().GetByNameRequest(
93
65
  parent_id=project_id,
94
66
  name=name,
95
- ),
96
- spec=nebius.compute().GpuClusterSpec(
97
- infiniband_fabric=fabric))).wait()
67
+ )))
68
+ cluster_id = cluster.metadata.id
69
+ except nebius.request_error():
70
+ cluster = nebius.sync_call(
71
+ service.create(nebius.compute().CreateGpuClusterRequest(
72
+ metadata=nebius.nebius_common().ResourceMetadata(
73
+ parent_id=project_id,
74
+ name=name,
75
+ ),
76
+ spec=nebius.compute().GpuClusterSpec(
77
+ infiniband_fabric=fabric))))
98
78
  cluster_id = cluster.resource_id
99
79
  return cluster_id
100
80
 
@@ -104,14 +84,16 @@ def delete_cluster(name: str, region: str) -> None:
104
84
  project_id = get_project_by_region(region)
105
85
  service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
106
86
  try:
107
- cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
108
- parent_id=project_id,
109
- name=name,
110
- )).wait()
87
+ cluster = nebius.sync_call(
88
+ service.get_by_name(nebius.nebius_common().GetByNameRequest(
89
+ parent_id=project_id,
90
+ name=name,
91
+ )))
111
92
  cluster_id = cluster.metadata.id
112
93
  logger.debug(f'Found GPU Cluster : {cluster_id}.')
113
- service.delete(
114
- nebius.compute().DeleteGpuClusterRequest(id=cluster_id)).wait()
94
+ nebius.sync_call(
95
+ service.delete(
96
+ nebius.compute().DeleteGpuClusterRequest(id=cluster_id)))
115
97
  logger.debug(f'Deleted GPU Cluster : {cluster_id}.')
116
98
  except nebius.request_error():
117
99
  logger.debug('GPU Cluster does not exist.')
@@ -120,13 +102,23 @@ def delete_cluster(name: str, region: str) -> None:
120
102
  def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
121
103
  """Lists instances associated with API key."""
122
104
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
123
- result = service.list(
124
- nebius.compute().ListInstancesRequest(parent_id=project_id)).wait()
125
-
126
- instances = result
105
+ page_token = ''
106
+ instances = []
107
+ while True:
108
+ result = nebius.sync_call(
109
+ service.list(nebius.compute().ListInstancesRequest(
110
+ parent_id=project_id,
111
+ page_size=100,
112
+ page_token=page_token,
113
+ ),
114
+ timeout=nebius.READ_TIMEOUT))
115
+ instances.extend(result.items)
116
+ if not result.next_page_token: # "" means no more pages
117
+ break
118
+ page_token = result.next_page_token
127
119
 
128
120
  instance_dict: Dict[str, Dict[str, Any]] = {}
129
- for instance in instances.items:
121
+ for instance in instances:
130
122
  info = {}
131
123
  info['status'] = instance.status.state.name
132
124
  info['name'] = instance.metadata.name
@@ -142,12 +134,13 @@ def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
142
134
 
143
135
  def stop(instance_id: str) -> None:
144
136
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
145
- service.stop(nebius.compute().StopInstanceRequest(id=instance_id)).wait()
137
+ nebius.sync_call(
138
+ service.stop(nebius.compute().StopInstanceRequest(id=instance_id)))
146
139
  retry_count = 0
147
140
  while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_STOP:
148
141
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
149
- instance = service.get(nebius.compute().GetInstanceRequest(
150
- id=instance_id,)).wait()
142
+ instance = nebius.sync_call(
143
+ service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
151
144
  if instance.status.state.name == 'STOPPED':
152
145
  break
153
146
  time.sleep(POLL_INTERVAL)
@@ -164,12 +157,13 @@ def stop(instance_id: str) -> None:
164
157
 
165
158
  def start(instance_id: str) -> None:
166
159
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
167
- service.start(nebius.compute().StartInstanceRequest(id=instance_id)).wait()
160
+ nebius.sync_call(
161
+ service.start(nebius.compute().StartInstanceRequest(id=instance_id)))
168
162
  retry_count = 0
169
163
  while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_START:
170
164
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
171
- instance = service.get(nebius.compute().GetInstanceRequest(
172
- id=instance_id,)).wait()
165
+ instance = nebius.sync_call(
166
+ service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
173
167
  if instance.status.state.name == 'RUNNING':
174
168
  break
175
169
  time.sleep(POLL_INTERVAL)
@@ -184,9 +178,19 @@ def start(instance_id: str) -> None:
184
178
  f' to be ready.')
185
179
 
186
180
 
187
- def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
188
- preset: str, region: str, image_family: str, disk_size: int,
189
- user_data: str) -> str:
181
+ def launch(cluster_name_on_cloud: str,
182
+ node_type: str,
183
+ platform: str,
184
+ preset: str,
185
+ region: str,
186
+ image_family: str,
187
+ disk_size: int,
188
+ user_data: str,
189
+ associate_public_ip_address: bool,
190
+ filesystems: List[Dict[str, Any]],
191
+ use_static_ip_address: bool = False,
192
+ use_spot: bool = False,
193
+ network_tier: Optional[resources_utils.NetworkTier] = None) -> str:
190
194
  # Each node must have a unique name to avoid conflicts between
191
195
  # multiple worker VMs. To ensure uniqueness,a UUID is appended
192
196
  # to the node name.
@@ -196,34 +200,59 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
196
200
 
197
201
  disk_name = 'disk-' + instance_name
198
202
  cluster_id = None
203
+ project_id = get_project_by_region(region)
199
204
  # 8 GPU virtual machines can be grouped into a GPU cluster.
200
205
  # The GPU clusters are built with InfiniBand secure high-speed networking.
201
206
  # https://docs.nebius.com/compute/clusters/gpu
202
- if platform in ('gpu-h100-sxm', 'gpu-h200-sxm'):
207
+ if platform in nebius_constants.INFINIBAND_INSTANCE_PLATFORMS:
203
208
  if preset == '8gpu-128vcpu-1600gb':
204
- cluster_id = get_or_create_gpu_cluster(cluster_name_on_cloud,
205
- region)
209
+ fabric = skypilot_config.get_effective_region_config(
210
+ cloud='nebius',
211
+ region=region,
212
+ keys=('fabric',),
213
+ default_value=None)
214
+
215
+ # Auto-select fabric if network_tier=best and no fabric configured
216
+ if (fabric is None and
217
+ str(network_tier) == str(resources_utils.NetworkTier.BEST)):
218
+ try:
219
+ fabric = nebius_constants.get_default_fabric(
220
+ platform, region)
221
+ logger.info(f'Auto-selected InfiniBand fabric {fabric} '
222
+ f'for {platform} in {region}')
223
+ except ValueError as e:
224
+ logger.warning(
225
+ f'InfiniBand fabric auto-selection failed: {e}')
226
+
227
+ if fabric is None:
228
+ logger.warning(
229
+ f'Set up fabric for region {region} in ~/.sky/config.yaml '
230
+ 'to use GPU clusters.')
231
+ else:
232
+ cluster_id = get_or_create_gpu_cluster(cluster_name_on_cloud,
233
+ project_id, fabric)
206
234
 
207
- project_id = get_project_by_region(region)
208
235
  service = nebius.compute().DiskServiceClient(nebius.sdk())
209
- disk = service.create(nebius.compute().CreateDiskRequest(
210
- metadata=nebius.nebius_common().ResourceMetadata(
211
- parent_id=project_id,
212
- name=disk_name,
213
- ),
214
- spec=nebius.compute().DiskSpec(
215
- source_image_family=nebius.compute().SourceImageFamily(
216
- image_family=image_family),
217
- size_gibibytes=disk_size,
218
- type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
219
- ))).wait()
236
+ disk = nebius.sync_call(
237
+ service.create(nebius.compute().CreateDiskRequest(
238
+ metadata=nebius.nebius_common().ResourceMetadata(
239
+ parent_id=project_id,
240
+ name=disk_name,
241
+ ),
242
+ spec=nebius.compute().DiskSpec(
243
+ source_image_family=nebius.compute().SourceImageFamily(
244
+ image_family=image_family),
245
+ size_gibibytes=disk_size,
246
+ type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
247
+ ))))
220
248
  disk_id = disk.resource_id
221
249
  retry_count = 0
222
250
  while retry_count < nebius.MAX_RETRIES_TO_DISK_CREATE:
223
- disk = service.get_by_name(nebius.nebius_common().GetByNameRequest(
224
- parent_id=project_id,
225
- name=disk_name,
226
- )).wait()
251
+ disk = nebius.sync_call(
252
+ service.get_by_name(nebius.nebius_common().GetByNameRequest(
253
+ parent_id=project_id,
254
+ name=disk_name,
255
+ )))
227
256
  if disk.status.state.name == 'READY':
228
257
  break
229
258
  logger.debug(f'Waiting for disk {disk_name} to be ready.')
@@ -237,73 +266,144 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
237
266
  f' seconds) while waiting for disk {disk_name}'
238
267
  f' to be ready.')
239
268
 
269
+ filesystems_spec = []
270
+ if filesystems:
271
+ for fs in filesystems:
272
+ filesystems_spec.append(nebius.compute().AttachedFilesystemSpec(
273
+ mount_tag=fs['filesystem_mount_tag'],
274
+ attach_mode=nebius.compute().AttachedFilesystemSpec.AttachMode[
275
+ fs['filesystem_attach_mode']],
276
+ existing_filesystem=nebius.compute().ExistingFilesystem(
277
+ id=fs['filesystem_id'])))
278
+
240
279
  service = nebius.vpc().SubnetServiceClient(nebius.sdk())
241
- sub_net = service.list(nebius.vpc().ListSubnetsRequest(
242
- parent_id=project_id,)).wait()
280
+ sub_net = nebius.sync_call(
281
+ service.list(nebius.vpc().ListSubnetsRequest(parent_id=project_id,)))
243
282
 
244
283
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
245
- service.create(nebius.compute().CreateInstanceRequest(
246
- metadata=nebius.nebius_common().ResourceMetadata(
247
- parent_id=project_id,
248
- name=instance_name,
249
- ),
250
- spec=nebius.compute().InstanceSpec(
251
- gpu_cluster=nebius.compute().InstanceGpuClusterSpec(id=cluster_id,)
252
- if cluster_id is not None else None,
253
- boot_disk=nebius.compute().AttachedDiskSpec(
254
- attach_mode=nebius.compute(
255
- ).AttachedDiskSpec.AttachMode.READ_WRITE,
256
- existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
257
- cloud_init_user_data=user_data,
258
- resources=nebius.compute().ResourcesSpec(platform=platform,
259
- preset=preset),
260
- network_interfaces=[
261
- nebius.compute().NetworkInterfaceSpec(
262
- subnet_id=sub_net.items[0].metadata.id,
263
- ip_address=nebius.compute().IPAddress(),
264
- name='network-interface-0',
265
- public_ip_address=nebius.compute().PublicIPAddress())
266
- ]))).wait()
267
- instance_id = ''
268
- retry_count = 0
269
- while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
270
- service = nebius.compute().InstanceServiceClient(nebius.sdk())
271
- instance = service.get_by_name(nebius.nebius_common().GetByNameRequest(
272
- parent_id=project_id,
273
- name=instance_name,
274
- )).wait()
275
- if instance.status.state.name == 'STARTING':
284
+ logger.debug(f'Creating instance {instance_name} in project {project_id}.')
285
+ try:
286
+ nebius.sync_call(
287
+ service.create(nebius.compute().CreateInstanceRequest(
288
+ metadata=nebius.nebius_common().ResourceMetadata(
289
+ parent_id=project_id,
290
+ name=instance_name,
291
+ ),
292
+ spec=nebius.compute().InstanceSpec(
293
+ gpu_cluster=nebius.compute().InstanceGpuClusterSpec(
294
+ id=cluster_id,) if cluster_id is not None else None,
295
+ boot_disk=nebius.compute().AttachedDiskSpec(
296
+ attach_mode=nebius.compute(
297
+ ).AttachedDiskSpec.AttachMode.READ_WRITE,
298
+ existing_disk=nebius.compute().ExistingDisk(
299
+ id=disk_id)),
300
+ cloud_init_user_data=user_data,
301
+ resources=nebius.compute().ResourcesSpec(platform=platform,
302
+ preset=preset),
303
+ filesystems=filesystems_spec if filesystems_spec else None,
304
+ network_interfaces=[
305
+ nebius.compute().NetworkInterfaceSpec(
306
+ subnet_id=sub_net.items[0].metadata.id,
307
+ ip_address=nebius.compute().IPAddress(),
308
+ name='network-interface-0',
309
+ public_ip_address=nebius.compute().PublicIPAddress(
310
+ static=use_static_ip_address)
311
+ if associate_public_ip_address else None,
312
+ )
313
+ ],
314
+ recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
315
+ if use_spot else None,
316
+ preemptible=nebius.compute().PreemptibleSpec(
317
+ priority=1,
318
+ on_preemption=nebius.compute().PreemptibleSpec.
319
+ PreemptionPolicy.STOP) if use_spot else None,
320
+ ))))
321
+ instance_id = ''
322
+ retry_count = 0
323
+ while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
324
+ service = nebius.compute().InstanceServiceClient(nebius.sdk())
325
+ instance = nebius.sync_call(
326
+ service.get_by_name(nebius.nebius_common().GetByNameRequest(
327
+ parent_id=project_id,
328
+ name=instance_name,
329
+ )))
276
330
  instance_id = instance.metadata.id
277
- break
278
- time.sleep(POLL_INTERVAL)
279
- logger.debug(f'Waiting for instance {instance_name} start running.')
280
- retry_count += 1
331
+ if instance.status.state.name == 'STARTING':
332
+ break
333
+
334
+ # All Instances initially have state=STOPPED and reconciling=True,
335
+ # so we need to wait until reconciling is False.
336
+ if instance.status.state.name == 'STOPPED' and \
337
+ not instance.status.reconciling:
338
+ next_token = ''
339
+ total_operations = 0
340
+ while True:
341
+ operations_response = nebius.sync_call(
342
+ service.list_operations_by_parent(
343
+ nebius.compute().ListOperationsByParentRequest(
344
+ parent_id=project_id,
345
+ page_size=100,
346
+ page_token=next_token,
347
+ )))
348
+ total_operations += len(operations_response.operations)
349
+ for operation in operations_response.operations:
350
+ # Find the most recent operation for the instance.
351
+ if operation.resource_id == instance_id:
352
+ error_msg = operation.description
353
+ if operation.status:
354
+ error_msg += f' {operation.status.message}'
355
+ raise RuntimeError(error_msg)
356
+ # If we've fetched too many operations, or there are no more
357
+ # operations to fetch, just raise a generic error.
358
+ if total_operations > _MAX_OPERATIONS_TO_FETCH or \
359
+ not operations_response.next_page_token:
360
+ raise RuntimeError(
361
+ f'Instance {instance_name} failed to start.')
362
+ next_token = operations_response.next_page_token
363
+ time.sleep(POLL_INTERVAL)
364
+ logger.debug(
365
+ f'Waiting for instance {instance_name} to start running. '
366
+ f'State: {instance.status.state.name}, '
367
+ f'Reconciling: {instance.status.reconciling}')
368
+ retry_count += 1
281
369
 
282
- if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
283
- raise TimeoutError(
284
- f'Exceeded maximum retries '
285
- f'({nebius.MAX_RETRIES_TO_INSTANCE_READY * POLL_INTERVAL}'
286
- f' seconds) while waiting for instance {instance_name}'
287
- f' to be ready.')
370
+ if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
371
+ raise TimeoutError(
372
+ f'Exceeded maximum retries '
373
+ f'({nebius.MAX_RETRIES_TO_INSTANCE_READY * POLL_INTERVAL}'
374
+ f' seconds) while waiting for instance {instance_name}'
375
+ f' to be ready.')
376
+ except nebius.request_error() as e:
377
+ # Handle ResourceExhausted quota limit error. In this case, we need to
378
+ # clean up the disk as VM creation failed and we can't proceed.
379
+ # It cannot be handled by the caller (provisioner)'s teardown logic,
380
+ # as we cannot retrieve the disk id, after the instance creation
381
+ # fails
382
+ logger.warning(f'Failed to launch instance {instance_name}: {e}')
383
+ service = nebius.compute().DiskServiceClient(nebius.sdk())
384
+ nebius.sync_call(
385
+ service.delete(nebius.compute().DeleteDiskRequest(id=disk_id)))
386
+ logger.debug(f'Disk {disk_id} deleted.')
387
+ raise e
288
388
  return instance_id
289
389
 
290
390
 
291
391
  def remove(instance_id: str) -> None:
292
392
  """Terminates the given instance."""
293
393
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
294
- result = service.get(
295
- nebius.compute().GetInstanceRequest(id=instance_id)).wait()
394
+ result = nebius.sync_call(
395
+ service.get(nebius.compute().GetInstanceRequest(id=instance_id)))
296
396
  disk_id = result.spec.boot_disk.existing_disk.id
297
- service.delete(
298
- nebius.compute().DeleteInstanceRequest(id=instance_id)).wait()
397
+ nebius.sync_call(
398
+ service.delete(nebius.compute().DeleteInstanceRequest(id=instance_id)))
299
399
  retry_count = 0
300
400
  # The instance begins deleting and attempts to delete the disk.
301
401
  # Must wait until the disk is unlocked and becomes deletable.
302
402
  while retry_count < nebius.MAX_RETRIES_TO_DISK_DELETE:
303
403
  try:
304
404
  service = nebius.compute().DiskServiceClient(nebius.sdk())
305
- service.delete(
306
- nebius.compute().DeleteDiskRequest(id=disk_id)).wait()
405
+ nebius.sync_call(
406
+ service.delete(nebius.compute().DeleteDiskRequest(id=disk_id)))
307
407
  break
308
408
  except nebius.request_error():
309
409
  logger.debug('Waiting for disk deletion.')
@@ -10,7 +10,7 @@ import copy
10
10
  from datetime import datetime
11
11
  import time
12
12
  import typing
13
- from typing import Any, Dict, List, Optional
13
+ from typing import Any, Dict, List, Optional, Tuple
14
14
 
15
15
  from sky import exceptions
16
16
  from sky import sky_logging
@@ -32,10 +32,12 @@ logger = sky_logging.init_logger(__name__)
32
32
  @query_utils.debug_enabled(logger)
33
33
  @common_utils.retry
34
34
  def query_instances(
35
+ cluster_name: str,
35
36
  cluster_name_on_cloud: str,
36
37
  provider_config: Optional[Dict[str, Any]] = None,
37
38
  non_terminated_only: bool = True,
38
- ) -> Dict[str, Optional['status_lib.ClusterStatus']]:
39
+ retry_if_missing: bool = False,
40
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
39
41
  """Query instances.
40
42
 
41
43
  Returns a dictionary of instance IDs and status.
@@ -43,11 +45,13 @@ def query_instances(
43
45
  A None status means the instance is marked as "terminated"
44
46
  or "terminating".
45
47
  """
48
+ del cluster_name, retry_if_missing # unused
46
49
  assert provider_config is not None, cluster_name_on_cloud
47
50
  region = provider_config['region']
48
51
 
49
52
  status_map = oci_utils.oci_config.STATE_MAPPING_OCI_TO_SKY
50
- statuses: Dict[str, Optional['status_lib.ClusterStatus']] = {}
53
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
54
+ Optional[str]]] = {}
51
55
  filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
52
56
 
53
57
  instances = _get_filtered_nodes(region, filters)
@@ -56,15 +60,16 @@ def query_instances(
56
60
  sky_status = status_map[vm_status]
57
61
  if non_terminated_only and sky_status is None:
58
62
  continue
59
- statuses[node['inst_id']] = sky_status
63
+ statuses[node['inst_id']] = (sky_status, None)
60
64
 
61
65
  return statuses
62
66
 
63
67
 
64
68
  @query_utils.debug_enabled(logger)
65
- def run_instances(region: str, cluster_name_on_cloud: str,
69
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
66
70
  config: common.ProvisionConfig) -> common.ProvisionRecord:
67
71
  """Start instances with bootstrapped configuration."""
72
+ del cluster_name # unused
68
73
  tags = dict(sorted(copy.deepcopy(config.tags).items()))
69
74
 
70
75
  start_time = round(time.time() * 1000)
@@ -1,7 +1,7 @@
1
1
  """Paperspace instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.provision import common
@@ -48,10 +48,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
48
48
  return head_instance_id
49
49
 
50
50
 
51
- def run_instances(region: str, cluster_name_on_cloud: str,
51
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
52
52
  config: common.ProvisionConfig) -> common.ProvisionRecord:
53
53
  """Runs instances for the given cluster."""
54
-
54
+ del cluster_name # unused
55
55
  pending_status = [
56
56
  'starting', 'restarting', 'upgrading', 'provisioning', 'stopping'
57
57
  ]
@@ -277,12 +277,14 @@ def get_cluster_info(
277
277
 
278
278
 
279
279
  def query_instances(
280
+ cluster_name: str,
280
281
  cluster_name_on_cloud: str,
281
282
  provider_config: Optional[Dict[str, Any]] = None,
282
283
  non_terminated_only: bool = True,
283
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
284
+ retry_if_missing: bool = False,
285
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
284
286
  """See sky/provision/__init__.py"""
285
- del non_terminated_only
287
+ del cluster_name, non_terminated_only, retry_if_missing #unused
286
288
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
287
289
  instances = _filter_instances(cluster_name_on_cloud, None)
288
290
 
@@ -297,10 +299,11 @@ def query_instances(
297
299
  'ready': status_lib.ClusterStatus.UP,
298
300
  'off': status_lib.ClusterStatus.STOPPED,
299
301
  }
300
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
302
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
303
+ Optional[str]]] = {}
301
304
  for inst_id, inst in instances.items():
302
305
  status = status_map[inst['state']]
303
- statuses[inst_id] = status
306
+ statuses[inst_id] = (status, None)
304
307
  return statuses
305
308
 
306
309
 
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Union
8
8
 
9
9
  from sky import sky_logging
10
10
  from sky.adaptors import common as adaptors_common
11
- import sky.provision.paperspace.constants as constants
11
+ from sky.provision.paperspace import constants
12
12
  from sky.utils import common_utils
13
13
 
14
14
  if typing.TYPE_CHECKING:
@@ -0,0 +1,10 @@
1
+ """Prime Intellect provisioner for SkyPilot."""
2
+
3
+ from sky.provision.primeintellect.config import bootstrap_instances
4
+ from sky.provision.primeintellect.instance import cleanup_ports
5
+ from sky.provision.primeintellect.instance import get_cluster_info
6
+ from sky.provision.primeintellect.instance import query_instances
7
+ from sky.provision.primeintellect.instance import run_instances
8
+ from sky.provision.primeintellect.instance import stop_instances
9
+ from sky.provision.primeintellect.instance import terminate_instances
10
+ from sky.provision.primeintellect.instance import wait_instances
@@ -0,0 +1,11 @@
1
+ """Prime Intellect configuration bootstrapping."""
2
+
3
+ from sky.provision import common
4
+
5
+
6
+ def bootstrap_instances(
7
+ region: str, cluster_name: str,
8
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
9
+ """Bootstraps instances for the given cluster."""
10
+ del region, cluster_name # unused
11
+ return config