skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,21 @@
1
1
  """Nebius library wrapper for SkyPilot."""
2
2
  import time
3
- from typing import Any, Dict
3
+ from typing import Any, Dict, List, Optional
4
4
  import uuid
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky import skypilot_config
8
8
  from sky.adaptors import nebius
9
+ from sky.provision.nebius import constants as nebius_constants
9
10
  from sky.utils import common_utils
11
+ from sky.utils import resources_utils
10
12
 
11
13
  logger = sky_logging.init_logger(__name__)
12
14
 
13
15
  POLL_INTERVAL = 5
14
16
 
17
+ _MAX_OPERATIONS_TO_FETCH = 1000
18
+
15
19
 
16
20
  def retry(func):
17
21
  """Decorator to retry a function."""
@@ -34,12 +38,14 @@ def retry(func):
34
38
 
35
39
  def get_project_by_region(region: str) -> str:
36
40
  service = nebius.iam().ProjectServiceClient(nebius.sdk())
37
- projects = service.list(nebius.iam().ListProjectsRequest(
38
- parent_id=nebius.get_tenant_id())).wait()
41
+ projects = nebius.sync_call(
42
+ service.list(
43
+ nebius.iam().ListProjectsRequest(parent_id=nebius.get_tenant_id()),
44
+ timeout=nebius.READ_TIMEOUT))
39
45
 
40
46
  # Check is there project if in config
41
- project_id = skypilot_config.get_nested(('nebius', region, 'project_id'),
42
- None)
47
+ project_id = skypilot_config.get_effective_region_config(
48
+ cloud='nebius', region=region, keys=('project_id',), default_value=None)
43
49
  if project_id is not None:
44
50
  return project_id
45
51
  for project in projects.items:
@@ -54,19 +60,21 @@ def get_or_create_gpu_cluster(name: str, project_id: str, fabric: str) -> str:
54
60
  """
55
61
  service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
56
62
  try:
57
- cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
58
- parent_id=project_id,
59
- name=name,
60
- )).wait()
61
- cluster_id = cluster.metadata.id
62
- except nebius.request_error():
63
- cluster = service.create(nebius.compute().CreateGpuClusterRequest(
64
- metadata=nebius.nebius_common().ResourceMetadata(
63
+ cluster = nebius.sync_call(
64
+ service.get_by_name(nebius.nebius_common().GetByNameRequest(
65
65
  parent_id=project_id,
66
66
  name=name,
67
- ),
68
- spec=nebius.compute().GpuClusterSpec(
69
- infiniband_fabric=fabric))).wait()
67
+ )))
68
+ cluster_id = cluster.metadata.id
69
+ except nebius.request_error():
70
+ cluster = nebius.sync_call(
71
+ service.create(nebius.compute().CreateGpuClusterRequest(
72
+ metadata=nebius.nebius_common().ResourceMetadata(
73
+ parent_id=project_id,
74
+ name=name,
75
+ ),
76
+ spec=nebius.compute().GpuClusterSpec(
77
+ infiniband_fabric=fabric))))
70
78
  cluster_id = cluster.resource_id
71
79
  return cluster_id
72
80
 
@@ -76,14 +84,16 @@ def delete_cluster(name: str, region: str) -> None:
76
84
  project_id = get_project_by_region(region)
77
85
  service = nebius.compute().GpuClusterServiceClient(nebius.sdk())
78
86
  try:
79
- cluster = service.get_by_name(nebius.nebius_common().GetByNameRequest(
80
- parent_id=project_id,
81
- name=name,
82
- )).wait()
87
+ cluster = nebius.sync_call(
88
+ service.get_by_name(nebius.nebius_common().GetByNameRequest(
89
+ parent_id=project_id,
90
+ name=name,
91
+ )))
83
92
  cluster_id = cluster.metadata.id
84
93
  logger.debug(f'Found GPU Cluster : {cluster_id}.')
85
- service.delete(
86
- nebius.compute().DeleteGpuClusterRequest(id=cluster_id)).wait()
94
+ nebius.sync_call(
95
+ service.delete(
96
+ nebius.compute().DeleteGpuClusterRequest(id=cluster_id)))
87
97
  logger.debug(f'Deleted GPU Cluster : {cluster_id}.')
88
98
  except nebius.request_error():
89
99
  logger.debug('GPU Cluster does not exist.')
@@ -92,13 +102,23 @@ def delete_cluster(name: str, region: str) -> None:
92
102
  def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
93
103
  """Lists instances associated with API key."""
94
104
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
95
- result = service.list(
96
- nebius.compute().ListInstancesRequest(parent_id=project_id)).wait()
97
-
98
- instances = result
105
+ page_token = ''
106
+ instances = []
107
+ while True:
108
+ result = nebius.sync_call(
109
+ service.list(nebius.compute().ListInstancesRequest(
110
+ parent_id=project_id,
111
+ page_size=100,
112
+ page_token=page_token,
113
+ ),
114
+ timeout=nebius.READ_TIMEOUT))
115
+ instances.extend(result.items)
116
+ if not result.next_page_token: # "" means no more pages
117
+ break
118
+ page_token = result.next_page_token
99
119
 
100
120
  instance_dict: Dict[str, Dict[str, Any]] = {}
101
- for instance in instances.items:
121
+ for instance in instances:
102
122
  info = {}
103
123
  info['status'] = instance.status.state.name
104
124
  info['name'] = instance.metadata.name
@@ -114,12 +134,13 @@ def list_instances(project_id: str) -> Dict[str, Dict[str, Any]]:
114
134
 
115
135
  def stop(instance_id: str) -> None:
116
136
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
117
- service.stop(nebius.compute().StopInstanceRequest(id=instance_id)).wait()
137
+ nebius.sync_call(
138
+ service.stop(nebius.compute().StopInstanceRequest(id=instance_id)))
118
139
  retry_count = 0
119
140
  while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_STOP:
120
141
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
121
- instance = service.get(nebius.compute().GetInstanceRequest(
122
- id=instance_id,)).wait()
142
+ instance = nebius.sync_call(
143
+ service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
123
144
  if instance.status.state.name == 'STOPPED':
124
145
  break
125
146
  time.sleep(POLL_INTERVAL)
@@ -136,12 +157,13 @@ def stop(instance_id: str) -> None:
136
157
 
137
158
  def start(instance_id: str) -> None:
138
159
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
139
- service.start(nebius.compute().StartInstanceRequest(id=instance_id)).wait()
160
+ nebius.sync_call(
161
+ service.start(nebius.compute().StartInstanceRequest(id=instance_id)))
140
162
  retry_count = 0
141
163
  while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_START:
142
164
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
143
- instance = service.get(nebius.compute().GetInstanceRequest(
144
- id=instance_id,)).wait()
165
+ instance = nebius.sync_call(
166
+ service.get(nebius.compute().GetInstanceRequest(id=instance_id,)))
145
167
  if instance.status.state.name == 'RUNNING':
146
168
  break
147
169
  time.sleep(POLL_INTERVAL)
@@ -156,9 +178,19 @@ def start(instance_id: str) -> None:
156
178
  f' to be ready.')
157
179
 
158
180
 
159
- def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
160
- preset: str, region: str, image_family: str, disk_size: int,
161
- user_data: str) -> str:
181
+ def launch(cluster_name_on_cloud: str,
182
+ node_type: str,
183
+ platform: str,
184
+ preset: str,
185
+ region: str,
186
+ image_family: str,
187
+ disk_size: int,
188
+ user_data: str,
189
+ associate_public_ip_address: bool,
190
+ filesystems: List[Dict[str, Any]],
191
+ use_static_ip_address: bool = False,
192
+ use_spot: bool = False,
193
+ network_tier: Optional[resources_utils.NetworkTier] = None) -> str:
162
194
  # Each node must have a unique name to avoid conflicts between
163
195
  # multiple worker VMs. To ensure uniqueness,a UUID is appended
164
196
  # to the node name.
@@ -172,11 +204,26 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
172
204
  # 8 GPU virtual machines can be grouped into a GPU cluster.
173
205
  # The GPU clusters are built with InfiniBand secure high-speed networking.
174
206
  # https://docs.nebius.com/compute/clusters/gpu
175
- if platform in ('gpu-h100-sxm', 'gpu-h200-sxm'):
207
+ if platform in nebius_constants.INFINIBAND_INSTANCE_PLATFORMS:
176
208
  if preset == '8gpu-128vcpu-1600gb':
177
- # Check is there fabric in config
178
- fabric = skypilot_config.get_nested(('nebius', region, 'fabric'),
179
- None)
209
+ fabric = skypilot_config.get_effective_region_config(
210
+ cloud='nebius',
211
+ region=region,
212
+ keys=('fabric',),
213
+ default_value=None)
214
+
215
+ # Auto-select fabric if network_tier=best and no fabric configured
216
+ if (fabric is None and
217
+ str(network_tier) == str(resources_utils.NetworkTier.BEST)):
218
+ try:
219
+ fabric = nebius_constants.get_default_fabric(
220
+ platform, region)
221
+ logger.info(f'Auto-selected InfiniBand fabric {fabric} '
222
+ f'for {platform} in {region}')
223
+ except ValueError as e:
224
+ logger.warning(
225
+ f'InfiniBand fabric auto-selection failed: {e}')
226
+
180
227
  if fabric is None:
181
228
  logger.warning(
182
229
  f'Set up fabric for region {region} in ~/.sky/config.yaml '
@@ -186,24 +233,26 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
186
233
  project_id, fabric)
187
234
 
188
235
  service = nebius.compute().DiskServiceClient(nebius.sdk())
189
- disk = service.create(nebius.compute().CreateDiskRequest(
190
- metadata=nebius.nebius_common().ResourceMetadata(
191
- parent_id=project_id,
192
- name=disk_name,
193
- ),
194
- spec=nebius.compute().DiskSpec(
195
- source_image_family=nebius.compute().SourceImageFamily(
196
- image_family=image_family),
197
- size_gibibytes=disk_size,
198
- type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
199
- ))).wait()
236
+ disk = nebius.sync_call(
237
+ service.create(nebius.compute().CreateDiskRequest(
238
+ metadata=nebius.nebius_common().ResourceMetadata(
239
+ parent_id=project_id,
240
+ name=disk_name,
241
+ ),
242
+ spec=nebius.compute().DiskSpec(
243
+ source_image_family=nebius.compute().SourceImageFamily(
244
+ image_family=image_family),
245
+ size_gibibytes=disk_size,
246
+ type=nebius.compute().DiskSpec.DiskType.NETWORK_SSD,
247
+ ))))
200
248
  disk_id = disk.resource_id
201
249
  retry_count = 0
202
250
  while retry_count < nebius.MAX_RETRIES_TO_DISK_CREATE:
203
- disk = service.get_by_name(nebius.nebius_common().GetByNameRequest(
204
- parent_id=project_id,
205
- name=disk_name,
206
- )).wait()
251
+ disk = nebius.sync_call(
252
+ service.get_by_name(nebius.nebius_common().GetByNameRequest(
253
+ parent_id=project_id,
254
+ name=disk_name,
255
+ )))
207
256
  if disk.status.state.name == 'READY':
208
257
  break
209
258
  logger.debug(f'Waiting for disk {disk_name} to be ready.')
@@ -217,46 +266,102 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
217
266
  f' seconds) while waiting for disk {disk_name}'
218
267
  f' to be ready.')
219
268
 
269
+ filesystems_spec = []
270
+ if filesystems:
271
+ for fs in filesystems:
272
+ filesystems_spec.append(nebius.compute().AttachedFilesystemSpec(
273
+ mount_tag=fs['filesystem_mount_tag'],
274
+ attach_mode=nebius.compute().AttachedFilesystemSpec.AttachMode[
275
+ fs['filesystem_attach_mode']],
276
+ existing_filesystem=nebius.compute().ExistingFilesystem(
277
+ id=fs['filesystem_id'])))
278
+
220
279
  service = nebius.vpc().SubnetServiceClient(nebius.sdk())
221
- sub_net = service.list(nebius.vpc().ListSubnetsRequest(
222
- parent_id=project_id,)).wait()
280
+ sub_net = nebius.sync_call(
281
+ service.list(nebius.vpc().ListSubnetsRequest(parent_id=project_id,)))
223
282
 
224
283
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
225
- service.create(nebius.compute().CreateInstanceRequest(
226
- metadata=nebius.nebius_common().ResourceMetadata(
227
- parent_id=project_id,
228
- name=instance_name,
229
- ),
230
- spec=nebius.compute().InstanceSpec(
231
- gpu_cluster=nebius.compute().InstanceGpuClusterSpec(id=cluster_id,)
232
- if cluster_id is not None else None,
233
- boot_disk=nebius.compute().AttachedDiskSpec(
234
- attach_mode=nebius.compute(
235
- ).AttachedDiskSpec.AttachMode.READ_WRITE,
236
- existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
237
- cloud_init_user_data=user_data,
238
- resources=nebius.compute().ResourcesSpec(platform=platform,
239
- preset=preset),
240
- network_interfaces=[
241
- nebius.compute().NetworkInterfaceSpec(
242
- subnet_id=sub_net.items[0].metadata.id,
243
- ip_address=nebius.compute().IPAddress(),
244
- name='network-interface-0',
245
- public_ip_address=nebius.compute().PublicIPAddress())
246
- ]))).wait()
284
+ logger.debug(f'Creating instance {instance_name} in project {project_id}.')
285
+ nebius.sync_call(
286
+ service.create(nebius.compute().CreateInstanceRequest(
287
+ metadata=nebius.nebius_common().ResourceMetadata(
288
+ parent_id=project_id,
289
+ name=instance_name,
290
+ ),
291
+ spec=nebius.compute().InstanceSpec(
292
+ gpu_cluster=nebius.compute().InstanceGpuClusterSpec(
293
+ id=cluster_id,) if cluster_id is not None else None,
294
+ boot_disk=nebius.compute().AttachedDiskSpec(
295
+ attach_mode=nebius.compute(
296
+ ).AttachedDiskSpec.AttachMode.READ_WRITE,
297
+ existing_disk=nebius.compute().ExistingDisk(id=disk_id)),
298
+ cloud_init_user_data=user_data,
299
+ resources=nebius.compute().ResourcesSpec(platform=platform,
300
+ preset=preset),
301
+ filesystems=filesystems_spec if filesystems_spec else None,
302
+ network_interfaces=[
303
+ nebius.compute().NetworkInterfaceSpec(
304
+ subnet_id=sub_net.items[0].metadata.id,
305
+ ip_address=nebius.compute().IPAddress(),
306
+ name='network-interface-0',
307
+ public_ip_address=nebius.compute().PublicIPAddress(
308
+ static=use_static_ip_address)
309
+ if associate_public_ip_address else None,
310
+ )
311
+ ],
312
+ recovery_policy=nebius.compute().InstanceRecoveryPolicy.FAIL
313
+ if use_spot else None,
314
+ preemptible=nebius.compute().PreemptibleSpec(
315
+ priority=1,
316
+ on_preemption=nebius.compute().PreemptibleSpec.
317
+ PreemptionPolicy.STOP) if use_spot else None,
318
+ ))))
247
319
  instance_id = ''
248
320
  retry_count = 0
249
321
  while retry_count < nebius.MAX_RETRIES_TO_INSTANCE_READY:
250
322
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
251
- instance = service.get_by_name(nebius.nebius_common().GetByNameRequest(
252
- parent_id=project_id,
253
- name=instance_name,
254
- )).wait()
323
+ instance = nebius.sync_call(
324
+ service.get_by_name(nebius.nebius_common().GetByNameRequest(
325
+ parent_id=project_id,
326
+ name=instance_name,
327
+ )))
328
+ instance_id = instance.metadata.id
255
329
  if instance.status.state.name == 'STARTING':
256
- instance_id = instance.metadata.id
257
330
  break
331
+
332
+ # All Instances initially have state=STOPPED and reconciling=True,
333
+ # so we need to wait until reconciling is False.
334
+ if instance.status.state.name == 'STOPPED' and \
335
+ not instance.status.reconciling:
336
+ next_token = ''
337
+ total_operations = 0
338
+ while True:
339
+ operations_response = nebius.sync_call(
340
+ service.list_operations_by_parent(
341
+ nebius.compute().ListOperationsByParentRequest(
342
+ parent_id=project_id,
343
+ page_size=100,
344
+ page_token=next_token,
345
+ )))
346
+ total_operations += len(operations_response.operations)
347
+ for operation in operations_response.operations:
348
+ # Find the most recent operation for the instance.
349
+ if operation.resource_id == instance_id:
350
+ error_msg = operation.description
351
+ if operation.status:
352
+ error_msg += f' {operation.status.message}'
353
+ raise RuntimeError(error_msg)
354
+ # If we've fetched too many operations, or there are no more
355
+ # operations to fetch, just raise a generic error.
356
+ if total_operations > _MAX_OPERATIONS_TO_FETCH or \
357
+ not operations_response.next_page_token:
358
+ raise RuntimeError(
359
+ f'Instance {instance_name} failed to start.')
360
+ next_token = operations_response.next_page_token
258
361
  time.sleep(POLL_INTERVAL)
259
- logger.debug(f'Waiting for instance {instance_name} start running.')
362
+ logger.debug(f'Waiting for instance {instance_name} to start running. '
363
+ f'State: {instance.status.state.name}, '
364
+ f'Reconciling: {instance.status.reconciling}')
260
365
  retry_count += 1
261
366
 
262
367
  if retry_count == nebius.MAX_RETRIES_TO_INSTANCE_READY:
@@ -271,19 +376,19 @@ def launch(cluster_name_on_cloud: str, node_type: str, platform: str,
271
376
  def remove(instance_id: str) -> None:
272
377
  """Terminates the given instance."""
273
378
  service = nebius.compute().InstanceServiceClient(nebius.sdk())
274
- result = service.get(
275
- nebius.compute().GetInstanceRequest(id=instance_id)).wait()
379
+ result = nebius.sync_call(
380
+ service.get(nebius.compute().GetInstanceRequest(id=instance_id)))
276
381
  disk_id = result.spec.boot_disk.existing_disk.id
277
- service.delete(
278
- nebius.compute().DeleteInstanceRequest(id=instance_id)).wait()
382
+ nebius.sync_call(
383
+ service.delete(nebius.compute().DeleteInstanceRequest(id=instance_id)))
279
384
  retry_count = 0
280
385
  # The instance begins deleting and attempts to delete the disk.
281
386
  # Must wait until the disk is unlocked and becomes deletable.
282
387
  while retry_count < nebius.MAX_RETRIES_TO_DISK_DELETE:
283
388
  try:
284
389
  service = nebius.compute().DiskServiceClient(nebius.sdk())
285
- service.delete(
286
- nebius.compute().DeleteDiskRequest(id=disk_id)).wait()
390
+ nebius.sync_call(
391
+ service.delete(nebius.compute().DeleteDiskRequest(id=disk_id)))
287
392
  break
288
393
  except nebius.request_error():
289
394
  logger.debug('Waiting for disk deletion.')
@@ -10,7 +10,7 @@ import copy
10
10
  from datetime import datetime
11
11
  import time
12
12
  import typing
13
- from typing import Any, Dict, List, Optional
13
+ from typing import Any, Dict, List, Optional, Tuple
14
14
 
15
15
  from sky import exceptions
16
16
  from sky import sky_logging
@@ -32,10 +32,12 @@ logger = sky_logging.init_logger(__name__)
32
32
  @query_utils.debug_enabled(logger)
33
33
  @common_utils.retry
34
34
  def query_instances(
35
+ cluster_name: str,
35
36
  cluster_name_on_cloud: str,
36
37
  provider_config: Optional[Dict[str, Any]] = None,
37
38
  non_terminated_only: bool = True,
38
- ) -> Dict[str, Optional['status_lib.ClusterStatus']]:
39
+ retry_if_missing: bool = False,
40
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
39
41
  """Query instances.
40
42
 
41
43
  Returns a dictionary of instance IDs and status.
@@ -43,11 +45,13 @@ def query_instances(
43
45
  A None status means the instance is marked as "terminated"
44
46
  or "terminating".
45
47
  """
48
+ del cluster_name, retry_if_missing # unused
46
49
  assert provider_config is not None, cluster_name_on_cloud
47
50
  region = provider_config['region']
48
51
 
49
52
  status_map = oci_utils.oci_config.STATE_MAPPING_OCI_TO_SKY
50
- statuses: Dict[str, Optional['status_lib.ClusterStatus']] = {}
53
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
54
+ Optional[str]]] = {}
51
55
  filters = {constants.TAG_RAY_CLUSTER_NAME: cluster_name_on_cloud}
52
56
 
53
57
  instances = _get_filtered_nodes(region, filters)
@@ -56,15 +60,16 @@ def query_instances(
56
60
  sky_status = status_map[vm_status]
57
61
  if non_terminated_only and sky_status is None:
58
62
  continue
59
- statuses[node['inst_id']] = sky_status
63
+ statuses[node['inst_id']] = (sky_status, None)
60
64
 
61
65
  return statuses
62
66
 
63
67
 
64
68
  @query_utils.debug_enabled(logger)
65
- def run_instances(region: str, cluster_name_on_cloud: str,
69
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
66
70
  config: common.ProvisionConfig) -> common.ProvisionRecord:
67
71
  """Start instances with bootstrapped configuration."""
72
+ del cluster_name # unused
68
73
  tags = dict(sorted(copy.deepcopy(config.tags).items()))
69
74
 
70
75
  start_time = round(time.time() * 1000)
@@ -1,7 +1,7 @@
1
1
  """Paperspace instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.provision import common
@@ -48,10 +48,10 @@ def _get_head_instance_id(instances: Dict[str, Any]) -> Optional[str]:
48
48
  return head_instance_id
49
49
 
50
50
 
51
- def run_instances(region: str, cluster_name_on_cloud: str,
51
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
52
52
  config: common.ProvisionConfig) -> common.ProvisionRecord:
53
53
  """Runs instances for the given cluster."""
54
-
54
+ del cluster_name # unused
55
55
  pending_status = [
56
56
  'starting', 'restarting', 'upgrading', 'provisioning', 'stopping'
57
57
  ]
@@ -277,12 +277,14 @@ def get_cluster_info(
277
277
 
278
278
 
279
279
  def query_instances(
280
+ cluster_name: str,
280
281
  cluster_name_on_cloud: str,
281
282
  provider_config: Optional[Dict[str, Any]] = None,
282
283
  non_terminated_only: bool = True,
283
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
284
+ retry_if_missing: bool = False,
285
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
284
286
  """See sky/provision/__init__.py"""
285
- del non_terminated_only
287
+ del cluster_name, non_terminated_only, retry_if_missing #unused
286
288
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
287
289
  instances = _filter_instances(cluster_name_on_cloud, None)
288
290
 
@@ -297,10 +299,11 @@ def query_instances(
297
299
  'ready': status_lib.ClusterStatus.UP,
298
300
  'off': status_lib.ClusterStatus.STOPPED,
299
301
  }
300
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
302
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
303
+ Optional[str]]] = {}
301
304
  for inst_id, inst in instances.items():
302
305
  status = status_map[inst['state']]
303
- statuses[inst_id] = status
306
+ statuses[inst_id] = (status, None)
304
307
  return statuses
305
308
 
306
309
 
@@ -8,7 +8,7 @@ from typing import Any, Dict, List, Optional, Union
8
8
 
9
9
  from sky import sky_logging
10
10
  from sky.adaptors import common as adaptors_common
11
- import sky.provision.paperspace.constants as constants
11
+ from sky.provision.paperspace import constants
12
12
  from sky.utils import common_utils
13
13
 
14
14
  if typing.TYPE_CHECKING:
@@ -0,0 +1,10 @@
1
+ """Prime Intellect provisioner for SkyPilot."""
2
+
3
+ from sky.provision.primeintellect.config import bootstrap_instances
4
+ from sky.provision.primeintellect.instance import cleanup_ports
5
+ from sky.provision.primeintellect.instance import get_cluster_info
6
+ from sky.provision.primeintellect.instance import query_instances
7
+ from sky.provision.primeintellect.instance import run_instances
8
+ from sky.provision.primeintellect.instance import stop_instances
9
+ from sky.provision.primeintellect.instance import terminate_instances
10
+ from sky.provision.primeintellect.instance import wait_instances
@@ -0,0 +1,11 @@
1
+ """Prime Intellect configuration bootstrapping."""
2
+
3
+ from sky.provision import common
4
+
5
+
6
+ def bootstrap_instances(
7
+ region: str, cluster_name: str,
8
+ config: common.ProvisionConfig) -> common.ProvisionConfig:
9
+ """Bootstraps instances for the given cluster."""
10
+ del region, cluster_name # unused
11
+ return config