skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,282 @@
1
+ """Kubernetes pvc provisioning."""
2
+ from typing import Any, Dict, List, Optional, Set, Tuple
3
+
4
+ from sky import global_user_state
5
+ from sky import models
6
+ from sky import sky_logging
7
+ from sky.adaptors import kubernetes
8
+ from sky.provision import constants
9
+ from sky.provision.kubernetes import config as config_lib
10
+ from sky.provision.kubernetes import constants as k8s_constants
11
+ from sky.provision.kubernetes import utils as kubernetes_utils
12
+ from sky.utils import volume as volume_lib
13
+
14
+ logger = sky_logging.init_logger(__name__)
15
+
16
+
17
+ def _get_context_namespace(config: models.VolumeConfig) -> Tuple[str, str]:
18
+ """Gets the context and namespace of a volume."""
19
+ if config.region is None:
20
+ context = kubernetes_utils.get_current_kube_config_context_name()
21
+ config.region = context
22
+ else:
23
+ context = config.region
24
+ namespace = config.config.get('namespace')
25
+ if namespace is None:
26
+ namespace = kubernetes_utils.get_kube_config_context_namespace(context)
27
+ config.config['namespace'] = namespace
28
+ return context, namespace
29
+
30
+
31
+ def check_pvc_usage_for_pod(context: Optional[str], namespace: str,
32
+ pod_spec: Dict[str, Any]) -> None:
33
+ """Checks if the PVC is used by any pod in the namespace."""
34
+ volumes = pod_spec.get('spec', {}).get('volumes', [])
35
+ if not volumes:
36
+ return
37
+ once_modes = [
38
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE.value,
39
+ volume_lib.VolumeAccessMode.READ_WRITE_ONCE_POD.value
40
+ ]
41
+ for volume in volumes:
42
+ pvc_name = volume.get('persistentVolumeClaim', {}).get('claimName')
43
+ if not pvc_name:
44
+ continue
45
+ pvc = kubernetes.core_api(
46
+ context).read_namespaced_persistent_volume_claim(
47
+ name=pvc_name, namespace=namespace)
48
+ access_mode = pvc.spec.access_modes[0]
49
+ if access_mode not in once_modes:
50
+ continue
51
+ usedby_pods, _ = _get_volume_usedby(context, namespace, pvc_name)
52
+ if usedby_pods:
53
+ raise config_lib.KubernetesError(f'Volume {pvc_name} with access '
54
+ f'mode {access_mode} is already '
55
+ f'in use by Pods {usedby_pods}.')
56
+
57
+
58
+ def apply_volume(config: models.VolumeConfig) -> models.VolumeConfig:
59
+ """Creates or registers a volume."""
60
+ context, namespace = _get_context_namespace(config)
61
+ pvc_spec = _get_pvc_spec(namespace, config)
62
+ # Check if the storage class exists
63
+ storage_class_name = pvc_spec['spec'].get('storageClassName')
64
+ if storage_class_name is not None:
65
+ try:
66
+ kubernetes.storage_api(context).read_storage_class(
67
+ name=storage_class_name)
68
+ except kubernetes.api_exception() as e:
69
+ raise config_lib.KubernetesError(
70
+ f'Check storage class {storage_class_name} error: {e}')
71
+ create_persistent_volume_claim(namespace, context, pvc_spec)
72
+ return config
73
+
74
+
75
+ def delete_volume(config: models.VolumeConfig) -> models.VolumeConfig:
76
+ """Deletes a volume."""
77
+ context, namespace = _get_context_namespace(config)
78
+ pvc_name = config.name_on_cloud
79
+ kubernetes_utils.delete_k8s_resource_with_retry(
80
+ delete_func=lambda pvc_name=pvc_name: kubernetes.core_api(
81
+ context).delete_namespaced_persistent_volume_claim(
82
+ name=pvc_name,
83
+ namespace=namespace,
84
+ _request_timeout=config_lib.DELETION_TIMEOUT),
85
+ resource_type='pvc',
86
+ resource_name=pvc_name)
87
+ logger.info(f'Deleted PVC {pvc_name} in namespace {namespace}')
88
+ return config
89
+
90
+
91
+ def _get_volume_usedby(
92
+ context: Optional[str],
93
+ namespace: str,
94
+ pvc_name: str,
95
+ ) -> Tuple[List[str], List[str]]:
96
+ """Gets the usedby resources of a volume.
97
+
98
+ This function returns the pods and clusters that are using the volume.
99
+ The usedby_pods is accurate, which also includes the Pods that are not
100
+ managed by SkyPilot.
101
+
102
+ Args:
103
+ context: Kubernetes context
104
+ namespace: Kubernetes namespace
105
+ pvc_name: PVC name
106
+
107
+ Returns:
108
+ usedby_pods: List of pods using the volume. These may include pods
109
+ not created by SkyPilot.
110
+ usedby_clusters: List of clusters using the volume.
111
+ """
112
+ usedby_pods = []
113
+ usedby_clusters = []
114
+ field_selector = ','.join([
115
+ f'status.phase!={phase}'
116
+ for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
117
+ ])
118
+ cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
119
+ # Get all pods in the namespace
120
+ pods = kubernetes.core_api(context).list_namespaced_pod(
121
+ namespace=namespace, field_selector=field_selector)
122
+ for pod in pods.items:
123
+ if pod.spec.volumes is None:
124
+ continue
125
+ for volume in pod.spec.volumes:
126
+ if volume.persistent_volume_claim is None:
127
+ continue
128
+ if volume.persistent_volume_claim.claim_name == pvc_name:
129
+ usedby_pods.append(pod.metadata.name)
130
+ # Get the real cluster name
131
+ cluster_name_on_cloud = pod.metadata.labels.get(
132
+ constants.TAG_SKYPILOT_CLUSTER_NAME)
133
+ if cluster_name_on_cloud is None:
134
+ continue
135
+ cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
136
+ if cluster_name is not None:
137
+ usedby_clusters.append(cluster_name)
138
+ if usedby_pods:
139
+ logger.debug(f'Volume {pvc_name} is used by Pods {usedby_pods}'
140
+ f' and clusters {usedby_clusters}')
141
+ return usedby_pods, usedby_clusters
142
+
143
+
144
+ def _get_cluster_name_on_cloud_to_cluster_name_map() -> Dict[str, str]:
145
+ """Gets the map from cluster name on cloud to cluster name."""
146
+ clusters = global_user_state.get_clusters()
147
+ cloud_to_name_map = {}
148
+ for cluster in clusters:
149
+ handle = cluster['handle']
150
+ if handle is None:
151
+ continue
152
+ cloud_to_name_map[handle.cluster_name_on_cloud] = cluster['name']
153
+ return cloud_to_name_map
154
+
155
+
156
+ def get_volume_usedby(
157
+ config: models.VolumeConfig,) -> Tuple[List[str], List[str]]:
158
+ """Gets the usedby resources of a volume."""
159
+ context, namespace = _get_context_namespace(config)
160
+ pvc_name = config.name_on_cloud
161
+ return _get_volume_usedby(context, namespace, pvc_name)
162
+
163
+
164
+ def get_all_volumes_usedby(
165
+ configs: List[models.VolumeConfig],
166
+ ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
167
+ """Gets the usedby resources of all volumes."""
168
+ field_selector = ','.join([
169
+ f'status.phase!={phase}'
170
+ for phase in k8s_constants.PVC_NOT_HOLD_POD_PHASES
171
+ ])
172
+ label_selector = 'parent=skypilot'
173
+ context_to_namespaces: Dict[str, Set[str]] = {}
174
+ pvc_names = set()
175
+ for config in configs:
176
+ context, namespace = _get_context_namespace(config)
177
+ if context not in context_to_namespaces:
178
+ context_to_namespaces[context] = set()
179
+ context_to_namespaces[context].add(namespace)
180
+ pvc_names.add(config.name_on_cloud)
181
+ cloud_to_name_map = _get_cluster_name_on_cloud_to_cluster_name_map()
182
+ # Get all pods in the namespace
183
+ used_by_pods: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
184
+ used_by_clusters: Dict[str, Dict[str, Dict[str, List[str]]]] = {}
185
+ for context, namespaces in context_to_namespaces.items():
186
+ used_by_pods[context] = {}
187
+ used_by_clusters[context] = {}
188
+ for namespace in namespaces:
189
+ used_by_pods[context][namespace] = {}
190
+ used_by_clusters[context][namespace] = {}
191
+ pods = kubernetes.core_api(context).list_namespaced_pod(
192
+ namespace=namespace,
193
+ field_selector=field_selector,
194
+ label_selector=label_selector)
195
+ for pod in pods.items:
196
+ if pod.spec.volumes is None:
197
+ continue
198
+ for volume in pod.spec.volumes:
199
+ if volume.persistent_volume_claim is None:
200
+ continue
201
+ volume_name = volume.persistent_volume_claim.claim_name
202
+ if volume_name not in pvc_names:
203
+ continue
204
+ if volume_name not in used_by_pods[context][namespace]:
205
+ used_by_pods[context][namespace][volume_name] = []
206
+ used_by_pods[context][namespace][volume_name].append(
207
+ pod.metadata.name)
208
+ cluster_name_on_cloud = pod.metadata.labels.get(
209
+ constants.TAG_SKYPILOT_CLUSTER_NAME)
210
+ if cluster_name_on_cloud is None:
211
+ continue
212
+ cluster_name = cloud_to_name_map.get(cluster_name_on_cloud)
213
+ if cluster_name is None:
214
+ continue
215
+ if cluster_name not in used_by_clusters[context][namespace]:
216
+ used_by_clusters[context][namespace][cluster_name] = []
217
+ used_by_clusters[context][namespace][cluster_name].append(
218
+ cluster_name)
219
+ return used_by_pods, used_by_clusters
220
+
221
+
222
+ def map_all_volumes_usedby(
223
+ used_by_pods: Dict[str, Any], used_by_clusters: Dict[str, Any],
224
+ config: models.VolumeConfig) -> Tuple[List[str], List[str]]:
225
+ """Maps the usedby resources of a volume."""
226
+ context, namespace = _get_context_namespace(config)
227
+ pvc_name = config.name_on_cloud
228
+
229
+ return (used_by_pods.get(context, {}).get(namespace, {}).get(pvc_name, []),
230
+ used_by_clusters.get(context, {}).get(namespace,
231
+ {}).get(pvc_name, []))
232
+
233
+
234
+ def create_persistent_volume_claim(namespace: str, context: Optional[str],
235
+ pvc_spec: Dict[str, Any]) -> None:
236
+ """Creates a persistent volume claim for SkyServe controller."""
237
+ pvc_name = pvc_spec['metadata']['name']
238
+ try:
239
+ kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
240
+ name=pvc_name, namespace=namespace)
241
+ logger.debug(f'PVC {pvc_name} already exists')
242
+ return
243
+ except kubernetes.api_exception() as e:
244
+ if e.status != 404: # Not found
245
+ raise
246
+ kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
247
+ namespace=namespace, body=pvc_spec)
248
+ logger.info(f'Created PVC {pvc_name} in namespace {namespace}')
249
+
250
+
251
+ def _get_pvc_spec(namespace: str,
252
+ config: models.VolumeConfig) -> Dict[str, Any]:
253
+ """Gets the PVC spec for the given storage config."""
254
+ access_mode = config.config.get('access_mode')
255
+ size = config.size
256
+ # The previous code assumes that the access_mode and size are always set.
257
+ assert access_mode is not None
258
+ assert size is not None
259
+ pvc_spec: Dict[str, Any] = {
260
+ 'metadata': {
261
+ 'name': config.name_on_cloud,
262
+ 'namespace': namespace,
263
+ 'labels': {
264
+ 'parent': 'skypilot',
265
+ 'skypilot-name': config.name,
266
+ }
267
+ },
268
+ 'spec': {
269
+ 'accessModes': [access_mode],
270
+ 'resources': {
271
+ 'requests': {
272
+ 'storage': f'{size}Gi'
273
+ }
274
+ },
275
+ }
276
+ }
277
+ if config.labels:
278
+ pvc_spec['metadata']['labels'].update(config.labels)
279
+ storage_class = config.config.get('storage_class_name')
280
+ if storage_class is not None:
281
+ pvc_spec['spec']['storageClassName'] = storage_class
282
+ return pvc_spec
@@ -1,7 +1,7 @@
1
1
  """Lambda Cloud instance provisioning."""
2
2
 
3
3
  import time
4
- from typing import Any, Dict, List, Optional
4
+ from typing import Any, Dict, List, Optional, Tuple
5
5
 
6
6
  from sky import sky_logging
7
7
  from sky.provision import common
@@ -68,9 +68,10 @@ def _get_private_ip(instance_info: Dict[str, Any], single_node: bool) -> str:
68
68
  return private_ip
69
69
 
70
70
 
71
- def run_instances(region: str, cluster_name_on_cloud: str,
71
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
72
72
  config: common.ProvisionConfig) -> common.ProvisionRecord:
73
73
  """Runs instances for the given cluster"""
74
+ del cluster_name # unused
74
75
  lambda_client = _get_lambda_client()
75
76
  pending_status = ['booting']
76
77
  while True:
@@ -106,34 +107,35 @@ def run_instances(region: str, cluster_name_on_cloud: str,
106
107
  created_instance_ids = []
107
108
  remote_ssh_key_name = config.authentication_config['remote_key_name']
108
109
 
109
- def launch_nodes(node_type: str, quantity: int) -> List[str]:
110
+ def launch_node(node_type: str) -> str:
110
111
  try:
111
112
  instance_ids = lambda_client.create_instances(
112
113
  instance_type=config.node_config['InstanceType'],
113
114
  region=region,
114
115
  name=f'{cluster_name_on_cloud}-{node_type}',
115
- quantity=quantity,
116
+ # Quantity cannot actually be greater than 1; see:
117
+ # https://github.com/skypilot-org/skypilot/issues/7084
118
+ quantity=1,
116
119
  ssh_key_name=remote_ssh_key_name,
117
120
  )
118
- logger.info(f'Launched {len(instance_ids)} {node_type} node(s), '
119
- f'instance_ids: {instance_ids}')
120
- return instance_ids
121
+ logger.info(f'Launched {node_type} node, '
122
+ f'instance_id: {instance_ids[0]}')
123
+ return instance_ids[0]
121
124
  except Exception as e:
122
125
  logger.warning(f'run_instances error: {e}')
123
126
  raise
124
127
 
125
128
  if head_instance_id is None:
126
- instance_ids = launch_nodes('head', 1)
127
- assert len(instance_ids) == 1
128
- created_instance_ids.append(instance_ids[0])
129
- head_instance_id = instance_ids[0]
129
+ head_instance_id = launch_node('head')
130
+ created_instance_ids.append(head_instance_id)
130
131
 
131
132
  assert head_instance_id is not None, 'head_instance_id should not be None'
132
133
 
133
134
  worker_node_count = to_start_count - 1
134
135
  if worker_node_count > 0:
135
- instance_ids = launch_nodes('worker', worker_node_count)
136
- created_instance_ids.extend(instance_ids)
136
+ for _ in range(worker_node_count):
137
+ worker_instance_id = launch_node('worker')
138
+ created_instance_ids.append(worker_instance_id)
137
139
 
138
140
  while True:
139
141
  instances = _filter_instances(cluster_name_on_cloud, ['active'])
@@ -226,11 +228,14 @@ def get_cluster_info(
226
228
 
227
229
 
228
230
  def query_instances(
231
+ cluster_name: str,
229
232
  cluster_name_on_cloud: str,
230
233
  provider_config: Optional[Dict[str, Any]] = None,
231
234
  non_terminated_only: bool = True,
232
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
235
+ retry_if_missing: bool = False,
236
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
233
237
  """See sky/provision/__init__.py"""
238
+ del cluster_name, retry_if_missing # unused
234
239
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
235
240
  instances = _filter_instances(cluster_name_on_cloud, None)
236
241
 
@@ -240,12 +245,13 @@ def query_instances(
240
245
  'unhealthy': status_lib.ClusterStatus.INIT,
241
246
  'terminating': None,
242
247
  }
243
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
248
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
249
+ Optional[str]]] = {}
244
250
  for instance_id, instance in instances.items():
245
251
  status = status_map.get(instance['status'])
246
252
  if non_terminated_only and status is None:
247
253
  continue
248
- statuses[instance_id] = status
254
+ statuses[instance_id] = (status, None)
249
255
  return statuses
250
256
 
251
257
 
@@ -0,0 +1,50 @@
1
+ """Constants used by the Nebius provisioner."""
2
+
3
+ VERSION = 'v1'
4
+
5
+ # InfiniBand-capable instance platforms
6
+ INFINIBAND_INSTANCE_PLATFORMS = [
7
+ 'gpu-h100-sxm',
8
+ 'gpu-h200-sxm',
9
+ ]
10
+
11
+ # InfiniBand environment variables for NCCL and UCX
12
+ INFINIBAND_ENV_VARS = {
13
+ 'NCCL_IB_HCA': 'mlx5',
14
+ 'UCX_NET_DEVICES': ('mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,'
15
+ 'mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1')
16
+ }
17
+
18
+ # pylint: disable=line-too-long
19
+ INFINIBAND_IMAGE_ID = 'docker:cr.eu-north1.nebius.cloud/nebius-benchmarks/nccl-tests:2.23.4-ubu22.04-cu12.4'
20
+
21
+ # Docker run options for InfiniBand support
22
+ INFINIBAND_DOCKER_OPTIONS = ['--device=/dev/infiniband', '--cap-add=IPC_LOCK']
23
+
24
+ # InfiniBand fabric mapping by platform and region
25
+ # Based on Nebius documentation
26
+ INFINIBAND_FABRIC_MAPPING = {
27
+ # H100 platforms
28
+ ('gpu-h100-sxm', 'eu-north1'): [
29
+ 'fabric-2', 'fabric-3', 'fabric-4', 'fabric-6'
30
+ ],
31
+
32
+ # H200 platforms
33
+ ('gpu-h200-sxm', 'eu-north1'): ['fabric-7'],
34
+ ('gpu-h200-sxm', 'eu-west1'): ['fabric-5'],
35
+ ('gpu-h200-sxm', 'us-central1'): ['us-central1-a'],
36
+ }
37
+
38
+
39
+ def get_default_fabric(platform: str, region: str) -> str:
40
+ """Get the default (first) fabric for a given platform and region."""
41
+ fabrics = INFINIBAND_FABRIC_MAPPING.get((platform, region), [])
42
+ if not fabrics:
43
+ # Select north europe region as default
44
+ fabrics = INFINIBAND_FABRIC_MAPPING.get(('gpu-h100-sxm', 'eu-north1'),
45
+ [])
46
+ if not fabrics:
47
+ raise ValueError(
48
+ f'No InfiniBand fabric available for platform {platform} '
49
+ f'in region {region}')
50
+ return fabrics[0]
@@ -1,6 +1,6 @@
1
1
  """Nebius instance provisioning."""
2
2
  import time
3
- from typing import Any, Dict, List, Optional
3
+ from typing import Any, Dict, List, Optional, Tuple
4
4
 
5
5
  from sky import sky_logging
6
6
  from sky.provision import common
@@ -65,9 +65,10 @@ def _wait_until_no_pending(region: str, cluster_name_on_cloud: str) -> None:
65
65
  f' to be ready.')
66
66
 
67
67
 
68
- def run_instances(region: str, cluster_name_on_cloud: str,
68
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
69
69
  config: common.ProvisionConfig) -> common.ProvisionRecord:
70
70
  """Runs instances for the given cluster."""
71
+ del cluster_name # unused
71
72
  _wait_until_no_pending(region, cluster_name_on_cloud)
72
73
  running_instances = _filter_instances(region, cluster_name_on_cloud,
73
74
  ['RUNNING'])
@@ -124,6 +125,7 @@ def run_instances(region: str, cluster_name_on_cloud: str,
124
125
  node_type = 'head' if head_instance_id is None else 'worker'
125
126
  try:
126
127
  platform, preset = config.node_config['InstanceType'].split('_')
128
+
127
129
  instance_id = utils.launch(
128
130
  cluster_name_on_cloud=cluster_name_on_cloud,
129
131
  node_type=node_type,
@@ -132,7 +134,14 @@ def run_instances(region: str, cluster_name_on_cloud: str,
132
134
  region=region,
133
135
  image_family=config.node_config['ImageId'],
134
136
  disk_size=config.node_config['DiskSize'],
135
- user_data=config.node_config['UserData'])
137
+ user_data=config.node_config['UserData'],
138
+ use_spot=config.node_config['use_spot'],
139
+ associate_public_ip_address=(
140
+ not config.provider_config['use_internal_ips']),
141
+ use_static_ip_address=config.provider_config.get(
142
+ 'use_static_ip_address', False),
143
+ filesystems=config.node_config.get('filesystems', []),
144
+ network_tier=config.node_config.get('network_tier'))
136
145
  except Exception as e: # pylint: disable=broad-except
137
146
  logger.warning(f'run_instances error: {e}')
138
147
  raise
@@ -241,11 +250,14 @@ def get_cluster_info(
241
250
 
242
251
 
243
252
  def query_instances(
253
+ cluster_name: str,
244
254
  cluster_name_on_cloud: str,
245
255
  provider_config: Optional[Dict[str, Any]] = None,
246
256
  non_terminated_only: bool = True,
247
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
257
+ retry_if_missing: bool = False,
258
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
248
259
  """See sky/provision/__init__.py"""
260
+ del cluster_name, retry_if_missing # unused
249
261
  assert provider_config is not None, (cluster_name_on_cloud, provider_config)
250
262
  instances = _filter_instances(provider_config['region'],
251
263
  cluster_name_on_cloud, None)
@@ -257,12 +269,13 @@ def query_instances(
257
269
  'STOPPING': status_lib.ClusterStatus.STOPPED,
258
270
  'DELETING': status_lib.ClusterStatus.STOPPED,
259
271
  }
260
- statuses: Dict[str, Optional[status_lib.ClusterStatus]] = {}
272
+ statuses: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
273
+ Optional[str]]] = {}
261
274
  for inst_id, inst in instances.items():
262
275
  status = status_map[inst['status']]
263
276
  if non_terminated_only and status is None:
264
277
  continue
265
- statuses[inst_id] = status
278
+ statuses[inst_id] = (status, None)
266
279
  return statuses
267
280
 
268
281