skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -33,14 +33,11 @@ provider:
33
33
  networking_mode: {{k8s_networking_mode}}
34
34
 
35
35
  # We use internal IPs since we set up a port-forward between the kubernetes
36
- # cluster and the local machine, or directly use NodePort to reach the
37
- # head node.
36
+ # cluster and the local machine.
38
37
  use_internal_ips: true
39
38
 
40
39
  timeout: {{timeout}}
41
40
 
42
- ssh_jump_image: {{k8s_ssh_jump_image}}
43
-
44
41
  # Namespace used to host SkyPilot system components, such as fuse device
45
42
  # manager.
46
43
  skypilot_system_namespace: {{k8s_skypilot_system_namespace}}
@@ -212,7 +209,9 @@ provider:
212
209
  metadata:
213
210
  labels:
214
211
  parent: skypilot
212
+ # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
215
213
  skypilot-cluster: {{cluster_name_on_cloud}}
214
+ skypilot-cluster-name: {{cluster_name_on_cloud}}
216
215
  skypilot-user: {{ user }}
217
216
  name: {{cluster_name_on_cloud}}-head-ssh
218
217
  spec:
@@ -230,7 +229,9 @@ provider:
230
229
  metadata:
231
230
  labels:
232
231
  parent: skypilot
232
+ # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
233
233
  skypilot-cluster: {{cluster_name_on_cloud}}
234
+ skypilot-cluster-name: {{cluster_name_on_cloud}}
234
235
  skypilot-user: {{ user }}
235
236
  # NOTE: If you're running multiple Ray clusters with services
236
237
  # on one Kubernetes cluster, they must have unique service
@@ -243,6 +244,24 @@ provider:
243
244
  # This selector must match the head node pod's selector below.
244
245
  selector:
245
246
  component: {{cluster_name_on_cloud}}-head
247
+ # Headless service mapping hostnames to rest of the worker nodes
248
+ {% for worker_id in range(1, num_nodes) %}
249
+ - apiVersion: v1
250
+ kind: Service
251
+ metadata:
252
+ labels:
253
+ parent: skypilot
254
+ # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
255
+ skypilot-cluster: {{cluster_name_on_cloud}}
256
+ skypilot-cluster-name: {{cluster_name_on_cloud}}
257
+ skypilot-user: {{ user }}
258
+ name: {{cluster_name_on_cloud}}-worker{{ worker_id }}
259
+ spec:
260
+ selector:
261
+ component: {{cluster_name_on_cloud}}-worker{{ worker_id }}
262
+ clusterIP: None
263
+ {% endfor %}
264
+
246
265
 
247
266
  # Specify the pod type for the ray head node (as configured below).
248
267
  head_node_type: ray_head_default
@@ -255,13 +274,12 @@ available_node_types:
255
274
  metadata:
256
275
  # name will be filled in the provisioner
257
276
  # head node name will be {{cluster_name_on_cloud}}-head, which will match the head node service selector above if a head node
258
- # service is required.
277
+ # service is required. Worker nodes are named {{cluster_name_on_cloud}}-worker{{ node_id }}
259
278
  labels:
260
279
  parent: skypilot
261
280
  # component will be set for the head node pod to be the same as the head node service selector above if a
281
+ # TODO (kyuds): remove this label for v0.11.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
262
282
  skypilot-cluster: {{cluster_name_on_cloud}}
263
- # Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
264
- skypilot-ssh-jump: {{k8s_ssh_jump_name}}
265
283
  skypilot-user: {{ user }}
266
284
  # Custom tags for the pods
267
285
  {%- for label_key, label_value in labels.items() %}
@@ -273,14 +291,100 @@ available_node_types:
273
291
  {% if (k8s_acc_label_key is not none and k8s_acc_label_values is not none) %}
274
292
  skypilot-binpack: "gpu"
275
293
  {% endif %}
294
+ {% if k8s_kueue_local_queue_name %}
295
+ kueue.x-k8s.io/queue-name: {{k8s_kueue_local_queue_name}}
296
+ kueue.x-k8s.io/pod-group-name: {{cluster_name_on_cloud}}
297
+ {% endif %}
298
+ {% if k8s_kueue_local_queue_name or k8s_enable_gpudirect_tcpx or k8s_enable_gpudirect_tcpxo or k8s_enable_gpudirect_rdma %}
299
+ annotations:
300
+ {% if k8s_kueue_local_queue_name %}
301
+ kueue.x-k8s.io/retriable-in-group: "false"
302
+ kueue.x-k8s.io/pod-group-total-count: "{{ num_nodes|string }}"
303
+ {% if k8s_max_run_duration_seconds %}
304
+ provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{k8s_max_run_duration_seconds|string}}"
305
+ {% endif %}
306
+ {% endif %}
307
+ # https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx
308
+ # Values from google cloud guide
309
+ {% if k8s_enable_gpudirect_tcpx %}
310
+ devices.gke.io/container.tcpx-daemon: |+
311
+ - path: /dev/nvidia0
312
+ - path: /dev/nvidia1
313
+ - path: /dev/nvidia2
314
+ - path: /dev/nvidia3
315
+ - path: /dev/nvidia4
316
+ - path: /dev/nvidia5
317
+ - path: /dev/nvidia6
318
+ - path: /dev/nvidia7
319
+ - path: /dev/nvidiactl
320
+ - path: /dev/nvidia-uvm
321
+ networking.gke.io/default-interface: 'eth0'
322
+ networking.gke.io/interfaces: |
323
+ [
324
+ {"interfaceName":"eth0","network":"default"},
325
+ {"interfaceName":"eth1","network":"vpc1"},
326
+ {"interfaceName":"eth2","network":"vpc2"},
327
+ {"interfaceName":"eth3","network":"vpc3"},
328
+ {"interfaceName":"eth4","network":"vpc4"}
329
+ ]
330
+ {% endif %}
331
+ {% if k8s_enable_gpudirect_tcpxo %}
332
+ devices.gke.io/container.tcpxo-daemon: |+
333
+ - path: /dev/nvidia0
334
+ - path: /dev/nvidia1
335
+ - path: /dev/nvidia2
336
+ - path: /dev/nvidia3
337
+ - path: /dev/nvidia4
338
+ - path: /dev/nvidia5
339
+ - path: /dev/nvidia6
340
+ - path: /dev/nvidia7
341
+ - path: /dev/nvidiactl
342
+ - path: /dev/nvidia-uvm
343
+ - path: /dev/dmabuf_import_helper
344
+ networking.gke.io/default-interface: 'eth0'
345
+ networking.gke.io/interfaces: |
346
+ [
347
+ {"interfaceName":"eth0","network":"default"},
348
+ {"interfaceName":"eth1","network":"vpc1"},
349
+ {"interfaceName":"eth2","network":"vpc2"},
350
+ {"interfaceName":"eth3","network":"vpc3"},
351
+ {"interfaceName":"eth4","network":"vpc4"},
352
+ {"interfaceName":"eth5","network":"vpc5"},
353
+ {"interfaceName":"eth6","network":"vpc6"},
354
+ {"interfaceName":"eth7","network":"vpc7"},
355
+ {"interfaceName":"eth8","network":"vpc8"}
356
+ ]
357
+ {% endif %}
358
+ {% if k8s_enable_gpudirect_rdma %}
359
+ networking.gke.io/default-interface: 'eth0'
360
+ networking.gke.io/interfaces: |
361
+ [
362
+ {"interfaceName":"eth0","network":"default"},
363
+ {"interfaceName":"eth1","network":"gvnic-1"},
364
+ {"interfaceName":"eth2","network":"rdma-0"},
365
+ {"interfaceName":"eth3","network":"rdma-1"},
366
+ {"interfaceName":"eth4","network":"rdma-2"},
367
+ {"interfaceName":"eth5","network":"rdma-3"},
368
+ {"interfaceName":"eth6","network":"rdma-4"},
369
+ {"interfaceName":"eth7","network":"rdma-5"},
370
+ {"interfaceName":"eth8","network":"rdma-6"},
371
+ {"interfaceName":"eth9","network":"rdma-7"}
372
+ ]
373
+ {% endif %}
374
+ {% endif %}
276
375
  spec:
277
376
  # serviceAccountName: skypilot-service-account
278
377
  serviceAccountName: {{k8s_service_account_name}}
279
378
  automountServiceAccountToken: {{k8s_automount_sa_token}}
280
379
  restartPolicy: {{ "Always" if high_availability else "Never" }}
380
+ {% if volume_mounts %}
381
+ securityContext:
382
+ fsGroup: 1000
383
+ fsGroupChangePolicy: OnRootMismatch
384
+ {% endif %}
281
385
 
282
386
  # Add node selector if GPU/TPUs are requested:
283
- {% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) %}
387
+ {% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) or (k8s_enable_flex_start) %}
284
388
  nodeSelector:
285
389
  {% if k8s_topology_label_key is not none and k8s_topology_label_value is not none %}
286
390
  {{k8s_topology_label_key}}: {{k8s_topology_label_value}}
@@ -288,6 +392,9 @@ available_node_types:
288
392
  {% if k8s_spot_label_key is not none %}
289
393
  {{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}}
290
394
  {% endif %}
395
+ {% if k8s_enable_flex_start %}
396
+ cloud.google.com/gke-flex-start: "true"
397
+ {% endif %}
291
398
  {% endif %}
292
399
  {% if (k8s_acc_label_key is not none and k8s_acc_label_values is not none) or (avoid_label_keys is not none) %}
293
400
  affinity:
@@ -339,9 +446,6 @@ available_node_types:
339
446
  # object store. If you do not provide this, Ray will fall back to
340
447
  # /tmp which cause slowdowns if is not a shared memory volume.
341
448
  volumes:
342
- - name: secret-volume
343
- secret:
344
- secretName: {{k8s_ssh_key_secret_name}}
345
449
  - name: dshm
346
450
  emptyDir:
347
451
  medium: Memory
@@ -356,19 +460,176 @@ available_node_types:
356
460
  persistentVolumeClaim:
357
461
  claimName: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
358
462
  {% endif %}
463
+ {% for volume_mount in volume_mounts %}
464
+ - name: {{volume_mount.name}}
465
+ persistentVolumeClaim:
466
+ claimName: {{volume_mount.volume_name_on_cloud}}
467
+ {% endfor %}
468
+ {% if k8s_enable_gpudirect_tcpx %}
469
+ - name: libraries
470
+ hostPath:
471
+ path: /home/kubernetes/bin/nvidia/lib64
472
+ - name: tcpx-socket
473
+ emptyDir: {}
474
+ - name: sys
475
+ hostPath:
476
+ path: /sys
477
+ - name: proc-sys
478
+ hostPath:
479
+ path: /proc/sys
480
+ {% endif %}
481
+ {% if k8s_enable_gpudirect_tcpxo %}
482
+ - name: libraries
483
+ hostPath:
484
+ path: /home/kubernetes/bin/nvidia
485
+ - name: sys
486
+ hostPath:
487
+ path: /sys
488
+ - name: proc-sys
489
+ hostPath:
490
+ path: /proc/sys
491
+ - name: aperture-devices
492
+ hostPath:
493
+ path: /dev/aperture_devices
494
+ {% endif %}
495
+ {% if k8s_enable_gpudirect_rdma %}
496
+ - name: library-dir-host
497
+ hostPath:
498
+ path: /home/kubernetes/bin/nvidia
499
+ - name: gib
500
+ hostPath:
501
+ path: /home/kubernetes/bin/gib
502
+ {% endif %}
359
503
  containers:
360
504
  - name: ray-node
361
- imagePullPolicy: IfNotPresent
505
+ imagePullPolicy: Always
362
506
  image: {{image_id}}
363
507
  env:
364
508
  - name: SKYPILOT_POD_NODE_TYPE
365
509
  valueFrom:
366
510
  fieldRef:
367
511
  fieldPath: metadata.labels['ray-node-type']
512
+ - name: SKYPILOT_POD_CPU_CORE_LIMIT
513
+ valueFrom:
514
+ resourceFieldRef:
515
+ containerName: ray-node
516
+ resource: requests.cpu
517
+ - name: SKYPILOT_POD_MEMORY_BYTES_LIMIT
518
+ valueFrom:
519
+ resourceFieldRef:
520
+ containerName: ray-node
521
+ resource: requests.memory
368
522
  {% for key, value in k8s_env_vars.items() if k8s_env_vars is not none %}
369
523
  - name: {{ key }}
370
524
  value: {{ value }}
371
525
  {% endfor %}
526
+ # https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl
527
+ # Page recommends setting NCCL values for GPUDirect TCPX for best performance.
528
+ {% if k8s_enable_gpudirect_tcpx %}
529
+ - name: LD_LIBRARY_PATH
530
+ value: /usr/local/nvidia/lib64:/usr/local/tcpx/lib64
531
+ - name: NCCL_GPUDIRECTTCPX_SOCKET_IFNAME
532
+ value: eth1,eth2,eth3,eth4
533
+ - name: NCCL_GPUDIRECTTCPX_CTRL_DEV
534
+ value: eth0
535
+ - name: NCCL_GPUDIRECTTCPX_TX_BINDINGS
536
+ value: "eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"
537
+ - name: NCCL_GPUDIRECTTCPX_RX_BINDINGS
538
+ value: "eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191"
539
+ - name: NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS
540
+ value: "500000"
541
+ - name: NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX
542
+ value: "/tmp"
543
+ - name: NCCL_GPUDIRECTTCPX_FORCE_ACK
544
+ value: "0"
545
+ - name: NCCL_SOCKET_IFNAME
546
+ value: eth0
547
+ - name: NCCL_CROSS_NIC
548
+ value: "0"
549
+ - name: NCCL_ALGO
550
+ value: Ring
551
+ - name: NCCL_PROTO
552
+ value: Simple
553
+ - name: NCCL_NSOCKS_PERTHREAD
554
+ value: "4"
555
+ - name: NCCL_SOCKET_NTHREADS
556
+ value: "1"
557
+ - name: NCCL_NET_GDR_LEVEL
558
+ value: PIX
559
+ - name: NCCL_DYNAMIC_CHUNK_SIZE
560
+ value: "524288"
561
+ - name: NCCL_P2P_PXN_LEVEL
562
+ value: "0"
563
+ - name: NCCL_P2P_NET_CHUNKSIZE
564
+ value: "524288"
565
+ - name: NCCL_P2P_PCI_CHUNKSIZE
566
+ value: "524288"
567
+ - name: NCCL_P2P_NVL_CHUNKSIZE
568
+ value: "1048576"
569
+ - name: NCCL_BUFFSIZE
570
+ value: "4194304"
571
+ - name: NCCL_MAX_NCHANNELS
572
+ value: "8"
573
+ - name: NCCL_MIN_NCHANNELS
574
+ value: "8"
575
+ - name: CUDA_VISIBLE_DEVICES
576
+ value: "0,1,2,3,4,5,6,7"
577
+ {% endif %}
578
+ {% if k8s_enable_gpudirect_tcpxo %}
579
+ - name: LD_LIBRARY_PATH
580
+ value: /usr/local/nvidia/lib64
581
+ - name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY
582
+ value: /dev/aperture_devices
583
+ - name: NCCL_FASTRAK_CTRL_DEV
584
+ value: eth0
585
+ - name: NCCL_FASTRAK_IFNAME
586
+ value: eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8
587
+ - name: NCCL_SOCKET_IFNAME
588
+ value: eth0
589
+ - name: NCCL_CROSS_NIC
590
+ value: "0"
591
+ - name: NCCL_ALGO
592
+ value: Ring,Tree
593
+ - name: NCCL_PROTO
594
+ value: Simple,LL128
595
+ - name: NCCL_MIN_NCHANNELS
596
+ value: "4"
597
+ - name: NCCL_TUNER_PLUGIN
598
+ value: libnccl-tuner.so
599
+ - name: NCCL_TUNER_CONFIG_PATH
600
+ value: /usr/local/nvidia/lib64/a3plus_tuner_config.textproto
601
+ - name: CUDA_VISIBLE_DEVICES
602
+ value: "0,1,2,3,4,5,6,7"
603
+ {% endif %}
604
+ {% if k8s_enable_gpudirect_rdma %}
605
+ - name: LD_LIBRARY_PATH
606
+ value: /usr/local/nvidia/lib64
607
+ - name: NCCL_NET
608
+ value: gIB
609
+ - name: NCCL_CROSS_NIC
610
+ value: "0"
611
+ - name: NCCL_NET_GDR_LEVEL
612
+ value: PIX
613
+ - name: NCCL_P2P_NET_CHUNKSIZE
614
+ value: "131072"
615
+ - name: NCCL_NVLS_CHUNKSIZE
616
+ value: "524288"
617
+ - name: NCCL_IB_ADAPTIVE_ROUTING
618
+ value: "1"
619
+ - name: NCCL_IB_QPS_PER_CONNECTION
620
+ value: "4"
621
+ - name: NCCL_IB_TC
622
+ value: "52"
623
+ - name: NCCL_IB_FIFO_TC
624
+ value: "84"
625
+ {% if k8s_enable_gpudirect_rdma_a4 %}
626
+ - name: NCCL_TUNER_CONFIG_PATH
627
+ value: /usr/local/gib/configs/tuner_config_a4.txtpb
628
+ {% else %}
629
+ - name: NCCL_TUNER_CONFIG_PATH
630
+ value: /usr/local/gib/configs/tuner_config_a3u.txtpb
631
+ {% endif %}
632
+ {% endif %}
372
633
  {% if k8s_fuse_device_required %}
373
634
  - name: FUSERMOUNT_SHARED_DIR
374
635
  value: {{k8s_fusermount_shared_dir}}
@@ -378,13 +639,9 @@ available_node_types:
378
639
  command: ["/bin/bash", "-c", "--"]
379
640
  args:
380
641
  - |
381
- # For backwards compatibility, we put a marker file in the pod
382
- # to indicate that the pod is running with the changes introduced
383
- # in project nimbus: https://github.com/skypilot-org/skypilot/pull/4393
384
- # TODO: Remove this marker file and it's usage in setup_commands
385
- # after v0.10.0 release.
386
- touch /tmp/skypilot_is_nimbus
387
-
642
+ # Set -x to print the commands and their arguments as they are executed.
643
+ # Useful for debugging.
644
+ set -x
388
645
  # Helper function to conditionally use sudo
389
646
  # TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
390
647
  prefix_cmd() { if [ $(id -u) -ne 0 ]; then echo "sudo"; else echo ""; fi; }
@@ -395,14 +652,138 @@ available_node_types:
395
652
  # STEP 1: Run apt update, install missing packages, and set up ssh.
396
653
  (
397
654
  (
398
- DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > /tmp/apt-update.log 2>&1 || \
399
- echo "Warning: apt-get update failed. Continuing anyway..." >> /tmp/apt-update.log
655
+ # For backwards compatibility, we put a marker file in the pod
656
+ # to indicate that the apt ssh setup step will write a completion
657
+ # marker file (/tmp/apt_ssh_setup_complete) to the pod.
658
+ # TODO: Remove this marker file and its usage in setup_commands
659
+ # after v0.11.0 release.
660
+ touch /tmp/apt_ssh_setup_started
661
+
662
+ # Helper: run apt-get update with retries
663
+ apt_update_with_retries() {
664
+ # do not fail the whole shell; we handle return codes
665
+ set +e
666
+ local log=/tmp/apt-update.log
667
+ local tries=3
668
+ local delay=1
669
+ local i
670
+ for i in $(seq 1 $tries); do
671
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update >> "$log" 2>&1 && { set -e; return 0; }
672
+ echo "apt-get update attempt $i/$tries failed; retrying in ${delay}s" >> "$log"
673
+ sleep $delay
674
+ delay=$((delay * 2))
675
+ done
676
+ set -e
677
+ return 1
678
+ }
679
+ apt_install_with_retries() {
680
+ local packages="$@"
681
+ [ -z "$packages" ] && return 0
682
+ set +e
683
+ local log=/tmp/apt-update.log
684
+ local tries=3
685
+ local delay=1
686
+ local i
687
+ for i in $(seq 1 $tries); do
688
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $packages && { set -e; return 0; }
689
+ echo "apt-get install failed for: $packages (attempt $i/$tries). Running -f install and retrying..." >> "$log"
690
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get -f install -y >> "$log" 2>&1 || true
691
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get clean >> "$log" 2>&1 || true
692
+ sleep $delay
693
+ delay=$((delay * 2))
694
+ done
695
+ set -e
696
+ return 1
697
+ }
698
+ apt_update_install_with_retries() {
699
+ apt_update_with_retries
700
+ apt_install_with_retries "$@"
701
+ }
702
+ backup_dir=/etc/apt/sources.list.backup_skypilot
703
+ backup_source() {
704
+ $(prefix_cmd) mkdir -p "$backup_dir"
705
+ if [ -f /etc/apt/sources.list ] && [ ! -f "$backup_dir/sources.list" ]; then
706
+ $(prefix_cmd) cp -a /etc/apt/sources.list "$backup_dir/sources.list" || true
707
+ fi
708
+ }
709
+ restore_source() {
710
+ if [ -f "$backup_dir/sources.list" ]; then
711
+ $(prefix_cmd) cp -a "$backup_dir/sources.list" /etc/apt/sources.list || true
712
+ fi
713
+ }
714
+ update_apt_sources() {
715
+ local host=$1
716
+ local apt_file=$2
717
+ $(prefix_cmd) sed -i -E "s|https?://[a-zA-Z0-9.-]+\.ubuntu\.com/ubuntu|http://$host/ubuntu|g" $apt_file
718
+ }
719
+ # Helper: install packages across mirrors with retries
720
+ apt_install_with_mirrors() {
721
+ local required=$1; shift
722
+ local packages="$@"
723
+ [ -z "$packages" ] && return 0
724
+ set +e
725
+ # Install packages with default sources first
726
+ local log=/tmp/apt-update.log
727
+ echo "$(date +%Y-%m-%d\ %H:%M:%S) Installing packages: $packages" >> "$log"
728
+ restore_source
729
+ apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
730
+ echo "Install failed with default sources: $packages" >> "$log"
731
+ # Detect distro (ubuntu/debian)
732
+ local APT_OS="unknown"
733
+ if [ -f /etc/os-release ]; then
734
+ . /etc/os-release
735
+ case "$ID" in
736
+ debian) APT_OS="debian" ;;
737
+ ubuntu) APT_OS="ubuntu" ;;
738
+ *)
739
+ if [ -n "$ID_LIKE" ]; then
740
+ case " $ID $ID_LIKE " in
741
+ *ubuntu*) APT_OS="ubuntu" ;;
742
+ *debian*) APT_OS="debian" ;;
743
+ esac
744
+ fi
745
+ ;;
746
+ esac
747
+ fi
748
+ # Build mirror candidates
749
+ # deb.debian.org is a CDN endpoint, if one backend goes down,
750
+ # the CDN automatically fails over to another mirror,
751
+ # so we only retry for ubuntu here.
752
+ if [ "$APT_OS" = "ubuntu" ]; then
753
+ # Backup current sources once
754
+ backup_source
755
+ # Selected from https://launchpad.net/ubuntu/+archivemirrors
756
+ # and results from apt-select
757
+ local MIRROR_CANDIDATES="mirrors.wikimedia.org mirror.umd.edu"
758
+ for host in $MIRROR_CANDIDATES; do
759
+ echo "Trying APT mirror ($APT_OS): $host" >> "$log"
760
+ if [ -f /etc/apt/sources.list ]; then
761
+ update_apt_sources $host /etc/apt/sources.list
762
+ else
763
+ echo "Error: /etc/apt/sources.list not found" >> "$log"
764
+ break
765
+ fi
766
+ apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
767
+ echo "Install failed with mirror ($APT_OS): $host" >> "$log"
768
+ # Restore to default sources
769
+ restore_source
770
+ done
771
+ fi
772
+ set -e
773
+ if [ "$required" = "1" ]; then
774
+ echo "Error: required package install failed across all mirrors: $packages" >> "$log"
775
+ return 1
776
+ else
777
+ echo "Optional package install failed across all mirrors: $packages; skipping." >> "$log"
778
+ return 0
779
+ fi
780
+ }
400
781
  # Install both fuse2 and fuse3 for compatibility for all possible fuse adapters in advance,
401
782
  # so that both fusemount and fusermount3 can be masked before enabling SSH access.
402
783
  PACKAGES="rsync curl wget netcat gcc patch pciutils fuse fuse3 openssh-server";
403
784
 
404
785
  # Separate packages into two groups: packages that are installed first
405
- # so that curl, rsync and wget are available sooner to unblock the following
786
+ # so that curl, rsync, ssh and wget are available sooner to unblock the following
406
787
  # conda installation and rsync.
407
788
  # Also, we install fuse first to avoid confliction with fuse3.
408
789
  set -e
@@ -423,7 +804,7 @@ available_node_types:
423
804
  done;
424
805
  if [ ! -z "$INSTALL_FIRST" ]; then
425
806
  echo "Installing core packages: $INSTALL_FIRST";
426
- DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $INSTALL_FIRST;
807
+ apt_install_with_mirrors 1 $INSTALL_FIRST || { echo "Error: core package installation failed." >> /tmp/apt-update.log; exit 1; }
427
808
  fi;
428
809
  # SSH and other packages are not necessary, so we disable set -e
429
810
  set +e
@@ -447,7 +828,8 @@ available_node_types:
447
828
  fi
448
829
  $(prefix_cmd) cp -p "$FUSERMOUNT_PATH" "${FUSERMOUNT_PATH}-original"
449
830
  $(prefix_cmd) ln -sf {{k8s_fusermount_shared_dir}}/fusermount-shim "$FUSERMOUNT_PATH"
450
- FUSERMOUNT3_PATH=$(which fusermount3)
831
+ # "|| true" because fusermount3 is not always available
832
+ FUSERMOUNT3_PATH=$(which fusermount3) || true
451
833
  if [ -z "$FUSERMOUNT3_PATH" ]; then
452
834
  FUSERMOUNT3_PATH="${FUSERMOUNT_PATH}3"
453
835
  fi
@@ -489,16 +871,23 @@ available_node_types:
489
871
  $(prefix_cmd) mkdir -p ~/.ssh;
490
872
  $(prefix_cmd) chown -R $(whoami) ~/.ssh;
491
873
  $(prefix_cmd) chmod 700 ~/.ssh;
492
- $(prefix_cmd) cat /etc/secret-volume/ssh-publickey* > ~/.ssh/authorized_keys;
874
+ $(prefix_cmd) cat > ~/.ssh/authorized_keys <<'SKYPILOT_SSH_KEY_EOF'
875
+ skypilot:ssh_public_key_content
876
+ SKYPILOT_SSH_KEY_EOF
493
877
  $(prefix_cmd) chmod 644 ~/.ssh/authorized_keys;
494
878
  $(prefix_cmd) service ssh restart;
495
879
  $(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
496
880
 
497
- ) > /tmp/${STEPS[0]}.log 2>&1 || {
498
- echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed
881
+ touch /tmp/apt_ssh_setup_complete
882
+ echo "=== SSH setup completed ==="
883
+ ) > /tmp/${STEPS[0]}.log 2>&1
884
+ if [ "$?" -ne "0" ]; then
885
+ {
886
+ echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed 2>&1
499
887
  cat /tmp/${STEPS[0]}.log
500
888
  exit 1
501
- }
889
+ }
890
+ fi
502
891
  ) &
503
892
 
504
893
  # STEP 2: Install conda, ray and skypilot (for dependencies); start
@@ -516,7 +905,21 @@ available_node_types:
516
905
  {{ conda_installation_commands }}
517
906
  {{ ray_installation_commands }}
518
907
 
519
- VIRTUAL_ENV=~/skypilot-runtime ~/.local/bin/uv pip install skypilot[kubernetes,remote]
908
+ # set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
909
+ # unset PYTHONPATH in case the user provided docker image set it.
910
+ VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip install skypilot[kubernetes,remote]
911
+ # Wait for `patch` package to be installed before applying ray patches
912
+ until dpkg -l | grep -q "^ii patch "; do
913
+ sleep 0.1
914
+ echo "Waiting for patch package to be installed..."
915
+ done
916
+ # Apply Ray patches for progress bar fix
917
+ # set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
918
+ # unset PYTHONPATH in case the user provided docker image set it.
919
+ # ~/.sky/python_path is seeded by conda_installation_commands
920
+ VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
921
+ env -u PYTHONPATH $(cat ~/.sky/python_path) -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
922
+ }
520
923
  touch /tmp/ray_skypilot_installation_complete
521
924
  echo "=== Ray and skypilot installation completed ==="
522
925
 
@@ -544,11 +947,14 @@ available_node_types:
544
947
  set +e
545
948
  {{ ray_worker_start_command }}
546
949
  fi
547
- ) > /tmp/${STEPS[1]}.log 2>&1 || {
548
- echo "Error: ${STEPS[1]} failed. Continuing anyway..." > /tmp/${STEPS[1]}.failed
950
+ ) > /tmp/${STEPS[1]}.log 2>&1
951
+ if [ "$?" -ne "0" ]; then
952
+ {
953
+ echo "Error: ${STEPS[1]} failed. Continuing anyway..." > /tmp/${STEPS[1]}.failed 2>&1
549
954
  cat /tmp/${STEPS[1]}.log
550
955
  exit 1
551
- }
956
+ }
957
+ fi
552
958
  ) &
553
959
 
554
960
 
@@ -566,11 +972,14 @@ available_node_types:
566
972
  fi;
567
973
  fi;
568
974
  export -p > ~/container_env_var.sh && $(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh
569
- ) > /tmp/${STEPS[2]}.log 2>&1 || {
570
- echo "Error: ${STEPS[2]} failed. Continuing anyway..." > /tmp/${STEPS[2]}.failed
975
+ ) > /tmp/${STEPS[2]}.log 2>&1
976
+ if [ "$?" -ne "0" ]; then
977
+ {
978
+ echo "Error: ${STEPS[2]} failed. Continuing anyway..." > /tmp/${STEPS[2]}.failed 2>&1
571
979
  cat /tmp/${STEPS[2]}.log
572
980
  exit 1
573
- }
981
+ }
982
+ fi
574
983
  ) &
575
984
 
576
985
  function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
@@ -623,23 +1032,72 @@ available_node_types:
623
1032
  {% if high_availability %}
624
1033
  mkdir -p {{k8s_high_availability_deployment_run_script_dir}}
625
1034
  if [ -f {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready ]; then
1035
+ SKYPILOT_HA_RECOVERY_LOG="{{ha_recovery_log_path}}"
1036
+ echo "Starting HA recovery at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
1037
+ start_time=$SECONDS
1038
+ retry_count=0
1039
+
1040
+ # Wait for Ray to be ready, as the following commands is depending on Ray.
1041
+ GET_RAY_STATUS_CMD=$({{sky_python_cmd}} -c 'from sky.provision import instance_setup; print(instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND)')
1042
+ while true; do
1043
+ retry_count=$((retry_count + 1))
1044
+ current_duration=$(( SECONDS - start_time ))
1045
+ echo "Attempt $retry_count to get Ray status after $current_duration seconds..." >> $SKYPILOT_HA_RECOVERY_LOG
1046
+
1047
+ bash --login -c "$GET_RAY_STATUS_CMD"
1048
+ if [ $? -eq 0 ]; then
1049
+ wait_duration=$(( SECONDS - start_time ))
1050
+ echo "Ray ready after waiting $wait_duration seconds (took $retry_count attempts)" >> $SKYPILOT_HA_RECOVERY_LOG
1051
+ break
1052
+ fi
1053
+ echo "Waiting for Ray to be ready..." >> $SKYPILOT_HA_RECOVERY_LOG
1054
+ sleep 2
1055
+ done
1056
+
626
1057
  # ! Keep this aligned with `CloudVmRayBackend._setup()`
627
- # Suppose all `task.setup` are the same for skyserve controller task.
1058
+ # Suppose all `task.setup` are the same for sky serve / managed jobs controller task.
628
1059
  # So be careful for compatibility issue once you change it.
629
1060
  chmod +x {{k8s_high_availability_deployment_setup_script_path}}
630
1061
  /bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && {{k8s_high_availability_deployment_setup_script_path}} > /tmp/controller_recovery_setup_commands.log 2>&1"
631
- echo "=== Controller setup commands completed for recovery ==="
1062
+ echo "=== Controller setup commands completed for recovery at $(date) ===" >> $SKYPILOT_HA_RECOVERY_LOG
632
1063
 
1064
+ touch {{k8s_high_availability_restarting_signal_file}}
1065
+ # Get all in-progress jobs from managed jobs controller. We skip any jobs that are already done.
1066
+ # Also, skip the jobs that are waiting to be scheduled as those does not have a controller process running.
1067
+ # For SkyServe, this will be None and every service will be recovered. This is because SkyServe
1068
+ # will delete the service from the database after it is terminated so everything in the database is running.
1069
+ ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs, _ = state.get_managed_jobs_with_filters(fields=['job_id', 'schedule_state']); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
1070
+ if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
1071
+ read -ra ALL_IN_PROGRESS_JOBS_SEQ <<< "$ALL_IN_PROGRESS_JOBS"
1072
+ fi
633
1073
  for file in {{k8s_high_availability_deployment_run_script_dir}}/*; do
1074
+ # This is the cluster job id on managed jobs controller, but it is guaranteed to be the same as the managed job id,
1075
+ # so we directly use it here. See `CloudVmRayBackend._exec_code_on_head::_dump_code_to_file` for more details.
1076
+ JOB_ID=$(basename $file | sed 's/sky_job_//')
1077
+ # If the list of in-progress jobs is not None (meaning this is a managed job HA controller) and job is not in-progress, skip.
1078
+ if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
1079
+ if [[ ! " ${ALL_IN_PROGRESS_JOBS_SEQ[@]} " =~ " ${JOB_ID} " ]]; then
1080
+ continue
1081
+ fi
1082
+ fi
634
1083
  # ! Keep this aligned with `CloudVmRayBackend._execute()`
635
1084
  chmod +x $file
1085
+ # TODO(tian): This logic may run a lot of things if the jobs controller previously had many jobs.
1086
+ # We should do more tests and make sure it will scale well.
636
1087
  /bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && $file > /tmp/task_run_$(basename $file).log 2>&1"
637
- echo "=== Controller task run for service (file: $file) completed for recovery ==="
1088
+ echo "=== Controller task run for service / job (file: $file) completed for recovery at $(date) ===" >> $SKYPILOT_HA_RECOVERY_LOG
638
1089
  done
1090
+ rm {{k8s_high_availability_restarting_signal_file}}
1091
+
1092
+ duration=$(( SECONDS - start_time ))
1093
+ echo "HA recovery completed at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
1094
+ echo "Total recovery time: $duration seconds" >> $SKYPILOT_HA_RECOVERY_LOG
639
1095
  fi
640
1096
 
641
1097
  touch {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready
642
1098
  {% endif %}
1099
+ # Set +x to stop printing the commands and their arguments as they are executed.
1100
+ set +x
643
1101
 
644
1102
  trap : TERM INT; log_tail || sleep infinity & wait
645
1103
 
@@ -653,14 +1111,27 @@ available_node_types:
653
1111
  # object store. If you do not provide this, Ray will fall back to
654
1112
  # /tmp which cause slowdowns if is not a shared memory volume.
655
1113
  volumeMounts:
656
- - name: secret-volume
657
- readOnly: true
658
- mountPath: "/etc/secret-volume"
659
- # This volume allocates shared memory for Ray to use for its plasma
660
- # object store. If you do not provide this, Ray will fall back to
661
- # /tmp which cause slowdowns if is not a shared memory volume.
662
1114
  - mountPath: /dev/shm
663
1115
  name: dshm
1116
+ {% if k8s_enable_gpudirect_tcpx %}
1117
+ - name: tcpx-socket
1118
+ mountPath: /tmp
1119
+ - name: libraries
1120
+ mountPath: /usr/local/nvidia/lib64
1121
+ readOnly: true
1122
+ {% endif %}
1123
+ {% if k8s_enable_gpudirect_tcpxo %}
1124
+ - name: libraries
1125
+ mountPath: /usr/local/nvidia
1126
+ - name: aperture-devices
1127
+ mountPath: /dev/aperture_devices
1128
+ {% endif %}
1129
+ {% if k8s_enable_gpudirect_rdma %}
1130
+ - name: library-dir-host
1131
+ mountPath: /usr/local/nvidia
1132
+ - name: gib
1133
+ mountPath: /usr/local/gib
1134
+ {% endif %}
664
1135
  {% if high_availability %}
665
1136
  - name: {{k8s_high_availability_deployment_volume_mount_name}}
666
1137
  mountPath: {{k8s_high_availability_deployment_volume_mount_path}}
@@ -669,6 +1140,10 @@ available_node_types:
669
1140
  - name: fusermount-shared-dir
670
1141
  mountPath: {{k8s_fusermount_shared_dir}}
671
1142
  {% endif %}
1143
+ {% for volume_mount in volume_mounts %}
1144
+ - name: {{volume_mount.name}}
1145
+ mountPath: {{volume_mount.path}}
1146
+ {% endfor %}
672
1147
  resources:
673
1148
  requests:
674
1149
  cpu: {{cpus}}
@@ -681,13 +1156,87 @@ available_node_types:
681
1156
  # https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work
682
1157
  {{k8s_resource_key}}: {{accelerator_count}}
683
1158
  {% endif %}
1159
+ {% if k8s_network_type == 'coreweave' %}
1160
+ rdma/ib: 1
1161
+ {% endif %}
684
1162
  {% if k8s_resource_key is not none %}
685
1163
  limits:
686
1164
  # Limits need to be defined for GPU/TPU requests
687
1165
  {% if k8s_resource_key is not none %}
688
1166
  {{k8s_resource_key}}: {{accelerator_count}}
689
1167
  {% endif %}
1168
+ {% if k8s_network_type == 'coreweave' %}
1169
+ rdma/ib: 1
1170
+ {% endif %}
690
1171
  {% endif %}
1172
+ {% if k8s_ipc_lock_capability %}
1173
+ securityContext:
1174
+ capabilities:
1175
+ add:
1176
+ - IPC_LOCK
1177
+ {% endif %}
1178
+ {% if k8s_enable_gpudirect_tcpx %}
1179
+ # GPUDirect TCPX daemon sidecar container
1180
+ - name: tcpx-daemon
1181
+ image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.11
1182
+ imagePullPolicy: Always
1183
+ command:
1184
+ - /tcpgpudmarxd/build/app/tcpgpudmarxd
1185
+ - --gpu_nic_preset
1186
+ - a3vm
1187
+ - --gpu_shmem_type
1188
+ - fd
1189
+ - --uds_path
1190
+ - /run/tcpx
1191
+ - --setup_param
1192
+ - --verbose
1193
+ - "128"
1194
+ - "2"
1195
+ - "0"
1196
+ securityContext:
1197
+ capabilities:
1198
+ add:
1199
+ - NET_ADMIN
1200
+ volumeMounts:
1201
+ - name: libraries
1202
+ mountPath: /usr/local/nvidia/lib64
1203
+ readOnly: true
1204
+ - name: tcpx-socket
1205
+ mountPath: /run/tcpx
1206
+ - name: sys
1207
+ mountPath: /hostsysfs
1208
+ - name: proc-sys
1209
+ mountPath: /hostprocsysfs
1210
+ env:
1211
+ - name: LD_LIBRARY_PATH
1212
+ value: /usr/local/nvidia/lib64
1213
+ {% endif %}
1214
+ {% if k8s_enable_gpudirect_tcpxo %}
1215
+ - name: tcpxo-daemon
1216
+ image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.17
1217
+ imagePullPolicy: Always
1218
+ command: ["/bin/sh", "-c"]
1219
+ args:
1220
+ - |
1221
+ set -ex
1222
+ chmod 755 /fts/entrypoint_rxdm_container.sh
1223
+ /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr
1224
+ securityContext:
1225
+ capabilities:
1226
+ add:
1227
+ - NET_ADMIN
1228
+ - NET_BIND_SERVICE
1229
+ volumeMounts:
1230
+ - name: libraries
1231
+ mountPath: /usr/local/nvidia
1232
+ - name: sys
1233
+ mountPath: /hostsysfs
1234
+ - name: proc-sys
1235
+ mountPath: /hostprocsysfs
1236
+ env:
1237
+ - name: LD_LIBRARY_PATH
1238
+ value: /usr/local/nvidia/lib64
1239
+ {% endif %}
691
1240
 
692
1241
  {% if high_availability %}
693
1242
  pvc_spec:
@@ -724,7 +1273,7 @@ available_node_types:
724
1273
  spec:
725
1274
  securityContext:
726
1275
  fsGroup: 1000
727
- # To prevent the home dir provided by the docker image from being overriden by pvc mounting,
1276
+ # To prevent the home dir provided by the docker image from being overridden by pvc mounting,
728
1277
  # we use initContainers to copy it first to /mnt/home, which will later be mounted to home dir.
729
1278
  initContainers:
730
1279
  - name: init-copy-home
@@ -791,13 +1340,20 @@ setup_commands:
791
1340
  {%- endfor %}
792
1341
  STEPS=("apt-ssh-setup" "runtime-setup" "env-setup")
793
1342
  start_epoch=$(date +%s);
794
- echo "=== Logs for asynchronous ray and skypilot installation ===";
795
- if [ -f /tmp/skypilot_is_nimbus ]; then
796
- echo "=== Logs for asynchronous ray and skypilot installation ===";
797
- [ -f /tmp/ray_skypilot_installation_complete ] && cat /tmp/${STEPS[1]}.log ||
798
- { tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/ray_skypilot_installation_complete ]; do sleep 0.5; done; kill $TAIL_PID || true; };
799
- [ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
1343
+
1344
+ # Wait for SSH setup to complete before proceeding
1345
+ if [ -f /tmp/apt_ssh_setup_started ]; then
1346
+ echo "=== Logs for asynchronous SSH setup ===";
1347
+ ([ -f /tmp/apt_ssh_setup_complete ]|| [ -f /tmp/${STEPS[0]}.failed ]) && cat /tmp/${STEPS[0]}.log ||
1348
+ { tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/apt_ssh_setup_complete ] || [ -f /tmp/${STEPS[0]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1349
+ [ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
800
1350
  fi
1351
+
1352
+ echo "=== Logs for asynchronous ray and skypilot installation ===";
1353
+ ([ -f /tmp/ray_skypilot_installation_complete ]|| [ -f /tmp/${STEPS[1]}.failed ]) && cat /tmp/${STEPS[1]}.log ||
1354
+ { tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/ray_skypilot_installation_complete ] || [ -f /tmp/${STEPS[1]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1355
+ [ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
1356
+
801
1357
  end_epoch=$(date +%s);
802
1358
  echo "=== Ray and skypilot dependencies installation completed in $(($end_epoch - $start_epoch)) secs ===";
803
1359
  start_epoch=$(date +%s);