skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -33,14 +33,11 @@ provider:
33
33
  networking_mode: {{k8s_networking_mode}}
34
34
 
35
35
  # We use internal IPs since we set up a port-forward between the kubernetes
36
- # cluster and the local machine, or directly use NodePort to reach the
37
- # head node.
36
+ # cluster and the local machine.
38
37
  use_internal_ips: true
39
38
 
40
39
  timeout: {{timeout}}
41
40
 
42
- ssh_jump_image: {{k8s_ssh_jump_image}}
43
-
44
41
  # Namespace used to host SkyPilot system components, such as fuse device
45
42
  # manager.
46
43
  skypilot_system_namespace: {{k8s_skypilot_system_namespace}}
@@ -49,6 +46,10 @@ provider:
49
46
  # Used to set up the necessary permissions and sidecars.
50
47
  fuse_device_required: {{k8s_fuse_device_required}}
51
48
 
49
+ {% if ephemeral_volume_mounts %}
50
+ ephemeral_volume_specs: {{ephemeral_volume_mounts | tojson}}
51
+ {% endif %}
52
+
52
53
  # ServiceAccount created by the autoscaler for the head node pod that it
53
54
  # runs in. If this field isn't provided, the head pod config below must
54
55
  # contain a user-created service account with the proper permissions.
@@ -212,7 +213,9 @@ provider:
212
213
  metadata:
213
214
  labels:
214
215
  parent: skypilot
216
+ # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
215
217
  skypilot-cluster: {{cluster_name_on_cloud}}
218
+ skypilot-cluster-name: {{cluster_name_on_cloud}}
216
219
  skypilot-user: {{ user }}
217
220
  name: {{cluster_name_on_cloud}}-head-ssh
218
221
  spec:
@@ -230,7 +233,9 @@ provider:
230
233
  metadata:
231
234
  labels:
232
235
  parent: skypilot
236
+ # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
233
237
  skypilot-cluster: {{cluster_name_on_cloud}}
238
+ skypilot-cluster-name: {{cluster_name_on_cloud}}
234
239
  skypilot-user: {{ user }}
235
240
  # NOTE: If you're running multiple Ray clusters with services
236
241
  # on one Kubernetes cluster, they must have unique service
@@ -243,6 +248,24 @@ provider:
243
248
  # This selector must match the head node pod's selector below.
244
249
  selector:
245
250
  component: {{cluster_name_on_cloud}}-head
251
+ # Headless service mapping hostnames to rest of the worker nodes
252
+ {% for worker_id in range(1, num_nodes) %}
253
+ - apiVersion: v1
254
+ kind: Service
255
+ metadata:
256
+ labels:
257
+ parent: skypilot
258
+ # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
259
+ skypilot-cluster: {{cluster_name_on_cloud}}
260
+ skypilot-cluster-name: {{cluster_name_on_cloud}}
261
+ skypilot-user: {{ user }}
262
+ name: {{cluster_name_on_cloud}}-worker{{ worker_id }}
263
+ spec:
264
+ selector:
265
+ component: {{cluster_name_on_cloud}}-worker{{ worker_id }}
266
+ clusterIP: None
267
+ {% endfor %}
268
+
246
269
 
247
270
  # Specify the pod type for the ray head node (as configured below).
248
271
  head_node_type: ray_head_default
@@ -255,13 +278,12 @@ available_node_types:
255
278
  metadata:
256
279
  # name will be filled in the provisioner
257
280
  # head node name will be {{cluster_name_on_cloud}}-head, which will match the head node service selector above if a head node
258
- # service is required.
281
+ # service is required. Worker nodes are named {{cluster_name_on_cloud}}-worker{{ node_id }}
259
282
  labels:
260
283
  parent: skypilot
261
284
  # component will be set for the head node pod to be the same as the head node service selector above if a
285
+ # TODO (kyuds): remove this label for v0.12.0, as skypilot-cluster label is deprecated in favor of skypilot-cluster-name.
262
286
  skypilot-cluster: {{cluster_name_on_cloud}}
263
- # Identifies the SSH jump pod used by this pod. Used in life cycle management of the ssh jump pod.
264
- skypilot-ssh-jump: {{k8s_ssh_jump_name}}
265
287
  skypilot-user: {{ user }}
266
288
  # Custom tags for the pods
267
289
  {%- for label_key, label_value in labels.items() %}
@@ -273,14 +295,100 @@ available_node_types:
273
295
  {% if (k8s_acc_label_key is not none and k8s_acc_label_values is not none) %}
274
296
  skypilot-binpack: "gpu"
275
297
  {% endif %}
298
+ {% if k8s_kueue_local_queue_name %}
299
+ kueue.x-k8s.io/queue-name: {{k8s_kueue_local_queue_name}}
300
+ kueue.x-k8s.io/pod-group-name: {{cluster_name_on_cloud}}
301
+ {% endif %}
302
+ {% if k8s_kueue_local_queue_name or k8s_enable_gpudirect_tcpx or k8s_enable_gpudirect_tcpxo or k8s_enable_gpudirect_rdma %}
303
+ annotations:
304
+ {% if k8s_kueue_local_queue_name %}
305
+ kueue.x-k8s.io/retriable-in-group: "false"
306
+ kueue.x-k8s.io/pod-group-total-count: "{{ num_nodes|string }}"
307
+ {% if k8s_max_run_duration_seconds %}
308
+ provreq.kueue.x-k8s.io/maxRunDurationSeconds: "{{k8s_max_run_duration_seconds|string}}"
309
+ {% endif %}
310
+ {% endif %}
311
+ # https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx
312
+ # Values from google cloud guide
313
+ {% if k8s_enable_gpudirect_tcpx %}
314
+ devices.gke.io/container.tcpx-daemon: |+
315
+ - path: /dev/nvidia0
316
+ - path: /dev/nvidia1
317
+ - path: /dev/nvidia2
318
+ - path: /dev/nvidia3
319
+ - path: /dev/nvidia4
320
+ - path: /dev/nvidia5
321
+ - path: /dev/nvidia6
322
+ - path: /dev/nvidia7
323
+ - path: /dev/nvidiactl
324
+ - path: /dev/nvidia-uvm
325
+ networking.gke.io/default-interface: 'eth0'
326
+ networking.gke.io/interfaces: |
327
+ [
328
+ {"interfaceName":"eth0","network":"default"},
329
+ {"interfaceName":"eth1","network":"vpc1"},
330
+ {"interfaceName":"eth2","network":"vpc2"},
331
+ {"interfaceName":"eth3","network":"vpc3"},
332
+ {"interfaceName":"eth4","network":"vpc4"}
333
+ ]
334
+ {% endif %}
335
+ {% if k8s_enable_gpudirect_tcpxo %}
336
+ devices.gke.io/container.tcpxo-daemon: |+
337
+ - path: /dev/nvidia0
338
+ - path: /dev/nvidia1
339
+ - path: /dev/nvidia2
340
+ - path: /dev/nvidia3
341
+ - path: /dev/nvidia4
342
+ - path: /dev/nvidia5
343
+ - path: /dev/nvidia6
344
+ - path: /dev/nvidia7
345
+ - path: /dev/nvidiactl
346
+ - path: /dev/nvidia-uvm
347
+ - path: /dev/dmabuf_import_helper
348
+ networking.gke.io/default-interface: 'eth0'
349
+ networking.gke.io/interfaces: |
350
+ [
351
+ {"interfaceName":"eth0","network":"default"},
352
+ {"interfaceName":"eth1","network":"vpc1"},
353
+ {"interfaceName":"eth2","network":"vpc2"},
354
+ {"interfaceName":"eth3","network":"vpc3"},
355
+ {"interfaceName":"eth4","network":"vpc4"},
356
+ {"interfaceName":"eth5","network":"vpc5"},
357
+ {"interfaceName":"eth6","network":"vpc6"},
358
+ {"interfaceName":"eth7","network":"vpc7"},
359
+ {"interfaceName":"eth8","network":"vpc8"}
360
+ ]
361
+ {% endif %}
362
+ {% if k8s_enable_gpudirect_rdma %}
363
+ networking.gke.io/default-interface: 'eth0'
364
+ networking.gke.io/interfaces: |
365
+ [
366
+ {"interfaceName":"eth0","network":"default"},
367
+ {"interfaceName":"eth1","network":"gvnic-1"},
368
+ {"interfaceName":"eth2","network":"rdma-0"},
369
+ {"interfaceName":"eth3","network":"rdma-1"},
370
+ {"interfaceName":"eth4","network":"rdma-2"},
371
+ {"interfaceName":"eth5","network":"rdma-3"},
372
+ {"interfaceName":"eth6","network":"rdma-4"},
373
+ {"interfaceName":"eth7","network":"rdma-5"},
374
+ {"interfaceName":"eth8","network":"rdma-6"},
375
+ {"interfaceName":"eth9","network":"rdma-7"}
376
+ ]
377
+ {% endif %}
378
+ {% endif %}
276
379
  spec:
277
380
  # serviceAccountName: skypilot-service-account
278
381
  serviceAccountName: {{k8s_service_account_name}}
279
382
  automountServiceAccountToken: {{k8s_automount_sa_token}}
280
383
  restartPolicy: {{ "Always" if high_availability else "Never" }}
384
+ {% if volume_mounts %}
385
+ securityContext:
386
+ fsGroup: 1000
387
+ fsGroupChangePolicy: OnRootMismatch
388
+ {% endif %}
281
389
 
282
390
  # Add node selector if GPU/TPUs are requested:
283
- {% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) %}
391
+ {% if (k8s_topology_label_key is not none and k8s_topology_label_value is not none) or (k8s_spot_label_key is not none) or (k8s_enable_flex_start) %}
284
392
  nodeSelector:
285
393
  {% if k8s_topology_label_key is not none and k8s_topology_label_value is not none %}
286
394
  {{k8s_topology_label_key}}: {{k8s_topology_label_value}}
@@ -288,6 +396,9 @@ available_node_types:
288
396
  {% if k8s_spot_label_key is not none %}
289
397
  {{k8s_spot_label_key}}: {{k8s_spot_label_value|tojson}}
290
398
  {% endif %}
399
+ {% if k8s_enable_flex_start %}
400
+ cloud.google.com/gke-flex-start: "true"
401
+ {% endif %}
291
402
  {% endif %}
292
403
  {% if (k8s_acc_label_key is not none and k8s_acc_label_values is not none) or (avoid_label_keys is not none) %}
293
404
  affinity:
@@ -339,9 +450,6 @@ available_node_types:
339
450
  # object store. If you do not provide this, Ray will fall back to
340
451
  # /tmp which cause slowdowns if is not a shared memory volume.
341
452
  volumes:
342
- - name: secret-volume
343
- secret:
344
- secretName: {{k8s_ssh_key_secret_name}}
345
453
  - name: dshm
346
454
  emptyDir:
347
455
  medium: Memory
@@ -356,19 +464,176 @@ available_node_types:
356
464
  persistentVolumeClaim:
357
465
  claimName: {{cluster_name_on_cloud}}-{{k8s_high_availability_deployment_volume_mount_name}}
358
466
  {% endif %}
467
+ {% for volume_mount in volume_mounts %}
468
+ - name: {{volume_mount.name}}
469
+ persistentVolumeClaim:
470
+ claimName: {{volume_mount.volume_name_on_cloud}}
471
+ {% endfor %}
472
+ {% if k8s_enable_gpudirect_tcpx %}
473
+ - name: libraries
474
+ hostPath:
475
+ path: /home/kubernetes/bin/nvidia/lib64
476
+ - name: tcpx-socket
477
+ emptyDir: {}
478
+ - name: sys
479
+ hostPath:
480
+ path: /sys
481
+ - name: proc-sys
482
+ hostPath:
483
+ path: /proc/sys
484
+ {% endif %}
485
+ {% if k8s_enable_gpudirect_tcpxo %}
486
+ - name: libraries
487
+ hostPath:
488
+ path: /home/kubernetes/bin/nvidia
489
+ - name: sys
490
+ hostPath:
491
+ path: /sys
492
+ - name: proc-sys
493
+ hostPath:
494
+ path: /proc/sys
495
+ - name: aperture-devices
496
+ hostPath:
497
+ path: /dev/aperture_devices
498
+ {% endif %}
499
+ {% if k8s_enable_gpudirect_rdma %}
500
+ - name: library-dir-host
501
+ hostPath:
502
+ path: /home/kubernetes/bin/nvidia
503
+ - name: gib
504
+ hostPath:
505
+ path: /home/kubernetes/bin/gib
506
+ {% endif %}
359
507
  containers:
360
508
  - name: ray-node
361
- imagePullPolicy: IfNotPresent
509
+ imagePullPolicy: Always
362
510
  image: {{image_id}}
363
511
  env:
364
512
  - name: SKYPILOT_POD_NODE_TYPE
365
513
  valueFrom:
366
514
  fieldRef:
367
515
  fieldPath: metadata.labels['ray-node-type']
516
+ - name: SKYPILOT_POD_CPU_CORE_LIMIT
517
+ valueFrom:
518
+ resourceFieldRef:
519
+ containerName: ray-node
520
+ resource: requests.cpu
521
+ - name: SKYPILOT_POD_MEMORY_BYTES_LIMIT
522
+ valueFrom:
523
+ resourceFieldRef:
524
+ containerName: ray-node
525
+ resource: requests.memory
368
526
  {% for key, value in k8s_env_vars.items() if k8s_env_vars is not none %}
369
527
  - name: {{ key }}
370
528
  value: {{ value }}
371
529
  {% endfor %}
530
+ # https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#environment-variables-nccl
531
+ # Page recommends setting NCCL values for GPUDirect TCPX for best performance.
532
+ {% if k8s_enable_gpudirect_tcpx %}
533
+ - name: LD_LIBRARY_PATH
534
+ value: /usr/local/nvidia/lib64:/usr/local/tcpx/lib64
535
+ - name: NCCL_GPUDIRECTTCPX_SOCKET_IFNAME
536
+ value: eth1,eth2,eth3,eth4
537
+ - name: NCCL_GPUDIRECTTCPX_CTRL_DEV
538
+ value: eth0
539
+ - name: NCCL_GPUDIRECTTCPX_TX_BINDINGS
540
+ value: "eth1:8-21,112-125;eth2:8-21,112-125;eth3:60-73,164-177;eth4:60-73,164-177"
541
+ - name: NCCL_GPUDIRECTTCPX_RX_BINDINGS
542
+ value: "eth1:22-35,126-139;eth2:22-35,126-139;eth3:74-87,178-191;eth4:74-87,178-191"
543
+ - name: NCCL_GPUDIRECTTCPX_PROGRAM_FLOW_STEERING_WAIT_MICROS
544
+ value: "500000"
545
+ - name: NCCL_GPUDIRECTTCPX_UNIX_CLIENT_PREFIX
546
+ value: "/tmp"
547
+ - name: NCCL_GPUDIRECTTCPX_FORCE_ACK
548
+ value: "0"
549
+ - name: NCCL_SOCKET_IFNAME
550
+ value: eth0
551
+ - name: NCCL_CROSS_NIC
552
+ value: "0"
553
+ - name: NCCL_ALGO
554
+ value: Ring
555
+ - name: NCCL_PROTO
556
+ value: Simple
557
+ - name: NCCL_NSOCKS_PERTHREAD
558
+ value: "4"
559
+ - name: NCCL_SOCKET_NTHREADS
560
+ value: "1"
561
+ - name: NCCL_NET_GDR_LEVEL
562
+ value: PIX
563
+ - name: NCCL_DYNAMIC_CHUNK_SIZE
564
+ value: "524288"
565
+ - name: NCCL_P2P_PXN_LEVEL
566
+ value: "0"
567
+ - name: NCCL_P2P_NET_CHUNKSIZE
568
+ value: "524288"
569
+ - name: NCCL_P2P_PCI_CHUNKSIZE
570
+ value: "524288"
571
+ - name: NCCL_P2P_NVL_CHUNKSIZE
572
+ value: "1048576"
573
+ - name: NCCL_BUFFSIZE
574
+ value: "4194304"
575
+ - name: NCCL_MAX_NCHANNELS
576
+ value: "8"
577
+ - name: NCCL_MIN_NCHANNELS
578
+ value: "8"
579
+ - name: CUDA_VISIBLE_DEVICES
580
+ value: "0,1,2,3,4,5,6,7"
581
+ {% endif %}
582
+ {% if k8s_enable_gpudirect_tcpxo %}
583
+ - name: LD_LIBRARY_PATH
584
+ value: /usr/local/nvidia/lib64
585
+ - name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY
586
+ value: /dev/aperture_devices
587
+ - name: NCCL_FASTRAK_CTRL_DEV
588
+ value: eth0
589
+ - name: NCCL_FASTRAK_IFNAME
590
+ value: eth1,eth2,eth3,eth4,eth5,eth6,eth7,eth8
591
+ - name: NCCL_SOCKET_IFNAME
592
+ value: eth0
593
+ - name: NCCL_CROSS_NIC
594
+ value: "0"
595
+ - name: NCCL_ALGO
596
+ value: Ring,Tree
597
+ - name: NCCL_PROTO
598
+ value: Simple,LL128
599
+ - name: NCCL_MIN_NCHANNELS
600
+ value: "4"
601
+ - name: NCCL_TUNER_PLUGIN
602
+ value: libnccl-tuner.so
603
+ - name: NCCL_TUNER_CONFIG_PATH
604
+ value: /usr/local/nvidia/lib64/a3plus_tuner_config.textproto
605
+ - name: CUDA_VISIBLE_DEVICES
606
+ value: "0,1,2,3,4,5,6,7"
607
+ {% endif %}
608
+ {% if k8s_enable_gpudirect_rdma %}
609
+ - name: LD_LIBRARY_PATH
610
+ value: /usr/local/nvidia/lib64
611
+ - name: NCCL_NET
612
+ value: gIB
613
+ - name: NCCL_CROSS_NIC
614
+ value: "0"
615
+ - name: NCCL_NET_GDR_LEVEL
616
+ value: PIX
617
+ - name: NCCL_P2P_NET_CHUNKSIZE
618
+ value: "131072"
619
+ - name: NCCL_NVLS_CHUNKSIZE
620
+ value: "524288"
621
+ - name: NCCL_IB_ADAPTIVE_ROUTING
622
+ value: "1"
623
+ - name: NCCL_IB_QPS_PER_CONNECTION
624
+ value: "4"
625
+ - name: NCCL_IB_TC
626
+ value: "52"
627
+ - name: NCCL_IB_FIFO_TC
628
+ value: "84"
629
+ {% if k8s_enable_gpudirect_rdma_a4 %}
630
+ - name: NCCL_TUNER_CONFIG_PATH
631
+ value: /usr/local/gib/configs/tuner_config_a4.txtpb
632
+ {% else %}
633
+ - name: NCCL_TUNER_CONFIG_PATH
634
+ value: /usr/local/gib/configs/tuner_config_a3u.txtpb
635
+ {% endif %}
636
+ {% endif %}
372
637
  {% if k8s_fuse_device_required %}
373
638
  - name: FUSERMOUNT_SHARED_DIR
374
639
  value: {{k8s_fusermount_shared_dir}}
@@ -378,12 +643,17 @@ available_node_types:
378
643
  command: ["/bin/bash", "-c", "--"]
379
644
  args:
380
645
  - |
381
- # For backwards compatibility, we put a marker file in the pod
382
- # to indicate that the pod is running with the changes introduced
383
- # in project nimbus: https://github.com/skypilot-org/skypilot/pull/4393
384
- # TODO: Remove this marker file and it's usage in setup_commands
385
- # after v0.10.0 release.
386
- touch /tmp/skypilot_is_nimbus
646
+ # Set -x to print the commands and their arguments as they are executed.
647
+ # Useful for debugging.
648
+ set -x
649
+
650
+ # Execute user-provided post-provision runcmd
651
+ # before any of the SkyPilot setup commands.
652
+ {%- if runcmd %}
653
+ {%- for cmd in runcmd %}
654
+ {{cmd}}
655
+ {%- endfor %}
656
+ {%- endif %}
387
657
 
388
658
  # Helper function to conditionally use sudo
389
659
  # TODO(zhwu): consolidate the two prefix_cmd and sudo replacements
@@ -395,14 +665,131 @@ available_node_types:
395
665
  # STEP 1: Run apt update, install missing packages, and set up ssh.
396
666
  (
397
667
  (
398
- DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update > /tmp/apt-update.log 2>&1 || \
399
- echo "Warning: apt-get update failed. Continuing anyway..." >> /tmp/apt-update.log
668
+ # Helper: run apt-get update with retries
669
+ apt_update_with_retries() {
670
+ # do not fail the whole shell; we handle return codes
671
+ set +e
672
+ local log=/tmp/apt-update.log
673
+ local tries=3
674
+ local delay=1
675
+ local i
676
+ for i in $(seq 1 $tries); do
677
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get update >> "$log" 2>&1 && { set -e; return 0; }
678
+ echo "apt-get update attempt $i/$tries failed; retrying in ${delay}s" >> "$log"
679
+ sleep $delay
680
+ delay=$((delay * 2))
681
+ done
682
+ set -e
683
+ return 1
684
+ }
685
+ apt_install_with_retries() {
686
+ local packages="$@"
687
+ [ -z "$packages" ] && return 0
688
+ set +e
689
+ local log=/tmp/apt-update.log
690
+ local tries=3
691
+ local delay=1
692
+ local i
693
+ for i in $(seq 1 $tries); do
694
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $packages && { set -e; return 0; }
695
+ echo "apt-get install failed for: $packages (attempt $i/$tries). Running -f install and retrying..." >> "$log"
696
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get -f install -y >> "$log" 2>&1 || true
697
+ DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get clean >> "$log" 2>&1 || true
698
+ sleep $delay
699
+ delay=$((delay * 2))
700
+ done
701
+ set -e
702
+ return 1
703
+ }
704
+ apt_update_install_with_retries() {
705
+ apt_update_with_retries
706
+ apt_install_with_retries "$@"
707
+ }
708
+ backup_dir=/etc/apt/sources.list.backup_skypilot
709
+ backup_source() {
710
+ $(prefix_cmd) mkdir -p "$backup_dir"
711
+ if [ -f /etc/apt/sources.list ] && [ ! -f "$backup_dir/sources.list" ]; then
712
+ $(prefix_cmd) cp -a /etc/apt/sources.list "$backup_dir/sources.list" || true
713
+ fi
714
+ }
715
+ restore_source() {
716
+ if [ -f "$backup_dir/sources.list" ]; then
717
+ $(prefix_cmd) cp -a "$backup_dir/sources.list" /etc/apt/sources.list || true
718
+ fi
719
+ }
720
+ update_apt_sources() {
721
+ local host=$1
722
+ local apt_file=$2
723
+ $(prefix_cmd) sed -i -E "s|https?://[a-zA-Z0-9.-]+\.ubuntu\.com/ubuntu|http://$host/ubuntu|g" $apt_file
724
+ }
725
+ # Helper: install packages across mirrors with retries
726
+ apt_install_with_mirrors() {
727
+ local required=$1; shift
728
+ local packages="$@"
729
+ [ -z "$packages" ] && return 0
730
+ set +e
731
+ # Install packages with default sources first
732
+ local log=/tmp/apt-update.log
733
+ echo "$(date +%Y-%m-%d\ %H:%M:%S) Installing packages: $packages" >> "$log"
734
+ restore_source
735
+ apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
736
+ echo "Install failed with default sources: $packages" >> "$log"
737
+ # Detect distro (ubuntu/debian)
738
+ local APT_OS="unknown"
739
+ if [ -f /etc/os-release ]; then
740
+ . /etc/os-release
741
+ case "$ID" in
742
+ debian) APT_OS="debian" ;;
743
+ ubuntu) APT_OS="ubuntu" ;;
744
+ *)
745
+ if [ -n "$ID_LIKE" ]; then
746
+ case " $ID $ID_LIKE " in
747
+ *ubuntu*) APT_OS="ubuntu" ;;
748
+ *debian*) APT_OS="debian" ;;
749
+ esac
750
+ fi
751
+ ;;
752
+ esac
753
+ fi
754
+ # Build mirror candidates
755
+ # deb.debian.org is a CDN endpoint, if one backend goes down,
756
+ # the CDN automatically fails over to another mirror,
757
+ # so we only retry for ubuntu here.
758
+ if [ "$APT_OS" = "ubuntu" ]; then
759
+ # Backup current sources once
760
+ backup_source
761
+ # Selected from https://launchpad.net/ubuntu/+archivemirrors
762
+ # and results from apt-select
763
+ local MIRROR_CANDIDATES="mirrors.wikimedia.org mirror.umd.edu"
764
+ for host in $MIRROR_CANDIDATES; do
765
+ echo "Trying APT mirror ($APT_OS): $host" >> "$log"
766
+ if [ -f /etc/apt/sources.list ]; then
767
+ update_apt_sources $host /etc/apt/sources.list
768
+ else
769
+ echo "Error: /etc/apt/sources.list not found" >> "$log"
770
+ break
771
+ fi
772
+ apt_update_install_with_retries $packages >> "$log" 2>&1 && { set -e; return 0; }
773
+ echo "Install failed with mirror ($APT_OS): $host" >> "$log"
774
+ # Restore to default sources
775
+ restore_source
776
+ done
777
+ fi
778
+ set -e
779
+ if [ "$required" = "1" ]; then
780
+ echo "Error: required package install failed across all mirrors: $packages" >> "$log"
781
+ return 1
782
+ else
783
+ echo "Optional package install failed across all mirrors: $packages; skipping." >> "$log"
784
+ return 0
785
+ fi
786
+ }
400
787
  # Install both fuse2 and fuse3 for compatibility for all possible fuse adapters in advance,
401
788
  # so that both fusemount and fusermount3 can be masked before enabling SSH access.
402
789
  PACKAGES="rsync curl wget netcat gcc patch pciutils fuse fuse3 openssh-server";
403
790
 
404
791
  # Separate packages into two groups: packages that are installed first
405
- # so that curl, rsync and wget are available sooner to unblock the following
792
+ # so that curl, rsync, ssh and wget are available sooner to unblock the following
406
793
  # conda installation and rsync.
407
794
  # Also, we install fuse first to avoid confliction with fuse3.
408
795
  set -e
@@ -423,7 +810,7 @@ available_node_types:
423
810
  done;
424
811
  if [ ! -z "$INSTALL_FIRST" ]; then
425
812
  echo "Installing core packages: $INSTALL_FIRST";
426
- DEBIAN_FRONTEND=noninteractive $(prefix_cmd) apt-get install -y -o Dpkg::Options::="--force-confdef" -o Dpkg::Options::="--force-confold" $INSTALL_FIRST;
813
+ apt_install_with_mirrors 1 $INSTALL_FIRST || { echo "Error: core package installation failed." >> /tmp/apt-update.log; exit 1; }
427
814
  fi;
428
815
  # SSH and other packages are not necessary, so we disable set -e
429
816
  set +e
@@ -447,7 +834,8 @@ available_node_types:
447
834
  fi
448
835
  $(prefix_cmd) cp -p "$FUSERMOUNT_PATH" "${FUSERMOUNT_PATH}-original"
449
836
  $(prefix_cmd) ln -sf {{k8s_fusermount_shared_dir}}/fusermount-shim "$FUSERMOUNT_PATH"
450
- FUSERMOUNT3_PATH=$(which fusermount3)
837
+ # "|| true" because fusermount3 is not always available
838
+ FUSERMOUNT3_PATH=$(which fusermount3) || true
451
839
  if [ -z "$FUSERMOUNT3_PATH" ]; then
452
840
  FUSERMOUNT3_PATH="${FUSERMOUNT_PATH}3"
453
841
  fi
@@ -489,16 +877,23 @@ available_node_types:
489
877
  $(prefix_cmd) mkdir -p ~/.ssh;
490
878
  $(prefix_cmd) chown -R $(whoami) ~/.ssh;
491
879
  $(prefix_cmd) chmod 700 ~/.ssh;
492
- $(prefix_cmd) cat /etc/secret-volume/ssh-publickey* > ~/.ssh/authorized_keys;
880
+ $(prefix_cmd) cat > ~/.ssh/authorized_keys <<'SKYPILOT_SSH_KEY_EOF'
881
+ skypilot:ssh_public_key_content
882
+ SKYPILOT_SSH_KEY_EOF
493
883
  $(prefix_cmd) chmod 644 ~/.ssh/authorized_keys;
494
884
  $(prefix_cmd) service ssh restart;
495
885
  $(prefix_cmd) sed -i "s/mesg n/tty -s \&\& mesg n/" ~/.profile;
496
886
 
497
- ) > /tmp/${STEPS[0]}.log 2>&1 || {
498
- echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed
887
+ touch /tmp/apt_ssh_setup_complete
888
+ echo "=== SSH setup completed ==="
889
+ ) > /tmp/${STEPS[0]}.log 2>&1
890
+ if [ "$?" -ne "0" ]; then
891
+ {
892
+ echo "Error: ${STEPS[0]} failed. Continuing anyway..." > /tmp/${STEPS[0]}.failed 2>&1
499
893
  cat /tmp/${STEPS[0]}.log
500
894
  exit 1
501
- }
895
+ }
896
+ fi
502
897
  ) &
503
898
 
504
899
  # STEP 2: Install conda, ray and skypilot (for dependencies); start
@@ -516,7 +911,21 @@ available_node_types:
516
911
  {{ conda_installation_commands }}
517
912
  {{ ray_installation_commands }}
518
913
 
519
- VIRTUAL_ENV=~/skypilot-runtime ~/.local/bin/uv pip install skypilot[kubernetes,remote]
914
+ # set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
915
+ # unset PYTHONPATH in case the user provided docker image set it.
916
+ VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip install skypilot[kubernetes,remote]
917
+ # Wait for `patch` package to be installed before applying ray patches
918
+ until dpkg -l | grep -q "^ii patch "; do
919
+ sleep 0.1
920
+ echo "Waiting for patch package to be installed..."
921
+ done
922
+ # Apply Ray patches for progress bar fix
923
+ # set UV_SYSTEM_PYTHON to false in case the user provided docker image set it to true.
924
+ # unset PYTHONPATH in case the user provided docker image set it.
925
+ # ~/.sky/python_path is seeded by conda_installation_commands
926
+ VIRTUAL_ENV=~/skypilot-runtime UV_SYSTEM_PYTHON=false env -u PYTHONPATH ~/.local/bin/uv pip list | grep "ray " | grep 2.9.3 2>&1 > /dev/null && {
927
+ env -u PYTHONPATH $(cat ~/.sky/python_path) -c "from sky.skylet.ray_patches import patch; patch()" || exit 1;
928
+ }
520
929
  touch /tmp/ray_skypilot_installation_complete
521
930
  echo "=== Ray and skypilot installation completed ==="
522
931
 
@@ -544,11 +953,14 @@ available_node_types:
544
953
  set +e
545
954
  {{ ray_worker_start_command }}
546
955
  fi
547
- ) > /tmp/${STEPS[1]}.log 2>&1 || {
548
- echo "Error: ${STEPS[1]} failed. Continuing anyway..." > /tmp/${STEPS[1]}.failed
956
+ ) > /tmp/${STEPS[1]}.log 2>&1
957
+ if [ "$?" -ne "0" ]; then
958
+ {
959
+ echo "Error: ${STEPS[1]} failed. Continuing anyway..." > /tmp/${STEPS[1]}.failed 2>&1
549
960
  cat /tmp/${STEPS[1]}.log
550
961
  exit 1
551
- }
962
+ }
963
+ fi
552
964
  ) &
553
965
 
554
966
 
@@ -566,11 +978,14 @@ available_node_types:
566
978
  fi;
567
979
  fi;
568
980
  export -p > ~/container_env_var.sh && $(prefix_cmd) mv ~/container_env_var.sh /etc/profile.d/container_env_var.sh
569
- ) > /tmp/${STEPS[2]}.log 2>&1 || {
570
- echo "Error: ${STEPS[2]} failed. Continuing anyway..." > /tmp/${STEPS[2]}.failed
981
+ ) > /tmp/${STEPS[2]}.log 2>&1
982
+ if [ "$?" -ne "0" ]; then
983
+ {
984
+ echo "Error: ${STEPS[2]} failed. Continuing anyway..." > /tmp/${STEPS[2]}.failed 2>&1
571
985
  cat /tmp/${STEPS[2]}.log
572
986
  exit 1
573
- }
987
+ }
988
+ fi
574
989
  ) &
575
990
 
576
991
  function mylsof { p=$(for pid in /proc/{0..9}*; do i=$(basename "$pid"); for file in "$pid"/fd/*; do link=$(readlink -e "$file"); if [ "$link" = "$1" ]; then echo "$i"; fi; done; done); echo "$p"; };
@@ -623,23 +1038,72 @@ available_node_types:
623
1038
  {% if high_availability %}
624
1039
  mkdir -p {{k8s_high_availability_deployment_run_script_dir}}
625
1040
  if [ -f {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready ]; then
1041
+ SKYPILOT_HA_RECOVERY_LOG="{{ha_recovery_log_path}}"
1042
+ echo "Starting HA recovery at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
1043
+ start_time=$SECONDS
1044
+ retry_count=0
1045
+
1046
+ # Wait for Ray to be ready, as the following commands is depending on Ray.
1047
+ GET_RAY_STATUS_CMD=$({{sky_python_cmd}} -c 'from sky.provision import instance_setup; print(instance_setup.RAY_STATUS_WITH_SKY_RAY_PORT_COMMAND)')
1048
+ while true; do
1049
+ retry_count=$((retry_count + 1))
1050
+ current_duration=$(( SECONDS - start_time ))
1051
+ echo "Attempt $retry_count to get Ray status after $current_duration seconds..." >> $SKYPILOT_HA_RECOVERY_LOG
1052
+
1053
+ bash --login -c "$GET_RAY_STATUS_CMD"
1054
+ if [ $? -eq 0 ]; then
1055
+ wait_duration=$(( SECONDS - start_time ))
1056
+ echo "Ray ready after waiting $wait_duration seconds (took $retry_count attempts)" >> $SKYPILOT_HA_RECOVERY_LOG
1057
+ break
1058
+ fi
1059
+ echo "Waiting for Ray to be ready..." >> $SKYPILOT_HA_RECOVERY_LOG
1060
+ sleep 2
1061
+ done
1062
+
626
1063
  # ! Keep this aligned with `CloudVmRayBackend._setup()`
627
- # Suppose all `task.setup` are the same for skyserve controller task.
1064
+ # Suppose all `task.setup` are the same for sky serve / managed jobs controller task.
628
1065
  # So be careful for compatibility issue once you change it.
629
1066
  chmod +x {{k8s_high_availability_deployment_setup_script_path}}
630
1067
  /bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && {{k8s_high_availability_deployment_setup_script_path}} > /tmp/controller_recovery_setup_commands.log 2>&1"
631
- echo "=== Controller setup commands completed for recovery ==="
1068
+ echo "=== Controller setup commands completed for recovery at $(date) ===" >> $SKYPILOT_HA_RECOVERY_LOG
632
1069
 
1070
+ touch {{k8s_high_availability_restarting_signal_file}}
1071
+ # Get all in-progress jobs from managed jobs controller. We skip any jobs that are already done.
1072
+ # Also, skip the jobs that are waiting to be scheduled as those does not have a controller process running.
1073
+ # For SkyServe, this will be None and every service will be recovered. This is because SkyServe
1074
+ # will delete the service from the database after it is terminated so everything in the database is running.
1075
+ ALL_IN_PROGRESS_JOBS=$({{sky_python_cmd}} -c "from sky.jobs import state; jobs, _ = state.get_managed_jobs_with_filters(fields=['job_id', 'schedule_state']); print(' '.join({str(job['job_id']) for job in jobs if job['schedule_state'] not in [state.ManagedJobScheduleState.DONE, state.ManagedJobScheduleState.WAITING]}) if jobs else None)")
1076
+ if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
1077
+ read -ra ALL_IN_PROGRESS_JOBS_SEQ <<< "$ALL_IN_PROGRESS_JOBS"
1078
+ fi
633
1079
  for file in {{k8s_high_availability_deployment_run_script_dir}}/*; do
1080
+ # This is the cluster job id on managed jobs controller, but it is guaranteed to be the same as the managed job id,
1081
+ # so we directly use it here. See `CloudVmRayBackend._exec_code_on_head::_dump_code_to_file` for more details.
1082
+ JOB_ID=$(basename $file | sed 's/sky_job_//')
1083
+ # If the list of in-progress jobs is not None (meaning this is a managed job HA controller) and job is not in-progress, skip.
1084
+ if [ "$ALL_IN_PROGRESS_JOBS" != "None" ]; then
1085
+ if [[ ! " ${ALL_IN_PROGRESS_JOBS_SEQ[@]} " =~ " ${JOB_ID} " ]]; then
1086
+ continue
1087
+ fi
1088
+ fi
634
1089
  # ! Keep this aligned with `CloudVmRayBackend._execute()`
635
1090
  chmod +x $file
1091
+ # TODO(tian): This logic may run a lot of things if the jobs controller previously had many jobs.
1092
+ # We should do more tests and make sure it will scale well.
636
1093
  /bin/bash --login -c "true && export OMP_NUM_THREADS=1 PYTHONWARNINGS='ignore' && $file > /tmp/task_run_$(basename $file).log 2>&1"
637
- echo "=== Controller task run for service (file: $file) completed for recovery ==="
1094
+ echo "=== Controller task run for service / job (file: $file) completed for recovery at $(date) ===" >> $SKYPILOT_HA_RECOVERY_LOG
638
1095
  done
1096
+ rm {{k8s_high_availability_restarting_signal_file}}
1097
+
1098
+ duration=$(( SECONDS - start_time ))
1099
+ echo "HA recovery completed at $(date)" >> $SKYPILOT_HA_RECOVERY_LOG
1100
+ echo "Total recovery time: $duration seconds" >> $SKYPILOT_HA_RECOVERY_LOG
639
1101
  fi
640
1102
 
641
1103
  touch {{k8s_high_availability_deployment_volume_mount_path}}/k8s_container_ready
642
1104
  {% endif %}
1105
+ # Set +x to stop printing the commands and their arguments as they are executed.
1106
+ set +x
643
1107
 
644
1108
  trap : TERM INT; log_tail || sleep infinity & wait
645
1109
 
@@ -653,14 +1117,27 @@ available_node_types:
653
1117
  # object store. If you do not provide this, Ray will fall back to
654
1118
  # /tmp which cause slowdowns if is not a shared memory volume.
655
1119
  volumeMounts:
656
- - name: secret-volume
657
- readOnly: true
658
- mountPath: "/etc/secret-volume"
659
- # This volume allocates shared memory for Ray to use for its plasma
660
- # object store. If you do not provide this, Ray will fall back to
661
- # /tmp which cause slowdowns if is not a shared memory volume.
662
1120
  - mountPath: /dev/shm
663
1121
  name: dshm
1122
+ {% if k8s_enable_gpudirect_tcpx %}
1123
+ - name: tcpx-socket
1124
+ mountPath: /tmp
1125
+ - name: libraries
1126
+ mountPath: /usr/local/nvidia/lib64
1127
+ readOnly: true
1128
+ {% endif %}
1129
+ {% if k8s_enable_gpudirect_tcpxo %}
1130
+ - name: libraries
1131
+ mountPath: /usr/local/nvidia
1132
+ - name: aperture-devices
1133
+ mountPath: /dev/aperture_devices
1134
+ {% endif %}
1135
+ {% if k8s_enable_gpudirect_rdma %}
1136
+ - name: library-dir-host
1137
+ mountPath: /usr/local/nvidia
1138
+ - name: gib
1139
+ mountPath: /usr/local/gib
1140
+ {% endif %}
664
1141
  {% if high_availability %}
665
1142
  - name: {{k8s_high_availability_deployment_volume_mount_name}}
666
1143
  mountPath: {{k8s_high_availability_deployment_volume_mount_path}}
@@ -669,6 +1146,10 @@ available_node_types:
669
1146
  - name: fusermount-shared-dir
670
1147
  mountPath: {{k8s_fusermount_shared_dir}}
671
1148
  {% endif %}
1149
+ {% for volume_mount in volume_mounts %}
1150
+ - name: {{volume_mount.name}}
1151
+ mountPath: {{volume_mount.path}}
1152
+ {% endfor %}
672
1153
  resources:
673
1154
  requests:
674
1155
  cpu: {{cpus}}
@@ -681,13 +1162,87 @@ available_node_types:
681
1162
  # https://cloud.google.com/kubernetes-engine/docs/concepts/tpus#how_tpus_work
682
1163
  {{k8s_resource_key}}: {{accelerator_count}}
683
1164
  {% endif %}
1165
+ {% if k8s_network_type == 'coreweave' %}
1166
+ rdma/ib: 1
1167
+ {% endif %}
684
1168
  {% if k8s_resource_key is not none %}
685
1169
  limits:
686
1170
  # Limits need to be defined for GPU/TPU requests
687
1171
  {% if k8s_resource_key is not none %}
688
1172
  {{k8s_resource_key}}: {{accelerator_count}}
689
1173
  {% endif %}
1174
+ {% if k8s_network_type == 'coreweave' %}
1175
+ rdma/ib: 1
1176
+ {% endif %}
690
1177
  {% endif %}
1178
+ {% if k8s_ipc_lock_capability %}
1179
+ securityContext:
1180
+ capabilities:
1181
+ add:
1182
+ - IPC_LOCK
1183
+ {% endif %}
1184
+ {% if k8s_enable_gpudirect_tcpx %}
1185
+ # GPUDirect TCPX daemon sidecar container
1186
+ - name: tcpx-daemon
1187
+ image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:v2.0.11
1188
+ imagePullPolicy: Always
1189
+ command:
1190
+ - /tcpgpudmarxd/build/app/tcpgpudmarxd
1191
+ - --gpu_nic_preset
1192
+ - a3vm
1193
+ - --gpu_shmem_type
1194
+ - fd
1195
+ - --uds_path
1196
+ - /run/tcpx
1197
+ - --setup_param
1198
+ - --verbose
1199
+ - "128"
1200
+ - "2"
1201
+ - "0"
1202
+ securityContext:
1203
+ capabilities:
1204
+ add:
1205
+ - NET_ADMIN
1206
+ volumeMounts:
1207
+ - name: libraries
1208
+ mountPath: /usr/local/nvidia/lib64
1209
+ readOnly: true
1210
+ - name: tcpx-socket
1211
+ mountPath: /run/tcpx
1212
+ - name: sys
1213
+ mountPath: /hostsysfs
1214
+ - name: proc-sys
1215
+ mountPath: /hostprocsysfs
1216
+ env:
1217
+ - name: LD_LIBRARY_PATH
1218
+ value: /usr/local/nvidia/lib64
1219
+ {% endif %}
1220
+ {% if k8s_enable_gpudirect_tcpxo %}
1221
+ - name: tcpxo-daemon
1222
+ image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/tcpgpudmarxd-dev:v1.0.17
1223
+ imagePullPolicy: Always
1224
+ command: ["/bin/sh", "-c"]
1225
+ args:
1226
+ - |
1227
+ set -ex
1228
+ chmod 755 /fts/entrypoint_rxdm_container.sh
1229
+ /fts/entrypoint_rxdm_container.sh --num_hops=2 --num_nics=8 --uid= --alsologtostderr
1230
+ securityContext:
1231
+ capabilities:
1232
+ add:
1233
+ - NET_ADMIN
1234
+ - NET_BIND_SERVICE
1235
+ volumeMounts:
1236
+ - name: libraries
1237
+ mountPath: /usr/local/nvidia
1238
+ - name: sys
1239
+ mountPath: /hostsysfs
1240
+ - name: proc-sys
1241
+ mountPath: /hostprocsysfs
1242
+ env:
1243
+ - name: LD_LIBRARY_PATH
1244
+ value: /usr/local/nvidia/lib64
1245
+ {% endif %}
691
1246
 
692
1247
  {% if high_availability %}
693
1248
  pvc_spec:
@@ -724,7 +1279,7 @@ available_node_types:
724
1279
  spec:
725
1280
  securityContext:
726
1281
  fsGroup: 1000
727
- # To prevent the home dir provided by the docker image from being overriden by pvc mounting,
1282
+ # To prevent the home dir provided by the docker image from being overridden by pvc mounting,
728
1283
  # we use initContainers to copy it first to /mnt/home, which will later be mounted to home dir.
729
1284
  initContainers:
730
1285
  - name: init-copy-home
@@ -791,17 +1346,23 @@ setup_commands:
791
1346
  {%- endfor %}
792
1347
  STEPS=("apt-ssh-setup" "runtime-setup" "env-setup")
793
1348
  start_epoch=$(date +%s);
1349
+
1350
+ # Wait for SSH setup to complete before proceeding
1351
+ echo "=== Logs for asynchronous SSH setup ===";
1352
+ ([ -f /tmp/apt_ssh_setup_complete ]|| [ -f /tmp/${STEPS[0]}.failed ]) && cat /tmp/${STEPS[0]}.log ||
1353
+ { tail -f -n +1 /tmp/${STEPS[0]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/apt_ssh_setup_complete ] || [ -f /tmp/${STEPS[0]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1354
+ [ -f /tmp/${STEPS[0]}.failed ] && { echo "Error: ${STEPS[0]} failed. Exiting."; exit 1; } || true;
1355
+
794
1356
  echo "=== Logs for asynchronous ray and skypilot installation ===";
795
- if [ -f /tmp/skypilot_is_nimbus ]; then
796
- echo "=== Logs for asynchronous ray and skypilot installation ===";
797
- [ -f /tmp/ray_skypilot_installation_complete ] && cat /tmp/${STEPS[1]}.log ||
798
- { tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; until [ -f /tmp/ray_skypilot_installation_complete ]; do sleep 0.5; done; kill $TAIL_PID || true; };
799
- [ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
800
- fi
1357
+ ([ -f /tmp/ray_skypilot_installation_complete ]|| [ -f /tmp/${STEPS[1]}.failed ]) && cat /tmp/${STEPS[1]}.log ||
1358
+ { tail -f -n +1 /tmp/${STEPS[1]}.log & TAIL_PID=$!; echo "Tail PID: $TAIL_PID"; sleep 0.5; until [ -f /tmp/ray_skypilot_installation_complete ] || [ -f /tmp/${STEPS[1]}.failed ]; do sleep 0.5; done; kill $TAIL_PID || true; };
1359
+ [ -f /tmp/${STEPS[1]}.failed ] && { echo "Error: ${STEPS[1]} failed. Exiting."; exit 1; } || true;
1360
+
801
1361
  end_epoch=$(date +%s);
802
1362
  echo "=== Ray and skypilot dependencies installation completed in $(($end_epoch - $start_epoch)) secs ===";
803
1363
  start_epoch=$(date +%s);
804
1364
  {{ skypilot_wheel_installation_commands }}
1365
+ {{ copy_skypilot_templates_commands }}
805
1366
  end_epoch=$(date +%s);
806
1367
  echo "=== Skypilot wheel installation completed in $(($end_epoch - $start_epoch)) secs ===";
807
1368
  start_epoch=$(date +%s);