skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -1,11 +1,14 @@
1
1
  """Kubernetes instance provisioning."""
2
2
  import copy
3
+ import datetime
3
4
  import json
5
+ import re
6
+ import sys
4
7
  import time
5
- from typing import Any, Callable, Dict, List, Optional, Union
6
- import uuid
8
+ from typing import Any, Dict, List, Optional, Tuple, Union
7
9
 
8
10
  from sky import exceptions
11
+ from sky import global_user_state
9
12
  from sky import sky_logging
10
13
  from sky import skypilot_config
11
14
  from sky.adaptors import kubernetes
@@ -13,31 +16,40 @@ from sky.provision import common
13
16
  from sky.provision import constants
14
17
  from sky.provision import docker_utils
15
18
  from sky.provision.kubernetes import config as config_lib
16
- from sky.provision.kubernetes import network_utils
19
+ from sky.provision.kubernetes import constants as k8s_constants
17
20
  from sky.provision.kubernetes import utils as kubernetes_utils
21
+ from sky.provision.kubernetes import volume
18
22
  from sky.utils import command_runner
19
23
  from sky.utils import common_utils
20
24
  from sky.utils import config_utils
21
25
  from sky.utils import kubernetes_enums
26
+ from sky.utils import rich_utils
22
27
  from sky.utils import status_lib
23
28
  from sky.utils import subprocess_utils
24
29
  from sky.utils import timeline
25
30
  from sky.utils import ux_utils
31
+ from sky.utils.db import db_utils
26
32
 
27
33
  POLL_INTERVAL = 2
28
34
  _TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
29
35
  _MAX_RETRIES = 3
36
+ _MAX_MISSING_PODS_RETRIES = 5
37
+ _MAX_QUERY_INSTANCES_RETRIES = 5
38
+ _QUERY_INSTANCES_RETRY_INTERVAL = .5
30
39
  _NUM_THREADS = subprocess_utils.get_parallel_threads('kubernetes')
31
40
 
41
+ COMMON_NON_PENDING_EVENT_REASONS = {
42
+ 'Scheduled', 'Created', 'Started', 'Failed', 'Pulled'
43
+ }
44
+
45
+ # Pattern to extract SSH user from command output, handling MOTD contamination
46
+ _SSH_USER_PATTERN = re.compile(r'SKYPILOT_SSH_USER: ([^\s\n]+)')
47
+
32
48
  logger = sky_logging.init_logger(__name__)
33
- TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
34
- TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
35
- TAG_POD_INITIALIZED = 'skypilot-initialized'
36
- TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
37
49
 
38
50
 
39
51
  def ray_tag_filter(cluster_name: str) -> Dict[str, str]:
40
- return {TAG_RAY_CLUSTER_NAME: cluster_name}
52
+ return {k8s_constants.TAG_RAY_CLUSTER_NAME: cluster_name}
41
53
 
42
54
 
43
55
  def _is_head(pod) -> bool:
@@ -67,12 +79,16 @@ def is_high_availability_cluster_by_kubectl(
67
79
  namespace: Optional[str] = None) -> bool:
68
80
  """Check if a cluster is a high availability controller by calling
69
81
  `kubectl get deployment`.
82
+
83
+ The deployment must have the label `skypilot-cluster-name` set to
84
+ `cluster_name`.
70
85
  """
71
86
  try:
72
87
  deployment_list = kubernetes.apps_api(
73
88
  context).list_namespaced_deployment(
74
89
  namespace,
75
- label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
90
+ label_selector=
91
+ f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
76
92
  except kubernetes.api_exception():
77
93
  return False
78
94
  # It is a high availability cluster if there is at least one deployment
@@ -186,14 +202,20 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
186
202
  break
187
203
  if event_message is not None:
188
204
  if pod_status == 'Pending':
189
- logger.info(event_message)
205
+ out_of = {}
206
+ # key: resource name, value: (extra message, nice name)
190
207
  if 'Insufficient cpu' in event_message:
191
- raise config_lib.KubernetesError(
192
- _lack_resource_msg('CPU', pod, details=event_message))
208
+ out_of['CPU'] = (': Run \'kubectl get nodes -o '
209
+ 'custom-columns=NAME:.metadata.name,'
210
+ 'CPU:.status.allocatable.cpu\' to check '
211
+ 'the available CPUs on the node.', 'CPUs')
193
212
  if 'Insufficient memory' in event_message:
194
- raise config_lib.KubernetesError(
195
- _lack_resource_msg('memory', pod,
196
- details=event_message))
213
+ out_of['memory'] = (': Run \'kubectl get nodes -o '
214
+ 'custom-columns=NAME:.metadata.name,'
215
+ 'MEMORY:.status.allocatable.memory\' '
216
+ 'to check the available memory on the '
217
+ 'node.', 'Memory')
218
+
197
219
  # TODO(aylei): after switching from smarter-device-manager to
198
220
  # fusermount-server, we need a new way to check whether the
199
221
  # fusermount-server daemonset is ready.
@@ -201,43 +223,79 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
201
223
  key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
202
224
  for key in lf.get_label_keys()
203
225
  ]
204
- if pod.spec.node_selector:
205
- for label_key in pod.spec.node_selector.keys():
206
- if label_key in gpu_lf_keys:
207
- # TODO(romilb): We may have additional node
208
- # affinity selectors in the future - in that
209
- # case we will need to update this logic.
210
- # TODO(Doyoung): Update the error message raised
211
- # with the multi-host TPU support.
212
- gpu_resource_key = kubernetes_utils.get_gpu_resource_key() # pylint: disable=line-too-long
213
- if 'Insufficient google.com/tpu' in event_message:
214
- extra_msg = (
215
- f'Verify if '
216
- f'{pod.spec.node_selector[label_key]}'
217
- ' is available in the cluster. Note '
218
- 'that multi-host TPU podslices are '
219
- 'currently not unsupported.')
220
- raise config_lib.KubernetesError(
221
- _lack_resource_msg('TPU',
222
- pod,
223
- extra_msg,
224
- details=event_message))
225
- elif ((f'Insufficient {gpu_resource_key}'
226
- in event_message) or
227
- ('didn\'t match Pod\'s node affinity/selector'
228
- in event_message)):
229
- extra_msg = (
230
- f'Verify if any node matching label '
231
- f'{pod.spec.node_selector[label_key]} and '
232
- f'sufficient resource {gpu_resource_key} '
233
- f'is available in the cluster.')
234
- raise config_lib.KubernetesError(
235
- _lack_resource_msg('GPU',
236
- pod,
237
- extra_msg,
238
- details=event_message))
226
+ for label_key in gpu_lf_keys:
227
+ # TODO(romilb): We may have additional node
228
+ # affinity selectors in the future - in that
229
+ # case we will need to update this logic.
230
+ # TODO(Doyoung): Update the error message raised
231
+ # with the multi-host TPU support.
232
+ gpu_resource_key = kubernetes_utils.get_gpu_resource_key(
233
+ context) # pylint: disable=line-too-long
234
+ if ((f'Insufficient {gpu_resource_key}' in event_message) or
235
+ ('didn\'t match Pod\'s node affinity/selector'
236
+ in event_message) and pod.spec.node_selector):
237
+ if 'gpu' in gpu_resource_key.lower():
238
+ info_msg = (
239
+ ': Run \'sky show-gpus --infra kubernetes\' to '
240
+ 'see the available GPUs.')
241
+ else:
242
+ info_msg = ': '
243
+ if (pod.spec.node_selector and
244
+ label_key in pod.spec.node_selector):
245
+ extra_msg = (
246
+ f'Verify if any node matching label '
247
+ f'{pod.spec.node_selector[label_key]} and '
248
+ f'sufficient resource {gpu_resource_key} '
249
+ f'is available in the cluster.')
250
+ extra_msg = info_msg + ' ' + extra_msg
251
+ else:
252
+ extra_msg = info_msg
253
+ if gpu_resource_key not in out_of or len(
254
+ out_of[gpu_resource_key][0]) < len(extra_msg):
255
+ out_of[f'{gpu_resource_key}'] = (extra_msg, 'GPUs')
256
+
257
+ if len(out_of) > 0:
258
+ # We are out of some resources. We should raise an error.
259
+ rsrc_err_msg = 'Insufficient resource capacity on the '
260
+ rsrc_err_msg += 'cluster:\n'
261
+ out_of_keys = list(out_of.keys())
262
+ for i in range(len(out_of_keys)):
263
+ rsrc = out_of_keys[i]
264
+ (extra_msg, nice_name) = out_of[rsrc]
265
+ extra_msg = extra_msg if extra_msg else ''
266
+ if i == len(out_of_keys) - 1:
267
+ indent = '└──'
268
+ else:
269
+ indent = '├──'
270
+ rsrc_err_msg += (f'{indent} Cluster does not have '
271
+ f'sufficient {nice_name} for your request'
272
+ f'{extra_msg}')
273
+ if i != len(out_of_keys) - 1:
274
+ rsrc_err_msg += '\n'
275
+
276
+ # Emit the error message without logging prefixes for better UX.
277
+ tmp_handler = sky_logging.EnvAwareHandler(sys.stdout)
278
+ tmp_handler.flush = sys.stdout.flush # type: ignore
279
+ tmp_handler.setFormatter(sky_logging.NO_PREFIX_FORMATTER)
280
+ tmp_handler.setLevel(sky_logging.ERROR)
281
+ prev_propagate = logger.propagate
282
+ try:
283
+ logger.addHandler(tmp_handler)
284
+ logger.propagate = False
285
+ logger.error(ux_utils.error_message(f'{rsrc_err_msg}'))
286
+ finally:
287
+ logger.removeHandler(tmp_handler)
288
+ logger.propagate = prev_propagate
289
+ nice_names = [out_of[rsrc][1] for rsrc in out_of_keys]
290
+ raise config_lib.KubernetesError(
291
+ f'{timeout_err_msg} '
292
+ f'Pod status: {pod_status} '
293
+ f'Details: \'{event_message}\' ',
294
+ insufficent_resources=nice_names,
295
+ )
296
+
239
297
  raise config_lib.KubernetesError(f'{timeout_err_msg} '
240
- f'Pod status: {pod_status}'
298
+ f'Pod status: {pod_status} '
241
299
  f'Details: \'{event_message}\' ')
242
300
  raise config_lib.KubernetesError(f'{timeout_err_msg}')
243
301
 
@@ -251,8 +309,89 @@ def _raise_command_running_error(message: str, command: str, pod_name: str,
251
309
  f'code {rc}: {command!r}\nOutput: {stdout}.')
252
310
 
253
311
 
312
+ def _detect_cluster_event_reason_occurred(namespace, context, search_start,
313
+ reason) -> bool:
314
+
315
+ def _convert_to_utc(timestamp):
316
+ if timestamp.tzinfo is None:
317
+ return timestamp.replace(tzinfo=datetime.timezone.utc)
318
+ return timestamp.astimezone(datetime.timezone.utc)
319
+
320
+ def _get_event_timestamp(event):
321
+ if event.last_timestamp:
322
+ return event.last_timestamp
323
+ elif event.metadata.creation_timestamp:
324
+ return event.metadata.creation_timestamp
325
+ return None
326
+
327
+ events = kubernetes.core_api(context).list_namespaced_event(
328
+ namespace=namespace, field_selector=f'reason={reason}')
329
+ for event in events.items:
330
+ ts = _get_event_timestamp(event)
331
+ if ts and _convert_to_utc(ts) > search_start:
332
+ return True
333
+ return False
334
+
335
+
336
+ def _cluster_had_autoscale_event(namespace, context, search_start) -> bool:
337
+ """Detects whether the cluster had a autoscaling event after a
338
+ specified datetime. This only works when using cluster-autoscaler.
339
+
340
+ Args:
341
+ namespace: kubernetes namespace
342
+ context: kubernetes context
343
+ search_start (datetime.datetime): filter for events that occurred
344
+ after search_start
345
+
346
+ Returns:
347
+ A boolean whether the cluster has an autoscaling event or not.
348
+ """
349
+ assert namespace is not None
350
+
351
+ try:
352
+ return _detect_cluster_event_reason_occurred(namespace, context,
353
+ search_start,
354
+ 'TriggeredScaleUp')
355
+ except Exception as e: # pylint: disable=broad-except
356
+ logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
357
+ return False
358
+
359
+
360
+ def _cluster_maybe_autoscaling(namespace, context, search_start) -> bool:
361
+ """Detects whether a kubernetes cluster may have an autoscaling event.
362
+
363
+ This is not a definitive detection. FailedScheduling, which is an
364
+ event that can occur when not enough resources are present in the cluster,
365
+ which is a trigger for cluster autoscaling. However, FailedScheduling may
366
+ have occurred due to other reasons (cluster itself is abnormal).
367
+
368
+ Hence, this should only be used for autoscalers that don't emit the
369
+ TriggeredScaleUp event, e.g.: Karpenter.
370
+
371
+ Args:
372
+ namespace: kubernetes namespace
373
+ context: kubernetes context
374
+ search_start (datetime.datetime): filter for events that occurred
375
+ after search_start
376
+
377
+ Returns:
378
+ A boolean whether the cluster has an autoscaling event or not.
379
+ """
380
+ assert namespace is not None
381
+
382
+ try:
383
+ return _detect_cluster_event_reason_occurred(namespace, context,
384
+ search_start,
385
+ 'FailedScheduling')
386
+ except Exception as e: # pylint: disable=broad-except
387
+ logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
388
+ return False
389
+
390
+
254
391
  @timeline.event
255
- def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
392
+ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int,
393
+ cluster_name: str,
394
+ create_pods_start: datetime.datetime):
256
395
  """Wait for all pods to be scheduled.
257
396
 
258
397
  Wait for all pods including jump pod to be scheduled, and if it
@@ -261,6 +400,9 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
261
400
  allocated and we can exit.
262
401
 
263
402
  If timeout is set to a negative value, this method will wait indefinitely.
403
+
404
+ Will update the spinner message to indicate autoscaling if autoscaling
405
+ is happening.
264
406
  """
265
407
  # Create a set of pod names we're waiting for
266
408
  if not new_nodes:
@@ -268,6 +410,18 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
268
410
  expected_pod_names = {node.metadata.name for node in new_nodes}
269
411
  start_time = time.time()
270
412
 
413
+ # Variables for autoscaler detection
414
+ autoscaler_type = skypilot_config.get_effective_region_config(
415
+ cloud='kubernetes',
416
+ region=context,
417
+ keys=('autoscaler',),
418
+ default_value=None)
419
+ autoscaler_is_set = autoscaler_type is not None
420
+ use_heuristic_detection = (autoscaler_is_set and
421
+ not kubernetes_enums.KubernetesAutoscalerType(
422
+ autoscaler_type).emits_autoscale_event())
423
+ is_autoscaling = False
424
+
271
425
  def _evaluate_timeout() -> bool:
272
426
  # If timeout is negative, retry indefinitely.
273
427
  if timeout < 0:
@@ -277,10 +431,13 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
277
431
  while _evaluate_timeout():
278
432
  # Get all pods in a single API call using the cluster name label
279
433
  # which all pods in new_nodes should share
280
- cluster_name = new_nodes[0].metadata.labels[TAG_SKYPILOT_CLUSTER_NAME]
434
+ cluster_name_on_cloud = new_nodes[0].metadata.labels[
435
+ constants.TAG_SKYPILOT_CLUSTER_NAME]
281
436
  pods = kubernetes.core_api(context).list_namespaced_pod(
282
437
  namespace,
283
- label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
438
+ label_selector=
439
+ f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
440
+ ).items
284
441
 
285
442
  # Get the set of found pod names and check if we have all expected pods
286
443
  found_pod_names = {pod.metadata.name for pod in pods}
@@ -304,6 +461,26 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
304
461
 
305
462
  if all_scheduled:
306
463
  return
464
+
465
+ # Check if cluster is autoscaling and update spinner message.
466
+ # Minor optimization to not query k8s api after autoscaling
467
+ # event was detected. This is useful because there isn't any
468
+ # autoscaling complete event.
469
+ if autoscaler_is_set and not is_autoscaling:
470
+ if use_heuristic_detection:
471
+ is_autoscaling = _cluster_maybe_autoscaling(
472
+ namespace, context, create_pods_start)
473
+ msg = 'Kubernetes cluster may be scaling up'
474
+ else:
475
+ is_autoscaling = _cluster_had_autoscale_event(
476
+ namespace, context, create_pods_start)
477
+ msg = 'Kubernetes cluster is autoscaling'
478
+
479
+ if is_autoscaling:
480
+ rich_utils.force_update_status(
481
+ ux_utils.spinner_message(f'Launching ({msg})',
482
+ cluster_name=cluster_name))
483
+
307
484
  time.sleep(1)
308
485
 
309
486
  # Handle pod scheduling errors
@@ -319,17 +496,17 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
319
496
 
320
497
 
321
498
  @timeline.event
322
- def _wait_for_pods_to_run(namespace, context, new_nodes):
499
+ def _wait_for_pods_to_run(namespace, context, cluster_name, new_pods):
323
500
  """Wait for pods and their containers to be ready.
324
501
 
325
502
  Pods may be pulling images or may be in the process of container
326
503
  creation.
327
504
  """
328
- if not new_nodes:
505
+ if not new_pods:
329
506
  return
330
507
 
331
508
  # Create a set of pod names we're waiting for
332
- expected_pod_names = {node.metadata.name for node in new_nodes}
509
+ expected_pod_names = {pod.metadata.name for pod in new_pods}
333
510
 
334
511
  def _check_init_containers(pod):
335
512
  # Check if any of the init containers failed
@@ -356,37 +533,40 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
356
533
  'Failed to create init container for pod '
357
534
  f'{pod.metadata.name}. Error details: {msg}.')
358
535
 
359
- while True:
360
- # Get all pods in a single API call
361
- cluster_name = new_nodes[0].metadata.labels[TAG_SKYPILOT_CLUSTER_NAME]
362
- all_pods = kubernetes.core_api(context).list_namespaced_pod(
363
- namespace,
364
- label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
365
-
366
- # Get the set of found pod names and check if we have all expected pods
367
- found_pod_names = {pod.metadata.name for pod in all_pods}
368
- missing_pods = expected_pod_names - found_pod_names
369
- if missing_pods:
370
- logger.info('Retrying running pods check: '
371
- f'Missing pods: {missing_pods}')
372
- time.sleep(0.5)
373
- continue
374
-
375
- all_pods_running = True
376
- for pod in all_pods:
377
- if pod.metadata.name not in expected_pod_names:
378
- continue
379
- # Continue if pod and all the containers within the
380
- # pod are successfully created and running.
381
- if pod.status.phase == 'Running' and all(
382
- container.state.running
383
- for container in pod.status.container_statuses):
384
- continue
385
-
386
- all_pods_running = False
387
- if pod.status.phase == 'Pending':
388
- # Iterate over each container in pod to check their status
389
- for container_status in pod.status.container_statuses:
536
+ def _inspect_pod_status(pod):
537
+ # Check if pod is terminated/preempted/failed.
538
+ if (pod.metadata.deletion_timestamp is not None or
539
+ pod.status.phase == 'Failed'):
540
+ # Get the reason and write to cluster events before
541
+ # the pod gets completely deleted from the API.
542
+ termination_reason = _get_pod_termination_reason(pod, cluster_name)
543
+ logger.warning(
544
+ f'Pod {pod.metadata.name} terminated: {termination_reason}')
545
+ raise config_lib.KubernetesError(
546
+ f'Pod {pod.metadata.name} has terminated or failed '
547
+ f'unexpectedly. Run `sky logs --provision {cluster_name}` '
548
+ 'for more details.')
549
+
550
+ container_statuses = pod.status.container_statuses
551
+ # Continue if pod and all the containers within the
552
+ # pod are successfully created and running.
553
+ if (pod.status.phase == 'Running' and container_statuses is not None and
554
+ all(container.state.running
555
+ for container in container_statuses)):
556
+ return True, None
557
+
558
+ reason: Optional[str] = None
559
+ if pod.status.phase == 'Pending':
560
+ pending_reason = _get_pod_pending_reason(context, namespace,
561
+ pod.metadata.name)
562
+ if pending_reason is not None:
563
+ reason, message = pending_reason
564
+ logger.debug(f'Pod {pod.metadata.name} is pending: '
565
+ f'{reason}: {message}')
566
+
567
+ # Iterate over each container in pod to check their status
568
+ if container_statuses is not None:
569
+ for container_status in container_statuses:
390
570
  # If the container wasn't in 'ContainerCreating'
391
571
  # state, then we know pod wasn't scheduled or
392
572
  # had some other error, such as image pull error.
@@ -397,43 +577,86 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
397
577
  if waiting.reason == 'PodInitializing':
398
578
  _check_init_containers(pod)
399
579
  elif waiting.reason != 'ContainerCreating':
400
- msg = waiting.message if waiting.message else str(
401
- waiting)
580
+ msg = waiting.message if (
581
+ waiting.message) else str(waiting)
402
582
  raise config_lib.KubernetesError(
403
583
  'Failed to create container while launching '
404
584
  f'the node. Error details: {msg}.')
405
- # Reaching this point means that one of the pods had an issue,
406
- # so break out of the loop, and wait until next second.
407
- break
585
+ return False, reason
586
+
587
+ missing_pods_retry = 0
588
+ last_status_msg: Optional[str] = None
589
+ while True:
590
+ # Get all pods in a single API call
591
+ cluster_name_on_cloud = new_pods[0].metadata.labels[
592
+ constants.TAG_SKYPILOT_CLUSTER_NAME]
593
+ all_pods = kubernetes.core_api(context).list_namespaced_pod(
594
+ namespace,
595
+ label_selector=
596
+ f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
597
+ ).items
598
+
599
+ # Get the set of found pod names and check if we have all expected pods
600
+ found_pod_names = {pod.metadata.name for pod in all_pods}
601
+ missing_pod_names = expected_pod_names - found_pod_names
602
+ if missing_pod_names:
603
+ # In _wait_for_pods_to_schedule, we already wait for all pods to go
604
+ # from pending to scheduled. So if a pod is missing here, it means
605
+ # something unusual must have happened, and so should be treated as
606
+ # an exception.
607
+ # It is also only in _wait_for_pods_to_schedule that
608
+ # provision_timeout is used.
609
+ # TODO(kevin): Should we take provision_timeout into account here,
610
+ # instead of hardcoding the number of retries?
611
+ if missing_pods_retry >= _MAX_MISSING_PODS_RETRIES:
612
+ for pod_name in missing_pod_names:
613
+ reason = _get_pod_missing_reason(context, namespace,
614
+ cluster_name, pod_name)
615
+ logger.warning(f'Pod {pod_name} missing: {reason}')
616
+ raise config_lib.KubernetesError(
617
+ f'Failed to get all pods after {missing_pods_retry} '
618
+ f'retries. Some pods may have been terminated or failed '
619
+ f'unexpectedly. Run `sky logs --provision {cluster_name}` '
620
+ 'for more details.')
621
+ logger.info('Retrying running pods check: '
622
+ f'Missing pods: {missing_pod_names}')
623
+ time.sleep(0.5)
624
+ missing_pods_retry += 1
625
+ continue
626
+
627
+ pods_to_check = [
628
+ pod for pod in all_pods if pod.metadata.name in expected_pod_names
629
+ ]
630
+ pod_statuses = subprocess_utils.run_in_parallel(_inspect_pod_status,
631
+ pods_to_check,
632
+ _NUM_THREADS)
633
+
634
+ all_pods_running = True
635
+ pending_reasons_count: Dict[str, int] = {}
636
+ for is_running, pending_reason in pod_statuses:
637
+ if not is_running:
638
+ all_pods_running = False
639
+ if pending_reason is not None:
640
+ pending_reasons_count[pending_reason] = (
641
+ pending_reasons_count.get(pending_reason, 0) + 1)
408
642
 
409
643
  if all_pods_running:
410
644
  break
411
- time.sleep(1)
412
-
413
645
 
414
- def _run_function_with_retries(func: Callable,
415
- operation_name: str,
416
- max_retries: int = _MAX_RETRIES,
417
- retry_delay: int = 5) -> Any:
418
- """Runs a function with retries on Kubernetes errors.
419
- Args:
420
- func: Function to retry
421
- operation_name: Name of the operation for logging
422
- max_retries: Maximum number of retry attempts
423
- retry_delay: Delay between retries in seconds
424
- Raises:
425
- The last exception encountered if all retries fail.
426
- """
427
- for attempt in range(max_retries + 1):
428
- try:
429
- return func()
430
- except config_lib.KubernetesError:
431
- if attempt < max_retries:
432
- logger.warning(f'Failed to {operation_name} - '
433
- f'retrying in {retry_delay} seconds.')
434
- time.sleep(retry_delay)
435
- else:
436
- raise
646
+ if pending_reasons_count:
647
+ msg = ', '.join([
648
+ f'{count} pod(s) pending due to {reason}'
649
+ for reason, count in sorted(pending_reasons_count.items())
650
+ ])
651
+ status_text = f'Launching ({msg})'
652
+ else:
653
+ status_text = 'Launching'
654
+ new_status_msg = ux_utils.spinner_message(status_text,
655
+ cluster_name=cluster_name)
656
+ if new_status_msg != last_status_msg:
657
+ rich_utils.force_update_status(new_status_msg)
658
+ last_status_msg = new_status_msg
659
+ time.sleep(1)
437
660
 
438
661
 
439
662
  @timeline.event
@@ -670,26 +893,11 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
670
893
  raise e
671
894
 
672
895
 
673
- def _create_persistent_volume_claim(namespace: str, context: Optional[str],
674
- pvc_spec: Dict[str, Any]) -> None:
675
- """Creates a persistent volume claim for SkyServe controller."""
676
- try:
677
- kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
678
- name=pvc_spec['metadata']['name'], namespace=namespace)
679
- return
680
- except kubernetes.api_exception() as e:
681
- if e.status != 404: # Not found
682
- raise
683
-
684
- kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
685
- namespace=namespace, body=pvc_spec)
686
-
687
-
688
896
  @timeline.event
689
897
  def _wait_for_deployment_pod(context,
690
898
  namespace,
691
899
  deployment,
692
- timeout=60) -> List:
900
+ timeout=300) -> List:
693
901
  label_selector = ','.join([
694
902
  f'{key}={value}'
695
903
  for key, value in deployment.spec.selector.match_labels.items()
@@ -721,13 +929,14 @@ def _wait_for_deployment_pod(context,
721
929
 
722
930
 
723
931
  @timeline.event
724
- def _create_pods(region: str, cluster_name_on_cloud: str,
932
+ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
725
933
  config: common.ProvisionConfig) -> common.ProvisionRecord:
726
934
  """Create pods based on the config."""
727
935
  provider_config = config.provider_config
728
936
  namespace = kubernetes_utils.get_namespace_from_config(provider_config)
729
937
  context = kubernetes_utils.get_context_from_config(provider_config)
730
938
  pod_spec = copy.deepcopy(config.node_config)
939
+ create_pods_start = datetime.datetime.now(datetime.timezone.utc)
731
940
 
732
941
  to_create_deployment = 'deployment_spec' in pod_spec
733
942
  if to_create_deployment:
@@ -744,7 +953,26 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
744
953
  else:
745
954
  pod_spec['metadata']['labels'] = tags
746
955
  pod_spec['metadata']['labels'].update(
747
- {TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
956
+ {constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
957
+
958
+ ephemeral_volumes = provider_config.get('ephemeral_volume_infos')
959
+ if ephemeral_volumes:
960
+ for ephemeral_volume in ephemeral_volumes:
961
+ # Update the volumes and volume mounts in the pod spec
962
+ if 'volumes' not in pod_spec['spec']:
963
+ pod_spec['spec']['volumes'] = []
964
+ pod_spec['spec']['volumes'].append({
965
+ 'name': ephemeral_volume.name,
966
+ 'persistentVolumeClaim': {
967
+ 'claimName': ephemeral_volume.volume_name_on_cloud,
968
+ },
969
+ })
970
+ if 'volumeMounts' not in pod_spec['spec']['containers'][0]:
971
+ pod_spec['spec']['containers'][0]['volumeMounts'] = []
972
+ pod_spec['spec']['containers'][0]['volumeMounts'].append({
973
+ 'name': ephemeral_volume.name,
974
+ 'mountPath': ephemeral_volume.path,
975
+ })
748
976
 
749
977
  terminating_pods = kubernetes_utils.filter_pods(namespace, context, tags,
750
978
  ['Terminating'])
@@ -776,8 +1004,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
776
1004
  running_pods = kubernetes_utils.filter_pods(namespace, context, tags,
777
1005
  ['Pending', 'Running'])
778
1006
  head_pod_name = _get_head_pod_name(running_pods)
1007
+ running_pod_statuses = [{
1008
+ pod.metadata.name: pod.status.phase
1009
+ } for pod in running_pods.values()]
779
1010
  logger.debug(f'Found {len(running_pods)} existing pods: '
780
- f'{list(running_pods.keys())}')
1011
+ f'{running_pod_statuses}')
781
1012
 
782
1013
  to_start_count = config.count - len(running_pods)
783
1014
  if to_start_count < 0:
@@ -793,7 +1024,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
793
1024
  nvidia_runtime_exists = False
794
1025
  try:
795
1026
  nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
796
- context)
1027
+ context=context)
797
1028
  except kubernetes.kubernetes.client.ApiException as e:
798
1029
  logger.warning('run_instances: Error occurred while checking for '
799
1030
  f'nvidia RuntimeClass - '
@@ -804,14 +1035,18 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
804
1035
  'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html') # pylint: disable=line-too-long
805
1036
 
806
1037
  needs_gpus = False
1038
+ needs_gpus_nvidia = False
807
1039
  limits = pod_spec['spec']['containers'][0].get('resources',
808
1040
  {}).get('limits')
809
1041
  if limits is not None:
810
- needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(), 0) > 0
1042
+ needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(context),
1043
+ 0) > 0
1044
+ needs_gpus_nvidia = limits.get(
1045
+ kubernetes_utils.SUPPORTED_GPU_RESOURCE_KEYS['nvidia'], 0) > 0
811
1046
 
812
1047
  # TPU pods provisioned on GKE use the default containerd runtime.
813
1048
  # Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview # pylint: disable=line-too-long
814
- if nvidia_runtime_exists and needs_gpus:
1049
+ if nvidia_runtime_exists and needs_gpus_nvidia:
815
1050
  pod_spec['spec']['runtimeClassName'] = 'nvidia'
816
1051
 
817
1052
  logger.debug(f'run_instances: calling create_namespaced_pod '
@@ -819,19 +1054,46 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
819
1054
 
820
1055
  def _create_resource_thread(i: int):
821
1056
  pod_spec_copy = copy.deepcopy(pod_spec)
822
- if head_pod_name is None and i == 0:
823
- # First pod should be head if no head exists
824
- pod_spec_copy['metadata']['labels'].update(constants.HEAD_NODE_TAGS)
825
- head_selector = _head_service_selector(cluster_name_on_cloud)
826
- pod_spec_copy['metadata']['labels'].update(head_selector)
827
- pod_spec_copy['metadata']['name'] = f'{cluster_name_on_cloud}-head'
1057
+ # 0 is for head pod, while 1+ is for worker pods.
1058
+ if i == 0:
1059
+ if head_pod_name is None:
1060
+ # First pod should be head if no head exists
1061
+ pod_spec_copy['metadata']['labels'].update(
1062
+ constants.HEAD_NODE_TAGS)
1063
+ head_selector = _head_service_selector(cluster_name_on_cloud)
1064
+ pod_spec_copy['metadata']['labels'].update(head_selector)
1065
+ pod_spec_copy['metadata'][
1066
+ 'name'] = f'{cluster_name_on_cloud}-head'
1067
+ else:
1068
+ # If head pod already exists, we skip creating it.
1069
+ return
828
1070
  else:
829
1071
  # Worker pods
830
1072
  pod_spec_copy['metadata']['labels'].update(
831
1073
  constants.WORKER_NODE_TAGS)
832
- pod_uuid = str(uuid.uuid4())[:6]
833
- pod_name = f'{cluster_name_on_cloud}-{pod_uuid}'
834
- pod_spec_copy['metadata']['name'] = f'{pod_name}-worker'
1074
+ pod_name = f'{cluster_name_on_cloud}-worker{i}'
1075
+ if pod_name in running_pods:
1076
+ # If the pod is already running, we skip creating it.
1077
+ return
1078
+ pod_spec_copy['metadata']['name'] = pod_name
1079
+ pod_spec_copy['metadata']['labels']['component'] = pod_name
1080
+
1081
+ # We need to keep the following fields in the pod spec to be same for
1082
+ # head and worker pods.
1083
+ # So that Kueue can merge them into a single PodSet when creating
1084
+ # ProvisioningRequest to trigger scale up of the cluster autoscaler,
1085
+ # this is especially required for DWS queued provisioning mode in GKE.
1086
+ # spec.containers[*].resources.requests
1087
+ # spec.initContainers[*].resources.requests
1088
+ # spec.resources
1089
+ # spec.nodeSelector
1090
+ # spec.tolerations
1091
+ # spec.affinity
1092
+ # resourceClaims
1093
+ # Refer to the following links for more details:
1094
+ # https://cloud.google.com/kubernetes-engine/docs/how-to/provisioningrequest#define_a_provisioningrequest_object # pylint: disable=line-too-long
1095
+ # https://kueue.sigs.k8s.io/docs/admission-check-controllers/provisioning/#podset-merge-policy # pylint: disable=line-too-long
1096
+ if config.count > 1:
835
1097
  # For multi-node support, we put a soft-constraint to schedule
836
1098
  # worker pods on different nodes than the head pod.
837
1099
  # This is not set as a hard constraint because if different nodes
@@ -850,7 +1112,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
850
1112
  'podAffinityTerm': {
851
1113
  'labelSelector': {
852
1114
  'matchExpressions': [{
853
- 'key': TAG_SKYPILOT_CLUSTER_NAME,
1115
+ 'key': constants.TAG_SKYPILOT_CLUSTER_NAME,
854
1116
  'operator': 'In',
855
1117
  'values': [cluster_name_on_cloud]
856
1118
  }]
@@ -883,9 +1145,25 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
883
1145
  pod_spec_copy['spec']['tolerations'] = existing_tolerations + [
884
1146
  tpu_toleration
885
1147
  ]
1148
+ # Add GPU toleration if GPU is requested.
1149
+ # The nodes provisioned by DWS with flex start with queued provisioning
1150
+ # mode have the GPU taint, so we have to add the GPU toleration.
1151
+ # No need to check if DWS is enabled here since this has no side effect
1152
+ # to the non-DWS case.
1153
+ if needs_gpus:
1154
+ gpu_toleration = {
1155
+ 'key': kubernetes_utils.get_gpu_resource_key(context),
1156
+ 'operator': 'Exists',
1157
+ 'effect': 'NoSchedule'
1158
+ }
1159
+ # Preserve existing tolerations if any
1160
+ existing_tolerations = pod_spec_copy['spec'].get('tolerations', [])
1161
+ pod_spec_copy['spec']['tolerations'] = existing_tolerations + [
1162
+ gpu_toleration
1163
+ ]
886
1164
 
887
1165
  if to_create_deployment:
888
- _create_persistent_volume_claim(namespace, context, pvc_spec)
1166
+ volume.create_persistent_volume_claim(namespace, context, pvc_spec)
889
1167
 
890
1168
  # It's safe to directly modify the template spec in the deployment spec
891
1169
  # because controller pod is singleton, i in [0].
@@ -893,9 +1171,12 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
893
1171
  # Add the deployment name as a label to the pod spec
894
1172
  deployment_name = deployment_spec['metadata']['name']
895
1173
  pod_spec_copy['metadata']['labels'][
896
- TAG_SKYPILOT_DEPLOYMENT_NAME] = deployment_name
1174
+ k8s_constants.TAG_SKYPILOT_DEPLOYMENT_NAME] = deployment_name
897
1175
  template_pod_spec['metadata'] = pod_spec_copy['metadata']
898
1176
  template_pod_spec['spec'].update(pod_spec_copy['spec'])
1177
+ # Propagate the labels to the deployment for identification.
1178
+ deployment_spec['metadata']['labels'] = pod_spec_copy['metadata'][
1179
+ 'labels']
899
1180
  try:
900
1181
  return kubernetes.apps_api(
901
1182
  context).create_namespaced_deployment(
@@ -904,6 +1185,10 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
904
1185
  print('Deployment failed', e)
905
1186
  raise e
906
1187
 
1188
+ # Check if any PVCs with access mode ReadWriteOnce or ReadWriteOncePod
1189
+ # is used by any pod in the namespace.
1190
+ volume.check_pvc_usage_for_pod(context, namespace, pod_spec_copy)
1191
+
907
1192
  return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
908
1193
  context)
909
1194
 
@@ -922,9 +1207,16 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
922
1207
  'and then up the cluster again.')
923
1208
  raise exceptions.InconsistentHighAvailabilityError(message)
924
1209
 
925
- # Create pods in parallel
926
- created_resources = subprocess_utils.run_in_parallel(
927
- _create_resource_thread, list(range(to_start_count)), _NUM_THREADS)
1210
+ created_resources = []
1211
+ if to_start_count > 0:
1212
+ # Create pods in parallel.
1213
+ # Use `config.count` instead of `to_start_count` to keep the index of
1214
+ # the Pods consistent especially for the case where some Pods are down
1215
+ # due to node failure or manual termination, etc. and then launch
1216
+ # again to create the Pods back.
1217
+ # The existing Pods will be skipped in _create_resource_thread.
1218
+ created_resources = subprocess_utils.run_in_parallel(
1219
+ _create_resource_thread, list(range(config.count)), _NUM_THREADS)
928
1220
 
929
1221
  if to_create_deployment:
930
1222
  deployments = copy.deepcopy(created_resources)
@@ -937,20 +1229,22 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
937
1229
  pods = created_resources
938
1230
 
939
1231
  created_pods = {}
1232
+ valid_pods = []
940
1233
  for pod in pods:
1234
+ # In case Pod is not created
1235
+ if pod is None:
1236
+ continue
1237
+ valid_pods.append(pod)
941
1238
  created_pods[pod.metadata.name] = pod
942
1239
  if head_pod_name is None and _is_head(pod):
943
1240
  head_pod_name = pod.metadata.name
1241
+ pods = valid_pods
1242
+
1243
+ # The running_pods may include Pending Pods, so we add them to the pods
1244
+ # list to wait for scheduling and running
1245
+ if running_pods:
1246
+ pods = pods + list(running_pods.values())
944
1247
 
945
- networking_mode = network_utils.get_networking_mode(
946
- config.provider_config.get('networking_mode'))
947
- if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
948
- # Adding the jump pod to the new_nodes list as well so it can be
949
- # checked if it's scheduled and running along with other pods.
950
- ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
951
- jump_pod = kubernetes.core_api(context).read_namespaced_pod(
952
- ssh_jump_pod_name, namespace)
953
- pods.append(jump_pod)
954
1248
  provision_timeout = provider_config['timeout']
955
1249
 
956
1250
  wait_str = ('indefinitely'
@@ -960,12 +1254,21 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
960
1254
 
961
1255
  # Wait until the pods are scheduled and surface cause for error
962
1256
  # if there is one
963
- _wait_for_pods_to_schedule(namespace, context, pods, provision_timeout)
1257
+ _wait_for_pods_to_schedule(namespace, context, pods, provision_timeout,
1258
+ cluster_name, create_pods_start)
1259
+ # Reset spinner message here because it might have hinted autoscaling
1260
+ # while waiting for pods to schedule.
1261
+ rich_utils.force_update_status(
1262
+ ux_utils.spinner_message('Launching', cluster_name=cluster_name))
964
1263
  # Wait until the pods and their containers are up and running, and
965
1264
  # fail early if there is an error
966
- logger.debug(f'run_instances: waiting for pods to be running (pulling '
967
- f'images): {[pod.metadata.name for pod in pods]}')
968
- _wait_for_pods_to_run(namespace, context, pods)
1265
+ logger.debug(f'run_instances: waiting for pods to be running: '
1266
+ f'{[pod.metadata.name for pod in pods]}')
1267
+ _wait_for_pods_to_run(namespace, context, cluster_name, pods)
1268
+ # Reset spinner message here because it might have hinted the reason
1269
+ # pods were pending.
1270
+ rich_utils.force_update_status(
1271
+ ux_utils.spinner_message('Launching', cluster_name=cluster_name))
969
1272
  logger.debug(f'run_instances: all pods are scheduled and running: '
970
1273
  f'{[pod.metadata.name for pod in pods]}')
971
1274
 
@@ -981,11 +1284,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
981
1284
  )
982
1285
 
983
1286
 
984
- def run_instances(region: str, cluster_name_on_cloud: str,
1287
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
985
1288
  config: common.ProvisionConfig) -> common.ProvisionRecord:
986
1289
  """Runs instances for the given cluster."""
987
1290
  try:
988
- return _create_pods(region, cluster_name_on_cloud, config)
1291
+ return _create_pods(region, cluster_name, cluster_name_on_cloud, config)
989
1292
  except (kubernetes.api_exception(), config_lib.KubernetesError) as e:
990
1293
  e_msg = common_utils.format_exception(e).replace('\n', ' ')
991
1294
  logger.warning('run_instances: Error occurred when creating pods: '
@@ -1006,42 +1309,10 @@ def stop_instances(
1006
1309
  raise NotImplementedError()
1007
1310
 
1008
1311
 
1009
- def _delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
1010
- resource_name: str) -> None:
1011
- """Helper to delete Kubernetes resources with 404 handling and retries.
1012
-
1013
- Args:
1014
- delete_func: Function to call to delete the resource
1015
- resource_type: Type of resource being deleted (e.g. 'service'),
1016
- used in logging
1017
- resource_name: Name of the resource being deleted, used in logging
1018
- """
1019
- max_retries = 3
1020
- retry_delay = 5 # seconds
1021
-
1022
- for attempt in range(max_retries):
1023
- try:
1024
- delete_func()
1025
- return
1026
- except kubernetes.api_exception() as e:
1027
- if e.status == 404:
1028
- logger.warning(
1029
- f'terminate_instances: Tried to delete {resource_type} '
1030
- f'{resource_name}, but the {resource_type} was not '
1031
- 'found (404).')
1032
- return
1033
- elif attempt < max_retries - 1:
1034
- logger.warning(f'terminate_instances: Failed to delete '
1035
- f'{resource_type} {resource_name} (attempt '
1036
- f'{attempt + 1}/{max_retries}). Error: {e}. '
1037
- f'Retrying in {retry_delay} seconds...')
1038
- time.sleep(retry_delay)
1039
- else:
1040
- raise
1041
-
1042
-
1043
- def _delete_services(name_prefix: str, namespace: str,
1044
- context: Optional[str]) -> None:
1312
+ def _delete_services(name_prefix: str,
1313
+ namespace: str,
1314
+ context: Optional[str],
1315
+ skip_ssh_service: bool = False) -> None:
1045
1316
  """Delete services with the given name prefix.
1046
1317
 
1047
1318
  Args:
@@ -1050,18 +1321,21 @@ def _delete_services(name_prefix: str, namespace: str,
1050
1321
  context: Kubernetes context
1051
1322
  """
1052
1323
  # TODO(andy): We should use tag for the service filter.
1053
- for service_name in [name_prefix, f'{name_prefix}-ssh']:
1324
+ services = ([name_prefix, f'{name_prefix}-ssh']
1325
+ if not skip_ssh_service else [name_prefix])
1326
+ for service_name in services:
1054
1327
  # Since we are not saving this lambda, it's a false positive.
1055
1328
  # TODO(andyl): Wait for
1056
1329
  # https://github.com/pylint-dev/pylint/issues/5263.
1057
1330
  # pylint: disable=cell-var-from-loop
1058
- _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.core_api(
1059
- context).delete_namespaced_service(name=service_name,
1060
- namespace=namespace,
1061
- _request_timeout=config_lib.
1062
- DELETION_TIMEOUT),
1063
- resource_type='service',
1064
- resource_name=service_name)
1331
+ kubernetes_utils.delete_k8s_resource_with_retry(
1332
+ delete_func=lambda: kubernetes.core_api(
1333
+ context).delete_namespaced_service(name=service_name,
1334
+ namespace=namespace,
1335
+ _request_timeout=config_lib.
1336
+ DELETION_TIMEOUT),
1337
+ resource_type='service',
1338
+ resource_name=service_name)
1065
1339
 
1066
1340
 
1067
1341
  def _terminate_node(namespace: str,
@@ -1075,13 +1349,16 @@ def _terminate_node(namespace: str,
1075
1349
  # Delete services for the head pod
1076
1350
  # services are specified in sky/templates/kubernetes-ray.yml.j2
1077
1351
  _delete_services(pod_name, namespace, context)
1352
+ else:
1353
+ # No ssh service is created for worker pods
1354
+ _delete_services(pod_name, namespace, context, skip_ssh_service=True)
1078
1355
 
1079
1356
  # Note - delete pod after all other resources are deleted.
1080
1357
  # This is to ensure there are no leftover resources if this down is run
1081
1358
  # from within the pod, e.g., for autodown.
1082
1359
  # Note - some misbehaving pods may not terminate gracefully if they have
1083
1360
  # open file descriptors. We force delete pods to avoid this.
1084
- _delete_k8s_resource_with_retry(
1361
+ kubernetes_utils.delete_k8s_resource_with_retry(
1085
1362
  delete_func=lambda: kubernetes.core_api(context).delete_namespaced_pod(
1086
1363
  name=pod_name,
1087
1364
  namespace=namespace,
@@ -1099,26 +1376,28 @@ def _terminate_deployment(cluster_name: str, namespace: str,
1099
1376
 
1100
1377
  # Delete deployment
1101
1378
  deployment_name = _get_deployment_name(cluster_name)
1102
- _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.apps_api(
1103
- context).delete_namespaced_deployment(name=deployment_name,
1104
- namespace=namespace,
1105
- _request_timeout=config_lib.
1106
- DELETION_TIMEOUT),
1107
- resource_type='deployment',
1108
- resource_name=deployment_name)
1379
+ kubernetes_utils.delete_k8s_resource_with_retry(
1380
+ delete_func=lambda: kubernetes.apps_api(
1381
+ context).delete_namespaced_deployment(name=deployment_name,
1382
+ namespace=namespace,
1383
+ _request_timeout=config_lib.
1384
+ DELETION_TIMEOUT),
1385
+ resource_type='deployment',
1386
+ resource_name=deployment_name)
1109
1387
 
1110
1388
  # Delete PVCs
1111
1389
  pvc_name = _get_pvc_name(
1112
1390
  cluster_name,
1113
1391
  kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME)
1114
1392
  # pylint: disable=cell-var-from-loop
1115
- _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.core_api(
1116
- context).delete_namespaced_persistent_volume_claim(
1117
- name=pvc_name,
1118
- namespace=namespace,
1119
- _request_timeout=config_lib.DELETION_TIMEOUT),
1120
- resource_type='pvc',
1121
- resource_name=pvc_name)
1393
+ kubernetes_utils.delete_k8s_resource_with_retry(
1394
+ delete_func=lambda: kubernetes.core_api(
1395
+ context).delete_namespaced_persistent_volume_claim(
1396
+ name=pvc_name,
1397
+ namespace=namespace,
1398
+ _request_timeout=config_lib.DELETION_TIMEOUT),
1399
+ resource_type='pvc',
1400
+ resource_name=pvc_name)
1122
1401
 
1123
1402
 
1124
1403
  def terminate_instances(
@@ -1133,18 +1412,6 @@ def terminate_instances(
1133
1412
  ray_tag_filter(cluster_name_on_cloud),
1134
1413
  None)
1135
1414
 
1136
- # Clean up the SSH jump pod if in use
1137
- networking_mode = network_utils.get_networking_mode(
1138
- provider_config.get('networking_mode'))
1139
- if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
1140
- pod_name = list(pods.keys())[0]
1141
- try:
1142
- kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context,
1143
- pod_name)
1144
- except Exception as e: # pylint: disable=broad-except
1145
- logger.warning('terminate_instances: Error occurred when analyzing '
1146
- f'SSH Jump pod: {e}')
1147
-
1148
1415
  if is_high_availability_cluster_by_kubectl(cluster_name_on_cloud, context,
1149
1416
  namespace):
1150
1417
  # For high availability controllers, terminate the deployment
@@ -1175,16 +1442,11 @@ def get_cluster_info(
1175
1442
 
1176
1443
  running_pods = kubernetes_utils.filter_pods(
1177
1444
  namespace, context, ray_tag_filter(cluster_name_on_cloud), ['Running'])
1445
+ logger.debug(f'Running pods: {list(running_pods.keys())}')
1178
1446
 
1179
1447
  pods: Dict[str, List[common.InstanceInfo]] = {}
1180
1448
  head_pod_name = None
1181
1449
 
1182
- port_forward_mode = kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD
1183
- network_mode_str = skypilot_config.get_nested(('kubernetes', 'networking'),
1184
- port_forward_mode.value)
1185
- network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
1186
- network_mode_str)
1187
- external_ip = kubernetes_utils.get_external_ip(network_mode, context)
1188
1450
  port = 22
1189
1451
  if not provider_config.get('use_internal_ips', False):
1190
1452
  port = kubernetes_utils.get_head_ssh_port(cluster_name_on_cloud,
@@ -1198,10 +1460,12 @@ def get_cluster_info(
1198
1460
  common.InstanceInfo(
1199
1461
  instance_id=pod_name,
1200
1462
  internal_ip=internal_ip,
1201
- external_ip=(None if network_mode == port_forward_mode else
1202
- external_ip),
1463
+ external_ip=None,
1203
1464
  ssh_port=port,
1204
1465
  tags=pod.metadata.labels,
1466
+ # TODO(hailong): `cluster.local` may need to be configurable
1467
+ # Service name is same as the pod name for now.
1468
+ internal_svc=f'{pod_name}.{namespace}.svc.cluster.local',
1205
1469
  )
1206
1470
  ]
1207
1471
  if _is_head(pod):
@@ -1210,10 +1474,16 @@ def get_cluster_info(
1210
1474
  assert head_spec is not None, pod
1211
1475
  cpu_request = head_spec.containers[0].resources.requests['cpu']
1212
1476
 
1213
- assert cpu_request is not None, 'cpu_request should not be None'
1477
+ if cpu_request is None:
1478
+ raise RuntimeError(f'Pod {cluster_name_on_cloud}-head not found'
1479
+ ' or not Running, check the Pod status')
1214
1480
 
1215
1481
  ssh_user = 'sky'
1216
- get_k8s_ssh_user_cmd = 'echo $(whoami)'
1482
+ # Use pattern matching to extract SSH user, handling MOTD contamination.
1483
+ # Some container images (like CUDA-Q) print MOTD when login shells start,
1484
+ # which can contaminate command output. We use a unique pattern to extract
1485
+ # the actual username reliably.
1486
+ get_k8s_ssh_user_cmd = 'echo "SKYPILOT_SSH_USER: $(whoami)"'
1217
1487
  assert head_pod_name is not None
1218
1488
  runner = command_runner.KubernetesCommandRunner(
1219
1489
  ((namespace, context), head_pod_name))
@@ -1223,10 +1493,24 @@ def get_cluster_info(
1223
1493
  stream_logs=False)
1224
1494
  _raise_command_running_error('get ssh user', get_k8s_ssh_user_cmd,
1225
1495
  head_pod_name, rc, stdout + stderr)
1226
- ssh_user = stdout.strip()
1496
+
1497
+ # Extract SSH user using pattern matching
1498
+ ssh_user_match = _SSH_USER_PATTERN.search(stdout)
1499
+ if ssh_user_match:
1500
+ ssh_user = ssh_user_match.group(1)
1501
+ else:
1502
+ raise ValueError('Failed to find SSH user identifier: '
1503
+ f'{stdout + stderr}')
1227
1504
  logger.debug(
1228
1505
  f'Using ssh user {ssh_user} for cluster {cluster_name_on_cloud}')
1229
1506
 
1507
+ # cpu_request may be a string like `100m`, need to parse and convert
1508
+ num_cpus = kubernetes_utils.parse_cpu_or_gpu_resource_to_float(cpu_request)
1509
+ # 'num-cpus' for ray must be an integer, but we should not set it to 0 if
1510
+ # cpus is <1.
1511
+ # Keep consistent with the logic in clouds/kubernetes.py
1512
+ str_cpus = str(max(int(num_cpus), 1))
1513
+
1230
1514
  return common.ClusterInfo(
1231
1515
  instances=pods,
1232
1516
  head_instance_id=head_pod_name,
@@ -1236,56 +1520,410 @@ def get_cluster_info(
1236
1520
  # problems for other pods.
1237
1521
  custom_ray_options={
1238
1522
  'object-store-memory': 500000000,
1239
- 'num-cpus': cpu_request,
1523
+ 'num-cpus': str_cpus,
1240
1524
  },
1241
1525
  provider_name='kubernetes',
1242
1526
  provider_config=provider_config)
1243
1527
 
1244
1528
 
1529
+ def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
1530
+ """Get pod termination reason and write to cluster events.
1531
+
1532
+ Checks both pod conditions (for preemption/disruption) and
1533
+ container statuses (for exit codes/errors).
1534
+ """
1535
+ latest_timestamp = pod.status.start_time or datetime.datetime.min
1536
+ ready_state = 'Unknown'
1537
+ termination_reason = 'Terminated unexpectedly'
1538
+ container_reasons = []
1539
+
1540
+ # Check pod status conditions for high level overview.
1541
+ # No need to sort, as each condition.type will only appear once.
1542
+ for condition in pod.status.conditions:
1543
+ reason = condition.reason or 'Unknown reason'
1544
+ message = condition.message or ''
1545
+
1546
+ # Get last known readiness state.
1547
+ if condition.type == 'Ready':
1548
+ ready_state = f'{reason} ({message})' if message else reason
1549
+ # Kueue preemption, as defined in:
1550
+ # https://pkg.go.dev/sigs.k8s.io/kueue/pkg/controller/jobs/pod#pkg-constants
1551
+ elif condition.type == 'TerminationTarget':
1552
+ termination_reason = f'Preempted by Kueue: {reason}'
1553
+ if message:
1554
+ termination_reason += f' ({message})'
1555
+ # Generic disruption.
1556
+ elif condition.type == 'DisruptionTarget':
1557
+ termination_reason = f'Disrupted: {reason}'
1558
+ if message:
1559
+ termination_reason += f' ({message})'
1560
+
1561
+ if condition.last_transition_time is not None:
1562
+ latest_timestamp = max(latest_timestamp,
1563
+ condition.last_transition_time)
1564
+
1565
+ pod_reason = (f'{termination_reason}.\n'
1566
+ f'Last known state: {ready_state}.')
1567
+
1568
+ # Check container statuses for exit codes/errors
1569
+ if pod.status and pod.status.container_statuses:
1570
+ for container_status in pod.status.container_statuses:
1571
+ terminated = container_status.state.terminated
1572
+ if terminated:
1573
+ exit_code = terminated.exit_code
1574
+ reason = terminated.reason
1575
+ if exit_code == 0:
1576
+ # skip exit 0 (non-failed) just for sanity
1577
+ logger.debug(f'{pod.metadata.name}/{container_status.name} '
1578
+ 'had exit code 0. Skipping.')
1579
+ continue
1580
+ if reason is None:
1581
+ # just in-case reason is None, have default for debugging
1582
+ reason = f'exit({exit_code})'
1583
+ container_reasons.append(reason)
1584
+ latest_timestamp = max(latest_timestamp, terminated.finished_at)
1585
+
1586
+ # TODO (kyuds): later, if needed, query `last_state` too.
1587
+
1588
+ # Normally we will have a single container per pod for skypilot
1589
+ # but doing this just in-case there are multiple containers.
1590
+ if container_reasons:
1591
+ pod_reason += f'\nContainer errors: {" | ".join(container_reasons)}'
1592
+
1593
+ global_user_state.add_cluster_event(
1594
+ cluster_name,
1595
+ None,
1596
+ f'[kubernetes pod {pod.metadata.name} terminated] {pod_reason}',
1597
+ global_user_state.ClusterEventType.DEBUG,
1598
+ transitioned_at=int(latest_timestamp.timestamp()),
1599
+ )
1600
+ return pod_reason
1601
+
1602
+
1603
+ def _get_pod_events(context: Optional[str], namespace: str,
1604
+ pod_name: str) -> List[Any]:
1605
+ """Get the events for a pod, sorted by timestamp, most recent first."""
1606
+ pod_field_selector = (
1607
+ f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
1608
+ pod_events = kubernetes.core_api(context).list_namespaced_event(
1609
+ namespace,
1610
+ field_selector=pod_field_selector,
1611
+ _request_timeout=kubernetes.API_TIMEOUT).items
1612
+ return sorted(
1613
+ pod_events,
1614
+ key=lambda event: event.metadata.creation_timestamp,
1615
+ # latest event appears first
1616
+ reverse=True)
1617
+
1618
+
1619
+ def _get_pod_pending_reason(context: Optional[str], namespace: str,
1620
+ pod_name: str) -> Optional[Tuple[str, str]]:
1621
+ """Get the reason why a pod is pending from its events.
1622
+
1623
+ Returns a (reason, message) tuple about why the pod is pending (e.g.,
1624
+ ("FailedMount", "hostPath type check failed")) or None if no reason found.
1625
+ """
1626
+ try:
1627
+ pod_events = _get_pod_events(context, namespace, pod_name)
1628
+ except Exception as e: # pylint: disable=broad-except
1629
+ logger.debug(f'Failed to get events for pod {pod_name}: {e}')
1630
+ return None
1631
+
1632
+ if not pod_events:
1633
+ return None
1634
+
1635
+ for event in pod_events:
1636
+ # Omit common events that does not indicate a pending reason.
1637
+ # We could also filter by event type 'Warning' or 'Error',
1638
+ # but there might be useful 'Normal' events such as pulling
1639
+ # image that we want to surface to the user.
1640
+ if event.reason not in COMMON_NON_PENDING_EVENT_REASONS:
1641
+ reason = event.reason or 'Unknown'
1642
+ message = event.message or ''
1643
+ return reason, message
1644
+
1645
+ return None
1646
+
1647
+
1648
+ def _get_pod_missing_reason(context: Optional[str], namespace: str,
1649
+ cluster_name: str, pod_name: str) -> Optional[str]:
1650
+ """Get events for missing pod and write to cluster events."""
1651
+ logger.debug(f'Analyzing events for pod {pod_name}')
1652
+ pod_events = _get_pod_events(context, namespace, pod_name)
1653
+ last_scheduled_node = None
1654
+ insert_new_pod_event = True
1655
+ new_event_inserted = False
1656
+ inserted_pod_events = 0
1657
+
1658
+ for event in pod_events:
1659
+ if event.reason == 'Scheduled':
1660
+ pattern = r'Successfully assigned (\S+) to (\S+)'
1661
+ match = re.search(pattern, event.message)
1662
+ if match:
1663
+ scheduled_node = match.group(2)
1664
+ last_scheduled_node = scheduled_node
1665
+ if insert_new_pod_event:
1666
+ # Try inserting the latest events first. If the event is a
1667
+ # duplicate, it means the event (and any previous events) have
1668
+ # already been inserted - so do not insert further events.
1669
+ try:
1670
+ global_user_state.add_cluster_event(
1671
+ cluster_name,
1672
+ None, f'[kubernetes pod {pod_name}] '
1673
+ f'{event.reason} {event.message}',
1674
+ global_user_state.ClusterEventType.DEBUG,
1675
+ transitioned_at=int(
1676
+ event.metadata.creation_timestamp.timestamp()),
1677
+ expose_duplicate_error=True)
1678
+ logger.debug(f'[pod {pod_name}] encountered new pod event: '
1679
+ f'{event.metadata.creation_timestamp} '
1680
+ f'{event.reason} {event.message}')
1681
+ except db_utils.UniqueConstraintViolationError:
1682
+ insert_new_pod_event = False
1683
+ else:
1684
+ new_event_inserted = True
1685
+ inserted_pod_events += 1
1686
+
1687
+ logger.debug(f'[pod {pod_name}] processed {len(pod_events)} pod events and '
1688
+ f'inserted {inserted_pod_events} new pod events '
1689
+ 'previously unseen')
1690
+
1691
+ if last_scheduled_node is not None:
1692
+ node_field_selector = ('involvedObject.kind=Node,'
1693
+ f'involvedObject.name={last_scheduled_node}')
1694
+ node_events = kubernetes.core_api(context).list_namespaced_event(
1695
+ namespace,
1696
+ field_selector=node_field_selector,
1697
+ _request_timeout=kubernetes.API_TIMEOUT).items
1698
+ node_events = sorted(
1699
+ node_events,
1700
+ key=lambda event: event.metadata.creation_timestamp,
1701
+ # latest event appears first
1702
+ reverse=True)
1703
+ insert_new_node_event = True
1704
+ inserted_node_events = 0
1705
+ for event in node_events:
1706
+ if insert_new_node_event:
1707
+ # Try inserting the latest events first. If the event is a
1708
+ # duplicate, it means the event (and any previous events) have
1709
+ # already been inserted - so do not insert further events.
1710
+ try:
1711
+ global_user_state.add_cluster_event(
1712
+ cluster_name,
1713
+ None, f'[kubernetes node {last_scheduled_node}] '
1714
+ f'{event.reason} {event.message}',
1715
+ global_user_state.ClusterEventType.DEBUG,
1716
+ transitioned_at=int(
1717
+ event.metadata.creation_timestamp.timestamp()),
1718
+ expose_duplicate_error=True)
1719
+ logger.debug(
1720
+ f'[pod {pod_name}] encountered new node event: '
1721
+ f'{event.metadata.creation_timestamp} '
1722
+ f'{event.reason} {event.message}')
1723
+ except db_utils.UniqueConstraintViolationError:
1724
+ insert_new_node_event = False
1725
+ else:
1726
+ new_event_inserted = True
1727
+ inserted_node_events += 1
1728
+
1729
+ logger.debug(f'[pod {pod_name}: node {last_scheduled_node}] '
1730
+ f'processed {len(node_events)} node events and '
1731
+ f'inserted {inserted_node_events} new node events '
1732
+ 'previously unseen')
1733
+ else:
1734
+ logger.debug(f'[pod {pod_name}] could not determine the node '
1735
+ 'the pod was scheduled to')
1736
+
1737
+ if not new_event_inserted:
1738
+ # If new event is not inserted, there is no useful information to
1739
+ # return. Return None.
1740
+ return None
1741
+
1742
+ # Analyze the events for failure
1743
+ failure_reason = None
1744
+ failure_decisiveness = 0
1745
+
1746
+ def _record_failure_reason(reason: str, decisiveness: int):
1747
+ nonlocal failure_reason, failure_decisiveness
1748
+ if decisiveness > failure_decisiveness:
1749
+ failure_reason = reason
1750
+ failure_decisiveness = decisiveness
1751
+
1752
+ cluster_events = global_user_state.get_cluster_events(
1753
+ cluster_name, None, global_user_state.ClusterEventType.DEBUG)
1754
+ for event in cluster_events:
1755
+ if event.startswith('[kubernetes pod'):
1756
+ event = event.split(']')[1].strip()
1757
+ elif event.startswith('[kubernetes node'):
1758
+ event = event.split(']')[1].strip()
1759
+
1760
+ if event.startswith('NodeNotReady '):
1761
+ _record_failure_reason(event[len('NodeNotReady '):], 1)
1762
+ elif event.startswith('TaintManagerEviction '):
1763
+ # usually the event message for TaintManagerEviction is not useful
1764
+ # so we record a more generic message.
1765
+ _record_failure_reason('pod was evicted by taint manager', 2)
1766
+ elif event.startswith('DeletingNode '):
1767
+ _record_failure_reason(event[len('DeletingNode '):], 3)
1768
+ return failure_reason
1769
+
1770
+
1771
+ def list_namespaced_pod(context: Optional[str], namespace: str,
1772
+ cluster_name_on_cloud: str, is_ssh: bool, identity: str,
1773
+ label_selector: str) -> List[Any]:
1774
+ # Get all the pods with the label skypilot-cluster-name: <cluster_name>
1775
+ try:
1776
+ # log the query parameters we pass to the k8s api
1777
+ logger.debug(f'Querying k8s api for pods:\n'
1778
+ f'context: {context}\n'
1779
+ f'namespace: {namespace}\n'
1780
+ f'label selector:`{label_selector}`.')
1781
+
1782
+ response = kubernetes.core_api(context).list_namespaced_pod(
1783
+ namespace,
1784
+ label_selector=label_selector,
1785
+ _request_timeout=kubernetes.API_TIMEOUT)
1786
+
1787
+ # log PodList response info
1788
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1789
+ logger.debug(f'k8s api response for `{label_selector}`:\n'
1790
+ f'apiVersion={response.api_version}, '
1791
+ f'kind={response.kind},\n'
1792
+ f'metadata={response.metadata}')
1793
+
1794
+ pods = response.items
1795
+
1796
+ # log detailed Pod info
1797
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1798
+ logger.debug(f'k8s api response for `{label_selector}`: '
1799
+ f'len(pods)={len(pods)}')
1800
+ for pod in pods:
1801
+ logger.debug(f'k8s pod info for `{label_selector}`: '
1802
+ f'pod.apiVersion={pod.api_version}, '
1803
+ f'pod.kind={pod.kind}, \n'
1804
+ f'pod.name={pod.metadata.name}, '
1805
+ f'pod.namespace={pod.metadata.namespace}, \n'
1806
+ f'pod.labels={pod.metadata.labels}, \n'
1807
+ f'pod.annotations={pod.metadata.annotations}, \n'
1808
+ 'pod.creationTimestamp='
1809
+ f'{pod.metadata.creation_timestamp}, '
1810
+ 'pod.deletionTimestamp='
1811
+ f'{pod.metadata.deletion_timestamp}, \n'
1812
+ f'pod.status={pod.status}')
1813
+ return pods
1814
+
1815
+ except kubernetes.max_retry_error():
1816
+ with ux_utils.print_exception_no_traceback():
1817
+ if is_ssh:
1818
+ node_pool = common_utils.removeprefix(context,
1819
+ 'ssh-') if context else ''
1820
+ msg = (
1821
+ f'Cannot connect to SSH Node Pool {node_pool}. '
1822
+ 'Please check if the SSH Node Pool is up and accessible. '
1823
+ 'To debug, run `sky check ssh` to check the status of '
1824
+ 'the SSH Node Pool.')
1825
+ else:
1826
+ ctx = kubernetes_utils.get_current_kube_config_context_name()
1827
+ msg = (f'Network error - check if the {identity} in '
1828
+ f'context {ctx} is up and accessible.')
1829
+ raise exceptions.ClusterStatusFetchingError(
1830
+ f'Failed to query cluster {cluster_name_on_cloud!r} status. ' +
1831
+ msg) from None
1832
+ except Exception as e: # pylint: disable=broad-except
1833
+ with ux_utils.print_exception_no_traceback():
1834
+ raise exceptions.ClusterStatusFetchingError(
1835
+ f'Failed to query {identity} {cluster_name_on_cloud!r} '
1836
+ f'status: {common_utils.format_exception(e)}')
1837
+
1838
+
1245
1839
  def query_instances(
1840
+ cluster_name: str,
1246
1841
  cluster_name_on_cloud: str,
1247
1842
  provider_config: Optional[Dict[str, Any]] = None,
1248
- non_terminated_only: bool = True
1249
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
1843
+ non_terminated_only: bool = True,
1844
+ retry_if_missing: bool = False,
1845
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
1846
+ # Mapping from pod phase to skypilot status. These are the only valid pod
1847
+ # phases.
1848
+ # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
1250
1849
  status_map = {
1251
1850
  'Pending': status_lib.ClusterStatus.INIT,
1252
1851
  'Running': status_lib.ClusterStatus.UP,
1253
- 'Failed': None,
1852
+ 'Failed': status_lib.ClusterStatus.INIT,
1254
1853
  'Unknown': None,
1255
1854
  'Succeeded': None,
1256
- 'Terminating': None,
1257
1855
  }
1258
1856
 
1259
1857
  assert provider_config is not None
1260
1858
  namespace = kubernetes_utils.get_namespace_from_config(provider_config)
1261
1859
  context = kubernetes_utils.get_context_from_config(provider_config)
1262
-
1263
- # Get all the pods with the label skypilot-cluster: <cluster_name>
1264
- try:
1265
- pods = kubernetes.core_api(context).list_namespaced_pod(
1266
- namespace,
1267
- label_selector=f'skypilot-cluster={cluster_name_on_cloud}',
1268
- _request_timeout=kubernetes.API_TIMEOUT).items
1269
- except kubernetes.max_retry_error():
1270
- with ux_utils.print_exception_no_traceback():
1271
- ctx = kubernetes_utils.get_current_kube_config_context_name()
1272
- raise exceptions.ClusterStatusFetchingError(
1273
- f'Failed to query cluster {cluster_name_on_cloud!r} status. '
1274
- 'Network error - check if the Kubernetes cluster in '
1275
- f'context {ctx} is up and accessible.') from None
1276
- except Exception as e: # pylint: disable=broad-except
1277
- with ux_utils.print_exception_no_traceback():
1278
- raise exceptions.ClusterStatusFetchingError(
1279
- f'Failed to query Kubernetes cluster {cluster_name_on_cloud!r} '
1280
- f'status: {common_utils.format_exception(e)}')
1860
+ is_ssh = context.startswith('ssh-') if context else False
1861
+ identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
1862
+ label_selector = (f'{constants.TAG_SKYPILOT_CLUSTER_NAME}='
1863
+ f'{cluster_name_on_cloud}')
1864
+
1865
+ attempts = 0
1866
+ pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
1867
+ is_ssh, identity, label_selector)
1868
+ # When we see no pods returned from the k8s api, we assume the pods have
1869
+ # been terminated by the user directly and mark the cluster as terminated
1870
+ # in the global user state.
1871
+ # We add retry logic here as an attempt to mitigate a leak caused by the
1872
+ # kubernetes api returning no pods despite the pods actually existing.
1873
+ while (retry_if_missing and not pods and
1874
+ attempts < _MAX_QUERY_INSTANCES_RETRIES):
1875
+ logger.debug(f'Retrying to query k8s api for {cluster_name_on_cloud} '
1876
+ f'{attempts}/{_MAX_QUERY_INSTANCES_RETRIES} times.'
1877
+ f'after {_QUERY_INSTANCES_RETRY_INTERVAL} seconds.')
1878
+ time.sleep(_QUERY_INSTANCES_RETRY_INTERVAL)
1879
+ attempts += 1
1880
+ pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
1881
+ is_ssh, identity, label_selector)
1882
+ if len(pods) > 0:
1883
+ logger.info(f'Found {len(pods)} pods for {label_selector} after'
1884
+ f'{attempts} retries.')
1281
1885
 
1282
1886
  # Check if the pods are running or pending
1283
- cluster_status = {}
1887
+ cluster_status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
1888
+ Optional[str]]] = {}
1284
1889
  for pod in pods:
1285
- pod_status = status_map[pod.status.phase]
1890
+ phase = pod.status.phase
1891
+ is_terminating = pod.metadata.deletion_timestamp is not None
1892
+ pod_status = status_map[phase]
1893
+ reason = None
1894
+ if phase in ('Failed', 'Unknown') or is_terminating:
1895
+ reason = _get_pod_termination_reason(pod, cluster_name)
1896
+ logger.debug(f'Pod Status ({phase}) Reason(s): {reason}')
1286
1897
  if non_terminated_only and pod_status is None:
1898
+ logger.debug(f'Pod {pod.metadata.name} is terminated, but '
1899
+ 'query_instances is called with '
1900
+ f'non_terminated_only=True. Phase: {phase}')
1287
1901
  continue
1288
- cluster_status[pod.metadata.name] = pod_status
1902
+ pod_name = pod.metadata.name
1903
+ reason = f'{pod_name}: {reason}' if reason is not None else None
1904
+ cluster_status[pod_name] = (pod_status, reason)
1905
+
1906
+ # Find the list of pod names that should be there
1907
+ # from k8s services. Filter duplicates as -ssh service
1908
+ # creates a duplicate entry.
1909
+ target_pod_names = list(
1910
+ set([
1911
+ service['spec']['selector']['component']
1912
+ for service in provider_config.get('services', [])
1913
+ ]))
1914
+
1915
+ for target_pod_name in target_pod_names:
1916
+ if target_pod_name not in cluster_status:
1917
+ # If the pod is not in the cluster_status, it means it's not
1918
+ # running.
1919
+ # Analyze what happened to the pod based on events.
1920
+ reason = _get_pod_missing_reason(context, namespace, cluster_name,
1921
+ target_pod_name)
1922
+ reason = (f'{target_pod_name}: {reason}'
1923
+ if reason is not None else None)
1924
+ if not non_terminated_only:
1925
+ cluster_status[target_pod_name] = (None, reason)
1926
+
1289
1927
  return cluster_status
1290
1928
 
1291
1929
 
@@ -1307,7 +1945,8 @@ def get_command_runners(
1307
1945
 
1308
1946
  # Try to get deployment name from label first
1309
1947
  head_instance_info = instances[pod_name][0]
1310
- deployment = head_instance_info.tags.get(TAG_SKYPILOT_DEPLOYMENT_NAME)
1948
+ deployment = head_instance_info.tags.get(
1949
+ k8s_constants.TAG_SKYPILOT_DEPLOYMENT_NAME)
1311
1950
 
1312
1951
  node_list = [((namespace, context), pod_name)]
1313
1952
  head_runner = command_runner.KubernetesCommandRunner(