skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -3,8 +3,9 @@ import copy
3
3
  import datetime
4
4
  import json
5
5
  import re
6
+ import sys
6
7
  import time
7
- from typing import Any, Callable, Dict, List, Optional, Tuple, Union
8
+ from typing import Any, Dict, List, Optional, Tuple, Union
8
9
 
9
10
  from sky import exceptions
10
11
  from sky import global_user_state
@@ -16,13 +17,13 @@ from sky.provision import constants
16
17
  from sky.provision import docker_utils
17
18
  from sky.provision.kubernetes import config as config_lib
18
19
  from sky.provision.kubernetes import constants as k8s_constants
19
- from sky.provision.kubernetes import network_utils
20
20
  from sky.provision.kubernetes import utils as kubernetes_utils
21
21
  from sky.provision.kubernetes import volume
22
22
  from sky.utils import command_runner
23
23
  from sky.utils import common_utils
24
24
  from sky.utils import config_utils
25
25
  from sky.utils import kubernetes_enums
26
+ from sky.utils import rich_utils
26
27
  from sky.utils import status_lib
27
28
  from sky.utils import subprocess_utils
28
29
  from sky.utils import timeline
@@ -32,8 +33,18 @@ from sky.utils.db import db_utils
32
33
  POLL_INTERVAL = 2
33
34
  _TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
34
35
  _MAX_RETRIES = 3
36
+ _MAX_MISSING_PODS_RETRIES = 5
37
+ _MAX_QUERY_INSTANCES_RETRIES = 5
38
+ _QUERY_INSTANCES_RETRY_INTERVAL = .5
35
39
  _NUM_THREADS = subprocess_utils.get_parallel_threads('kubernetes')
36
40
 
41
+ COMMON_NON_PENDING_EVENT_REASONS = {
42
+ 'Scheduled', 'Created', 'Started', 'Failed', 'Pulled'
43
+ }
44
+
45
+ # Pattern to extract SSH user from command output, handling MOTD contamination
46
+ _SSH_USER_PATTERN = re.compile(r'SKYPILOT_SSH_USER: ([^\s\n]+)')
47
+
37
48
  logger = sky_logging.init_logger(__name__)
38
49
 
39
50
 
@@ -77,7 +88,7 @@ def is_high_availability_cluster_by_kubectl(
77
88
  context).list_namespaced_deployment(
78
89
  namespace,
79
90
  label_selector=
80
- f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
91
+ f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
81
92
  except kubernetes.api_exception():
82
93
  return False
83
94
  # It is a high availability cluster if there is at least one deployment
@@ -191,14 +202,20 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
191
202
  break
192
203
  if event_message is not None:
193
204
  if pod_status == 'Pending':
194
- logger.info(event_message)
205
+ out_of = {}
206
+ # key: resource name, value: (extra message, nice name)
195
207
  if 'Insufficient cpu' in event_message:
196
- raise config_lib.KubernetesError(
197
- _lack_resource_msg('CPU', pod, details=event_message))
208
+ out_of['CPU'] = (': Run \'kubectl get nodes -o '
209
+ 'custom-columns=NAME:.metadata.name,'
210
+ 'CPU:.status.allocatable.cpu\' to check '
211
+ 'the available CPUs on the node.', 'CPUs')
198
212
  if 'Insufficient memory' in event_message:
199
- raise config_lib.KubernetesError(
200
- _lack_resource_msg('memory', pod,
201
- details=event_message))
213
+ out_of['memory'] = (': Run \'kubectl get nodes -o '
214
+ 'custom-columns=NAME:.metadata.name,'
215
+ 'MEMORY:.status.allocatable.memory\' '
216
+ 'to check the available memory on the '
217
+ 'node.', 'Memory')
218
+
202
219
  # TODO(aylei): after switching from smarter-device-manager to
203
220
  # fusermount-server, we need a new way to check whether the
204
221
  # fusermount-server daemonset is ready.
@@ -206,41 +223,77 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
206
223
  key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
207
224
  for key in lf.get_label_keys()
208
225
  ]
209
- if pod.spec.node_selector:
210
- for label_key in pod.spec.node_selector.keys():
211
- if label_key in gpu_lf_keys:
212
- # TODO(romilb): We may have additional node
213
- # affinity selectors in the future - in that
214
- # case we will need to update this logic.
215
- # TODO(Doyoung): Update the error message raised
216
- # with the multi-host TPU support.
217
- gpu_resource_key = kubernetes_utils.get_gpu_resource_key(context) # pylint: disable=line-too-long
218
- if 'Insufficient google.com/tpu' in event_message:
219
- extra_msg = (
220
- f'Verify if '
221
- f'{pod.spec.node_selector[label_key]}'
222
- ' is available in the cluster. Note '
223
- 'that multi-host TPU podslices are '
224
- 'currently not unsupported.')
225
- raise config_lib.KubernetesError(
226
- _lack_resource_msg('TPU',
227
- pod,
228
- extra_msg,
229
- details=event_message))
230
- elif ((f'Insufficient {gpu_resource_key}'
231
- in event_message) or
232
- ('didn\'t match Pod\'s node affinity/selector'
233
- in event_message)):
234
- extra_msg = (
235
- f'Verify if any node matching label '
236
- f'{pod.spec.node_selector[label_key]} and '
237
- f'sufficient resource {gpu_resource_key} '
238
- f'is available in the cluster.')
239
- raise config_lib.KubernetesError(
240
- _lack_resource_msg('GPU',
241
- pod,
242
- extra_msg,
243
- details=event_message))
226
+ for label_key in gpu_lf_keys:
227
+ # TODO(romilb): We may have additional node
228
+ # affinity selectors in the future - in that
229
+ # case we will need to update this logic.
230
+ # TODO(Doyoung): Update the error message raised
231
+ # with the multi-host TPU support.
232
+ gpu_resource_key = kubernetes_utils.get_gpu_resource_key(
233
+ context) # pylint: disable=line-too-long
234
+ if ((f'Insufficient {gpu_resource_key}' in event_message) or
235
+ ('didn\'t match Pod\'s node affinity/selector'
236
+ in event_message) and pod.spec.node_selector):
237
+ if 'gpu' in gpu_resource_key.lower():
238
+ info_msg = (
239
+ ': Run \'sky show-gpus --infra kubernetes\' to '
240
+ 'see the available GPUs.')
241
+ else:
242
+ info_msg = ': '
243
+ if (pod.spec.node_selector and
244
+ label_key in pod.spec.node_selector):
245
+ extra_msg = (
246
+ f'Verify if any node matching label '
247
+ f'{pod.spec.node_selector[label_key]} and '
248
+ f'sufficient resource {gpu_resource_key} '
249
+ f'is available in the cluster.')
250
+ extra_msg = info_msg + ' ' + extra_msg
251
+ else:
252
+ extra_msg = info_msg
253
+ if gpu_resource_key not in out_of or len(
254
+ out_of[gpu_resource_key][0]) < len(extra_msg):
255
+ out_of[f'{gpu_resource_key}'] = (extra_msg, 'GPUs')
256
+
257
+ if len(out_of) > 0:
258
+ # We are out of some resources. We should raise an error.
259
+ rsrc_err_msg = 'Insufficient resource capacity on the '
260
+ rsrc_err_msg += 'cluster:\n'
261
+ out_of_keys = list(out_of.keys())
262
+ for i in range(len(out_of_keys)):
263
+ rsrc = out_of_keys[i]
264
+ (extra_msg, nice_name) = out_of[rsrc]
265
+ extra_msg = extra_msg if extra_msg else ''
266
+ if i == len(out_of_keys) - 1:
267
+ indent = '└──'
268
+ else:
269
+ indent = '├──'
270
+ rsrc_err_msg += (f'{indent} Cluster does not have '
271
+ f'sufficient {nice_name} for your request'
272
+ f'{extra_msg}')
273
+ if i != len(out_of_keys) - 1:
274
+ rsrc_err_msg += '\n'
275
+
276
+ # Emit the error message without logging prefixes for better UX.
277
+ tmp_handler = sky_logging.EnvAwareHandler(sys.stdout)
278
+ tmp_handler.flush = sys.stdout.flush # type: ignore
279
+ tmp_handler.setFormatter(sky_logging.NO_PREFIX_FORMATTER)
280
+ tmp_handler.setLevel(sky_logging.ERROR)
281
+ prev_propagate = logger.propagate
282
+ try:
283
+ logger.addHandler(tmp_handler)
284
+ logger.propagate = False
285
+ logger.error(ux_utils.error_message(f'{rsrc_err_msg}'))
286
+ finally:
287
+ logger.removeHandler(tmp_handler)
288
+ logger.propagate = prev_propagate
289
+ nice_names = [out_of[rsrc][1] for rsrc in out_of_keys]
290
+ raise config_lib.KubernetesError(
291
+ f'{timeout_err_msg} '
292
+ f'Pod status: {pod_status} '
293
+ f'Details: \'{event_message}\' ',
294
+ insufficent_resources=nice_names,
295
+ )
296
+
244
297
  raise config_lib.KubernetesError(f'{timeout_err_msg} '
245
298
  f'Pod status: {pod_status} '
246
299
  f'Details: \'{event_message}\' ')
@@ -256,8 +309,89 @@ def _raise_command_running_error(message: str, command: str, pod_name: str,
256
309
  f'code {rc}: {command!r}\nOutput: {stdout}.')
257
310
 
258
311
 
312
+ def _detect_cluster_event_reason_occurred(namespace, context, search_start,
313
+ reason) -> bool:
314
+
315
+ def _convert_to_utc(timestamp):
316
+ if timestamp.tzinfo is None:
317
+ return timestamp.replace(tzinfo=datetime.timezone.utc)
318
+ return timestamp.astimezone(datetime.timezone.utc)
319
+
320
+ def _get_event_timestamp(event):
321
+ if event.last_timestamp:
322
+ return event.last_timestamp
323
+ elif event.metadata.creation_timestamp:
324
+ return event.metadata.creation_timestamp
325
+ return None
326
+
327
+ events = kubernetes.core_api(context).list_namespaced_event(
328
+ namespace=namespace, field_selector=f'reason={reason}')
329
+ for event in events.items:
330
+ ts = _get_event_timestamp(event)
331
+ if ts and _convert_to_utc(ts) > search_start:
332
+ return True
333
+ return False
334
+
335
+
336
+ def _cluster_had_autoscale_event(namespace, context, search_start) -> bool:
337
+ """Detects whether the cluster had a autoscaling event after a
338
+ specified datetime. This only works when using cluster-autoscaler.
339
+
340
+ Args:
341
+ namespace: kubernetes namespace
342
+ context: kubernetes context
343
+ search_start (datetime.datetime): filter for events that occurred
344
+ after search_start
345
+
346
+ Returns:
347
+ A boolean whether the cluster has an autoscaling event or not.
348
+ """
349
+ assert namespace is not None
350
+
351
+ try:
352
+ return _detect_cluster_event_reason_occurred(namespace, context,
353
+ search_start,
354
+ 'TriggeredScaleUp')
355
+ except Exception as e: # pylint: disable=broad-except
356
+ logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
357
+ return False
358
+
359
+
360
+ def _cluster_maybe_autoscaling(namespace, context, search_start) -> bool:
361
+ """Detects whether a kubernetes cluster may have an autoscaling event.
362
+
363
+ This is not a definitive detection. FailedScheduling, which is an
364
+ event that can occur when not enough resources are present in the cluster,
365
+ which is a trigger for cluster autoscaling. However, FailedScheduling may
366
+ have occurred due to other reasons (cluster itself is abnormal).
367
+
368
+ Hence, this should only be used for autoscalers that don't emit the
369
+ TriggeredScaleUp event, e.g.: Karpenter.
370
+
371
+ Args:
372
+ namespace: kubernetes namespace
373
+ context: kubernetes context
374
+ search_start (datetime.datetime): filter for events that occurred
375
+ after search_start
376
+
377
+ Returns:
378
+ A boolean whether the cluster has an autoscaling event or not.
379
+ """
380
+ assert namespace is not None
381
+
382
+ try:
383
+ return _detect_cluster_event_reason_occurred(namespace, context,
384
+ search_start,
385
+ 'FailedScheduling')
386
+ except Exception as e: # pylint: disable=broad-except
387
+ logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
388
+ return False
389
+
390
+
259
391
  @timeline.event
260
- def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
392
+ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int,
393
+ cluster_name: str,
394
+ create_pods_start: datetime.datetime):
261
395
  """Wait for all pods to be scheduled.
262
396
 
263
397
  Wait for all pods including jump pod to be scheduled, and if it
@@ -266,6 +400,9 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
266
400
  allocated and we can exit.
267
401
 
268
402
  If timeout is set to a negative value, this method will wait indefinitely.
403
+
404
+ Will update the spinner message to indicate autoscaling if autoscaling
405
+ is happening.
269
406
  """
270
407
  # Create a set of pod names we're waiting for
271
408
  if not new_nodes:
@@ -273,6 +410,18 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
273
410
  expected_pod_names = {node.metadata.name for node in new_nodes}
274
411
  start_time = time.time()
275
412
 
413
+ # Variables for autoscaler detection
414
+ autoscaler_type = skypilot_config.get_effective_region_config(
415
+ cloud='kubernetes',
416
+ region=context,
417
+ keys=('autoscaler',),
418
+ default_value=None)
419
+ autoscaler_is_set = autoscaler_type is not None
420
+ use_heuristic_detection = (autoscaler_is_set and
421
+ not kubernetes_enums.KubernetesAutoscalerType(
422
+ autoscaler_type).emits_autoscale_event())
423
+ is_autoscaling = False
424
+
276
425
  def _evaluate_timeout() -> bool:
277
426
  # If timeout is negative, retry indefinitely.
278
427
  if timeout < 0:
@@ -282,12 +431,13 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
282
431
  while _evaluate_timeout():
283
432
  # Get all pods in a single API call using the cluster name label
284
433
  # which all pods in new_nodes should share
285
- cluster_name = new_nodes[0].metadata.labels[
286
- k8s_constants.TAG_SKYPILOT_CLUSTER_NAME]
434
+ cluster_name_on_cloud = new_nodes[0].metadata.labels[
435
+ constants.TAG_SKYPILOT_CLUSTER_NAME]
287
436
  pods = kubernetes.core_api(context).list_namespaced_pod(
288
437
  namespace,
289
438
  label_selector=
290
- f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
439
+ f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
440
+ ).items
291
441
 
292
442
  # Get the set of found pod names and check if we have all expected pods
293
443
  found_pod_names = {pod.metadata.name for pod in pods}
@@ -311,6 +461,26 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
311
461
 
312
462
  if all_scheduled:
313
463
  return
464
+
465
+ # Check if cluster is autoscaling and update spinner message.
466
+ # Minor optimization to not query k8s api after autoscaling
467
+ # event was detected. This is useful because there isn't any
468
+ # autoscaling complete event.
469
+ if autoscaler_is_set and not is_autoscaling:
470
+ if use_heuristic_detection:
471
+ is_autoscaling = _cluster_maybe_autoscaling(
472
+ namespace, context, create_pods_start)
473
+ msg = 'Kubernetes cluster may be scaling up'
474
+ else:
475
+ is_autoscaling = _cluster_had_autoscale_event(
476
+ namespace, context, create_pods_start)
477
+ msg = 'Kubernetes cluster is autoscaling'
478
+
479
+ if is_autoscaling:
480
+ rich_utils.force_update_status(
481
+ ux_utils.spinner_message(f'Launching ({msg})',
482
+ cluster_name=cluster_name))
483
+
314
484
  time.sleep(1)
315
485
 
316
486
  # Handle pod scheduling errors
@@ -326,17 +496,17 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
326
496
 
327
497
 
328
498
  @timeline.event
329
- def _wait_for_pods_to_run(namespace, context, new_nodes):
499
+ def _wait_for_pods_to_run(namespace, context, cluster_name, new_pods):
330
500
  """Wait for pods and their containers to be ready.
331
501
 
332
502
  Pods may be pulling images or may be in the process of container
333
503
  creation.
334
504
  """
335
- if not new_nodes:
505
+ if not new_pods:
336
506
  return
337
507
 
338
508
  # Create a set of pod names we're waiting for
339
- expected_pod_names = {node.metadata.name for node in new_nodes}
509
+ expected_pod_names = {pod.metadata.name for pod in new_pods}
340
510
 
341
511
  def _check_init_containers(pod):
342
512
  # Check if any of the init containers failed
@@ -363,39 +533,40 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
363
533
  'Failed to create init container for pod '
364
534
  f'{pod.metadata.name}. Error details: {msg}.')
365
535
 
366
- while True:
367
- # Get all pods in a single API call
368
- cluster_name = new_nodes[0].metadata.labels[
369
- k8s_constants.TAG_SKYPILOT_CLUSTER_NAME]
370
- all_pods = kubernetes.core_api(context).list_namespaced_pod(
371
- namespace,
372
- label_selector=
373
- f'{k8s_constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
374
-
375
- # Get the set of found pod names and check if we have all expected pods
376
- found_pod_names = {pod.metadata.name for pod in all_pods}
377
- missing_pods = expected_pod_names - found_pod_names
378
- if missing_pods:
379
- logger.info('Retrying running pods check: '
380
- f'Missing pods: {missing_pods}')
381
- time.sleep(0.5)
382
- continue
383
-
384
- all_pods_running = True
385
- for pod in all_pods:
386
- if pod.metadata.name not in expected_pod_names:
387
- continue
388
- # Continue if pod and all the containers within the
389
- # pod are successfully created and running.
390
- if pod.status.phase == 'Running' and all(
391
- container.state.running
392
- for container in pod.status.container_statuses):
393
- continue
394
-
395
- all_pods_running = False
396
- if pod.status.phase == 'Pending':
397
- # Iterate over each container in pod to check their status
398
- for container_status in pod.status.container_statuses:
536
+ def _inspect_pod_status(pod):
537
+ # Check if pod is terminated/preempted/failed.
538
+ if (pod.metadata.deletion_timestamp is not None or
539
+ pod.status.phase == 'Failed'):
540
+ # Get the reason and write to cluster events before
541
+ # the pod gets completely deleted from the API.
542
+ termination_reason = _get_pod_termination_reason(pod, cluster_name)
543
+ logger.warning(
544
+ f'Pod {pod.metadata.name} terminated: {termination_reason}')
545
+ raise config_lib.KubernetesError(
546
+ f'Pod {pod.metadata.name} has terminated or failed '
547
+ f'unexpectedly. Run `sky logs --provision {cluster_name}` '
548
+ 'for more details.')
549
+
550
+ container_statuses = pod.status.container_statuses
551
+ # Continue if pod and all the containers within the
552
+ # pod are successfully created and running.
553
+ if (pod.status.phase == 'Running' and container_statuses is not None and
554
+ all(container.state.running
555
+ for container in container_statuses)):
556
+ return True, None
557
+
558
+ reason: Optional[str] = None
559
+ if pod.status.phase == 'Pending':
560
+ pending_reason = _get_pod_pending_reason(context, namespace,
561
+ pod.metadata.name)
562
+ if pending_reason is not None:
563
+ reason, message = pending_reason
564
+ logger.debug(f'Pod {pod.metadata.name} is pending: '
565
+ f'{reason}: {message}')
566
+
567
+ # Iterate over each container in pod to check their status
568
+ if container_statuses is not None:
569
+ for container_status in container_statuses:
399
570
  # If the container wasn't in 'ContainerCreating'
400
571
  # state, then we know pod wasn't scheduled or
401
572
  # had some other error, such as image pull error.
@@ -406,43 +577,86 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
406
577
  if waiting.reason == 'PodInitializing':
407
578
  _check_init_containers(pod)
408
579
  elif waiting.reason != 'ContainerCreating':
409
- msg = waiting.message if waiting.message else str(
410
- waiting)
580
+ msg = waiting.message if (
581
+ waiting.message) else str(waiting)
411
582
  raise config_lib.KubernetesError(
412
583
  'Failed to create container while launching '
413
584
  f'the node. Error details: {msg}.')
414
- # Reaching this point means that one of the pods had an issue,
415
- # so break out of the loop, and wait until next second.
416
- break
585
+ return False, reason
586
+
587
+ missing_pods_retry = 0
588
+ last_status_msg: Optional[str] = None
589
+ while True:
590
+ # Get all pods in a single API call
591
+ cluster_name_on_cloud = new_pods[0].metadata.labels[
592
+ constants.TAG_SKYPILOT_CLUSTER_NAME]
593
+ all_pods = kubernetes.core_api(context).list_namespaced_pod(
594
+ namespace,
595
+ label_selector=
596
+ f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
597
+ ).items
598
+
599
+ # Get the set of found pod names and check if we have all expected pods
600
+ found_pod_names = {pod.metadata.name for pod in all_pods}
601
+ missing_pod_names = expected_pod_names - found_pod_names
602
+ if missing_pod_names:
603
+ # In _wait_for_pods_to_schedule, we already wait for all pods to go
604
+ # from pending to scheduled. So if a pod is missing here, it means
605
+ # something unusual must have happened, and so should be treated as
606
+ # an exception.
607
+ # It is also only in _wait_for_pods_to_schedule that
608
+ # provision_timeout is used.
609
+ # TODO(kevin): Should we take provision_timeout into account here,
610
+ # instead of hardcoding the number of retries?
611
+ if missing_pods_retry >= _MAX_MISSING_PODS_RETRIES:
612
+ for pod_name in missing_pod_names:
613
+ reason = _get_pod_missing_reason(context, namespace,
614
+ cluster_name, pod_name)
615
+ logger.warning(f'Pod {pod_name} missing: {reason}')
616
+ raise config_lib.KubernetesError(
617
+ f'Failed to get all pods after {missing_pods_retry} '
618
+ f'retries. Some pods may have been terminated or failed '
619
+ f'unexpectedly. Run `sky logs --provision {cluster_name}` '
620
+ 'for more details.')
621
+ logger.info('Retrying running pods check: '
622
+ f'Missing pods: {missing_pod_names}')
623
+ time.sleep(0.5)
624
+ missing_pods_retry += 1
625
+ continue
626
+
627
+ pods_to_check = [
628
+ pod for pod in all_pods if pod.metadata.name in expected_pod_names
629
+ ]
630
+ pod_statuses = subprocess_utils.run_in_parallel(_inspect_pod_status,
631
+ pods_to_check,
632
+ _NUM_THREADS)
633
+
634
+ all_pods_running = True
635
+ pending_reasons_count: Dict[str, int] = {}
636
+ for is_running, pending_reason in pod_statuses:
637
+ if not is_running:
638
+ all_pods_running = False
639
+ if pending_reason is not None:
640
+ pending_reasons_count[pending_reason] = (
641
+ pending_reasons_count.get(pending_reason, 0) + 1)
417
642
 
418
643
  if all_pods_running:
419
644
  break
420
- time.sleep(1)
421
645
 
422
-
423
- def _run_function_with_retries(func: Callable,
424
- operation_name: str,
425
- max_retries: int = _MAX_RETRIES,
426
- retry_delay: int = 5) -> Any:
427
- """Runs a function with retries on Kubernetes errors.
428
- Args:
429
- func: Function to retry
430
- operation_name: Name of the operation for logging
431
- max_retries: Maximum number of retry attempts
432
- retry_delay: Delay between retries in seconds
433
- Raises:
434
- The last exception encountered if all retries fail.
435
- """
436
- for attempt in range(max_retries + 1):
437
- try:
438
- return func()
439
- except config_lib.KubernetesError:
440
- if attempt < max_retries:
441
- logger.warning(f'Failed to {operation_name} - '
442
- f'retrying in {retry_delay} seconds.')
443
- time.sleep(retry_delay)
444
- else:
445
- raise
646
+ if pending_reasons_count:
647
+ msg = ', '.join([
648
+ f'{count} pod(s) pending due to {reason}'
649
+ for reason, count in sorted(pending_reasons_count.items())
650
+ ])
651
+ status_text = f'Launching ({msg})'
652
+ else:
653
+ status_text = 'Launching'
654
+ new_status_msg = ux_utils.spinner_message(status_text,
655
+ cluster_name=cluster_name)
656
+ if new_status_msg != last_status_msg:
657
+ rich_utils.force_update_status(new_status_msg)
658
+ last_status_msg = new_status_msg
659
+ time.sleep(1)
446
660
 
447
661
 
448
662
  @timeline.event
@@ -683,7 +897,7 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
683
897
  def _wait_for_deployment_pod(context,
684
898
  namespace,
685
899
  deployment,
686
- timeout=60) -> List:
900
+ timeout=300) -> List:
687
901
  label_selector = ','.join([
688
902
  f'{key}={value}'
689
903
  for key, value in deployment.spec.selector.match_labels.items()
@@ -715,13 +929,14 @@ def _wait_for_deployment_pod(context,
715
929
 
716
930
 
717
931
  @timeline.event
718
- def _create_pods(region: str, cluster_name_on_cloud: str,
932
+ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
719
933
  config: common.ProvisionConfig) -> common.ProvisionRecord:
720
934
  """Create pods based on the config."""
721
935
  provider_config = config.provider_config
722
936
  namespace = kubernetes_utils.get_namespace_from_config(provider_config)
723
937
  context = kubernetes_utils.get_context_from_config(provider_config)
724
938
  pod_spec = copy.deepcopy(config.node_config)
939
+ create_pods_start = datetime.datetime.now(datetime.timezone.utc)
725
940
 
726
941
  to_create_deployment = 'deployment_spec' in pod_spec
727
942
  if to_create_deployment:
@@ -738,7 +953,26 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
738
953
  else:
739
954
  pod_spec['metadata']['labels'] = tags
740
955
  pod_spec['metadata']['labels'].update(
741
- {k8s_constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
956
+ {constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
957
+
958
+ ephemeral_volumes = provider_config.get('ephemeral_volume_infos')
959
+ if ephemeral_volumes:
960
+ for ephemeral_volume in ephemeral_volumes:
961
+ # Update the volumes and volume mounts in the pod spec
962
+ if 'volumes' not in pod_spec['spec']:
963
+ pod_spec['spec']['volumes'] = []
964
+ pod_spec['spec']['volumes'].append({
965
+ 'name': ephemeral_volume.name,
966
+ 'persistentVolumeClaim': {
967
+ 'claimName': ephemeral_volume.volume_name_on_cloud,
968
+ },
969
+ })
970
+ if 'volumeMounts' not in pod_spec['spec']['containers'][0]:
971
+ pod_spec['spec']['containers'][0]['volumeMounts'] = []
972
+ pod_spec['spec']['containers'][0]['volumeMounts'].append({
973
+ 'name': ephemeral_volume.name,
974
+ 'mountPath': ephemeral_volume.path,
975
+ })
742
976
 
743
977
  terminating_pods = kubernetes_utils.filter_pods(namespace, context, tags,
744
978
  ['Terminating'])
@@ -770,8 +1004,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
770
1004
  running_pods = kubernetes_utils.filter_pods(namespace, context, tags,
771
1005
  ['Pending', 'Running'])
772
1006
  head_pod_name = _get_head_pod_name(running_pods)
1007
+ running_pod_statuses = [{
1008
+ pod.metadata.name: pod.status.phase
1009
+ } for pod in running_pods.values()]
773
1010
  logger.debug(f'Found {len(running_pods)} existing pods: '
774
- f'{list(running_pods.keys())}')
1011
+ f'{running_pod_statuses}')
775
1012
 
776
1013
  to_start_count = config.count - len(running_pods)
777
1014
  if to_start_count < 0:
@@ -787,7 +1024,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
787
1024
  nvidia_runtime_exists = False
788
1025
  try:
789
1026
  nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
790
- context)
1027
+ context=context)
791
1028
  except kubernetes.kubernetes.client.ApiException as e:
792
1029
  logger.warning('run_instances: Error occurred while checking for '
793
1030
  f'nvidia RuntimeClass - '
@@ -817,12 +1054,19 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
817
1054
 
818
1055
  def _create_resource_thread(i: int):
819
1056
  pod_spec_copy = copy.deepcopy(pod_spec)
820
- if head_pod_name is None and i == 0:
821
- # First pod should be head if no head exists
822
- pod_spec_copy['metadata']['labels'].update(constants.HEAD_NODE_TAGS)
823
- head_selector = _head_service_selector(cluster_name_on_cloud)
824
- pod_spec_copy['metadata']['labels'].update(head_selector)
825
- pod_spec_copy['metadata']['name'] = f'{cluster_name_on_cloud}-head'
1057
+ # 0 is for head pod, while 1+ is for worker pods.
1058
+ if i == 0:
1059
+ if head_pod_name is None:
1060
+ # First pod should be head if no head exists
1061
+ pod_spec_copy['metadata']['labels'].update(
1062
+ constants.HEAD_NODE_TAGS)
1063
+ head_selector = _head_service_selector(cluster_name_on_cloud)
1064
+ pod_spec_copy['metadata']['labels'].update(head_selector)
1065
+ pod_spec_copy['metadata'][
1066
+ 'name'] = f'{cluster_name_on_cloud}-head'
1067
+ else:
1068
+ # If head pod already exists, we skip creating it.
1069
+ return
826
1070
  else:
827
1071
  # Worker pods
828
1072
  pod_spec_copy['metadata']['labels'].update(
@@ -868,7 +1112,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
868
1112
  'podAffinityTerm': {
869
1113
  'labelSelector': {
870
1114
  'matchExpressions': [{
871
- 'key': k8s_constants.TAG_SKYPILOT_CLUSTER_NAME,
1115
+ 'key': constants.TAG_SKYPILOT_CLUSTER_NAME,
872
1116
  'operator': 'In',
873
1117
  'values': [cluster_name_on_cloud]
874
1118
  }]
@@ -963,9 +1207,16 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
963
1207
  'and then up the cluster again.')
964
1208
  raise exceptions.InconsistentHighAvailabilityError(message)
965
1209
 
966
- # Create pods in parallel
967
- created_resources = subprocess_utils.run_in_parallel(
968
- _create_resource_thread, list(range(to_start_count)), _NUM_THREADS)
1210
+ created_resources = []
1211
+ if to_start_count > 0:
1212
+ # Create pods in parallel.
1213
+ # Use `config.count` instead of `to_start_count` to keep the index of
1214
+ # the Pods consistent especially for the case where some Pods are down
1215
+ # due to node failure or manual termination, etc. and then launch
1216
+ # again to create the Pods back.
1217
+ # The existing Pods will be skipped in _create_resource_thread.
1218
+ created_resources = subprocess_utils.run_in_parallel(
1219
+ _create_resource_thread, list(range(config.count)), _NUM_THREADS)
969
1220
 
970
1221
  if to_create_deployment:
971
1222
  deployments = copy.deepcopy(created_resources)
@@ -978,20 +1229,22 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
978
1229
  pods = created_resources
979
1230
 
980
1231
  created_pods = {}
1232
+ valid_pods = []
981
1233
  for pod in pods:
1234
+ # In case Pod is not created
1235
+ if pod is None:
1236
+ continue
1237
+ valid_pods.append(pod)
982
1238
  created_pods[pod.metadata.name] = pod
983
1239
  if head_pod_name is None and _is_head(pod):
984
1240
  head_pod_name = pod.metadata.name
1241
+ pods = valid_pods
1242
+
1243
+ # The running_pods may include Pending Pods, so we add them to the pods
1244
+ # list to wait for scheduling and running
1245
+ if running_pods:
1246
+ pods = pods + list(running_pods.values())
985
1247
 
986
- networking_mode = network_utils.get_networking_mode(
987
- config.provider_config.get('networking_mode'), context)
988
- if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
989
- # Adding the jump pod to the new_nodes list as well so it can be
990
- # checked if it's scheduled and running along with other pods.
991
- ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
992
- jump_pod = kubernetes.core_api(context).read_namespaced_pod(
993
- ssh_jump_pod_name, namespace)
994
- pods.append(jump_pod)
995
1248
  provision_timeout = provider_config['timeout']
996
1249
 
997
1250
  wait_str = ('indefinitely'
@@ -1001,12 +1254,21 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
1001
1254
 
1002
1255
  # Wait until the pods are scheduled and surface cause for error
1003
1256
  # if there is one
1004
- _wait_for_pods_to_schedule(namespace, context, pods, provision_timeout)
1257
+ _wait_for_pods_to_schedule(namespace, context, pods, provision_timeout,
1258
+ cluster_name, create_pods_start)
1259
+ # Reset spinner message here because it might have hinted autoscaling
1260
+ # while waiting for pods to schedule.
1261
+ rich_utils.force_update_status(
1262
+ ux_utils.spinner_message('Launching', cluster_name=cluster_name))
1005
1263
  # Wait until the pods and their containers are up and running, and
1006
1264
  # fail early if there is an error
1007
- logger.debug(f'run_instances: waiting for pods to be running (pulling '
1008
- f'images): {[pod.metadata.name for pod in pods]}')
1009
- _wait_for_pods_to_run(namespace, context, pods)
1265
+ logger.debug(f'run_instances: waiting for pods to be running: '
1266
+ f'{[pod.metadata.name for pod in pods]}')
1267
+ _wait_for_pods_to_run(namespace, context, cluster_name, pods)
1268
+ # Reset spinner message here because it might have hinted the reason
1269
+ # pods were pending.
1270
+ rich_utils.force_update_status(
1271
+ ux_utils.spinner_message('Launching', cluster_name=cluster_name))
1010
1272
  logger.debug(f'run_instances: all pods are scheduled and running: '
1011
1273
  f'{[pod.metadata.name for pod in pods]}')
1012
1274
 
@@ -1022,11 +1284,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
1022
1284
  )
1023
1285
 
1024
1286
 
1025
- def run_instances(region: str, cluster_name_on_cloud: str,
1287
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
1026
1288
  config: common.ProvisionConfig) -> common.ProvisionRecord:
1027
1289
  """Runs instances for the given cluster."""
1028
1290
  try:
1029
- return _create_pods(region, cluster_name_on_cloud, config)
1291
+ return _create_pods(region, cluster_name, cluster_name_on_cloud, config)
1030
1292
  except (kubernetes.api_exception(), config_lib.KubernetesError) as e:
1031
1293
  e_msg = common_utils.format_exception(e).replace('\n', ' ')
1032
1294
  logger.warning('run_instances: Error occurred when creating pods: '
@@ -1150,18 +1412,6 @@ def terminate_instances(
1150
1412
  ray_tag_filter(cluster_name_on_cloud),
1151
1413
  None)
1152
1414
 
1153
- # Clean up the SSH jump pod if in use
1154
- networking_mode = network_utils.get_networking_mode(
1155
- provider_config.get('networking_mode'), context)
1156
- if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
1157
- pod_name = list(pods.keys())[0]
1158
- try:
1159
- kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context,
1160
- pod_name)
1161
- except Exception as e: # pylint: disable=broad-except
1162
- logger.warning('terminate_instances: Error occurred when analyzing '
1163
- f'SSH Jump pod: {e}')
1164
-
1165
1415
  if is_high_availability_cluster_by_kubectl(cluster_name_on_cloud, context,
1166
1416
  namespace):
1167
1417
  # For high availability controllers, terminate the deployment
@@ -1192,19 +1442,11 @@ def get_cluster_info(
1192
1442
 
1193
1443
  running_pods = kubernetes_utils.filter_pods(
1194
1444
  namespace, context, ray_tag_filter(cluster_name_on_cloud), ['Running'])
1445
+ logger.debug(f'Running pods: {list(running_pods.keys())}')
1195
1446
 
1196
1447
  pods: Dict[str, List[common.InstanceInfo]] = {}
1197
1448
  head_pod_name = None
1198
1449
 
1199
- port_forward_mode = kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD
1200
- network_mode_str = skypilot_config.get_effective_region_config(
1201
- cloud='kubernetes',
1202
- region=context,
1203
- keys=('networking_mode',),
1204
- default_value=port_forward_mode.value)
1205
- network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
1206
- network_mode_str)
1207
- external_ip = kubernetes_utils.get_external_ip(network_mode, context)
1208
1450
  port = 22
1209
1451
  if not provider_config.get('use_internal_ips', False):
1210
1452
  port = kubernetes_utils.get_head_ssh_port(cluster_name_on_cloud,
@@ -1218,10 +1460,12 @@ def get_cluster_info(
1218
1460
  common.InstanceInfo(
1219
1461
  instance_id=pod_name,
1220
1462
  internal_ip=internal_ip,
1221
- external_ip=(None if network_mode == port_forward_mode else
1222
- external_ip),
1463
+ external_ip=None,
1223
1464
  ssh_port=port,
1224
1465
  tags=pod.metadata.labels,
1466
+ # TODO(hailong): `cluster.local` may need to be configurable
1467
+ # Service name is same as the pod name for now.
1468
+ internal_svc=f'{pod_name}.{namespace}.svc.cluster.local',
1225
1469
  )
1226
1470
  ]
1227
1471
  if _is_head(pod):
@@ -1230,10 +1474,16 @@ def get_cluster_info(
1230
1474
  assert head_spec is not None, pod
1231
1475
  cpu_request = head_spec.containers[0].resources.requests['cpu']
1232
1476
 
1233
- assert cpu_request is not None, 'cpu_request should not be None'
1477
+ if cpu_request is None:
1478
+ raise RuntimeError(f'Pod {cluster_name_on_cloud}-head not found'
1479
+ ' or not Running, check the Pod status')
1234
1480
 
1235
1481
  ssh_user = 'sky'
1236
- get_k8s_ssh_user_cmd = 'echo $(whoami)'
1482
+ # Use pattern matching to extract SSH user, handling MOTD contamination.
1483
+ # Some container images (like CUDA-Q) print MOTD when login shells start,
1484
+ # which can contaminate command output. We use a unique pattern to extract
1485
+ # the actual username reliably.
1486
+ get_k8s_ssh_user_cmd = 'echo "SKYPILOT_SSH_USER: $(whoami)"'
1237
1487
  assert head_pod_name is not None
1238
1488
  runner = command_runner.KubernetesCommandRunner(
1239
1489
  ((namespace, context), head_pod_name))
@@ -1243,10 +1493,24 @@ def get_cluster_info(
1243
1493
  stream_logs=False)
1244
1494
  _raise_command_running_error('get ssh user', get_k8s_ssh_user_cmd,
1245
1495
  head_pod_name, rc, stdout + stderr)
1246
- ssh_user = stdout.strip()
1496
+
1497
+ # Extract SSH user using pattern matching
1498
+ ssh_user_match = _SSH_USER_PATTERN.search(stdout)
1499
+ if ssh_user_match:
1500
+ ssh_user = ssh_user_match.group(1)
1501
+ else:
1502
+ raise ValueError('Failed to find SSH user identifier: '
1503
+ f'{stdout + stderr}')
1247
1504
  logger.debug(
1248
1505
  f'Using ssh user {ssh_user} for cluster {cluster_name_on_cloud}')
1249
1506
 
1507
+ # cpu_request may be a string like `100m`, need to parse and convert
1508
+ num_cpus = kubernetes_utils.parse_cpu_or_gpu_resource_to_float(cpu_request)
1509
+ # 'num-cpus' for ray must be an integer, but we should not set it to 0 if
1510
+ # cpus is <1.
1511
+ # Keep consistent with the logic in clouds/kubernetes.py
1512
+ str_cpus = str(max(int(num_cpus), 1))
1513
+
1250
1514
  return common.ClusterInfo(
1251
1515
  instances=pods,
1252
1516
  head_instance_id=head_pod_name,
@@ -1256,16 +1520,52 @@ def get_cluster_info(
1256
1520
  # problems for other pods.
1257
1521
  custom_ray_options={
1258
1522
  'object-store-memory': 500000000,
1259
- 'num-cpus': cpu_request,
1523
+ 'num-cpus': str_cpus,
1260
1524
  },
1261
1525
  provider_name='kubernetes',
1262
1526
  provider_config=provider_config)
1263
1527
 
1264
1528
 
1265
1529
  def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
1266
- """Get pod termination reason and write to cluster events."""
1267
- reasons = []
1530
+ """Get pod termination reason and write to cluster events.
1531
+
1532
+ Checks both pod conditions (for preemption/disruption) and
1533
+ container statuses (for exit codes/errors).
1534
+ """
1268
1535
  latest_timestamp = pod.status.start_time or datetime.datetime.min
1536
+ ready_state = 'Unknown'
1537
+ termination_reason = 'Terminated unexpectedly'
1538
+ container_reasons = []
1539
+
1540
+ # Check pod status conditions for high level overview.
1541
+ # No need to sort, as each condition.type will only appear once.
1542
+ for condition in pod.status.conditions:
1543
+ reason = condition.reason or 'Unknown reason'
1544
+ message = condition.message or ''
1545
+
1546
+ # Get last known readiness state.
1547
+ if condition.type == 'Ready':
1548
+ ready_state = f'{reason} ({message})' if message else reason
1549
+ # Kueue preemption, as defined in:
1550
+ # https://pkg.go.dev/sigs.k8s.io/kueue/pkg/controller/jobs/pod#pkg-constants
1551
+ elif condition.type == 'TerminationTarget':
1552
+ termination_reason = f'Preempted by Kueue: {reason}'
1553
+ if message:
1554
+ termination_reason += f' ({message})'
1555
+ # Generic disruption.
1556
+ elif condition.type == 'DisruptionTarget':
1557
+ termination_reason = f'Disrupted: {reason}'
1558
+ if message:
1559
+ termination_reason += f' ({message})'
1560
+
1561
+ if condition.last_transition_time is not None:
1562
+ latest_timestamp = max(latest_timestamp,
1563
+ condition.last_transition_time)
1564
+
1565
+ pod_reason = (f'{termination_reason}.\n'
1566
+ f'Last known state: {ready_state}.')
1567
+
1568
+ # Check container statuses for exit codes/errors
1269
1569
  if pod.status and pod.status.container_statuses:
1270
1570
  for container_status in pod.status.container_statuses:
1271
1571
  terminated = container_status.state.terminated
@@ -1280,18 +1580,15 @@ def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
1280
1580
  if reason is None:
1281
1581
  # just in-case reason is None, have default for debugging
1282
1582
  reason = f'exit({exit_code})'
1283
- reasons.append(reason)
1284
- if terminated.finished_at > latest_timestamp:
1285
- latest_timestamp = terminated.finished_at
1583
+ container_reasons.append(reason)
1584
+ latest_timestamp = max(latest_timestamp, terminated.finished_at)
1286
1585
 
1287
1586
  # TODO (kyuds): later, if needed, query `last_state` too.
1288
1587
 
1289
- if not reasons:
1290
- return ''
1291
-
1292
1588
  # Normally we will have a single container per pod for skypilot
1293
1589
  # but doing this just in-case there are multiple containers.
1294
- pod_reason = ' | '.join(reasons)
1590
+ if container_reasons:
1591
+ pod_reason += f'\nContainer errors: {" | ".join(container_reasons)}'
1295
1592
 
1296
1593
  global_user_state.add_cluster_event(
1297
1594
  cluster_name,
@@ -1303,21 +1600,56 @@ def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
1303
1600
  return pod_reason
1304
1601
 
1305
1602
 
1306
- def _get_pod_missing_reason(context: Optional[str], namespace: str,
1307
- cluster_name: str, pod_name: str) -> Optional[str]:
1308
- """Get events for missing pod and write to cluster events."""
1309
- logger.debug(f'Analyzing events for pod {pod_name}')
1603
+ def _get_pod_events(context: Optional[str], namespace: str,
1604
+ pod_name: str) -> List[Any]:
1605
+ """Get the events for a pod, sorted by timestamp, most recent first."""
1310
1606
  pod_field_selector = (
1311
1607
  f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
1312
1608
  pod_events = kubernetes.core_api(context).list_namespaced_event(
1313
1609
  namespace,
1314
1610
  field_selector=pod_field_selector,
1315
1611
  _request_timeout=kubernetes.API_TIMEOUT).items
1316
- pod_events = sorted(
1612
+ return sorted(
1317
1613
  pod_events,
1318
1614
  key=lambda event: event.metadata.creation_timestamp,
1319
1615
  # latest event appears first
1320
1616
  reverse=True)
1617
+
1618
+
1619
+ def _get_pod_pending_reason(context: Optional[str], namespace: str,
1620
+ pod_name: str) -> Optional[Tuple[str, str]]:
1621
+ """Get the reason why a pod is pending from its events.
1622
+
1623
+ Returns a (reason, message) tuple about why the pod is pending (e.g.,
1624
+ ("FailedMount", "hostPath type check failed")) or None if no reason found.
1625
+ """
1626
+ try:
1627
+ pod_events = _get_pod_events(context, namespace, pod_name)
1628
+ except Exception as e: # pylint: disable=broad-except
1629
+ logger.debug(f'Failed to get events for pod {pod_name}: {e}')
1630
+ return None
1631
+
1632
+ if not pod_events:
1633
+ return None
1634
+
1635
+ for event in pod_events:
1636
+ # Omit common events that does not indicate a pending reason.
1637
+ # We could also filter by event type 'Warning' or 'Error',
1638
+ # but there might be useful 'Normal' events such as pulling
1639
+ # image that we want to surface to the user.
1640
+ if event.reason not in COMMON_NON_PENDING_EVENT_REASONS:
1641
+ reason = event.reason or 'Unknown'
1642
+ message = event.message or ''
1643
+ return reason, message
1644
+
1645
+ return None
1646
+
1647
+
1648
+ def _get_pod_missing_reason(context: Optional[str], namespace: str,
1649
+ cluster_name: str, pod_name: str) -> Optional[str]:
1650
+ """Get events for missing pod and write to cluster events."""
1651
+ logger.debug(f'Analyzing events for pod {pod_name}')
1652
+ pod_events = _get_pod_events(context, namespace, pod_name)
1321
1653
  last_scheduled_node = None
1322
1654
  insert_new_pod_event = True
1323
1655
  new_event_inserted = False
@@ -1436,35 +1768,50 @@ def _get_pod_missing_reason(context: Optional[str], namespace: str,
1436
1768
  return failure_reason
1437
1769
 
1438
1770
 
1439
- def query_instances(
1440
- cluster_name: str,
1441
- cluster_name_on_cloud: str,
1442
- provider_config: Optional[Dict[str, Any]] = None,
1443
- non_terminated_only: bool = True
1444
- ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
1445
- # Mapping from pod phase to skypilot status. These are the only valid pod
1446
- # phases.
1447
- # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
1448
- status_map = {
1449
- 'Pending': status_lib.ClusterStatus.INIT,
1450
- 'Running': status_lib.ClusterStatus.UP,
1451
- 'Failed': status_lib.ClusterStatus.INIT,
1452
- 'Unknown': None,
1453
- 'Succeeded': None,
1454
- }
1455
-
1456
- assert provider_config is not None
1457
- namespace = kubernetes_utils.get_namespace_from_config(provider_config)
1458
- context = kubernetes_utils.get_context_from_config(provider_config)
1459
- is_ssh = context.startswith('ssh-') if context else False
1460
- identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
1461
-
1462
- # Get all the pods with the label skypilot-cluster: <cluster_name>
1771
+ def list_namespaced_pod(context: Optional[str], namespace: str,
1772
+ cluster_name_on_cloud: str, is_ssh: bool, identity: str,
1773
+ label_selector: str) -> List[Any]:
1774
+ # Get all the pods with the label skypilot-cluster-name: <cluster_name>
1463
1775
  try:
1464
- pods = kubernetes.core_api(context).list_namespaced_pod(
1776
+ # log the query parameters we pass to the k8s api
1777
+ logger.debug(f'Querying k8s api for pods:\n'
1778
+ f'context: {context}\n'
1779
+ f'namespace: {namespace}\n'
1780
+ f'label selector:`{label_selector}`.')
1781
+
1782
+ response = kubernetes.core_api(context).list_namespaced_pod(
1465
1783
  namespace,
1466
- label_selector=f'skypilot-cluster={cluster_name_on_cloud}',
1467
- _request_timeout=kubernetes.API_TIMEOUT).items
1784
+ label_selector=label_selector,
1785
+ _request_timeout=kubernetes.API_TIMEOUT)
1786
+
1787
+ # log PodList response info
1788
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1789
+ logger.debug(f'k8s api response for `{label_selector}`:\n'
1790
+ f'apiVersion={response.api_version}, '
1791
+ f'kind={response.kind},\n'
1792
+ f'metadata={response.metadata}')
1793
+
1794
+ pods = response.items
1795
+
1796
+ # log detailed Pod info
1797
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1798
+ logger.debug(f'k8s api response for `{label_selector}`: '
1799
+ f'len(pods)={len(pods)}')
1800
+ for pod in pods:
1801
+ logger.debug(f'k8s pod info for `{label_selector}`: '
1802
+ f'pod.apiVersion={pod.api_version}, '
1803
+ f'pod.kind={pod.kind}, \n'
1804
+ f'pod.name={pod.metadata.name}, '
1805
+ f'pod.namespace={pod.metadata.namespace}, \n'
1806
+ f'pod.labels={pod.metadata.labels}, \n'
1807
+ f'pod.annotations={pod.metadata.annotations}, \n'
1808
+ 'pod.creationTimestamp='
1809
+ f'{pod.metadata.creation_timestamp}, '
1810
+ 'pod.deletionTimestamp='
1811
+ f'{pod.metadata.deletion_timestamp}, \n'
1812
+ f'pod.status={pod.status}')
1813
+ return pods
1814
+
1468
1815
  except kubernetes.max_retry_error():
1469
1816
  with ux_utils.print_exception_no_traceback():
1470
1817
  if is_ssh:
@@ -1488,14 +1835,63 @@ def query_instances(
1488
1835
  f'Failed to query {identity} {cluster_name_on_cloud!r} '
1489
1836
  f'status: {common_utils.format_exception(e)}')
1490
1837
 
1838
+
1839
+ def query_instances(
1840
+ cluster_name: str,
1841
+ cluster_name_on_cloud: str,
1842
+ provider_config: Optional[Dict[str, Any]] = None,
1843
+ non_terminated_only: bool = True,
1844
+ retry_if_missing: bool = False,
1845
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
1846
+ # Mapping from pod phase to skypilot status. These are the only valid pod
1847
+ # phases.
1848
+ # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
1849
+ status_map = {
1850
+ 'Pending': status_lib.ClusterStatus.INIT,
1851
+ 'Running': status_lib.ClusterStatus.UP,
1852
+ 'Failed': status_lib.ClusterStatus.INIT,
1853
+ 'Unknown': None,
1854
+ 'Succeeded': None,
1855
+ }
1856
+
1857
+ assert provider_config is not None
1858
+ namespace = kubernetes_utils.get_namespace_from_config(provider_config)
1859
+ context = kubernetes_utils.get_context_from_config(provider_config)
1860
+ is_ssh = context.startswith('ssh-') if context else False
1861
+ identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
1862
+ label_selector = (f'{constants.TAG_SKYPILOT_CLUSTER_NAME}='
1863
+ f'{cluster_name_on_cloud}')
1864
+
1865
+ attempts = 0
1866
+ pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
1867
+ is_ssh, identity, label_selector)
1868
+ # When we see no pods returned from the k8s api, we assume the pods have
1869
+ # been terminated by the user directly and mark the cluster as terminated
1870
+ # in the global user state.
1871
+ # We add retry logic here as an attempt to mitigate a leak caused by the
1872
+ # kubernetes api returning no pods despite the pods actually existing.
1873
+ while (retry_if_missing and not pods and
1874
+ attempts < _MAX_QUERY_INSTANCES_RETRIES):
1875
+ logger.debug(f'Retrying to query k8s api for {cluster_name_on_cloud} '
1876
+ f'{attempts}/{_MAX_QUERY_INSTANCES_RETRIES} times.'
1877
+ f'after {_QUERY_INSTANCES_RETRY_INTERVAL} seconds.')
1878
+ time.sleep(_QUERY_INSTANCES_RETRY_INTERVAL)
1879
+ attempts += 1
1880
+ pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
1881
+ is_ssh, identity, label_selector)
1882
+ if len(pods) > 0:
1883
+ logger.info(f'Found {len(pods)} pods for {label_selector} after'
1884
+ f'{attempts} retries.')
1885
+
1491
1886
  # Check if the pods are running or pending
1492
1887
  cluster_status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
1493
1888
  Optional[str]]] = {}
1494
1889
  for pod in pods:
1495
1890
  phase = pod.status.phase
1891
+ is_terminating = pod.metadata.deletion_timestamp is not None
1496
1892
  pod_status = status_map[phase]
1497
1893
  reason = None
1498
- if phase in ('Failed', 'Unknown'):
1894
+ if phase in ('Failed', 'Unknown') or is_terminating:
1499
1895
  reason = _get_pod_termination_reason(pod, cluster_name)
1500
1896
  logger.debug(f'Pod Status ({phase}) Reason(s): {reason}')
1501
1897
  if non_terminated_only and pod_status is None: