skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -1,11 +1,14 @@
1
1
  """Kubernetes instance provisioning."""
2
2
  import copy
3
+ import datetime
3
4
  import json
5
+ import re
6
+ import sys
4
7
  import time
5
- from typing import Any, Callable, Dict, List, Optional, Union
6
- import uuid
8
+ from typing import Any, Dict, List, Optional, Tuple, Union
7
9
 
8
10
  from sky import exceptions
11
+ from sky import global_user_state
9
12
  from sky import sky_logging
10
13
  from sky import skypilot_config
11
14
  from sky.adaptors import kubernetes
@@ -13,31 +16,36 @@ from sky.provision import common
13
16
  from sky.provision import constants
14
17
  from sky.provision import docker_utils
15
18
  from sky.provision.kubernetes import config as config_lib
16
- from sky.provision.kubernetes import network_utils
19
+ from sky.provision.kubernetes import constants as k8s_constants
17
20
  from sky.provision.kubernetes import utils as kubernetes_utils
21
+ from sky.provision.kubernetes import volume
18
22
  from sky.utils import command_runner
19
23
  from sky.utils import common_utils
20
24
  from sky.utils import config_utils
21
25
  from sky.utils import kubernetes_enums
26
+ from sky.utils import rich_utils
22
27
  from sky.utils import status_lib
23
28
  from sky.utils import subprocess_utils
24
29
  from sky.utils import timeline
25
30
  from sky.utils import ux_utils
31
+ from sky.utils.db import db_utils
26
32
 
27
33
  POLL_INTERVAL = 2
28
34
  _TIMEOUT_FOR_POD_TERMINATION = 60 # 1 minutes
29
35
  _MAX_RETRIES = 3
36
+ _MAX_MISSING_PODS_RETRIES = 5
37
+ _MAX_QUERY_INSTANCES_RETRIES = 5
38
+ _QUERY_INSTANCES_RETRY_INTERVAL = .5
30
39
  _NUM_THREADS = subprocess_utils.get_parallel_threads('kubernetes')
31
40
 
41
+ # Pattern to extract SSH user from command output, handling MOTD contamination
42
+ _SSH_USER_PATTERN = re.compile(r'SKYPILOT_SSH_USER: ([^\s\n]+)')
43
+
32
44
  logger = sky_logging.init_logger(__name__)
33
- TAG_RAY_CLUSTER_NAME = 'ray-cluster-name'
34
- TAG_SKYPILOT_CLUSTER_NAME = 'skypilot-cluster-name'
35
- TAG_POD_INITIALIZED = 'skypilot-initialized'
36
- TAG_SKYPILOT_DEPLOYMENT_NAME = 'skypilot-deployment-name'
37
45
 
38
46
 
39
47
  def ray_tag_filter(cluster_name: str) -> Dict[str, str]:
40
- return {TAG_RAY_CLUSTER_NAME: cluster_name}
48
+ return {k8s_constants.TAG_RAY_CLUSTER_NAME: cluster_name}
41
49
 
42
50
 
43
51
  def _is_head(pod) -> bool:
@@ -67,12 +75,16 @@ def is_high_availability_cluster_by_kubectl(
67
75
  namespace: Optional[str] = None) -> bool:
68
76
  """Check if a cluster is a high availability controller by calling
69
77
  `kubectl get deployment`.
78
+
79
+ The deployment must have the label `skypilot-cluster-name` set to
80
+ `cluster_name`.
70
81
  """
71
82
  try:
72
83
  deployment_list = kubernetes.apps_api(
73
84
  context).list_namespaced_deployment(
74
85
  namespace,
75
- label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
86
+ label_selector=
87
+ f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}')
76
88
  except kubernetes.api_exception():
77
89
  return False
78
90
  # It is a high availability cluster if there is at least one deployment
@@ -186,14 +198,20 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
186
198
  break
187
199
  if event_message is not None:
188
200
  if pod_status == 'Pending':
189
- logger.info(event_message)
201
+ out_of = {}
202
+ # key: resource name, value: (extra message, nice name)
190
203
  if 'Insufficient cpu' in event_message:
191
- raise config_lib.KubernetesError(
192
- _lack_resource_msg('CPU', pod, details=event_message))
204
+ out_of['CPU'] = (': Run \'kubectl get nodes -o '
205
+ 'custom-columns=NAME:.metadata.name,'
206
+ 'CPU:.status.allocatable.cpu\' to check '
207
+ 'the available CPUs on the node.', 'CPUs')
193
208
  if 'Insufficient memory' in event_message:
194
- raise config_lib.KubernetesError(
195
- _lack_resource_msg('memory', pod,
196
- details=event_message))
209
+ out_of['memory'] = (': Run \'kubectl get nodes -o '
210
+ 'custom-columns=NAME:.metadata.name,'
211
+ 'MEMORY:.status.allocatable.memory\' '
212
+ 'to check the available memory on the '
213
+ 'node.', 'Memory')
214
+
197
215
  # TODO(aylei): after switching from smarter-device-manager to
198
216
  # fusermount-server, we need a new way to check whether the
199
217
  # fusermount-server daemonset is ready.
@@ -201,43 +219,79 @@ def _raise_pod_scheduling_errors(namespace, context, new_nodes):
201
219
  key for lf in kubernetes_utils.LABEL_FORMATTER_REGISTRY
202
220
  for key in lf.get_label_keys()
203
221
  ]
204
- if pod.spec.node_selector:
205
- for label_key in pod.spec.node_selector.keys():
206
- if label_key in gpu_lf_keys:
207
- # TODO(romilb): We may have additional node
208
- # affinity selectors in the future - in that
209
- # case we will need to update this logic.
210
- # TODO(Doyoung): Update the error message raised
211
- # with the multi-host TPU support.
212
- gpu_resource_key = kubernetes_utils.get_gpu_resource_key() # pylint: disable=line-too-long
213
- if 'Insufficient google.com/tpu' in event_message:
214
- extra_msg = (
215
- f'Verify if '
216
- f'{pod.spec.node_selector[label_key]}'
217
- ' is available in the cluster. Note '
218
- 'that multi-host TPU podslices are '
219
- 'currently not unsupported.')
220
- raise config_lib.KubernetesError(
221
- _lack_resource_msg('TPU',
222
- pod,
223
- extra_msg,
224
- details=event_message))
225
- elif ((f'Insufficient {gpu_resource_key}'
226
- in event_message) or
227
- ('didn\'t match Pod\'s node affinity/selector'
228
- in event_message)):
229
- extra_msg = (
230
- f'Verify if any node matching label '
231
- f'{pod.spec.node_selector[label_key]} and '
232
- f'sufficient resource {gpu_resource_key} '
233
- f'is available in the cluster.')
234
- raise config_lib.KubernetesError(
235
- _lack_resource_msg('GPU',
236
- pod,
237
- extra_msg,
238
- details=event_message))
222
+ for label_key in gpu_lf_keys:
223
+ # TODO(romilb): We may have additional node
224
+ # affinity selectors in the future - in that
225
+ # case we will need to update this logic.
226
+ # TODO(Doyoung): Update the error message raised
227
+ # with the multi-host TPU support.
228
+ gpu_resource_key = kubernetes_utils.get_gpu_resource_key(
229
+ context) # pylint: disable=line-too-long
230
+ if ((f'Insufficient {gpu_resource_key}' in event_message) or
231
+ ('didn\'t match Pod\'s node affinity/selector'
232
+ in event_message) and pod.spec.node_selector):
233
+ if 'gpu' in gpu_resource_key.lower():
234
+ info_msg = (
235
+ ': Run \'sky show-gpus --infra kubernetes\' to '
236
+ 'see the available GPUs.')
237
+ else:
238
+ info_msg = ': '
239
+ if (pod.spec.node_selector and
240
+ label_key in pod.spec.node_selector):
241
+ extra_msg = (
242
+ f'Verify if any node matching label '
243
+ f'{pod.spec.node_selector[label_key]} and '
244
+ f'sufficient resource {gpu_resource_key} '
245
+ f'is available in the cluster.')
246
+ extra_msg = info_msg + ' ' + extra_msg
247
+ else:
248
+ extra_msg = info_msg
249
+ if gpu_resource_key not in out_of or len(
250
+ out_of[gpu_resource_key][0]) < len(extra_msg):
251
+ out_of[f'{gpu_resource_key}'] = (extra_msg, 'GPUs')
252
+
253
+ if len(out_of) > 0:
254
+ # We are out of some resources. We should raise an error.
255
+ rsrc_err_msg = 'Insufficient resource capacity on the '
256
+ rsrc_err_msg += 'cluster:\n'
257
+ out_of_keys = list(out_of.keys())
258
+ for i in range(len(out_of_keys)):
259
+ rsrc = out_of_keys[i]
260
+ (extra_msg, nice_name) = out_of[rsrc]
261
+ extra_msg = extra_msg if extra_msg else ''
262
+ if i == len(out_of_keys) - 1:
263
+ indent = '└──'
264
+ else:
265
+ indent = '├──'
266
+ rsrc_err_msg += (f'{indent} Cluster does not have '
267
+ f'sufficient {nice_name} for your request'
268
+ f'{extra_msg}')
269
+ if i != len(out_of_keys) - 1:
270
+ rsrc_err_msg += '\n'
271
+
272
+ # Emit the error message without logging prefixes for better UX.
273
+ tmp_handler = sky_logging.EnvAwareHandler(sys.stdout)
274
+ tmp_handler.flush = sys.stdout.flush
275
+ tmp_handler.setFormatter(sky_logging.NO_PREFIX_FORMATTER)
276
+ tmp_handler.setLevel(sky_logging.ERROR)
277
+ prev_propagate = logger.propagate
278
+ try:
279
+ logger.addHandler(tmp_handler)
280
+ logger.propagate = False
281
+ logger.error(ux_utils.error_message(f'{rsrc_err_msg}'))
282
+ finally:
283
+ logger.removeHandler(tmp_handler)
284
+ logger.propagate = prev_propagate
285
+ nice_names = [out_of[rsrc][1] for rsrc in out_of_keys]
286
+ raise config_lib.KubernetesError(
287
+ f'{timeout_err_msg} '
288
+ f'Pod status: {pod_status} '
289
+ f'Details: \'{event_message}\' ',
290
+ insufficent_resources=nice_names,
291
+ )
292
+
239
293
  raise config_lib.KubernetesError(f'{timeout_err_msg} '
240
- f'Pod status: {pod_status}'
294
+ f'Pod status: {pod_status} '
241
295
  f'Details: \'{event_message}\' ')
242
296
  raise config_lib.KubernetesError(f'{timeout_err_msg}')
243
297
 
@@ -251,8 +305,89 @@ def _raise_command_running_error(message: str, command: str, pod_name: str,
251
305
  f'code {rc}: {command!r}\nOutput: {stdout}.')
252
306
 
253
307
 
308
+ def _detect_cluster_event_reason_occurred(namespace, context, search_start,
309
+ reason) -> bool:
310
+
311
+ def _convert_to_utc(timestamp):
312
+ if timestamp.tzinfo is None:
313
+ return timestamp.replace(tzinfo=datetime.timezone.utc)
314
+ return timestamp.astimezone(datetime.timezone.utc)
315
+
316
+ def _get_event_timestamp(event):
317
+ if event.last_timestamp:
318
+ return event.last_timestamp
319
+ elif event.metadata.creation_timestamp:
320
+ return event.metadata.creation_timestamp
321
+ return None
322
+
323
+ events = kubernetes.core_api(context).list_namespaced_event(
324
+ namespace=namespace, field_selector=f'reason={reason}')
325
+ for event in events.items:
326
+ ts = _get_event_timestamp(event)
327
+ if ts and _convert_to_utc(ts) > search_start:
328
+ return True
329
+ return False
330
+
331
+
332
+ def _cluster_had_autoscale_event(namespace, context, search_start) -> bool:
333
+ """Detects whether the cluster had a autoscaling event after a
334
+ specified datetime. This only works when using cluster-autoscaler.
335
+
336
+ Args:
337
+ namespace: kubernetes namespace
338
+ context: kubernetes context
339
+ search_start (datetime.datetime): filter for events that occurred
340
+ after search_start
341
+
342
+ Returns:
343
+ A boolean whether the cluster has an autoscaling event or not.
344
+ """
345
+ assert namespace is not None
346
+
347
+ try:
348
+ return _detect_cluster_event_reason_occurred(namespace, context,
349
+ search_start,
350
+ 'TriggeredScaleUp')
351
+ except Exception as e: # pylint: disable=broad-except
352
+ logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
353
+ return False
354
+
355
+
356
+ def _cluster_maybe_autoscaling(namespace, context, search_start) -> bool:
357
+ """Detects whether a kubernetes cluster may have an autoscaling event.
358
+
359
+ This is not a definitive detection. FailedScheduling, which is an
360
+ event that can occur when not enough resources are present in the cluster,
361
+ which is a trigger for cluster autoscaling. However, FailedScheduling may
362
+ have occurred due to other reasons (cluster itself is abnormal).
363
+
364
+ Hence, this should only be used for autoscalers that don't emit the
365
+ TriggeredScaleUp event, e.g.: Karpenter.
366
+
367
+ Args:
368
+ namespace: kubernetes namespace
369
+ context: kubernetes context
370
+ search_start (datetime.datetime): filter for events that occurred
371
+ after search_start
372
+
373
+ Returns:
374
+ A boolean whether the cluster has an autoscaling event or not.
375
+ """
376
+ assert namespace is not None
377
+
378
+ try:
379
+ return _detect_cluster_event_reason_occurred(namespace, context,
380
+ search_start,
381
+ 'FailedScheduling')
382
+ except Exception as e: # pylint: disable=broad-except
383
+ logger.debug(f'Error occurred while detecting cluster autoscaler: {e}')
384
+ return False
385
+
386
+
254
387
  @timeline.event
255
- def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
388
+ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int,
389
+ cluster_name: str,
390
+ create_pods_start: datetime.datetime):
256
391
  """Wait for all pods to be scheduled.
257
392
 
258
393
  Wait for all pods including jump pod to be scheduled, and if it
@@ -261,6 +396,9 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
261
396
  allocated and we can exit.
262
397
 
263
398
  If timeout is set to a negative value, this method will wait indefinitely.
399
+
400
+ Will update the spinner message to indicate autoscaling if autoscaling
401
+ is happening.
264
402
  """
265
403
  # Create a set of pod names we're waiting for
266
404
  if not new_nodes:
@@ -268,6 +406,18 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
268
406
  expected_pod_names = {node.metadata.name for node in new_nodes}
269
407
  start_time = time.time()
270
408
 
409
+ # Variables for autoscaler detection
410
+ autoscaler_type = skypilot_config.get_effective_region_config(
411
+ cloud='kubernetes',
412
+ region=context,
413
+ keys=('autoscaler',),
414
+ default_value=None)
415
+ autoscaler_is_set = autoscaler_type is not None
416
+ use_heuristic_detection = (autoscaler_is_set and
417
+ not kubernetes_enums.KubernetesAutoscalerType(
418
+ autoscaler_type).emits_autoscale_event())
419
+ is_autoscaling = False
420
+
271
421
  def _evaluate_timeout() -> bool:
272
422
  # If timeout is negative, retry indefinitely.
273
423
  if timeout < 0:
@@ -277,10 +427,13 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
277
427
  while _evaluate_timeout():
278
428
  # Get all pods in a single API call using the cluster name label
279
429
  # which all pods in new_nodes should share
280
- cluster_name = new_nodes[0].metadata.labels[TAG_SKYPILOT_CLUSTER_NAME]
430
+ cluster_name_on_cloud = new_nodes[0].metadata.labels[
431
+ constants.TAG_SKYPILOT_CLUSTER_NAME]
281
432
  pods = kubernetes.core_api(context).list_namespaced_pod(
282
433
  namespace,
283
- label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
434
+ label_selector=
435
+ f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
436
+ ).items
284
437
 
285
438
  # Get the set of found pod names and check if we have all expected pods
286
439
  found_pod_names = {pod.metadata.name for pod in pods}
@@ -304,6 +457,26 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
304
457
 
305
458
  if all_scheduled:
306
459
  return
460
+
461
+ # Check if cluster is autoscaling and update spinner message.
462
+ # Minor optimization to not query k8s api after autoscaling
463
+ # event was detected. This is useful because there isn't any
464
+ # autoscaling complete event.
465
+ if autoscaler_is_set and not is_autoscaling:
466
+ if use_heuristic_detection:
467
+ is_autoscaling = _cluster_maybe_autoscaling(
468
+ namespace, context, create_pods_start)
469
+ msg = 'Kubernetes cluster may be scaling up'
470
+ else:
471
+ is_autoscaling = _cluster_had_autoscale_event(
472
+ namespace, context, create_pods_start)
473
+ msg = 'Kubernetes cluster is autoscaling'
474
+
475
+ if is_autoscaling:
476
+ rich_utils.force_update_status(
477
+ ux_utils.spinner_message(f'Launching ({msg})',
478
+ cluster_name=cluster_name))
479
+
307
480
  time.sleep(1)
308
481
 
309
482
  # Handle pod scheduling errors
@@ -319,17 +492,17 @@ def _wait_for_pods_to_schedule(namespace, context, new_nodes, timeout: int):
319
492
 
320
493
 
321
494
  @timeline.event
322
- def _wait_for_pods_to_run(namespace, context, new_nodes):
495
+ def _wait_for_pods_to_run(namespace, context, cluster_name, new_pods):
323
496
  """Wait for pods and their containers to be ready.
324
497
 
325
498
  Pods may be pulling images or may be in the process of container
326
499
  creation.
327
500
  """
328
- if not new_nodes:
501
+ if not new_pods:
329
502
  return
330
503
 
331
504
  # Create a set of pod names we're waiting for
332
- expected_pod_names = {node.metadata.name for node in new_nodes}
505
+ expected_pod_names = {pod.metadata.name for pod in new_pods}
333
506
 
334
507
  def _check_init_containers(pod):
335
508
  # Check if any of the init containers failed
@@ -356,26 +529,62 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
356
529
  'Failed to create init container for pod '
357
530
  f'{pod.metadata.name}. Error details: {msg}.')
358
531
 
532
+ missing_pods_retry = 0
359
533
  while True:
360
534
  # Get all pods in a single API call
361
- cluster_name = new_nodes[0].metadata.labels[TAG_SKYPILOT_CLUSTER_NAME]
535
+ cluster_name_on_cloud = new_pods[0].metadata.labels[
536
+ constants.TAG_SKYPILOT_CLUSTER_NAME]
362
537
  all_pods = kubernetes.core_api(context).list_namespaced_pod(
363
538
  namespace,
364
- label_selector=f'{TAG_SKYPILOT_CLUSTER_NAME}={cluster_name}').items
539
+ label_selector=
540
+ f'{constants.TAG_SKYPILOT_CLUSTER_NAME}={cluster_name_on_cloud}'
541
+ ).items
365
542
 
366
543
  # Get the set of found pod names and check if we have all expected pods
367
544
  found_pod_names = {pod.metadata.name for pod in all_pods}
368
- missing_pods = expected_pod_names - found_pod_names
369
- if missing_pods:
545
+ missing_pod_names = expected_pod_names - found_pod_names
546
+ if missing_pod_names:
547
+ # In _wait_for_pods_to_schedule, we already wait for all pods to go
548
+ # from pending to scheduled. So if a pod is missing here, it means
549
+ # something unusual must have happened, and so should be treated as
550
+ # an exception.
551
+ # It is also only in _wait_for_pods_to_schedule that
552
+ # provision_timeout is used.
553
+ # TODO(kevin): Should we take provision_timeout into account here,
554
+ # instead of hardcoding the number of retries?
555
+ if missing_pods_retry >= _MAX_MISSING_PODS_RETRIES:
556
+ for pod_name in missing_pod_names:
557
+ reason = _get_pod_missing_reason(context, namespace,
558
+ cluster_name, pod_name)
559
+ logger.warning(f'Pod {pod_name} missing: {reason}')
560
+ raise config_lib.KubernetesError(
561
+ f'Failed to get all pods after {missing_pods_retry} '
562
+ f'retries. Some pods may have been terminated or failed '
563
+ f'unexpectedly. Run `sky logs --provision {cluster_name}` '
564
+ 'for more details.')
370
565
  logger.info('Retrying running pods check: '
371
- f'Missing pods: {missing_pods}')
566
+ f'Missing pods: {missing_pod_names}')
372
567
  time.sleep(0.5)
568
+ missing_pods_retry += 1
373
569
  continue
374
570
 
375
571
  all_pods_running = True
376
572
  for pod in all_pods:
377
573
  if pod.metadata.name not in expected_pod_names:
378
574
  continue
575
+
576
+ # Check if pod is terminated/preempted/failed.
577
+ if (pod.metadata.deletion_timestamp is not None or
578
+ pod.status.phase == 'Failed'):
579
+ # Get the reason and write to cluster events before
580
+ # the pod gets completely deleted from the API.
581
+ reason = _get_pod_termination_reason(pod, cluster_name)
582
+ logger.warning(f'Pod {pod.metadata.name} terminated: {reason}')
583
+ raise config_lib.KubernetesError(
584
+ f'Pod {pod.metadata.name} has terminated or failed '
585
+ f'unexpectedly. Run `sky logs --provision {cluster_name}` '
586
+ 'for more details.')
587
+
379
588
  # Continue if pod and all the containers within the
380
589
  # pod are successfully created and running.
381
590
  if pod.status.phase == 'Running' and all(
@@ -411,31 +620,6 @@ def _wait_for_pods_to_run(namespace, context, new_nodes):
411
620
  time.sleep(1)
412
621
 
413
622
 
414
- def _run_function_with_retries(func: Callable,
415
- operation_name: str,
416
- max_retries: int = _MAX_RETRIES,
417
- retry_delay: int = 5) -> Any:
418
- """Runs a function with retries on Kubernetes errors.
419
- Args:
420
- func: Function to retry
421
- operation_name: Name of the operation for logging
422
- max_retries: Maximum number of retry attempts
423
- retry_delay: Delay between retries in seconds
424
- Raises:
425
- The last exception encountered if all retries fail.
426
- """
427
- for attempt in range(max_retries + 1):
428
- try:
429
- return func()
430
- except config_lib.KubernetesError:
431
- if attempt < max_retries:
432
- logger.warning(f'Failed to {operation_name} - '
433
- f'retrying in {retry_delay} seconds.')
434
- time.sleep(retry_delay)
435
- else:
436
- raise
437
-
438
-
439
623
  @timeline.event
440
624
  def pre_init(namespace: str, context: Optional[str], new_nodes: List) -> None:
441
625
  """Pre-initialization step for SkyPilot pods.
@@ -670,26 +854,11 @@ def _create_namespaced_pod_with_retries(namespace: str, pod_spec: dict,
670
854
  raise e
671
855
 
672
856
 
673
- def _create_persistent_volume_claim(namespace: str, context: Optional[str],
674
- pvc_spec: Dict[str, Any]) -> None:
675
- """Creates a persistent volume claim for SkyServe controller."""
676
- try:
677
- kubernetes.core_api(context).read_namespaced_persistent_volume_claim(
678
- name=pvc_spec['metadata']['name'], namespace=namespace)
679
- return
680
- except kubernetes.api_exception() as e:
681
- if e.status != 404: # Not found
682
- raise
683
-
684
- kubernetes.core_api(context).create_namespaced_persistent_volume_claim(
685
- namespace=namespace, body=pvc_spec)
686
-
687
-
688
857
  @timeline.event
689
858
  def _wait_for_deployment_pod(context,
690
859
  namespace,
691
860
  deployment,
692
- timeout=60) -> List:
861
+ timeout=300) -> List:
693
862
  label_selector = ','.join([
694
863
  f'{key}={value}'
695
864
  for key, value in deployment.spec.selector.match_labels.items()
@@ -721,13 +890,14 @@ def _wait_for_deployment_pod(context,
721
890
 
722
891
 
723
892
  @timeline.event
724
- def _create_pods(region: str, cluster_name_on_cloud: str,
893
+ def _create_pods(region: str, cluster_name: str, cluster_name_on_cloud: str,
725
894
  config: common.ProvisionConfig) -> common.ProvisionRecord:
726
895
  """Create pods based on the config."""
727
896
  provider_config = config.provider_config
728
897
  namespace = kubernetes_utils.get_namespace_from_config(provider_config)
729
898
  context = kubernetes_utils.get_context_from_config(provider_config)
730
899
  pod_spec = copy.deepcopy(config.node_config)
900
+ create_pods_start = datetime.datetime.now(datetime.timezone.utc)
731
901
 
732
902
  to_create_deployment = 'deployment_spec' in pod_spec
733
903
  if to_create_deployment:
@@ -744,7 +914,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
744
914
  else:
745
915
  pod_spec['metadata']['labels'] = tags
746
916
  pod_spec['metadata']['labels'].update(
747
- {TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
917
+ {constants.TAG_SKYPILOT_CLUSTER_NAME: cluster_name_on_cloud})
748
918
 
749
919
  terminating_pods = kubernetes_utils.filter_pods(namespace, context, tags,
750
920
  ['Terminating'])
@@ -776,8 +946,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
776
946
  running_pods = kubernetes_utils.filter_pods(namespace, context, tags,
777
947
  ['Pending', 'Running'])
778
948
  head_pod_name = _get_head_pod_name(running_pods)
949
+ running_pod_statuses = [{
950
+ pod.metadata.name: pod.status.phase
951
+ } for pod in running_pods.values()]
779
952
  logger.debug(f'Found {len(running_pods)} existing pods: '
780
- f'{list(running_pods.keys())}')
953
+ f'{running_pod_statuses}')
781
954
 
782
955
  to_start_count = config.count - len(running_pods)
783
956
  if to_start_count < 0:
@@ -793,7 +966,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
793
966
  nvidia_runtime_exists = False
794
967
  try:
795
968
  nvidia_runtime_exists = kubernetes_utils.check_nvidia_runtime_class(
796
- context)
969
+ context=context)
797
970
  except kubernetes.kubernetes.client.ApiException as e:
798
971
  logger.warning('run_instances: Error occurred while checking for '
799
972
  f'nvidia RuntimeClass - '
@@ -804,14 +977,18 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
804
977
  'For more details, refer to https://docs.skypilot.co/en/latest/reference/config.html') # pylint: disable=line-too-long
805
978
 
806
979
  needs_gpus = False
980
+ needs_gpus_nvidia = False
807
981
  limits = pod_spec['spec']['containers'][0].get('resources',
808
982
  {}).get('limits')
809
983
  if limits is not None:
810
- needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(), 0) > 0
984
+ needs_gpus = limits.get(kubernetes_utils.get_gpu_resource_key(context),
985
+ 0) > 0
986
+ needs_gpus_nvidia = limits.get(
987
+ kubernetes_utils.SUPPORTED_GPU_RESOURCE_KEYS['nvidia'], 0) > 0
811
988
 
812
989
  # TPU pods provisioned on GKE use the default containerd runtime.
813
990
  # Reference: https://cloud.google.com/kubernetes-engine/docs/how-to/migrate-containerd#overview # pylint: disable=line-too-long
814
- if nvidia_runtime_exists and needs_gpus:
991
+ if nvidia_runtime_exists and needs_gpus_nvidia:
815
992
  pod_spec['spec']['runtimeClassName'] = 'nvidia'
816
993
 
817
994
  logger.debug(f'run_instances: calling create_namespaced_pod '
@@ -819,19 +996,46 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
819
996
 
820
997
  def _create_resource_thread(i: int):
821
998
  pod_spec_copy = copy.deepcopy(pod_spec)
822
- if head_pod_name is None and i == 0:
823
- # First pod should be head if no head exists
824
- pod_spec_copy['metadata']['labels'].update(constants.HEAD_NODE_TAGS)
825
- head_selector = _head_service_selector(cluster_name_on_cloud)
826
- pod_spec_copy['metadata']['labels'].update(head_selector)
827
- pod_spec_copy['metadata']['name'] = f'{cluster_name_on_cloud}-head'
999
+ # 0 is for head pod, while 1+ is for worker pods.
1000
+ if i == 0:
1001
+ if head_pod_name is None:
1002
+ # First pod should be head if no head exists
1003
+ pod_spec_copy['metadata']['labels'].update(
1004
+ constants.HEAD_NODE_TAGS)
1005
+ head_selector = _head_service_selector(cluster_name_on_cloud)
1006
+ pod_spec_copy['metadata']['labels'].update(head_selector)
1007
+ pod_spec_copy['metadata'][
1008
+ 'name'] = f'{cluster_name_on_cloud}-head'
1009
+ else:
1010
+ # If head pod already exists, we skip creating it.
1011
+ return
828
1012
  else:
829
1013
  # Worker pods
830
1014
  pod_spec_copy['metadata']['labels'].update(
831
1015
  constants.WORKER_NODE_TAGS)
832
- pod_uuid = str(uuid.uuid4())[:6]
833
- pod_name = f'{cluster_name_on_cloud}-{pod_uuid}'
834
- pod_spec_copy['metadata']['name'] = f'{pod_name}-worker'
1016
+ pod_name = f'{cluster_name_on_cloud}-worker{i}'
1017
+ if pod_name in running_pods:
1018
+ # If the pod is already running, we skip creating it.
1019
+ return
1020
+ pod_spec_copy['metadata']['name'] = pod_name
1021
+ pod_spec_copy['metadata']['labels']['component'] = pod_name
1022
+
1023
+ # We need to keep the following fields in the pod spec to be same for
1024
+ # head and worker pods.
1025
+ # So that Kueue can merge them into a single PodSet when creating
1026
+ # ProvisioningRequest to trigger scale up of the cluster autoscaler,
1027
+ # this is especially required for DWS queued provisioning mode in GKE.
1028
+ # spec.containers[*].resources.requests
1029
+ # spec.initContainers[*].resources.requests
1030
+ # spec.resources
1031
+ # spec.nodeSelector
1032
+ # spec.tolerations
1033
+ # spec.affinity
1034
+ # resourceClaims
1035
+ # Refer to the following links for more details:
1036
+ # https://cloud.google.com/kubernetes-engine/docs/how-to/provisioningrequest#define_a_provisioningrequest_object # pylint: disable=line-too-long
1037
+ # https://kueue.sigs.k8s.io/docs/admission-check-controllers/provisioning/#podset-merge-policy # pylint: disable=line-too-long
1038
+ if config.count > 1:
835
1039
  # For multi-node support, we put a soft-constraint to schedule
836
1040
  # worker pods on different nodes than the head pod.
837
1041
  # This is not set as a hard constraint because if different nodes
@@ -850,7 +1054,7 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
850
1054
  'podAffinityTerm': {
851
1055
  'labelSelector': {
852
1056
  'matchExpressions': [{
853
- 'key': TAG_SKYPILOT_CLUSTER_NAME,
1057
+ 'key': constants.TAG_SKYPILOT_CLUSTER_NAME,
854
1058
  'operator': 'In',
855
1059
  'values': [cluster_name_on_cloud]
856
1060
  }]
@@ -883,9 +1087,25 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
883
1087
  pod_spec_copy['spec']['tolerations'] = existing_tolerations + [
884
1088
  tpu_toleration
885
1089
  ]
1090
+ # Add GPU toleration if GPU is requested.
1091
+ # The nodes provisioned by DWS with flex start with queued provisioning
1092
+ # mode have the GPU taint, so we have to add the GPU toleration.
1093
+ # No need to check if DWS is enabled here since this has no side effect
1094
+ # to the non-DWS case.
1095
+ if needs_gpus:
1096
+ gpu_toleration = {
1097
+ 'key': kubernetes_utils.get_gpu_resource_key(context),
1098
+ 'operator': 'Exists',
1099
+ 'effect': 'NoSchedule'
1100
+ }
1101
+ # Preserve existing tolerations if any
1102
+ existing_tolerations = pod_spec_copy['spec'].get('tolerations', [])
1103
+ pod_spec_copy['spec']['tolerations'] = existing_tolerations + [
1104
+ gpu_toleration
1105
+ ]
886
1106
 
887
1107
  if to_create_deployment:
888
- _create_persistent_volume_claim(namespace, context, pvc_spec)
1108
+ volume.create_persistent_volume_claim(namespace, context, pvc_spec)
889
1109
 
890
1110
  # It's safe to directly modify the template spec in the deployment spec
891
1111
  # because controller pod is singleton, i in [0].
@@ -893,9 +1113,12 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
893
1113
  # Add the deployment name as a label to the pod spec
894
1114
  deployment_name = deployment_spec['metadata']['name']
895
1115
  pod_spec_copy['metadata']['labels'][
896
- TAG_SKYPILOT_DEPLOYMENT_NAME] = deployment_name
1116
+ k8s_constants.TAG_SKYPILOT_DEPLOYMENT_NAME] = deployment_name
897
1117
  template_pod_spec['metadata'] = pod_spec_copy['metadata']
898
1118
  template_pod_spec['spec'].update(pod_spec_copy['spec'])
1119
+ # Propagate the labels to the deployment for identification.
1120
+ deployment_spec['metadata']['labels'] = pod_spec_copy['metadata'][
1121
+ 'labels']
899
1122
  try:
900
1123
  return kubernetes.apps_api(
901
1124
  context).create_namespaced_deployment(
@@ -904,6 +1127,10 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
904
1127
  print('Deployment failed', e)
905
1128
  raise e
906
1129
 
1130
+ # Check if any PVCs with access mode ReadWriteOnce or ReadWriteOncePod
1131
+ # is used by any pod in the namespace.
1132
+ volume.check_pvc_usage_for_pod(context, namespace, pod_spec_copy)
1133
+
907
1134
  return _create_namespaced_pod_with_retries(namespace, pod_spec_copy,
908
1135
  context)
909
1136
 
@@ -922,9 +1149,16 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
922
1149
  'and then up the cluster again.')
923
1150
  raise exceptions.InconsistentHighAvailabilityError(message)
924
1151
 
925
- # Create pods in parallel
926
- created_resources = subprocess_utils.run_in_parallel(
927
- _create_resource_thread, list(range(to_start_count)), _NUM_THREADS)
1152
+ created_resources = []
1153
+ if to_start_count > 0:
1154
+ # Create pods in parallel.
1155
+ # Use `config.count` instead of `to_start_count` to keep the index of
1156
+ # the Pods consistent especially for the case where some Pods are down
1157
+ # due to node failure or manual termination, etc. and then launch
1158
+ # again to create the Pods back.
1159
+ # The existing Pods will be skipped in _create_resource_thread.
1160
+ created_resources = subprocess_utils.run_in_parallel(
1161
+ _create_resource_thread, list(range(config.count)), _NUM_THREADS)
928
1162
 
929
1163
  if to_create_deployment:
930
1164
  deployments = copy.deepcopy(created_resources)
@@ -937,20 +1171,22 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
937
1171
  pods = created_resources
938
1172
 
939
1173
  created_pods = {}
1174
+ valid_pods = []
940
1175
  for pod in pods:
1176
+ # In case Pod is not created
1177
+ if pod is None:
1178
+ continue
1179
+ valid_pods.append(pod)
941
1180
  created_pods[pod.metadata.name] = pod
942
1181
  if head_pod_name is None and _is_head(pod):
943
1182
  head_pod_name = pod.metadata.name
1183
+ pods = valid_pods
1184
+
1185
+ # The running_pods may include Pending Pods, so we add them to the pods
1186
+ # list to wait for scheduling and running
1187
+ if running_pods:
1188
+ pods = pods + list(running_pods.values())
944
1189
 
945
- networking_mode = network_utils.get_networking_mode(
946
- config.provider_config.get('networking_mode'))
947
- if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
948
- # Adding the jump pod to the new_nodes list as well so it can be
949
- # checked if it's scheduled and running along with other pods.
950
- ssh_jump_pod_name = pod_spec['metadata']['labels']['skypilot-ssh-jump']
951
- jump_pod = kubernetes.core_api(context).read_namespaced_pod(
952
- ssh_jump_pod_name, namespace)
953
- pods.append(jump_pod)
954
1190
  provision_timeout = provider_config['timeout']
955
1191
 
956
1192
  wait_str = ('indefinitely'
@@ -960,12 +1196,17 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
960
1196
 
961
1197
  # Wait until the pods are scheduled and surface cause for error
962
1198
  # if there is one
963
- _wait_for_pods_to_schedule(namespace, context, pods, provision_timeout)
1199
+ _wait_for_pods_to_schedule(namespace, context, pods, provision_timeout,
1200
+ cluster_name, create_pods_start)
1201
+ # Reset spinner message here because it might have hinted autoscaling
1202
+ # while waiting for pods to schedule.
1203
+ rich_utils.force_update_status(
1204
+ ux_utils.spinner_message('Launching', cluster_name=cluster_name))
964
1205
  # Wait until the pods and their containers are up and running, and
965
1206
  # fail early if there is an error
966
1207
  logger.debug(f'run_instances: waiting for pods to be running (pulling '
967
1208
  f'images): {[pod.metadata.name for pod in pods]}')
968
- _wait_for_pods_to_run(namespace, context, pods)
1209
+ _wait_for_pods_to_run(namespace, context, cluster_name, pods)
969
1210
  logger.debug(f'run_instances: all pods are scheduled and running: '
970
1211
  f'{[pod.metadata.name for pod in pods]}')
971
1212
 
@@ -981,11 +1222,11 @@ def _create_pods(region: str, cluster_name_on_cloud: str,
981
1222
  )
982
1223
 
983
1224
 
984
- def run_instances(region: str, cluster_name_on_cloud: str,
1225
+ def run_instances(region: str, cluster_name: str, cluster_name_on_cloud: str,
985
1226
  config: common.ProvisionConfig) -> common.ProvisionRecord:
986
1227
  """Runs instances for the given cluster."""
987
1228
  try:
988
- return _create_pods(region, cluster_name_on_cloud, config)
1229
+ return _create_pods(region, cluster_name, cluster_name_on_cloud, config)
989
1230
  except (kubernetes.api_exception(), config_lib.KubernetesError) as e:
990
1231
  e_msg = common_utils.format_exception(e).replace('\n', ' ')
991
1232
  logger.warning('run_instances: Error occurred when creating pods: '
@@ -1006,42 +1247,10 @@ def stop_instances(
1006
1247
  raise NotImplementedError()
1007
1248
 
1008
1249
 
1009
- def _delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
1010
- resource_name: str) -> None:
1011
- """Helper to delete Kubernetes resources with 404 handling and retries.
1012
-
1013
- Args:
1014
- delete_func: Function to call to delete the resource
1015
- resource_type: Type of resource being deleted (e.g. 'service'),
1016
- used in logging
1017
- resource_name: Name of the resource being deleted, used in logging
1018
- """
1019
- max_retries = 3
1020
- retry_delay = 5 # seconds
1021
-
1022
- for attempt in range(max_retries):
1023
- try:
1024
- delete_func()
1025
- return
1026
- except kubernetes.api_exception() as e:
1027
- if e.status == 404:
1028
- logger.warning(
1029
- f'terminate_instances: Tried to delete {resource_type} '
1030
- f'{resource_name}, but the {resource_type} was not '
1031
- 'found (404).')
1032
- return
1033
- elif attempt < max_retries - 1:
1034
- logger.warning(f'terminate_instances: Failed to delete '
1035
- f'{resource_type} {resource_name} (attempt '
1036
- f'{attempt + 1}/{max_retries}). Error: {e}. '
1037
- f'Retrying in {retry_delay} seconds...')
1038
- time.sleep(retry_delay)
1039
- else:
1040
- raise
1041
-
1042
-
1043
- def _delete_services(name_prefix: str, namespace: str,
1044
- context: Optional[str]) -> None:
1250
+ def _delete_services(name_prefix: str,
1251
+ namespace: str,
1252
+ context: Optional[str],
1253
+ skip_ssh_service: bool = False) -> None:
1045
1254
  """Delete services with the given name prefix.
1046
1255
 
1047
1256
  Args:
@@ -1050,18 +1259,21 @@ def _delete_services(name_prefix: str, namespace: str,
1050
1259
  context: Kubernetes context
1051
1260
  """
1052
1261
  # TODO(andy): We should use tag for the service filter.
1053
- for service_name in [name_prefix, f'{name_prefix}-ssh']:
1262
+ services = ([name_prefix, f'{name_prefix}-ssh']
1263
+ if not skip_ssh_service else [name_prefix])
1264
+ for service_name in services:
1054
1265
  # Since we are not saving this lambda, it's a false positive.
1055
1266
  # TODO(andyl): Wait for
1056
1267
  # https://github.com/pylint-dev/pylint/issues/5263.
1057
1268
  # pylint: disable=cell-var-from-loop
1058
- _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.core_api(
1059
- context).delete_namespaced_service(name=service_name,
1060
- namespace=namespace,
1061
- _request_timeout=config_lib.
1062
- DELETION_TIMEOUT),
1063
- resource_type='service',
1064
- resource_name=service_name)
1269
+ kubernetes_utils.delete_k8s_resource_with_retry(
1270
+ delete_func=lambda: kubernetes.core_api(
1271
+ context).delete_namespaced_service(name=service_name,
1272
+ namespace=namespace,
1273
+ _request_timeout=config_lib.
1274
+ DELETION_TIMEOUT),
1275
+ resource_type='service',
1276
+ resource_name=service_name)
1065
1277
 
1066
1278
 
1067
1279
  def _terminate_node(namespace: str,
@@ -1075,13 +1287,16 @@ def _terminate_node(namespace: str,
1075
1287
  # Delete services for the head pod
1076
1288
  # services are specified in sky/templates/kubernetes-ray.yml.j2
1077
1289
  _delete_services(pod_name, namespace, context)
1290
+ else:
1291
+ # No ssh service is created for worker pods
1292
+ _delete_services(pod_name, namespace, context, skip_ssh_service=True)
1078
1293
 
1079
1294
  # Note - delete pod after all other resources are deleted.
1080
1295
  # This is to ensure there are no leftover resources if this down is run
1081
1296
  # from within the pod, e.g., for autodown.
1082
1297
  # Note - some misbehaving pods may not terminate gracefully if they have
1083
1298
  # open file descriptors. We force delete pods to avoid this.
1084
- _delete_k8s_resource_with_retry(
1299
+ kubernetes_utils.delete_k8s_resource_with_retry(
1085
1300
  delete_func=lambda: kubernetes.core_api(context).delete_namespaced_pod(
1086
1301
  name=pod_name,
1087
1302
  namespace=namespace,
@@ -1099,26 +1314,28 @@ def _terminate_deployment(cluster_name: str, namespace: str,
1099
1314
 
1100
1315
  # Delete deployment
1101
1316
  deployment_name = _get_deployment_name(cluster_name)
1102
- _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.apps_api(
1103
- context).delete_namespaced_deployment(name=deployment_name,
1104
- namespace=namespace,
1105
- _request_timeout=config_lib.
1106
- DELETION_TIMEOUT),
1107
- resource_type='deployment',
1108
- resource_name=deployment_name)
1317
+ kubernetes_utils.delete_k8s_resource_with_retry(
1318
+ delete_func=lambda: kubernetes.apps_api(
1319
+ context).delete_namespaced_deployment(name=deployment_name,
1320
+ namespace=namespace,
1321
+ _request_timeout=config_lib.
1322
+ DELETION_TIMEOUT),
1323
+ resource_type='deployment',
1324
+ resource_name=deployment_name)
1109
1325
 
1110
1326
  # Delete PVCs
1111
1327
  pvc_name = _get_pvc_name(
1112
1328
  cluster_name,
1113
1329
  kubernetes_utils.HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME)
1114
1330
  # pylint: disable=cell-var-from-loop
1115
- _delete_k8s_resource_with_retry(delete_func=lambda: kubernetes.core_api(
1116
- context).delete_namespaced_persistent_volume_claim(
1117
- name=pvc_name,
1118
- namespace=namespace,
1119
- _request_timeout=config_lib.DELETION_TIMEOUT),
1120
- resource_type='pvc',
1121
- resource_name=pvc_name)
1331
+ kubernetes_utils.delete_k8s_resource_with_retry(
1332
+ delete_func=lambda: kubernetes.core_api(
1333
+ context).delete_namespaced_persistent_volume_claim(
1334
+ name=pvc_name,
1335
+ namespace=namespace,
1336
+ _request_timeout=config_lib.DELETION_TIMEOUT),
1337
+ resource_type='pvc',
1338
+ resource_name=pvc_name)
1122
1339
 
1123
1340
 
1124
1341
  def terminate_instances(
@@ -1133,18 +1350,6 @@ def terminate_instances(
1133
1350
  ray_tag_filter(cluster_name_on_cloud),
1134
1351
  None)
1135
1352
 
1136
- # Clean up the SSH jump pod if in use
1137
- networking_mode = network_utils.get_networking_mode(
1138
- provider_config.get('networking_mode'))
1139
- if networking_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
1140
- pod_name = list(pods.keys())[0]
1141
- try:
1142
- kubernetes_utils.clean_zombie_ssh_jump_pod(namespace, context,
1143
- pod_name)
1144
- except Exception as e: # pylint: disable=broad-except
1145
- logger.warning('terminate_instances: Error occurred when analyzing '
1146
- f'SSH Jump pod: {e}')
1147
-
1148
1353
  if is_high_availability_cluster_by_kubectl(cluster_name_on_cloud, context,
1149
1354
  namespace):
1150
1355
  # For high availability controllers, terminate the deployment
@@ -1175,16 +1380,11 @@ def get_cluster_info(
1175
1380
 
1176
1381
  running_pods = kubernetes_utils.filter_pods(
1177
1382
  namespace, context, ray_tag_filter(cluster_name_on_cloud), ['Running'])
1383
+ logger.debug(f'Running pods: {list(running_pods.keys())}')
1178
1384
 
1179
1385
  pods: Dict[str, List[common.InstanceInfo]] = {}
1180
1386
  head_pod_name = None
1181
1387
 
1182
- port_forward_mode = kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD
1183
- network_mode_str = skypilot_config.get_nested(('kubernetes', 'networking'),
1184
- port_forward_mode.value)
1185
- network_mode = kubernetes_enums.KubernetesNetworkingMode.from_str(
1186
- network_mode_str)
1187
- external_ip = kubernetes_utils.get_external_ip(network_mode, context)
1188
1388
  port = 22
1189
1389
  if not provider_config.get('use_internal_ips', False):
1190
1390
  port = kubernetes_utils.get_head_ssh_port(cluster_name_on_cloud,
@@ -1198,10 +1398,12 @@ def get_cluster_info(
1198
1398
  common.InstanceInfo(
1199
1399
  instance_id=pod_name,
1200
1400
  internal_ip=internal_ip,
1201
- external_ip=(None if network_mode == port_forward_mode else
1202
- external_ip),
1401
+ external_ip=None,
1203
1402
  ssh_port=port,
1204
1403
  tags=pod.metadata.labels,
1404
+ # TODO(hailong): `cluster.local` may need to be configurable
1405
+ # Service name is same as the pod name for now.
1406
+ internal_svc=f'{pod_name}.{namespace}.svc.cluster.local',
1205
1407
  )
1206
1408
  ]
1207
1409
  if _is_head(pod):
@@ -1210,10 +1412,16 @@ def get_cluster_info(
1210
1412
  assert head_spec is not None, pod
1211
1413
  cpu_request = head_spec.containers[0].resources.requests['cpu']
1212
1414
 
1213
- assert cpu_request is not None, 'cpu_request should not be None'
1415
+ if cpu_request is None:
1416
+ raise RuntimeError(f'Pod {cluster_name_on_cloud}-head not found'
1417
+ ' or not Running, check the Pod status')
1214
1418
 
1215
1419
  ssh_user = 'sky'
1216
- get_k8s_ssh_user_cmd = 'echo $(whoami)'
1420
+ # Use pattern matching to extract SSH user, handling MOTD contamination.
1421
+ # Some container images (like CUDA-Q) print MOTD when login shells start,
1422
+ # which can contaminate command output. We use a unique pattern to extract
1423
+ # the actual username reliably.
1424
+ get_k8s_ssh_user_cmd = 'echo "SKYPILOT_SSH_USER: $(whoami)"'
1217
1425
  assert head_pod_name is not None
1218
1426
  runner = command_runner.KubernetesCommandRunner(
1219
1427
  ((namespace, context), head_pod_name))
@@ -1223,10 +1431,24 @@ def get_cluster_info(
1223
1431
  stream_logs=False)
1224
1432
  _raise_command_running_error('get ssh user', get_k8s_ssh_user_cmd,
1225
1433
  head_pod_name, rc, stdout + stderr)
1226
- ssh_user = stdout.strip()
1434
+
1435
+ # Extract SSH user using pattern matching
1436
+ ssh_user_match = _SSH_USER_PATTERN.search(stdout)
1437
+ if ssh_user_match:
1438
+ ssh_user = ssh_user_match.group(1)
1439
+ else:
1440
+ raise ValueError('Failed to find SSH user identifier: '
1441
+ f'{stdout + stderr}')
1227
1442
  logger.debug(
1228
1443
  f'Using ssh user {ssh_user} for cluster {cluster_name_on_cloud}')
1229
1444
 
1445
+ # cpu_request may be a string like `100m`, need to parse and convert
1446
+ num_cpus = kubernetes_utils.parse_cpu_or_gpu_resource_to_float(cpu_request)
1447
+ # 'num-cpus' for ray must be an integer, but we should not set it to 0 if
1448
+ # cpus is <1.
1449
+ # Keep consistent with the logic in clouds/kubernetes.py
1450
+ str_cpus = str(max(int(num_cpus), 1))
1451
+
1230
1452
  return common.ClusterInfo(
1231
1453
  instances=pods,
1232
1454
  head_instance_id=head_pod_name,
@@ -1236,56 +1458,375 @@ def get_cluster_info(
1236
1458
  # problems for other pods.
1237
1459
  custom_ray_options={
1238
1460
  'object-store-memory': 500000000,
1239
- 'num-cpus': cpu_request,
1461
+ 'num-cpus': str_cpus,
1240
1462
  },
1241
1463
  provider_name='kubernetes',
1242
1464
  provider_config=provider_config)
1243
1465
 
1244
1466
 
1467
+ def _get_pod_termination_reason(pod: Any, cluster_name: str) -> str:
1468
+ """Get pod termination reason and write to cluster events.
1469
+
1470
+ Checks both pod conditions (for preemption/disruption) and
1471
+ container statuses (for exit codes/errors).
1472
+ """
1473
+ latest_timestamp = pod.status.start_time or datetime.datetime.min
1474
+ ready_state = 'Unknown'
1475
+ termination_reason = 'Terminated unexpectedly'
1476
+ container_reasons = []
1477
+
1478
+ # Check pod status conditions for high level overview.
1479
+ # No need to sort, as each condition.type will only appear once.
1480
+ for condition in pod.status.conditions:
1481
+ reason = condition.reason or 'Unknown reason'
1482
+ message = condition.message or ''
1483
+
1484
+ # Get last known readiness state.
1485
+ if condition.type == 'Ready':
1486
+ ready_state = f'{reason} ({message})' if message else reason
1487
+ # Kueue preemption, as defined in:
1488
+ # https://pkg.go.dev/sigs.k8s.io/kueue/pkg/controller/jobs/pod#pkg-constants
1489
+ elif condition.type == 'TerminationTarget':
1490
+ termination_reason = f'Preempted by Kueue: {reason}'
1491
+ if message:
1492
+ termination_reason += f' ({message})'
1493
+ # Generic disruption.
1494
+ elif condition.type == 'DisruptionTarget':
1495
+ termination_reason = f'Disrupted: {reason}'
1496
+ if message:
1497
+ termination_reason += f' ({message})'
1498
+
1499
+ if condition.last_transition_time is not None:
1500
+ latest_timestamp = max(latest_timestamp,
1501
+ condition.last_transition_time)
1502
+
1503
+ pod_reason = (f'{termination_reason}.\n'
1504
+ f'Last known state: {ready_state}.')
1505
+
1506
+ # Check container statuses for exit codes/errors
1507
+ if pod.status and pod.status.container_statuses:
1508
+ for container_status in pod.status.container_statuses:
1509
+ terminated = container_status.state.terminated
1510
+ if terminated:
1511
+ exit_code = terminated.exit_code
1512
+ reason = terminated.reason
1513
+ if exit_code == 0:
1514
+ # skip exit 0 (non-failed) just for sanity
1515
+ logger.debug(f'{pod.metadata.name}/{container_status.name} '
1516
+ 'had exit code 0. Skipping.')
1517
+ continue
1518
+ if reason is None:
1519
+ # just in-case reason is None, have default for debugging
1520
+ reason = f'exit({exit_code})'
1521
+ container_reasons.append(reason)
1522
+ latest_timestamp = max(latest_timestamp, terminated.finished_at)
1523
+
1524
+ # TODO (kyuds): later, if needed, query `last_state` too.
1525
+
1526
+ # Normally we will have a single container per pod for skypilot
1527
+ # but doing this just in-case there are multiple containers.
1528
+ if container_reasons:
1529
+ pod_reason += f'\nContainer errors: {" | ".join(container_reasons)}'
1530
+
1531
+ global_user_state.add_cluster_event(
1532
+ cluster_name,
1533
+ None,
1534
+ f'[kubernetes pod {pod.metadata.name} terminated] {pod_reason}',
1535
+ global_user_state.ClusterEventType.DEBUG,
1536
+ transitioned_at=int(latest_timestamp.timestamp()),
1537
+ )
1538
+ return pod_reason
1539
+
1540
+
1541
+ def _get_pod_missing_reason(context: Optional[str], namespace: str,
1542
+ cluster_name: str, pod_name: str) -> Optional[str]:
1543
+ """Get events for missing pod and write to cluster events."""
1544
+ logger.debug(f'Analyzing events for pod {pod_name}')
1545
+ pod_field_selector = (
1546
+ f'involvedObject.kind=Pod,involvedObject.name={pod_name}')
1547
+ pod_events = kubernetes.core_api(context).list_namespaced_event(
1548
+ namespace,
1549
+ field_selector=pod_field_selector,
1550
+ _request_timeout=kubernetes.API_TIMEOUT).items
1551
+ pod_events = sorted(
1552
+ pod_events,
1553
+ key=lambda event: event.metadata.creation_timestamp,
1554
+ # latest event appears first
1555
+ reverse=True)
1556
+ last_scheduled_node = None
1557
+ insert_new_pod_event = True
1558
+ new_event_inserted = False
1559
+ inserted_pod_events = 0
1560
+
1561
+ for event in pod_events:
1562
+ if event.reason == 'Scheduled':
1563
+ pattern = r'Successfully assigned (\S+) to (\S+)'
1564
+ match = re.search(pattern, event.message)
1565
+ if match:
1566
+ scheduled_node = match.group(2)
1567
+ last_scheduled_node = scheduled_node
1568
+ if insert_new_pod_event:
1569
+ # Try inserting the latest events first. If the event is a
1570
+ # duplicate, it means the event (and any previous events) have
1571
+ # already been inserted - so do not insert further events.
1572
+ try:
1573
+ global_user_state.add_cluster_event(
1574
+ cluster_name,
1575
+ None, f'[kubernetes pod {pod_name}] '
1576
+ f'{event.reason} {event.message}',
1577
+ global_user_state.ClusterEventType.DEBUG,
1578
+ transitioned_at=int(
1579
+ event.metadata.creation_timestamp.timestamp()),
1580
+ expose_duplicate_error=True)
1581
+ logger.debug(f'[pod {pod_name}] encountered new pod event: '
1582
+ f'{event.metadata.creation_timestamp} '
1583
+ f'{event.reason} {event.message}')
1584
+ except db_utils.UniqueConstraintViolationError:
1585
+ insert_new_pod_event = False
1586
+ else:
1587
+ new_event_inserted = True
1588
+ inserted_pod_events += 1
1589
+
1590
+ logger.debug(f'[pod {pod_name}] processed {len(pod_events)} pod events and '
1591
+ f'inserted {inserted_pod_events} new pod events '
1592
+ 'previously unseen')
1593
+
1594
+ if last_scheduled_node is not None:
1595
+ node_field_selector = ('involvedObject.kind=Node,'
1596
+ f'involvedObject.name={last_scheduled_node}')
1597
+ node_events = kubernetes.core_api(context).list_namespaced_event(
1598
+ namespace,
1599
+ field_selector=node_field_selector,
1600
+ _request_timeout=kubernetes.API_TIMEOUT).items
1601
+ node_events = sorted(
1602
+ node_events,
1603
+ key=lambda event: event.metadata.creation_timestamp,
1604
+ # latest event appears first
1605
+ reverse=True)
1606
+ insert_new_node_event = True
1607
+ inserted_node_events = 0
1608
+ for event in node_events:
1609
+ if insert_new_node_event:
1610
+ # Try inserting the latest events first. If the event is a
1611
+ # duplicate, it means the event (and any previous events) have
1612
+ # already been inserted - so do not insert further events.
1613
+ try:
1614
+ global_user_state.add_cluster_event(
1615
+ cluster_name,
1616
+ None, f'[kubernetes node {last_scheduled_node}] '
1617
+ f'{event.reason} {event.message}',
1618
+ global_user_state.ClusterEventType.DEBUG,
1619
+ transitioned_at=int(
1620
+ event.metadata.creation_timestamp.timestamp()),
1621
+ expose_duplicate_error=True)
1622
+ logger.debug(
1623
+ f'[pod {pod_name}] encountered new node event: '
1624
+ f'{event.metadata.creation_timestamp} '
1625
+ f'{event.reason} {event.message}')
1626
+ except db_utils.UniqueConstraintViolationError:
1627
+ insert_new_node_event = False
1628
+ else:
1629
+ new_event_inserted = True
1630
+ inserted_node_events += 1
1631
+
1632
+ logger.debug(f'[pod {pod_name}: node {last_scheduled_node}] '
1633
+ f'processed {len(node_events)} node events and '
1634
+ f'inserted {inserted_node_events} new node events '
1635
+ 'previously unseen')
1636
+ else:
1637
+ logger.debug(f'[pod {pod_name}] could not determine the node '
1638
+ 'the pod was scheduled to')
1639
+
1640
+ if not new_event_inserted:
1641
+ # If new event is not inserted, there is no useful information to
1642
+ # return. Return None.
1643
+ return None
1644
+
1645
+ # Analyze the events for failure
1646
+ failure_reason = None
1647
+ failure_decisiveness = 0
1648
+
1649
+ def _record_failure_reason(reason: str, decisiveness: int):
1650
+ nonlocal failure_reason, failure_decisiveness
1651
+ if decisiveness > failure_decisiveness:
1652
+ failure_reason = reason
1653
+ failure_decisiveness = decisiveness
1654
+
1655
+ cluster_events = global_user_state.get_cluster_events(
1656
+ cluster_name, None, global_user_state.ClusterEventType.DEBUG)
1657
+ for event in cluster_events:
1658
+ if event.startswith('[kubernetes pod'):
1659
+ event = event.split(']')[1].strip()
1660
+ elif event.startswith('[kubernetes node'):
1661
+ event = event.split(']')[1].strip()
1662
+
1663
+ if event.startswith('NodeNotReady '):
1664
+ _record_failure_reason(event[len('NodeNotReady '):], 1)
1665
+ elif event.startswith('TaintManagerEviction '):
1666
+ # usually the event message for TaintManagerEviction is not useful
1667
+ # so we record a more generic message.
1668
+ _record_failure_reason('pod was evicted by taint manager', 2)
1669
+ elif event.startswith('DeletingNode '):
1670
+ _record_failure_reason(event[len('DeletingNode '):], 3)
1671
+ return failure_reason
1672
+
1673
+
1674
+ def list_namespaced_pod(context: Optional[str], namespace: str,
1675
+ cluster_name_on_cloud: str, is_ssh: bool, identity: str,
1676
+ label_selector: str) -> List[Any]:
1677
+ # Get all the pods with the label skypilot-cluster-name: <cluster_name>
1678
+ try:
1679
+ # log the query parameters we pass to the k8s api
1680
+ logger.debug(f'Querying k8s api for pods:\n'
1681
+ f'context: {context}\n'
1682
+ f'namespace: {namespace}\n'
1683
+ f'label selector:`{label_selector}`.')
1684
+
1685
+ response = kubernetes.core_api(context).list_namespaced_pod(
1686
+ namespace,
1687
+ label_selector=label_selector,
1688
+ _request_timeout=kubernetes.API_TIMEOUT)
1689
+
1690
+ # log PodList response info
1691
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1692
+ logger.debug(f'k8s api response for `{label_selector}`:\n'
1693
+ f'apiVersion={response.api_version}, '
1694
+ f'kind={response.kind},\n'
1695
+ f'metadata={response.metadata}')
1696
+
1697
+ pods = response.items
1698
+
1699
+ # log detailed Pod info
1700
+ if sky_logging.logging_enabled(logger, sky_logging.DEBUG):
1701
+ logger.debug(f'k8s api response for `{label_selector}`: '
1702
+ f'len(pods)={len(pods)}')
1703
+ for pod in pods:
1704
+ logger.debug(f'k8s pod info for `{label_selector}`: '
1705
+ f'pod.apiVersion={pod.api_version}, '
1706
+ f'pod.kind={pod.kind}, \n'
1707
+ f'pod.name={pod.metadata.name}, '
1708
+ f'pod.namespace={pod.metadata.namespace}, \n'
1709
+ f'pod.labels={pod.metadata.labels}, \n'
1710
+ f'pod.annotations={pod.metadata.annotations}, \n'
1711
+ 'pod.creationTimestamp='
1712
+ f'{pod.metadata.creation_timestamp}, '
1713
+ 'pod.deletionTimestamp='
1714
+ f'{pod.metadata.deletion_timestamp}, \n'
1715
+ f'pod.status={pod.status}')
1716
+ return pods
1717
+
1718
+ except kubernetes.max_retry_error():
1719
+ with ux_utils.print_exception_no_traceback():
1720
+ if is_ssh:
1721
+ node_pool = common_utils.removeprefix(context,
1722
+ 'ssh-') if context else ''
1723
+ msg = (
1724
+ f'Cannot connect to SSH Node Pool {node_pool}. '
1725
+ 'Please check if the SSH Node Pool is up and accessible. '
1726
+ 'To debug, run `sky check ssh` to check the status of '
1727
+ 'the SSH Node Pool.')
1728
+ else:
1729
+ ctx = kubernetes_utils.get_current_kube_config_context_name()
1730
+ msg = (f'Network error - check if the {identity} in '
1731
+ f'context {ctx} is up and accessible.')
1732
+ raise exceptions.ClusterStatusFetchingError(
1733
+ f'Failed to query cluster {cluster_name_on_cloud!r} status. ' +
1734
+ msg) from None
1735
+ except Exception as e: # pylint: disable=broad-except
1736
+ with ux_utils.print_exception_no_traceback():
1737
+ raise exceptions.ClusterStatusFetchingError(
1738
+ f'Failed to query {identity} {cluster_name_on_cloud!r} '
1739
+ f'status: {common_utils.format_exception(e)}')
1740
+
1741
+
1245
1742
  def query_instances(
1743
+ cluster_name: str,
1246
1744
  cluster_name_on_cloud: str,
1247
1745
  provider_config: Optional[Dict[str, Any]] = None,
1248
- non_terminated_only: bool = True
1249
- ) -> Dict[str, Optional[status_lib.ClusterStatus]]:
1746
+ non_terminated_only: bool = True,
1747
+ retry_if_missing: bool = False,
1748
+ ) -> Dict[str, Tuple[Optional['status_lib.ClusterStatus'], Optional[str]]]:
1749
+ # Mapping from pod phase to skypilot status. These are the only valid pod
1750
+ # phases.
1751
+ # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
1250
1752
  status_map = {
1251
1753
  'Pending': status_lib.ClusterStatus.INIT,
1252
1754
  'Running': status_lib.ClusterStatus.UP,
1253
- 'Failed': None,
1755
+ 'Failed': status_lib.ClusterStatus.INIT,
1254
1756
  'Unknown': None,
1255
1757
  'Succeeded': None,
1256
- 'Terminating': None,
1257
1758
  }
1258
1759
 
1259
1760
  assert provider_config is not None
1260
1761
  namespace = kubernetes_utils.get_namespace_from_config(provider_config)
1261
1762
  context = kubernetes_utils.get_context_from_config(provider_config)
1262
-
1263
- # Get all the pods with the label skypilot-cluster: <cluster_name>
1264
- try:
1265
- pods = kubernetes.core_api(context).list_namespaced_pod(
1266
- namespace,
1267
- label_selector=f'skypilot-cluster={cluster_name_on_cloud}',
1268
- _request_timeout=kubernetes.API_TIMEOUT).items
1269
- except kubernetes.max_retry_error():
1270
- with ux_utils.print_exception_no_traceback():
1271
- ctx = kubernetes_utils.get_current_kube_config_context_name()
1272
- raise exceptions.ClusterStatusFetchingError(
1273
- f'Failed to query cluster {cluster_name_on_cloud!r} status. '
1274
- 'Network error - check if the Kubernetes cluster in '
1275
- f'context {ctx} is up and accessible.') from None
1276
- except Exception as e: # pylint: disable=broad-except
1277
- with ux_utils.print_exception_no_traceback():
1278
- raise exceptions.ClusterStatusFetchingError(
1279
- f'Failed to query Kubernetes cluster {cluster_name_on_cloud!r} '
1280
- f'status: {common_utils.format_exception(e)}')
1763
+ is_ssh = context.startswith('ssh-') if context else False
1764
+ identity = 'SSH Node Pool' if is_ssh else 'Kubernetes cluster'
1765
+ label_selector = (f'{constants.TAG_SKYPILOT_CLUSTER_NAME}='
1766
+ f'{cluster_name_on_cloud}')
1767
+
1768
+ attempts = 0
1769
+ pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
1770
+ is_ssh, identity, label_selector)
1771
+ # When we see no pods returned from the k8s api, we assume the pods have
1772
+ # been terminated by the user directly and mark the cluster as terminated
1773
+ # in the global user state.
1774
+ # We add retry logic here as an attempt to mitigate a leak caused by the
1775
+ # kubernetes api returning no pods despite the pods actually existing.
1776
+ while (retry_if_missing and not pods and
1777
+ attempts < _MAX_QUERY_INSTANCES_RETRIES):
1778
+ logger.debug(f'Retrying to query k8s api for {cluster_name_on_cloud} '
1779
+ f'{attempts}/{_MAX_QUERY_INSTANCES_RETRIES} times.'
1780
+ f'after {_QUERY_INSTANCES_RETRY_INTERVAL} seconds.')
1781
+ time.sleep(_QUERY_INSTANCES_RETRY_INTERVAL)
1782
+ attempts += 1
1783
+ pods = list_namespaced_pod(context, namespace, cluster_name_on_cloud,
1784
+ is_ssh, identity, label_selector)
1785
+ if len(pods) > 0:
1786
+ logger.info(f'Found {len(pods)} pods for {label_selector} after'
1787
+ f'{attempts} retries.')
1281
1788
 
1282
1789
  # Check if the pods are running or pending
1283
- cluster_status = {}
1790
+ cluster_status: Dict[str, Tuple[Optional['status_lib.ClusterStatus'],
1791
+ Optional[str]]] = {}
1284
1792
  for pod in pods:
1285
- pod_status = status_map[pod.status.phase]
1793
+ phase = pod.status.phase
1794
+ is_terminating = pod.metadata.deletion_timestamp is not None
1795
+ pod_status = status_map[phase]
1796
+ reason = None
1797
+ if phase in ('Failed', 'Unknown') or is_terminating:
1798
+ reason = _get_pod_termination_reason(pod, cluster_name)
1799
+ logger.debug(f'Pod Status ({phase}) Reason(s): {reason}')
1286
1800
  if non_terminated_only and pod_status is None:
1801
+ logger.debug(f'Pod {pod.metadata.name} is terminated, but '
1802
+ 'query_instances is called with '
1803
+ f'non_terminated_only=True. Phase: {phase}')
1287
1804
  continue
1288
- cluster_status[pod.metadata.name] = pod_status
1805
+ pod_name = pod.metadata.name
1806
+ reason = f'{pod_name}: {reason}' if reason is not None else None
1807
+ cluster_status[pod_name] = (pod_status, reason)
1808
+
1809
+ # Find the list of pod names that should be there
1810
+ # from k8s services. Filter duplicates as -ssh service
1811
+ # creates a duplicate entry.
1812
+ target_pod_names = list(
1813
+ set([
1814
+ service['spec']['selector']['component']
1815
+ for service in provider_config.get('services', [])
1816
+ ]))
1817
+
1818
+ for target_pod_name in target_pod_names:
1819
+ if target_pod_name not in cluster_status:
1820
+ # If the pod is not in the cluster_status, it means it's not
1821
+ # running.
1822
+ # Analyze what happened to the pod based on events.
1823
+ reason = _get_pod_missing_reason(context, namespace, cluster_name,
1824
+ target_pod_name)
1825
+ reason = (f'{target_pod_name}: {reason}'
1826
+ if reason is not None else None)
1827
+ if not non_terminated_only:
1828
+ cluster_status[target_pod_name] = (None, reason)
1829
+
1289
1830
  return cluster_status
1290
1831
 
1291
1832
 
@@ -1307,7 +1848,8 @@ def get_command_runners(
1307
1848
 
1308
1849
  # Try to get deployment name from label first
1309
1850
  head_instance_info = instances[pod_name][0]
1310
- deployment = head_instance_info.tags.get(TAG_SKYPILOT_DEPLOYMENT_NAME)
1851
+ deployment = head_instance_info.tags.get(
1852
+ k8s_constants.TAG_SKYPILOT_DEPLOYMENT_NAME)
1311
1853
 
1312
1854
  node_list = [((namespace, context), pod_name)]
1313
1855
  head_runner = command_runner.KubernetesCommandRunner(