skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,11 @@
1
1
  """Kubernetes utilities for SkyPilot."""
2
+ import collections
3
+ import copy
2
4
  import dataclasses
5
+ import datetime
6
+ import enum
3
7
  import functools
8
+ import hashlib
4
9
  import json
5
10
  import math
6
11
  import os
@@ -9,12 +14,13 @@ import shutil
9
14
  import subprocess
10
15
  import time
11
16
  import typing
12
- from typing import Any, Dict, List, Optional, Set, Tuple, Union
13
- from urllib.parse import urlparse
17
+ from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
18
+
19
+ import ijson
14
20
 
15
- import sky
16
21
  from sky import clouds
17
22
  from sky import exceptions
23
+ from sky import global_user_state
18
24
  from sky import models
19
25
  from sky import sky_logging
20
26
  from sky import skypilot_config
@@ -34,6 +40,7 @@ from sky.utils import schemas
34
40
  from sky.utils import status_lib
35
41
  from sky.utils import timeline
36
42
  from sky.utils import ux_utils
43
+ from sky.utils import yaml_utils
37
44
 
38
45
  if typing.TYPE_CHECKING:
39
46
  import jinja2
@@ -55,6 +62,80 @@ HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
55
62
  # and store all data that needs to be persisted in future.
56
63
  HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
57
64
 
65
+ IJSON_BUFFER_SIZE = 64 * 1024 # 64KB, default from ijson
66
+
67
+
68
+ class KubernetesHighPerformanceNetworkType(enum.Enum):
69
+ """Enum for different Kubernetes cluster types with high performance
70
+ network configurations.
71
+
72
+ This enum defines cluster types that support optimized networking for
73
+ distributed ML workloads:
74
+ - GCP_TCPX: GKE clusters with GPUDirect-TCPX support
75
+ (A3 High instances: a3-highgpu-8g)
76
+ - GCP_TCPXO: GKE clusters with GPUDirect-TCPXO support
77
+ (A3 Mega instances: a3-megagpu-8g)
78
+ - GCP_GPUDIRECT_RDMA: GKE clusters with GPUDirect-RDMA support
79
+ (A4/A3 Ultra instances)
80
+ - NEBIUS: Nebius clusters with InfiniBand support for high-throughput,
81
+ low-latency networking
82
+ - COREWEAVE: CoreWeave clusters with InfiniBand support.
83
+ - NONE: Standard clusters without specialized networking optimizations
84
+
85
+ The network configurations align with corresponding VM-based
86
+ implementations:
87
+ - GCP settings match
88
+ sky.provision.gcp.constants.GPU_DIRECT_TCPX_SPECIFIC_OPTIONS
89
+ - Nebius settings match the InfiniBand configuration used in Nebius VMs
90
+ """
91
+
92
+ GCP_TCPX = 'gcp_tcpx'
93
+ GCP_TCPXO = 'gcp_tcpxo'
94
+ GCP_GPUDIRECT_RDMA = 'gcp_gpudirect_rdma'
95
+ NEBIUS = 'nebius'
96
+ COREWEAVE = 'coreweave'
97
+ NONE = 'none'
98
+
99
+ def get_network_env_vars(self) -> Dict[str, str]:
100
+ """Get network environment variables for this cluster type."""
101
+ if self == KubernetesHighPerformanceNetworkType.NEBIUS:
102
+ # Nebius cluster with InfiniBand - use InfiniBand optimizations
103
+ return {
104
+ 'NCCL_IB_HCA': 'mlx5',
105
+ 'UCX_NET_DEVICES': ('mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,'
106
+ 'mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1')
107
+ }
108
+ elif self == KubernetesHighPerformanceNetworkType.COREWEAVE:
109
+ return {
110
+ 'NCCL_SOCKET_IFNAME': 'eth0',
111
+ 'NCCL_IB_HCA': 'ibp',
112
+ 'UCX_NET_DEVICES': ('ibp0:1,ibp1:1,ibp2:1,ibp3:1,'
113
+ 'ibp4:1,ibp5:1,ibp6:1,ibp7:1')
114
+ }
115
+ else:
116
+ # GCP clusters and generic clusters - environment variables are
117
+ # handled directly in the template
118
+ return {}
119
+
120
+ def supports_high_performance_networking(self) -> bool:
121
+ """Check if this cluster type supports high performance networking."""
122
+ return self is not KubernetesHighPerformanceNetworkType.NONE
123
+
124
+ def supports_gpu_direct(self) -> bool:
125
+ """Check if this cluster type supports GPUDirect networking."""
126
+ return self in (KubernetesHighPerformanceNetworkType.GCP_TCPX,
127
+ KubernetesHighPerformanceNetworkType.GCP_TCPXO,
128
+ KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA)
129
+
130
+ def requires_ipc_lock_capability(self) -> bool:
131
+ """Check if this cluster type requires IPC_LOCK capability."""
132
+ return self.supports_high_performance_networking()
133
+
134
+ def requires_tcpxo_daemon(self) -> bool:
135
+ """Check if this cluster type requires TCPXO daemon."""
136
+ return self == KubernetesHighPerformanceNetworkType.GCP_TCPXO
137
+
138
+
58
139
  # TODO(romilb): Move constants to constants.py
59
140
  DEFAULT_NAMESPACE = 'default'
60
141
 
@@ -72,12 +153,14 @@ MEMORY_SIZE_UNITS = {
72
153
  # The resource keys used by Kubernetes to track NVIDIA GPUs and Google TPUs on
73
154
  # nodes. These keys are typically used in the node's status.allocatable
74
155
  # or status.capacity fields to indicate the available resources on the node.
75
- GPU_RESOURCE_KEY = 'nvidia.com/gpu'
156
+ SUPPORTED_GPU_RESOURCE_KEYS = {'amd': 'amd.com/gpu', 'nvidia': 'nvidia.com/gpu'}
76
157
  TPU_RESOURCE_KEY = 'google.com/tpu'
77
158
 
78
159
  NO_ACCELERATOR_HELP_MESSAGE = (
79
160
  'If your cluster contains GPUs or TPUs, make sure '
80
- f'{GPU_RESOURCE_KEY} or {TPU_RESOURCE_KEY} resource is available '
161
+ f'one of {SUPPORTED_GPU_RESOURCE_KEYS["amd"]}, '
162
+ f'{SUPPORTED_GPU_RESOURCE_KEYS["nvidia"]} or '
163
+ f'{TPU_RESOURCE_KEY} resource is available '
81
164
  'on the nodes and the node labels for identifying GPUs/TPUs '
82
165
  '(e.g., skypilot.co/accelerator) are setup correctly. ')
83
166
 
@@ -131,6 +214,64 @@ DEFAULT_MAX_RETRIES = 3
131
214
  DEFAULT_RETRY_INTERVAL_SECONDS = 1
132
215
 
133
216
 
217
+ def normalize_tpu_accelerator_name(accelerator: str) -> Tuple[str, int]:
218
+ """Normalize TPU names to the k8s-compatible name and extract count."""
219
+ # Examples:
220
+ # 'tpu-v6e-8' -> ('tpu-v6e-slice', 8)
221
+ # 'tpu-v5litepod-4' -> ('tpu-v5-lite-podslice', 4)
222
+
223
+ gcp_to_k8s_patterns = [
224
+ (r'^tpu-v6e-(\d+)$', 'tpu-v6e-slice'),
225
+ (r'^tpu-v5p-(\d+)$', 'tpu-v5p-slice'),
226
+ (r'^tpu-v5litepod-(\d+)$', 'tpu-v5-lite-podslice'),
227
+ (r'^tpu-v5lite-(\d+)$', 'tpu-v5-lite-device'),
228
+ (r'^tpu-v4-(\d+)$', 'tpu-v4-podslice'),
229
+ ]
230
+
231
+ for pattern, replacement in gcp_to_k8s_patterns:
232
+ match = re.match(pattern, accelerator)
233
+ if match:
234
+ count = int(match.group(1))
235
+ return replacement, count
236
+
237
+ # Default fallback
238
+ return accelerator, 1
239
+
240
+
241
+ def _is_cloudflare_403_error(exception: Exception) -> bool:
242
+ """Check if an exception is a transient CloudFlare 403 error.
243
+
244
+ CloudFlare proxy 403 errors with CF-specific headers are transient and
245
+ should be retried, unlike real RBAC 403 errors.
246
+
247
+ Args:
248
+ exception: The exception to check
249
+
250
+ Returns:
251
+ True if this is a CloudFlare 403 error that should be retried
252
+ """
253
+ if not isinstance(exception, kubernetes.api_exception()):
254
+ return False
255
+
256
+ # Only check for 403 errors
257
+ if exception.status != 403:
258
+ return False
259
+
260
+ # Check for CloudFlare-specific headers
261
+ headers = exception.headers if hasattr(exception, 'headers') else {}
262
+ if not headers:
263
+ return False
264
+
265
+ # CloudFlare errors have CF-RAY header and/or Server: cloudflare
266
+ for k, v in headers.items():
267
+ if 'cf-ray' in k.lower():
268
+ return True
269
+ if 'server' in k.lower() and 'cloudflare' in str(v).lower():
270
+ return True
271
+
272
+ return False
273
+
274
+
134
275
  def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
135
276
  retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
136
277
  resource_type: Optional[str] = None):
@@ -165,19 +306,25 @@ def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
165
306
  kubernetes.api_exception(),
166
307
  kubernetes.config_exception()) as e:
167
308
  last_exception = e
309
+
310
+ # Check if this is a CloudFlare transient 403 error
311
+ is_cloudflare_403 = _is_cloudflare_403_error(e)
312
+
168
313
  # Don't retry on permanent errors like 401 (Unauthorized)
169
- # or 403 (Forbidden)
314
+ # or 403 (Forbidden), unless it's a CloudFlare transient 403
170
315
  if (isinstance(e, kubernetes.api_exception()) and
171
- e.status in (401, 403)):
316
+ e.status in (401, 403) and not is_cloudflare_403):
172
317
  # Raise KubeAPIUnreachableError exception so that the
173
318
  # optimizer/provisioner can failover to other clouds.
174
319
  raise exceptions.KubeAPIUnreachableError(
175
320
  f'Kubernetes API error: {str(e)}') from e
176
321
  if attempt < max_retries - 1:
177
322
  sleep_time = backoff.current_backoff()
178
- logger.debug(f'Kubernetes API call {func.__name__} '
179
- f'failed with {str(e)}. Retrying in '
180
- f'{sleep_time:.1f}s...')
323
+ error_type = 'CloudFlare 403' if is_cloudflare_403 else 'error'
324
+ logger.debug(
325
+ f'Kubernetes API call {func.__name__} '
326
+ f'failed with {error_type} {str(e)}. Retrying in '
327
+ f'{sleep_time:.1f}s...')
181
328
  time.sleep(sleep_time)
182
329
  continue
183
330
 
@@ -287,8 +434,13 @@ def get_gke_accelerator_name(accelerator: str) -> str:
287
434
  # A100-80GB, L4, H100-80GB and H100-MEGA-80GB
288
435
  # have a different name pattern.
289
436
  return 'nvidia-{}'.format(accelerator.lower())
437
+ elif accelerator == 'H200':
438
+ # H200s on GCP use this label format
439
+ return 'nvidia-h200-141gb'
290
440
  elif accelerator.startswith('tpu-'):
291
441
  return accelerator
442
+ elif accelerator.startswith('amd-'):
443
+ return accelerator
292
444
  else:
293
445
  return 'nvidia-tesla-{}'.format(accelerator.lower())
294
446
 
@@ -342,6 +494,9 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
342
494
 
343
495
  LABEL_KEY = 'gpu.nvidia.com/class'
344
496
 
497
+ # TODO (kyuds): fill in more label values for different accelerators.
498
+ ACC_VALUE_MAPPINGS = {'H100_NVLINK_80GB': 'H100'}
499
+
345
500
  @classmethod
346
501
  def get_label_key(cls, accelerator: Optional[str] = None) -> str:
347
502
  return cls.LABEL_KEY
@@ -360,7 +515,8 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
360
515
 
361
516
  @classmethod
362
517
  def get_accelerator_from_label_value(cls, value: str) -> str:
363
- return value
518
+ # return original label value if not found in mappings.
519
+ return cls.ACC_VALUE_MAPPINGS.get(value, value)
364
520
 
365
521
 
366
522
  class GKELabelFormatter(GPULabelFormatter):
@@ -425,6 +581,10 @@ class GKELabelFormatter(GPULabelFormatter):
425
581
 
426
582
  e.g. tpu-v5-lite-podslice:8 -> '2x4'
427
583
  """
584
+ # If the TPU type is in the GKE_TPU_ACCELERATOR_TO_GENERATION, it means
585
+ # that it has been normalized before, no need to normalize again.
586
+ if acc_type not in GKE_TPU_ACCELERATOR_TO_GENERATION:
587
+ acc_type, acc_count = normalize_tpu_accelerator_name(acc_type)
428
588
  count_to_topology = cls.GKE_TPU_TOPOLOGIES.get(acc_type,
429
589
  {}).get(acc_count, None)
430
590
  if count_to_topology is None:
@@ -452,13 +612,26 @@ class GKELabelFormatter(GPULabelFormatter):
452
612
  # we map H100 ---> H100-80GB and keep H100-MEGA-80GB
453
613
  # to distinguish between a3-high and a3-mega instances
454
614
  return 'H100'
615
+ elif acc == 'H200-141GB':
616
+ return 'H200'
455
617
  return acc
456
618
  elif is_tpu_on_gke(value):
457
619
  return value
620
+ elif value == '':
621
+ # heterogenous cluster may have empty labels for cpu nodes.
622
+ return ''
458
623
  else:
459
624
  raise ValueError(
460
625
  f'Invalid accelerator name in GKE cluster: {value}')
461
626
 
627
+ @classmethod
628
+ def validate_label_value(cls, value: str) -> Tuple[bool, str]:
629
+ try:
630
+ _ = cls.get_accelerator_from_label_value(value)
631
+ return True, ''
632
+ except ValueError as e:
633
+ return False, str(e)
634
+
462
635
 
463
636
  class GFDLabelFormatter(GPULabelFormatter):
464
637
  """GPU Feature Discovery label formatter
@@ -563,17 +736,37 @@ def detect_gpu_label_formatter(
563
736
  for label, value in node.metadata.labels.items():
564
737
  node_labels[node.metadata.name].append((label, value))
565
738
 
566
- label_formatter = None
567
-
739
+ invalid_label_values: List[Tuple[str, str, str, str]] = []
568
740
  # Check if the node labels contain any of the GPU label prefixes
569
741
  for lf in LABEL_FORMATTER_REGISTRY:
742
+ skip = False
570
743
  for _, label_list in node_labels.items():
571
- for label, _ in label_list:
744
+ for label, value in label_list:
572
745
  if lf.match_label_key(label):
573
- label_formatter = lf()
574
- return label_formatter, node_labels
746
+ # Skip empty label values
747
+ if not value or value.strip() == '':
748
+ continue
749
+ valid, reason = lf.validate_label_value(value)
750
+ if valid:
751
+ return lf(), node_labels
752
+ else:
753
+ invalid_label_values.append(
754
+ (label, lf.__name__, value, reason))
755
+ skip = True
756
+ break
757
+ if skip:
758
+ break
759
+ if skip:
760
+ continue
575
761
 
576
- return label_formatter, node_labels
762
+ for label, lf_name, value, reason in invalid_label_values:
763
+ logger.warning(f'GPU label {label} matched for label '
764
+ f'formatter {lf_name}, '
765
+ f'but has invalid value {value}. '
766
+ f'Reason: {reason}. '
767
+ 'Skipping...')
768
+
769
+ return None, node_labels
577
770
 
578
771
 
579
772
  class Autoscaler:
@@ -703,6 +896,74 @@ class GKEAutoscaler(Autoscaler):
703
896
  return True
704
897
  return False
705
898
 
899
+ @classmethod
900
+ @annotations.lru_cache(scope='request', maxsize=10)
901
+ def get_available_machine_types(cls, context: str) -> List[str]:
902
+ """Returns the list of machine types that are available in the cluster.
903
+ """
904
+ # Assume context naming convention of
905
+ # gke_PROJECT-ID_LOCATION_CLUSTER-NAME
906
+ valid, project_id, location, cluster_name = cls._validate_context_name(
907
+ context)
908
+ if not valid:
909
+ # Context name is not in the format of
910
+ # gke_PROJECT-ID_LOCATION_CLUSTER-NAME.
911
+ # Cannot determine if the context can autoscale.
912
+ # Return empty list.
913
+ logger.debug(f'Context {context} is not in the format of '
914
+ f'gke_PROJECT-ID_LOCATION_CLUSTER-NAME. '
915
+ 'Returning empty machine type list.')
916
+ return []
917
+ try:
918
+ logger.debug(
919
+ f'Attempting to get information about cluster {cluster_name}')
920
+ container_service = gcp.build('container',
921
+ 'v1',
922
+ credentials=None,
923
+ cache_discovery=False)
924
+ cluster = container_service.projects().locations().clusters().get(
925
+ name=f'projects/{project_id}'
926
+ f'/locations/{location}'
927
+ f'/clusters/{cluster_name}').execute()
928
+ except ImportError:
929
+ # If the gcp module is not installed, return empty list.
930
+ # Remind the user once per day to install the gcp module for better
931
+ # pod scheduling with GKE autoscaler.
932
+ if time.time() - cls._pip_install_gcp_hint_last_sent > 60 * 60 * 24:
933
+ logger.info(
934
+ 'Could not fetch autoscaler information from GKE. '
935
+ 'Run pip install "skypilot[gcp]" for more intelligent pod '
936
+ 'scheduling with GKE autoscaler.')
937
+ cls._pip_install_gcp_hint_last_sent = time.time()
938
+ return []
939
+ except gcp.http_error_exception() as e:
940
+ # Cluster information is not available.
941
+ # Return empty list.
942
+ logger.debug(f'{e.message}', exc_info=True)
943
+ return []
944
+
945
+ machine_types = []
946
+ # Get the list of machine types that are available in the cluster.
947
+ node_pools = cluster.get('nodePools', [])
948
+ for node_pool in node_pools:
949
+ name = node_pool.get('name', '')
950
+ logger.debug(f'Checking if node pool {name} '
951
+ 'has autoscaling enabled.')
952
+ autoscaling_enabled = (node_pool.get('autoscaling',
953
+ {}).get('enabled', False))
954
+ if autoscaling_enabled:
955
+ logger.debug(f'Node pool {name} has autoscaling enabled.')
956
+ try:
957
+ machine_type = node_pool.get('config',
958
+ {}).get('machineType', '')
959
+ if machine_type:
960
+ machine_types.append(machine_type)
961
+ except KeyError:
962
+ logger.debug(f'Encountered KeyError while checking machine '
963
+ f'type of node pool {name}.')
964
+ continue
965
+ return machine_types
966
+
706
967
  @classmethod
707
968
  def _validate_context_name(cls, context: str) -> Tuple[bool, str, str, str]:
708
969
  """Validates the context name is in the format of
@@ -752,6 +1013,8 @@ class GKEAutoscaler(Autoscaler):
752
1013
  f'checking {node_pool_name} for TPU {requested_acc_type}:'
753
1014
  f'{requested_acc_count}')
754
1015
  if 'resourceLabels' in node_config:
1016
+ requested_acc_type, requested_acc_count = normalize_tpu_accelerator_name(
1017
+ requested_acc_type)
755
1018
  accelerator_exists = cls._node_pool_has_tpu_capacity(
756
1019
  node_config['resourceLabels'], machine_type,
757
1020
  requested_acc_type, requested_acc_count)
@@ -801,12 +1064,16 @@ class GKEAutoscaler(Autoscaler):
801
1064
  to fit the instance type.
802
1065
  """
803
1066
  for accelerator in node_pool_accelerators:
1067
+ raw_value = accelerator['acceleratorType']
804
1068
  node_accelerator_type = (
805
- GKELabelFormatter.get_accelerator_from_label_value(
806
- accelerator['acceleratorType']))
1069
+ GKELabelFormatter.get_accelerator_from_label_value(raw_value))
1070
+ # handle heterogenous nodes.
1071
+ if not node_accelerator_type:
1072
+ continue
807
1073
  node_accelerator_count = accelerator['acceleratorCount']
808
- if node_accelerator_type == requested_gpu_type and int(
809
- node_accelerator_count) >= requested_gpu_count:
1074
+ viable_names = [node_accelerator_type.lower(), raw_value.lower()]
1075
+ if (requested_gpu_type.lower() in viable_names and
1076
+ int(node_accelerator_count) >= requested_gpu_count):
810
1077
  return True
811
1078
  return False
812
1079
 
@@ -869,6 +1136,14 @@ class KarpenterAutoscaler(Autoscaler):
869
1136
  can_query_backend: bool = False
870
1137
 
871
1138
 
1139
+ class CoreweaveAutoscaler(Autoscaler):
1140
+ """CoreWeave autoscaler
1141
+ """
1142
+
1143
+ label_formatter: Any = CoreWeaveLabelFormatter
1144
+ can_query_backend: bool = False
1145
+
1146
+
872
1147
  class GenericAutoscaler(Autoscaler):
873
1148
  """Generic autoscaler
874
1149
  """
@@ -881,6 +1156,7 @@ class GenericAutoscaler(Autoscaler):
881
1156
  AUTOSCALER_TYPE_TO_AUTOSCALER = {
882
1157
  kubernetes_enums.KubernetesAutoscalerType.GKE: GKEAutoscaler,
883
1158
  kubernetes_enums.KubernetesAutoscalerType.KARPENTER: KarpenterAutoscaler,
1159
+ kubernetes_enums.KubernetesAutoscalerType.COREWEAVE: CoreweaveAutoscaler,
884
1160
  kubernetes_enums.KubernetesAutoscalerType.GENERIC: GenericAutoscaler,
885
1161
  }
886
1162
 
@@ -894,10 +1170,10 @@ def detect_accelerator_resource(
894
1170
  context: Optional[str]) -> Tuple[bool, Set[str]]:
895
1171
  """Checks if the Kubernetes cluster has GPU/TPU resource.
896
1172
 
897
- Two types of accelerator resources are available which are each checked
898
- with nvidia.com/gpu and google.com/tpu. If nvidia.com/gpu resource is
1173
+ Three types of accelerator resources are available which are each checked
1174
+ with amd.com/gpu, nvidia.com/gpu and google.com/tpu. If amd.com/gpu or nvidia.com/gpu resource is
899
1175
  missing, that typically means that the Kubernetes cluster does not have
900
- GPUs or the nvidia GPU operator and/or device drivers are not installed.
1176
+ GPUs or the amd/nvidia GPU operator and/or device drivers are not installed.
901
1177
 
902
1178
  Returns:
903
1179
  bool: True if the cluster has GPU_RESOURCE_KEY or TPU_RESOURCE_KEY
@@ -908,15 +1184,57 @@ def detect_accelerator_resource(
908
1184
  nodes = get_kubernetes_nodes(context=context)
909
1185
  for node in nodes:
910
1186
  cluster_resources.update(node.status.allocatable.keys())
911
- has_accelerator = (get_gpu_resource_key() in cluster_resources or
1187
+ has_accelerator = (get_gpu_resource_key(context) in cluster_resources or
912
1188
  TPU_RESOURCE_KEY in cluster_resources)
913
1189
 
914
1190
  return has_accelerator, cluster_resources
915
1191
 
916
1192
 
1193
+ @dataclasses.dataclass
1194
+ class V1ObjectMeta:
1195
+ name: str
1196
+ labels: Dict[str, str]
1197
+ namespace: str = '' # Used for pods, not nodes
1198
+
1199
+
1200
+ @dataclasses.dataclass
1201
+ class V1NodeAddress:
1202
+ type: str
1203
+ address: str
1204
+
1205
+
1206
+ @dataclasses.dataclass
1207
+ class V1NodeStatus:
1208
+ allocatable: Dict[str, str]
1209
+ capacity: Dict[str, str]
1210
+ addresses: List[V1NodeAddress]
1211
+
1212
+
1213
+ @dataclasses.dataclass
1214
+ class V1Node:
1215
+ metadata: V1ObjectMeta
1216
+ status: V1NodeStatus
1217
+
1218
+ @classmethod
1219
+ def from_dict(cls, data: dict) -> 'V1Node':
1220
+ """Create V1Node from a dictionary."""
1221
+ return cls(metadata=V1ObjectMeta(
1222
+ name=data['metadata']['name'],
1223
+ labels=data['metadata'].get('labels', {}),
1224
+ ),
1225
+ status=V1NodeStatus(
1226
+ allocatable=data['status']['allocatable'],
1227
+ capacity=data['status']['capacity'],
1228
+ addresses=[
1229
+ V1NodeAddress(type=addr['type'],
1230
+ address=addr['address'])
1231
+ for addr in data['status'].get('addresses', [])
1232
+ ]))
1233
+
1234
+
917
1235
  @annotations.lru_cache(scope='request', maxsize=10)
918
1236
  @_retry_on_error(resource_type='node')
919
- def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[Any]:
1237
+ def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[V1Node]:
920
1238
  """Gets the kubernetes nodes in the context.
921
1239
 
922
1240
  If context is None, gets the nodes in the current context.
@@ -924,25 +1242,113 @@ def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[Any]:
924
1242
  if context is None:
925
1243
  context = get_current_kube_config_context_name()
926
1244
 
927
- nodes = kubernetes.core_api(context).list_node(
928
- _request_timeout=kubernetes.API_TIMEOUT).items
1245
+ # Return raw urllib3.HTTPResponse object so that we can parse the json
1246
+ # more efficiently.
1247
+ response = kubernetes.core_api(context).list_node(
1248
+ _request_timeout=kubernetes.API_TIMEOUT, _preload_content=False)
1249
+ try:
1250
+ nodes = [
1251
+ V1Node.from_dict(item_dict) for item_dict in ijson.items(
1252
+ response, 'items.item', buf_size=IJSON_BUFFER_SIZE)
1253
+ ]
1254
+ finally:
1255
+ response.release_conn()
1256
+
929
1257
  return nodes
930
1258
 
931
1259
 
932
- @_retry_on_error(resource_type='pod')
933
- def get_all_pods_in_kubernetes_cluster(*,
934
- context: Optional[str] = None
935
- ) -> List[Any]:
936
- """Gets pods in all namespaces in kubernetes cluster indicated by context.
1260
+ @dataclasses.dataclass
1261
+ class V1PodStatus:
1262
+ phase: str
1263
+
937
1264
 
938
- Used for computing cluster resource usage.
1265
+ @dataclasses.dataclass
1266
+ class V1ResourceRequirements:
1267
+ requests: Optional[Dict[str, str]]
1268
+
1269
+
1270
+ @dataclasses.dataclass
1271
+ class V1Container:
1272
+ resources: V1ResourceRequirements
1273
+
1274
+
1275
+ @dataclasses.dataclass
1276
+ class V1PodSpec:
1277
+ containers: List[V1Container]
1278
+ node_name: Optional[str]
1279
+
1280
+
1281
+ @dataclasses.dataclass
1282
+ class V1Pod:
1283
+ metadata: V1ObjectMeta
1284
+ status: V1PodStatus
1285
+ spec: V1PodSpec
1286
+
1287
+ @classmethod
1288
+ def from_dict(cls, data: dict) -> 'V1Pod':
1289
+ """Create V1Pod from a dictionary."""
1290
+ return cls(metadata=V1ObjectMeta(
1291
+ name=data['metadata']['name'],
1292
+ labels=data['metadata'].get('labels', {}),
1293
+ namespace=data['metadata'].get('namespace'),
1294
+ ),
1295
+ status=V1PodStatus(phase=data['status'].get('phase'),),
1296
+ spec=V1PodSpec(
1297
+ node_name=data['spec'].get('nodeName'),
1298
+ containers=[
1299
+ V1Container(resources=V1ResourceRequirements(
1300
+ requests=container.get('resources', {}).get(
1301
+ 'requests') or None))
1302
+ for container in data['spec'].get('containers', [])
1303
+ ]))
1304
+
1305
+
1306
+ @_retry_on_error(resource_type='pod')
1307
+ def get_allocated_gpu_qty_by_node(
1308
+ *,
1309
+ context: Optional[str] = None,
1310
+ ) -> Dict[str, int]:
1311
+ """Gets allocated GPU quantity by each node by fetching pods in
1312
+ all namespaces in kubernetes cluster indicated by context.
939
1313
  """
940
1314
  if context is None:
941
1315
  context = get_current_kube_config_context_name()
1316
+ non_included_pod_statuses = POD_STATUSES.copy()
1317
+ status_filters = ['Running', 'Pending']
1318
+ if status_filters is not None:
1319
+ non_included_pod_statuses -= set(status_filters)
1320
+ field_selector = ','.join(
1321
+ [f'status.phase!={status}' for status in non_included_pod_statuses])
942
1322
 
943
- pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
944
- _request_timeout=kubernetes.API_TIMEOUT).items
945
- return pods
1323
+ # Return raw urllib3.HTTPResponse object so that we can parse the json
1324
+ # more efficiently.
1325
+ response = kubernetes.core_api(context).list_pod_for_all_namespaces(
1326
+ _request_timeout=kubernetes.API_TIMEOUT,
1327
+ _preload_content=False,
1328
+ field_selector=field_selector)
1329
+ try:
1330
+ allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
1331
+ for item_dict in ijson.items(response,
1332
+ 'items.item',
1333
+ buf_size=IJSON_BUFFER_SIZE):
1334
+ pod = V1Pod.from_dict(item_dict)
1335
+ if should_exclude_pod_from_gpu_allocation(pod):
1336
+ logger.debug(
1337
+ f'Excluding pod {pod.metadata.name} from GPU count '
1338
+ f'calculations on node {pod.spec.node_name}')
1339
+ continue
1340
+ # Iterate over all the containers in the pod and sum the
1341
+ # GPU requests
1342
+ pod_allocated_qty = 0
1343
+ for container in pod.spec.containers:
1344
+ if container.resources.requests:
1345
+ pod_allocated_qty += get_node_accelerator_count(
1346
+ context, container.resources.requests)
1347
+ if pod_allocated_qty > 0 and pod.spec.node_name:
1348
+ allocated_qty_by_node[pod.spec.node_name] += pod_allocated_qty
1349
+ return allocated_qty_by_node
1350
+ finally:
1351
+ response.release_conn()
946
1352
 
947
1353
 
948
1354
  def check_instance_fits(context: Optional[str],
@@ -991,7 +1397,7 @@ def check_instance_fits(context: Optional[str],
991
1397
  'Maximum resources found on a single node: '
992
1398
  f'{max_cpu} CPUs, {common_utils.format_float(max_mem)}G Memory')
993
1399
 
994
- def check_tpu_fits(candidate_instance_type: 'KubernetesInstanceType',
1400
+ def check_tpu_fits(acc_type: str, acc_count: int,
995
1401
  node_list: List[Any]) -> Tuple[bool, Optional[str]]:
996
1402
  """Checks if the instance fits on the cluster based on requested TPU.
997
1403
 
@@ -1001,8 +1407,6 @@ def check_instance_fits(context: Optional[str],
1001
1407
  node (node_tpu_chip_count) and the total TPU chips across the entire
1002
1408
  podslice (topology_chip_count) are correctly handled.
1003
1409
  """
1004
- acc_type = candidate_instance_type.accelerator_type
1005
- acc_count = candidate_instance_type.accelerator_count
1006
1410
  tpu_list_in_cluster = []
1007
1411
  for node in node_list:
1008
1412
  if acc_type == node.metadata.labels[
@@ -1053,14 +1457,15 @@ def check_instance_fits(context: Optional[str],
1053
1457
  if is_tpu_on_gke(acc_type):
1054
1458
  # If requested accelerator is a TPU type, check if the cluster
1055
1459
  # has sufficient TPU resource to meet the requirement.
1056
- fits, reason = check_tpu_fits(k8s_instance_type, gpu_nodes)
1460
+ acc_type, acc_count = normalize_tpu_accelerator_name(acc_type)
1461
+ fits, reason = check_tpu_fits(acc_type, acc_count, gpu_nodes)
1057
1462
  if reason is not None:
1058
1463
  return fits, reason
1059
1464
  else:
1060
1465
  # Check if any of the GPU nodes have sufficient number of GPUs.
1061
1466
  gpu_nodes = [
1062
- node for node in gpu_nodes if
1063
- get_node_accelerator_count(node.status.allocatable) >= acc_count
1467
+ node for node in gpu_nodes if get_node_accelerator_count(
1468
+ context, node.status.allocatable) >= acc_count
1064
1469
  ]
1065
1470
  if not gpu_nodes:
1066
1471
  return False, (
@@ -1122,14 +1527,14 @@ def get_accelerator_label_key_values(
1122
1527
  Raises:
1123
1528
  ResourcesUnavailableError: Can be raised from the following conditions:
1124
1529
  - The cluster does not have GPU/TPU resources
1125
- (nvidia.com/gpu, google.com/tpu)
1530
+ (amd.com/gpu, nvidia.com/gpu, google.com/tpu)
1126
1531
  - The cluster has GPU/TPU resources, but no node in the cluster has
1127
1532
  an accelerator label.
1128
1533
  - The cluster has a node with an invalid accelerator label value.
1129
1534
  - The cluster doesn't have any nodes with acc_type GPU/TPU
1130
1535
  """
1131
1536
  # Check if the cluster has GPU resources
1132
- # TODO(romilb): This assumes the accelerator is a nvidia GPU. We
1537
+ # TODO(romilb): This assumes the accelerator is a amd/nvidia GPU. We
1133
1538
  # need to support TPUs and other accelerators as well.
1134
1539
  # TODO(romilb): Currently, we broadly disable all GPU checks if autoscaling
1135
1540
  # is configured in config.yaml since the cluster may be scaling up from
@@ -1137,7 +1542,16 @@ def get_accelerator_label_key_values(
1137
1542
  # support pollingthe clusters for autoscaling information, such as the
1138
1543
  # node pools configured etc.
1139
1544
 
1140
- autoscaler_type = get_autoscaler_type()
1545
+ is_ssh_node_pool = context.startswith('ssh-') if context else False
1546
+ cloud_name = 'SSH Node Pool' if is_ssh_node_pool else 'Kubernetes cluster'
1547
+ context_display_name = common_utils.removeprefix(
1548
+ context, 'ssh-') if (context and is_ssh_node_pool) else context
1549
+
1550
+ autoscaler_type = skypilot_config.get_effective_region_config(
1551
+ cloud='kubernetes',
1552
+ region=context,
1553
+ keys=('autoscaler',),
1554
+ default_value=None)
1141
1555
  if autoscaler_type is not None:
1142
1556
  # If autoscaler is set in config.yaml, override the label key and value
1143
1557
  # to the autoscaler's format and bypass the GPU checks.
@@ -1146,7 +1560,8 @@ def get_accelerator_label_key_values(
1146
1560
  # early since we assume the cluster autoscaler will handle GPU
1147
1561
  # node provisioning.
1148
1562
  return None, None, None, None
1149
- autoscaler = AUTOSCALER_TYPE_TO_AUTOSCALER.get(autoscaler_type)
1563
+ autoscaler = AUTOSCALER_TYPE_TO_AUTOSCALER.get(
1564
+ kubernetes_enums.KubernetesAutoscalerType(autoscaler_type))
1150
1565
  assert autoscaler is not None, ('Unsupported autoscaler type:'
1151
1566
  f' {autoscaler_type}')
1152
1567
  formatter = autoscaler.label_formatter
@@ -1176,13 +1591,17 @@ def get_accelerator_label_key_values(
1176
1591
  suffix = ''
1177
1592
  if env_options.Options.SHOW_DEBUG_INFO.get():
1178
1593
  suffix = f' Found node labels: {node_labels}'
1179
- raise exceptions.ResourcesUnavailableError(
1180
- 'Could not detect GPU labels in Kubernetes cluster. '
1181
- 'If this cluster has GPUs, please ensure GPU nodes have '
1182
- 'node labels of either of these formats: '
1183
- f'{supported_formats}. Please refer to '
1184
- 'the documentation on how to set up node labels.'
1185
- f'{suffix}')
1594
+ msg = (f'Could not detect GPU labels in {cloud_name}.')
1595
+ if not is_ssh_node_pool:
1596
+ msg += (' Run `sky check ssh` to debug.')
1597
+ else:
1598
+ msg += (
1599
+ ' If this cluster has GPUs, please ensure GPU nodes have '
1600
+ 'node labels of either of these formats: '
1601
+ f'{supported_formats}. Please refer to '
1602
+ 'the documentation on how to set up node labels.')
1603
+ msg += f'{suffix}'
1604
+ raise exceptions.ResourcesUnavailableError(msg)
1186
1605
  else:
1187
1606
  # Validate the label value on all nodes labels to ensure they are
1188
1607
  # correctly setup and will behave as expected.
@@ -1193,7 +1612,7 @@ def get_accelerator_label_key_values(
1193
1612
  value)
1194
1613
  if not is_valid:
1195
1614
  raise exceptions.ResourcesUnavailableError(
1196
- f'Node {node_name!r} in Kubernetes cluster has '
1615
+ f'Node {node_name!r} in {cloud_name} has '
1197
1616
  f'invalid GPU label: {label}={value}. {reason}')
1198
1617
  if check_mode:
1199
1618
  # If check mode is enabled and we reached so far, we can
@@ -1212,9 +1631,13 @@ def get_accelerator_label_key_values(
1212
1631
  if is_multi_host_tpu(node_metadata_labels):
1213
1632
  continue
1214
1633
  for label, value in label_list:
1215
- if (label_formatter.match_label_key(label) and
1216
- label_formatter.get_accelerator_from_label_value(
1217
- value).lower() == acc_type.lower()):
1634
+ if label_formatter.match_label_key(label):
1635
+ # match either canonicalized name or raw name
1636
+ accelerator = (label_formatter.
1637
+ get_accelerator_from_label_value(value))
1638
+ viable = [value.lower(), accelerator.lower()]
1639
+ if acc_type.lower() not in viable:
1640
+ continue
1218
1641
  if is_tpu_on_gke(acc_type):
1219
1642
  assert isinstance(label_formatter,
1220
1643
  GKELabelFormatter)
@@ -1257,10 +1680,10 @@ def get_accelerator_label_key_values(
1257
1680
  # TODO(Doyoung): Update the error message raised with the
1258
1681
  # multi-host TPU support.
1259
1682
  raise exceptions.ResourcesUnavailableError(
1260
- 'Could not find any node in the Kubernetes cluster '
1683
+ f'Could not find any node in the {cloud_name} '
1261
1684
  f'with {acc_type}. Please ensure at least one node in the '
1262
1685
  f'cluster has {acc_type} and node labels are setup '
1263
- 'correctly. Please refer to the documentration for more. '
1686
+ 'correctly. Please refer to the documentation for more. '
1264
1687
  f'{suffix}. Note that multi-host TPU podslices are '
1265
1688
  'currently not unsupported.')
1266
1689
  else:
@@ -1270,15 +1693,27 @@ def get_accelerator_label_key_values(
1270
1693
  if env_options.Options.SHOW_DEBUG_INFO.get():
1271
1694
  suffix = (' Available resources on the cluster: '
1272
1695
  f'{cluster_resources}')
1273
- raise exceptions.ResourcesUnavailableError(
1274
- f'Could not detect GPU/TPU resources ({GPU_RESOURCE_KEY!r} or '
1275
- f'{TPU_RESOURCE_KEY!r}) in Kubernetes cluster. If this cluster'
1276
- ' contains GPUs, please ensure GPU drivers are installed on '
1277
- 'the node. Check if the GPUs are setup correctly by running '
1278
- '`kubectl describe nodes` and looking for the '
1279
- f'{GPU_RESOURCE_KEY!r} or {TPU_RESOURCE_KEY!r} resource. '
1280
- 'Please refer to the documentation on how to set up GPUs.'
1281
- f'{suffix}')
1696
+ if is_ssh_node_pool:
1697
+ msg = (
1698
+ f'Could not detect GPUs in SSH Node Pool '
1699
+ f'\'{context_display_name}\'. If this cluster contains '
1700
+ 'GPUs, please ensure GPU drivers are installed on the node '
1701
+ 'and re-run '
1702
+ f'`sky ssh up --infra {context_display_name}`. {suffix}')
1703
+ else:
1704
+ msg = (
1705
+ f'Could not detect GPU/TPU resources ({SUPPORTED_GPU_RESOURCE_KEYS["amd"]!r}, '
1706
+ f'{SUPPORTED_GPU_RESOURCE_KEYS["nvidia"]!r} or '
1707
+ f'{TPU_RESOURCE_KEY!r}) in Kubernetes cluster. If this cluster'
1708
+ ' contains GPUs, please ensure GPU drivers are installed on '
1709
+ 'the node. Check if the GPUs are setup correctly by running '
1710
+ '`kubectl describe nodes` and looking for the '
1711
+ f'{SUPPORTED_GPU_RESOURCE_KEYS["amd"]!r}, '
1712
+ f'{SUPPORTED_GPU_RESOURCE_KEYS["nvidia"]!r} or '
1713
+ f'{TPU_RESOURCE_KEY!r} resource. '
1714
+ 'Please refer to the documentation on how to set up GPUs.'
1715
+ f'{suffix}')
1716
+ raise exceptions.ResourcesUnavailableError(msg)
1282
1717
  assert False, 'This should not be reached'
1283
1718
 
1284
1719
 
@@ -1302,23 +1737,6 @@ def get_port(svc_name: str, namespace: str, context: Optional[str]) -> int:
1302
1737
  return head_service.spec.ports[0].node_port
1303
1738
 
1304
1739
 
1305
- def get_external_ip(network_mode: Optional[
1306
- kubernetes_enums.KubernetesNetworkingMode], context: Optional[str]) -> str:
1307
- if network_mode == kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD:
1308
- return '127.0.0.1'
1309
- # Return the IP address of the first node with an external IP
1310
- nodes = kubernetes.core_api(context).list_node().items
1311
- for node in nodes:
1312
- if node.status.addresses:
1313
- for address in node.status.addresses:
1314
- if address.type == 'ExternalIP':
1315
- return address.address
1316
- # If no external IP is found, use the API server IP
1317
- api_host = kubernetes.core_api(context).api_client.configuration.host
1318
- parsed_url = urlparse(api_host)
1319
- return parsed_url.hostname
1320
-
1321
-
1322
1740
  def check_credentials(context: Optional[str],
1323
1741
  timeout: int = kubernetes.API_TIMEOUT,
1324
1742
  run_optional_checks: bool = False) -> \
@@ -1337,7 +1755,10 @@ def check_credentials(context: Optional[str],
1337
1755
  try:
1338
1756
  namespace = get_kube_config_context_namespace(context)
1339
1757
  kubernetes.core_api(context).list_namespaced_pod(
1340
- namespace, _request_timeout=timeout)
1758
+ namespace, limit=1, _request_timeout=timeout)
1759
+ # This call is "free" because this function is a cached call,
1760
+ # and it will not be called again in this function.
1761
+ get_kubernetes_nodes(context=context)
1341
1762
  except ImportError:
1342
1763
  # TODO(romilb): Update these error strs to also include link to docs
1343
1764
  # when docs are ready.
@@ -1365,7 +1786,7 @@ def check_credentials(context: Optional[str],
1365
1786
  # Check if $KUBECONFIG envvar consists of multiple paths. We run this before
1366
1787
  # optional checks.
1367
1788
  try:
1368
- _ = _get_kubeconfig_path()
1789
+ _ = get_kubeconfig_paths()
1369
1790
  except ValueError as e:
1370
1791
  return False, f'{common_utils.format_exception(e, use_bracket=True)}'
1371
1792
 
@@ -1423,50 +1844,191 @@ def check_credentials(context: Optional[str],
1423
1844
  return True, None
1424
1845
 
1425
1846
 
1847
+ class PodValidator:
1848
+ """Validates Kubernetes pod configs against the OpenAPI spec.
1849
+
1850
+ Adapted from kubernetes.client.ApiClient:
1851
+ https://github.com/kubernetes-client/python/blob/0c56ef1c8c4b50087bc7b803f6af896fb973309e/kubernetes/client/api_client.py#L33
1852
+
1853
+ We needed to adapt it because the original implementation ignores
1854
+ unknown fields, whereas we want to raise an error so that users
1855
+ are aware of the issue.
1856
+ """
1857
+ PRIMITIVE_TYPES = (int, float, bool, str)
1858
+ NATIVE_TYPES_MAPPING = {
1859
+ 'int': int,
1860
+ 'float': float,
1861
+ 'str': str,
1862
+ 'bool': bool,
1863
+ 'date': datetime.date,
1864
+ 'datetime': datetime.datetime,
1865
+ 'object': object,
1866
+ }
1867
+
1868
+ @classmethod
1869
+ def validate(cls, data):
1870
+ return cls.__validate(data, kubernetes.models.V1Pod)
1871
+
1872
+ @classmethod
1873
+ def __validate(cls, data, klass):
1874
+ """Deserializes dict, list, str into an object.
1875
+
1876
+ :param data: dict, list or str.
1877
+ :param klass: class literal, or string of class name.
1878
+
1879
+ :return: object.
1880
+ """
1881
+ if data is None:
1882
+ return None
1883
+
1884
+ if isinstance(klass, str):
1885
+ if klass.startswith('list['):
1886
+ sub_kls = re.match(r'list\[(.*)\]', klass).group(1)
1887
+ return [cls.__validate(sub_data, sub_kls) for sub_data in data]
1888
+
1889
+ if klass.startswith('dict('):
1890
+ sub_kls = re.match(r'dict\(([^,]*), (.*)\)', klass).group(2)
1891
+ return {k: cls.__validate(v, sub_kls) for k, v in data.items()}
1892
+
1893
+ # convert str to class
1894
+ if klass in cls.NATIVE_TYPES_MAPPING:
1895
+ klass = cls.NATIVE_TYPES_MAPPING[klass]
1896
+ else:
1897
+ klass = getattr(kubernetes.models, klass)
1898
+
1899
+ if klass in cls.PRIMITIVE_TYPES:
1900
+ return cls.__validate_primitive(data, klass)
1901
+ elif klass == object:
1902
+ return cls.__validate_object(data)
1903
+ elif klass == datetime.date:
1904
+ return cls.__validate_date(data)
1905
+ elif klass == datetime.datetime:
1906
+ return cls.__validate_datetime(data)
1907
+ else:
1908
+ return cls.__validate_model(data, klass)
1909
+
1910
+ @classmethod
1911
+ def __validate_primitive(cls, data, klass):
1912
+ """Deserializes string to primitive type.
1913
+
1914
+ :param data: str.
1915
+ :param klass: class literal.
1916
+
1917
+ :return: int, long, float, str, bool.
1918
+ """
1919
+ try:
1920
+ return klass(data)
1921
+ except UnicodeEncodeError:
1922
+ return str(data)
1923
+ except TypeError:
1924
+ return data
1925
+
1926
+ @classmethod
1927
+ def __validate_object(cls, value):
1928
+ """Return an original value.
1929
+
1930
+ :return: object.
1931
+ """
1932
+ return value
1933
+
1934
+ @classmethod
1935
+ def __validate_date(cls, string):
1936
+ """Deserializes string to date.
1937
+
1938
+ :param string: str.
1939
+ :return: date.
1940
+ """
1941
+ try:
1942
+ return kubernetes.dateutil_parser.parse(string).date()
1943
+ except ValueError as exc:
1944
+ raise ValueError(
1945
+ f'Failed to parse `{string}` as date object') from exc
1946
+
1947
+ @classmethod
1948
+ def __validate_datetime(cls, string):
1949
+ """Deserializes string to datetime.
1950
+
1951
+ The string should be in iso8601 datetime format.
1952
+
1953
+ :param string: str.
1954
+ :return: datetime.
1955
+ """
1956
+ try:
1957
+ return kubernetes.dateutil_parser.parse(string)
1958
+ except ValueError as exc:
1959
+ raise ValueError(
1960
+ f'Failed to parse `{string}` as datetime object') from exc
1961
+
1962
+ @classmethod
1963
+ def __validate_model(cls, data, klass):
1964
+ """Deserializes list or dict to model.
1965
+
1966
+ :param data: dict, list.
1967
+ :param klass: class literal.
1968
+ :return: model object.
1969
+ """
1970
+
1971
+ if not klass.openapi_types and not hasattr(klass,
1972
+ 'get_real_child_model'):
1973
+ return data
1974
+
1975
+ kwargs = {}
1976
+ try:
1977
+ if (data is not None and klass.openapi_types is not None and
1978
+ isinstance(data, (list, dict))):
1979
+ # attribute_map is a dict that maps field names in snake_case
1980
+ # to camelCase.
1981
+ reverse_attribute_map = {
1982
+ v: k for k, v in klass.attribute_map.items()
1983
+ }
1984
+ for k, v in data.items():
1985
+ field_name = reverse_attribute_map.get(k, None)
1986
+ if field_name is None:
1987
+ raise ValueError(
1988
+ f'Unknown field `{k}`. Please ensure '
1989
+ 'pod_config follows the Kubernetes '
1990
+ 'Pod schema: '
1991
+ 'https://github.com/kubernetes/kubernetes/blob/master/api/openapi-spec/v3/api__v1_openapi.json'
1992
+ )
1993
+ kwargs[field_name] = cls.__validate(
1994
+ v, klass.openapi_types[field_name])
1995
+ except exceptions.KubernetesValidationError as e:
1996
+ raise exceptions.KubernetesValidationError([k] + e.path,
1997
+ str(e)) from e
1998
+ except Exception as e:
1999
+ raise exceptions.KubernetesValidationError([k], str(e)) from e
2000
+
2001
+ instance = klass(**kwargs)
2002
+
2003
+ if hasattr(instance, 'get_real_child_model'):
2004
+ klass_name = instance.get_real_child_model(data)
2005
+ if klass_name:
2006
+ instance = cls.__validate(data, klass_name)
2007
+ return instance
2008
+
1426
2009
  def check_pod_config(pod_config: dict) \
1427
2010
  -> Tuple[bool, Optional[str]]:
1428
- """Check if the pod_config is a valid pod config
2011
+ """Check if the pod_config is a valid pod config.
1429
2012
 
1430
- Using deserialize api to check the pod_config is valid or not.
2013
+ Uses the deserialize API from the kubernetes client library.
2014
+
2015
+ This is a client-side validation, meant to catch common errors like
2016
+ unknown/misspelled fields, and missing required fields.
2017
+
2018
+ The full validation however is done later on by the Kubernetes API server
2019
+ when the pod creation request is sent.
1431
2020
 
1432
2021
  Returns:
1433
2022
  bool: True if pod_config is valid.
1434
2023
  str: Error message about why the pod_config is invalid, None otherwise.
1435
2024
  """
1436
- errors = []
1437
- # This api_client won't be used to send any requests, so there is no need to
1438
- # load kubeconfig
1439
- api_client = kubernetes.kubernetes.client.ApiClient()
1440
-
1441
- # Used for kubernetes api_client deserialize function, the function will use
1442
- # data attr, the detail ref:
1443
- # https://github.com/kubernetes-client/python/blob/master/kubernetes/client/api_client.py#L244
1444
- class InnerResponse():
1445
-
1446
- def __init__(self, data: dict):
1447
- self.data = json.dumps(data)
1448
-
1449
2025
  try:
1450
- # Validate metadata if present
1451
- if 'metadata' in pod_config:
1452
- try:
1453
- value = InnerResponse(pod_config['metadata'])
1454
- api_client.deserialize(
1455
- value, kubernetes.kubernetes.client.V1ObjectMeta)
1456
- except ValueError as e:
1457
- errors.append(f'Invalid metadata: {str(e)}')
1458
- # Validate spec if present
1459
- if 'spec' in pod_config:
1460
- try:
1461
- value = InnerResponse(pod_config['spec'])
1462
- api_client.deserialize(value,
1463
- kubernetes.kubernetes.client.V1PodSpec)
1464
- except ValueError as e:
1465
- errors.append(f'Invalid spec: {str(e)}')
1466
- return len(errors) == 0, '.'.join(errors)
2026
+ PodValidator.validate(pod_config)
2027
+ except exceptions.KubernetesValidationError as e:
2028
+ return False, f'Validation error in {".".join(e.path)}: {str(e)}'
1467
2029
  except Exception as e: # pylint: disable=broad-except
1468
- errors.append(f'Validation error: {str(e)}')
1469
- return False, '.'.join(errors)
2030
+ return False, f'Unexpected error: {str(e)}'
2031
+ return True, None
1470
2032
 
1471
2033
 
1472
2034
  def is_kubeconfig_exec_auth(
@@ -1507,7 +2069,7 @@ def is_kubeconfig_exec_auth(
1507
2069
  return False, None
1508
2070
 
1509
2071
  # Get active context and user from kubeconfig using k8s api
1510
- all_contexts, current_context = k8s.config.list_kube_config_contexts()
2072
+ all_contexts, current_context = kubernetes.list_kube_config_contexts()
1511
2073
  context_obj = current_context
1512
2074
  if context is not None:
1513
2075
  for c in all_contexts:
@@ -1518,33 +2080,31 @@ def is_kubeconfig_exec_auth(
1518
2080
  raise ValueError(f'Kubernetes context {context!r} not found.')
1519
2081
  target_username = context_obj['context']['user']
1520
2082
 
1521
- # K8s api does not provide a mechanism to get the user details from the
1522
- # context. We need to load the kubeconfig file and parse it to get the
1523
- # user details.
1524
- kubeconfig_path = _get_kubeconfig_path()
1525
-
1526
- # Load the kubeconfig file as a dictionary
1527
- with open(kubeconfig_path, 'r', encoding='utf-8') as f:
1528
- kubeconfig = yaml.safe_load(f)
2083
+ # Load the kubeconfig for the context
2084
+ kubeconfig_text = _get_kubeconfig_text_for_context(context)
2085
+ kubeconfig = yaml_utils.safe_load(kubeconfig_text)
1529
2086
 
2087
+ # Get the user details
1530
2088
  user_details = kubeconfig['users']
1531
2089
 
1532
2090
  # Find user matching the target username
1533
2091
  user_details = next(
1534
2092
  user for user in user_details if user['name'] == target_username)
1535
2093
 
1536
- remote_identity = skypilot_config.get_nested(
1537
- ('kubernetes', 'remote_identity'),
1538
- schemas.get_default_remote_identity('kubernetes'))
2094
+ remote_identity = skypilot_config.get_effective_region_config(
2095
+ cloud='kubernetes',
2096
+ region=context,
2097
+ keys=('remote_identity',),
2098
+ default_value=schemas.get_default_remote_identity('kubernetes'))
1539
2099
  if ('exec' in user_details.get('user', {}) and remote_identity
1540
2100
  == schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value):
1541
2101
  ctx_name = context_obj['name']
1542
2102
  exec_msg = ('exec-based authentication is used for '
1543
- f'Kubernetes context {ctx_name!r}.'
1544
- ' This may cause issues with autodown or when running '
1545
- 'Managed Jobs or SkyServe controller on Kubernetes. '
1546
- 'To fix, configure SkyPilot to create a service account '
1547
- 'for running pods by setting the following in '
2103
+ f'Kubernetes context {ctx_name!r}. '
2104
+ 'Make sure that the corresponding cloud provider is '
2105
+ 'also enabled through `sky check` (e.g.: GCP for GKE). '
2106
+ 'Alternatively, configure SkyPilot to create a service '
2107
+ 'account for running pods by setting the following in '
1548
2108
  '~/.sky/config.yaml:\n'
1549
2109
  ' kubernetes:\n'
1550
2110
  ' remote_identity: SERVICE_ACCOUNT\n'
@@ -1554,6 +2114,33 @@ def is_kubeconfig_exec_auth(
1554
2114
  return False, None
1555
2115
 
1556
2116
 
2117
+ def _get_kubeconfig_text_for_context(context: Optional[str] = None) -> str:
2118
+ """Get the kubeconfig text for the given context.
2119
+
2120
+ The kubeconfig might be multiple files, this function use kubectl to
2121
+ handle merging automatically.
2122
+ """
2123
+ command = 'kubectl config view --minify'
2124
+ if context is not None:
2125
+ command += f' --context={context}'
2126
+
2127
+ # Ensure subprocess inherits the current environment properly
2128
+ # This fixes the issue where kubectl can't find ~/.kube/config in API server context
2129
+ env = os.environ.copy()
2130
+
2131
+ proc = subprocess.run(command,
2132
+ shell=True,
2133
+ check=False,
2134
+ env=env,
2135
+ stdout=subprocess.PIPE,
2136
+ stderr=subprocess.PIPE)
2137
+ if proc.returncode != 0:
2138
+ raise RuntimeError(
2139
+ f'Failed to get kubeconfig text for context {context}: {proc.stderr.decode("utf-8")}'
2140
+ )
2141
+ return proc.stdout.decode('utf-8')
2142
+
2143
+
1557
2144
  @annotations.lru_cache(scope='request')
1558
2145
  def get_current_kube_config_context_name() -> Optional[str]:
1559
2146
  """Get the current kubernetes context from the kubeconfig file
@@ -1563,7 +2150,7 @@ def get_current_kube_config_context_name() -> Optional[str]:
1563
2150
  """
1564
2151
  k8s = kubernetes.kubernetes
1565
2152
  try:
1566
- _, current_context = k8s.config.list_kube_config_contexts()
2153
+ _, current_context = kubernetes.list_kube_config_contexts()
1567
2154
  return current_context['name']
1568
2155
  except k8s.config.config_exception.ConfigException:
1569
2156
  return None
@@ -1599,7 +2186,7 @@ def get_all_kube_context_names() -> List[str]:
1599
2186
  k8s = kubernetes.kubernetes
1600
2187
  context_names = []
1601
2188
  try:
1602
- all_contexts, _ = k8s.config.list_kube_config_contexts()
2189
+ all_contexts, _ = kubernetes.list_kube_config_contexts()
1603
2190
  # all_contexts will always have at least one context. If kubeconfig
1604
2191
  # does not have any contexts defined, it will raise ConfigException.
1605
2192
  context_names = [context['name'] for context in all_contexts]
@@ -1642,7 +2229,7 @@ def get_kube_config_context_namespace(
1642
2229
  return f.read().strip()
1643
2230
  # If not in-cluster, get the namespace from kubeconfig
1644
2231
  try:
1645
- contexts, current_context = k8s.config.list_kube_config_contexts()
2232
+ contexts, current_context = kubernetes.list_kube_config_contexts()
1646
2233
  if context_name is None:
1647
2234
  context = current_context
1648
2235
  else:
@@ -1659,6 +2246,15 @@ def get_kube_config_context_namespace(
1659
2246
  return DEFAULT_NAMESPACE
1660
2247
 
1661
2248
 
2249
+ def parse_cpu_or_gpu_resource_to_float(resource_str: str) -> float:
2250
+ if not resource_str:
2251
+ return 0.0
2252
+ if resource_str[-1] == 'm':
2253
+ return float(resource_str[:-1]) / 1000
2254
+ else:
2255
+ return float(resource_str)
2256
+
2257
+
1662
2258
  def parse_cpu_or_gpu_resource(resource_qty_str: str) -> Union[int, float]:
1663
2259
  resource_str = str(resource_qty_str)
1664
2260
  if resource_str[-1] == 'm':
@@ -1736,9 +2332,16 @@ class KubernetesInstanceType:
1736
2332
  @staticmethod
1737
2333
  def is_valid_instance_type(name: str) -> bool:
1738
2334
  """Returns whether the given name is a valid instance type."""
2335
+ # Before https://github.com/skypilot-org/skypilot/pull/4756,
2336
+ # the accelerators are appended with format "--{a}{type}",
2337
+ # e.g. "4CPU--16GB--1V100".
2338
+ # Check both patterns to keep backward compatibility.
2339
+ # TODO(romilb): Backward compatibility, remove after 0.11.0.
2340
+ prev_pattern = re.compile(
2341
+ r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--\d+\S+)?$')
1739
2342
  pattern = re.compile(
1740
2343
  r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--[\w\d-]+:\d+)?$')
1741
- return bool(pattern.match(name))
2344
+ return bool(pattern.match(name)) or bool(prev_pattern.match(name))
1742
2345
 
1743
2346
  @classmethod
1744
2347
  def _parse_instance_type(
@@ -1755,6 +2358,11 @@ class KubernetesInstanceType:
1755
2358
  r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_type>[\w\d-]+):(?P<accelerator_count>\d+))?$' # pylint: disable=line-too-long
1756
2359
  )
1757
2360
  match = pattern.match(name)
2361
+ # TODO(romilb): Backward compatibility, remove after 0.11.0.
2362
+ prev_pattern = re.compile(
2363
+ r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_count>\d+)(?P<accelerator_type>\S+))?$' # pylint: disable=line-too-long
2364
+ )
2365
+ prev_match = prev_pattern.match(name)
1758
2366
  if match:
1759
2367
  cpus = float(match.group('cpus'))
1760
2368
  memory = float(match.group('memory'))
@@ -1762,9 +2370,20 @@ class KubernetesInstanceType:
1762
2370
  accelerator_type = match.group('accelerator_type')
1763
2371
  if accelerator_count:
1764
2372
  accelerator_count = int(accelerator_count)
1765
- # This is to revert the accelerator types with spaces back to
1766
- # the original format.
1767
- accelerator_type = str(accelerator_type).replace('_', ' ')
2373
+ accelerator_type = str(accelerator_type)
2374
+ else:
2375
+ accelerator_count = None
2376
+ accelerator_type = None
2377
+ return cpus, memory, accelerator_count, accelerator_type
2378
+ # TODO(romilb): Backward compatibility, remove after 0.11.0.
2379
+ elif prev_match:
2380
+ cpus = float(prev_match.group('cpus'))
2381
+ memory = float(prev_match.group('memory'))
2382
+ accelerator_count = prev_match.group('accelerator_count')
2383
+ accelerator_type = prev_match.group('accelerator_type')
2384
+ if accelerator_count:
2385
+ accelerator_count = int(accelerator_count)
2386
+ accelerator_type = str(accelerator_type)
1768
2387
  else:
1769
2388
  accelerator_count = None
1770
2389
  accelerator_type = None
@@ -1841,16 +2460,14 @@ def construct_ssh_jump_command(
1841
2460
 
1842
2461
 
1843
2462
  def get_ssh_proxy_command(
1844
- k8s_ssh_target: str,
1845
- network_mode: kubernetes_enums.KubernetesNetworkingMode,
2463
+ pod_name: str,
1846
2464
  private_key_path: str,
1847
2465
  context: Optional[str],
1848
2466
  namespace: str,
1849
2467
  ) -> str:
1850
2468
  """Generates the SSH proxy command to connect to the pod.
1851
2469
 
1852
- Uses a jump pod if the network mode is NODEPORT, and direct port-forwarding
1853
- if the network mode is PORTFORWARD.
2470
+ Uses a direct port-forwarding.
1854
2471
 
1855
2472
  By default, establishing an SSH connection creates a communication
1856
2473
  channel to a remote node by setting up a TCP connection. When a
@@ -1861,17 +2478,8 @@ def get_ssh_proxy_command(
1861
2478
  Pods within a Kubernetes cluster have internal IP addresses that are
1862
2479
  typically not accessible from outside the cluster. Since the default TCP
1863
2480
  connection of SSH won't allow access to these pods, we employ a
1864
- ProxyCommand to establish the required communication channel. We offer this
1865
- in two different networking options: NodePort/port-forward.
2481
+ ProxyCommand to establish the required communication channel.
1866
2482
 
1867
- With the NodePort networking mode, a NodePort service is launched. This
1868
- service opens an external port on the node which redirects to the desired
1869
- port to a SSH jump pod. When establishing an SSH session in this mode, the
1870
- ProxyCommand makes use of this external port to create a communication
1871
- channel directly to port 22, which is the default port ssh server listens
1872
- on, of the jump pod.
1873
-
1874
- With Port-forward mode, instead of directly exposing an external port,
1875
2483
  'kubectl port-forward' sets up a tunnel between a local port
1876
2484
  (127.0.0.1:23100) and port 22 of the provisioned pod. Then we establish TCP
1877
2485
  connection to the local end of this tunnel, 127.0.0.1:23100, using 'socat'.
@@ -1882,38 +2490,26 @@ def get_ssh_proxy_command(
1882
2490
  the local machine.
1883
2491
 
1884
2492
  Args:
1885
- k8s_ssh_target: str; The Kubernetes object that will be used as the
1886
- target for SSH. If network_mode is NODEPORT, this is the name of the
1887
- service. If network_mode is PORTFORWARD, this is the pod name.
1888
- network_mode: KubernetesNetworkingMode; networking mode for ssh
1889
- session. It is either 'NODEPORT' or 'PORTFORWARD'
2493
+ pod_name: str; The Kubernetes pod name that will be used as the
2494
+ target for SSH.
1890
2495
  private_key_path: str; Path to the private key to use for SSH.
1891
2496
  This key must be authorized to access the SSH jump pod.
1892
- Required for NODEPORT networking mode.
1893
2497
  namespace: Kubernetes namespace to use.
1894
- Required for NODEPORT networking mode.
1895
2498
  """
1896
- # Fetch IP to connect to for the jump svc
1897
- ssh_jump_ip = get_external_ip(network_mode, context)
2499
+ ssh_jump_ip = '127.0.0.1' # Local end of the port-forward tunnel
1898
2500
  assert private_key_path is not None, 'Private key path must be provided'
1899
- if network_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
1900
- assert namespace is not None, 'Namespace must be provided for NodePort'
1901
- ssh_jump_port = get_port(k8s_ssh_target, namespace, context)
1902
- ssh_jump_proxy_command = construct_ssh_jump_command(
1903
- private_key_path, ssh_jump_ip, ssh_jump_port=ssh_jump_port)
1904
- else:
1905
- ssh_jump_proxy_command_path = create_proxy_command_script()
1906
- ssh_jump_proxy_command = construct_ssh_jump_command(
1907
- private_key_path,
1908
- ssh_jump_ip,
1909
- ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
1910
- proxy_cmd_path=ssh_jump_proxy_command_path,
1911
- proxy_cmd_target_pod=k8s_ssh_target,
1912
- # We embed both the current context and namespace to the SSH proxy
1913
- # command to make sure SSH still works when the current
1914
- # context/namespace is changed by the user.
1915
- current_kube_context=context,
1916
- current_kube_namespace=namespace)
2501
+ ssh_jump_proxy_command_path = create_proxy_command_script()
2502
+ ssh_jump_proxy_command = construct_ssh_jump_command(
2503
+ private_key_path,
2504
+ ssh_jump_ip,
2505
+ ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
2506
+ proxy_cmd_path=ssh_jump_proxy_command_path,
2507
+ proxy_cmd_target_pod=pod_name,
2508
+ # We embed both the current context and namespace to the SSH proxy
2509
+ # command to make sure SSH still works when the current
2510
+ # context/namespace is changed by the user.
2511
+ current_kube_context=context,
2512
+ current_kube_namespace=namespace)
1917
2513
  return ssh_jump_proxy_command
1918
2514
 
1919
2515
 
@@ -1945,240 +2541,6 @@ def create_proxy_command_script() -> str:
1945
2541
  return PORT_FORWARD_PROXY_CMD_PATH
1946
2542
 
1947
2543
 
1948
- def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
1949
- context: Optional[str],
1950
- service_type: kubernetes_enums.KubernetesServiceType):
1951
- """Sets up Kubernetes service resource to access for SSH jump pod.
1952
-
1953
- This method acts as a necessary complement to be run along with
1954
- setup_ssh_jump_pod(...) method. This service ensures the pod is accessible.
1955
-
1956
- Args:
1957
- ssh_jump_name: Name to use for the SSH jump service
1958
- namespace: Namespace to create the SSH jump service in
1959
- service_type: Networking configuration on either to use NodePort
1960
- or ClusterIP service to ssh in
1961
- """
1962
- # Fill in template - ssh_key_secret and ssh_jump_image are not required for
1963
- # the service spec, so we pass in empty strs.
1964
- content = fill_ssh_jump_template('', '', ssh_jump_name, service_type.value)
1965
-
1966
- # Add custom metadata from config
1967
- merge_custom_metadata(content['service_spec']['metadata'])
1968
-
1969
- # Create service
1970
- try:
1971
- kubernetes.core_api(context).create_namespaced_service(
1972
- namespace, content['service_spec'])
1973
- except kubernetes.api_exception() as e:
1974
- # SSH Jump Pod service already exists.
1975
- if e.status == 409:
1976
- ssh_jump_service = kubernetes.core_api(
1977
- context).read_namespaced_service(name=ssh_jump_name,
1978
- namespace=namespace)
1979
- curr_svc_type = ssh_jump_service.spec.type
1980
- if service_type.value == curr_svc_type:
1981
- # If the currently existing SSH Jump service's type is identical
1982
- # to user's configuration for networking mode
1983
- logger.debug(
1984
- f'SSH Jump Service {ssh_jump_name} already exists in the '
1985
- 'cluster, using it.')
1986
- else:
1987
- # If a different type of service type for SSH Jump pod compared
1988
- # to user's configuration for networking mode exists, we remove
1989
- # existing servie to create a new one following user's config
1990
- kubernetes.core_api(context).delete_namespaced_service(
1991
- name=ssh_jump_name, namespace=namespace)
1992
- kubernetes.core_api(context).create_namespaced_service(
1993
- namespace, content['service_spec'])
1994
- port_forward_mode = (
1995
- kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value)
1996
- nodeport_mode = (
1997
- kubernetes_enums.KubernetesNetworkingMode.NODEPORT.value)
1998
- clusterip_svc = (
1999
- kubernetes_enums.KubernetesServiceType.CLUSTERIP.value)
2000
- nodeport_svc = (
2001
- kubernetes_enums.KubernetesServiceType.NODEPORT.value)
2002
- curr_network_mode = port_forward_mode \
2003
- if curr_svc_type == clusterip_svc else nodeport_mode
2004
- new_network_mode = nodeport_mode \
2005
- if curr_svc_type == clusterip_svc else port_forward_mode
2006
- new_svc_type = nodeport_svc \
2007
- if curr_svc_type == clusterip_svc else clusterip_svc
2008
- logger.info(
2009
- f'Switching the networking mode from '
2010
- f'\'{curr_network_mode}\' to \'{new_network_mode}\' '
2011
- f'following networking configuration. Deleting existing '
2012
- f'\'{curr_svc_type}\' service and recreating as '
2013
- f'\'{new_svc_type}\' service.')
2014
- else:
2015
- raise
2016
- else:
2017
- logger.info(f'Created SSH Jump Service {ssh_jump_name}.')
2018
-
2019
-
2020
- def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
2021
- ssh_key_secret: str, namespace: str,
2022
- context: Optional[str]):
2023
- """Sets up Kubernetes RBAC and pod for SSH jump host.
2024
-
2025
- Our Kubernetes implementation uses a SSH jump pod to reach SkyPilot clusters
2026
- running inside a cluster. This function sets up the resources needed for
2027
- the SSH jump pod. This includes a service account which grants the jump pod
2028
- permission to watch for other SkyPilot pods and terminate itself if there
2029
- are no SkyPilot pods running.
2030
-
2031
- setup_ssh_jump_service must also be run to ensure that the SSH jump pod is
2032
- reachable.
2033
-
2034
- Args:
2035
- ssh_jump_image: Container image to use for the SSH jump pod
2036
- ssh_jump_name: Name to use for the SSH jump pod
2037
- ssh_key_secret: Secret name for the SSH key stored in the cluster
2038
- namespace: Namespace to create the SSH jump pod in
2039
- """
2040
- # Fill in template - service is created separately so service_type is not
2041
- # required, so we pass in empty str.
2042
- content = fill_ssh_jump_template(ssh_key_secret, ssh_jump_image,
2043
- ssh_jump_name, '')
2044
-
2045
- # Add custom metadata to all objects
2046
- for object_type in content.keys():
2047
- merge_custom_metadata(content[object_type]['metadata'])
2048
-
2049
- # ServiceAccount
2050
- try:
2051
- kubernetes.core_api(context).create_namespaced_service_account(
2052
- namespace, content['service_account'])
2053
- except kubernetes.api_exception() as e:
2054
- if e.status == 409:
2055
- logger.info(
2056
- 'SSH Jump ServiceAccount already exists in the cluster, using '
2057
- 'it.')
2058
- else:
2059
- raise
2060
- else:
2061
- logger.info('Created SSH Jump ServiceAccount.')
2062
- # Role
2063
- try:
2064
- kubernetes.auth_api(context).create_namespaced_role(
2065
- namespace, content['role'])
2066
- except kubernetes.api_exception() as e:
2067
- if e.status == 409:
2068
- logger.info(
2069
- 'SSH Jump Role already exists in the cluster, using it.')
2070
- else:
2071
- raise
2072
- else:
2073
- logger.info('Created SSH Jump Role.')
2074
- # RoleBinding
2075
- try:
2076
- kubernetes.auth_api(context).create_namespaced_role_binding(
2077
- namespace, content['role_binding'])
2078
- except kubernetes.api_exception() as e:
2079
- if e.status == 409:
2080
- logger.info(
2081
- 'SSH Jump RoleBinding already exists in the cluster, using '
2082
- 'it.')
2083
- else:
2084
- raise
2085
- else:
2086
- logger.info('Created SSH Jump RoleBinding.')
2087
- # Pod
2088
- try:
2089
- kubernetes.core_api(context).create_namespaced_pod(
2090
- namespace, content['pod_spec'])
2091
- except kubernetes.api_exception() as e:
2092
- if e.status == 409:
2093
- logger.info(
2094
- f'SSH Jump Host {ssh_jump_name} already exists in the cluster, '
2095
- 'using it.')
2096
- else:
2097
- raise
2098
- else:
2099
- logger.info(f'Created SSH Jump Host {ssh_jump_name}.')
2100
-
2101
-
2102
- def clean_zombie_ssh_jump_pod(namespace: str, context: Optional[str],
2103
- node_id: str):
2104
- """Analyzes SSH jump pod and removes if it is in a bad state
2105
-
2106
- Prevents the existence of a dangling SSH jump pod. This could happen
2107
- in case the pod main container did not start properly (or failed). In that
2108
- case, jump pod lifecycle manager will not function properly to
2109
- remove the pod and service automatically, and must be done manually.
2110
-
2111
- Args:
2112
- namespace: Namespace to remove the SSH jump pod and service from
2113
- node_id: Name of head pod
2114
- """
2115
-
2116
- def find(l, predicate):
2117
- """Utility function to find element in given list"""
2118
- results = [x for x in l if predicate(x)]
2119
- return results[0] if results else None
2120
-
2121
- # Get the SSH jump pod name from the head pod
2122
- try:
2123
- pod = kubernetes.core_api(context).read_namespaced_pod(
2124
- node_id, namespace)
2125
- except kubernetes.api_exception() as e:
2126
- if e.status == 404:
2127
- logger.warning(f'Failed to get pod {node_id},'
2128
- ' but the pod was not found (404).')
2129
- raise
2130
- else:
2131
- ssh_jump_name = pod.metadata.labels.get('skypilot-ssh-jump')
2132
- try:
2133
- ssh_jump_pod = kubernetes.core_api(context).read_namespaced_pod(
2134
- ssh_jump_name, namespace)
2135
- cont_ready_cond = find(ssh_jump_pod.status.conditions,
2136
- lambda c: c.type == 'ContainersReady')
2137
- if (cont_ready_cond and cont_ready_cond.status
2138
- == 'False') or ssh_jump_pod.status.phase == 'Pending':
2139
- # Either the main container is not ready or the pod failed
2140
- # to schedule. To be on the safe side and prevent a dangling
2141
- # ssh jump pod, lets remove it and the service. Otherwise, main
2142
- # container is ready and its lifecycle management script takes
2143
- # care of the cleaning.
2144
- kubernetes.core_api(context).delete_namespaced_pod(
2145
- ssh_jump_name, namespace)
2146
- kubernetes.core_api(context).delete_namespaced_service(
2147
- ssh_jump_name, namespace)
2148
- except kubernetes.api_exception() as e:
2149
- # We keep the warning in debug to avoid polluting the `sky launch`
2150
- # output.
2151
- logger.debug(f'Tried to check ssh jump pod {ssh_jump_name},'
2152
- f' but got error {e}\n. Consider running `kubectl '
2153
- f'delete pod {ssh_jump_name} -n {namespace}` to manually '
2154
- 'remove the pod if it has crashed.')
2155
- # We encountered an issue while checking ssh jump pod. To be on
2156
- # the safe side, lets remove its service so the port is freed
2157
- try:
2158
- kubernetes.core_api(context).delete_namespaced_service(
2159
- ssh_jump_name, namespace)
2160
- except kubernetes.api_exception():
2161
- pass
2162
-
2163
-
2164
- def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
2165
- ssh_jump_name: str, service_type: str) -> Dict:
2166
- template_path = os.path.join(sky.__root_dir__, 'templates',
2167
- 'kubernetes-ssh-jump.yml.j2')
2168
- if not os.path.exists(template_path):
2169
- raise FileNotFoundError(
2170
- 'Template "kubernetes-ssh-jump.j2" does not exist.')
2171
- with open(template_path, 'r', encoding='utf-8') as fin:
2172
- template = fin.read()
2173
- j2_template = jinja2.Template(template)
2174
- cont = j2_template.render(name=ssh_jump_name,
2175
- image=ssh_jump_image,
2176
- secret=ssh_key_secret,
2177
- service_type=service_type)
2178
- content = yaml.safe_load(cont)
2179
- return content
2180
-
2181
-
2182
2544
  def check_port_forward_mode_dependencies(
2183
2545
  raise_error: bool = True) -> Optional[List[str]]:
2184
2546
  """Checks if 'socat' and 'nc' are installed
@@ -2256,7 +2618,7 @@ def check_port_forward_mode_dependencies(
2256
2618
  return None
2257
2619
 
2258
2620
 
2259
- def get_endpoint_debug_message() -> str:
2621
+ def get_endpoint_debug_message(context: Optional[str] = None) -> str:
2260
2622
  """ Returns a string message for user to debug Kubernetes port opening
2261
2623
 
2262
2624
  Polls the configured ports mode on Kubernetes to produce an
@@ -2264,7 +2626,7 @@ def get_endpoint_debug_message() -> str:
2264
2626
 
2265
2627
  Also checks if the
2266
2628
  """
2267
- port_mode = network_utils.get_port_mode()
2629
+ port_mode = network_utils.get_port_mode(None, context)
2268
2630
  if port_mode == kubernetes_enums.KubernetesPortMode.INGRESS:
2269
2631
  endpoint_type = 'Ingress'
2270
2632
  debug_cmd = 'kubectl describe ingress && kubectl describe ingressclass'
@@ -2279,9 +2641,11 @@ def get_endpoint_debug_message() -> str:
2279
2641
 
2280
2642
 
2281
2643
  def combine_pod_config_fields(
2282
- cluster_yaml_path: str,
2644
+ cluster_yaml_obj: Dict[str, Any],
2283
2645
  cluster_config_overrides: Dict[str, Any],
2284
- ) -> None:
2646
+ cloud: Optional[clouds.Cloud] = None,
2647
+ context: Optional[str] = None,
2648
+ ) -> Dict[str, Any]:
2285
2649
  """Adds or updates fields in the YAML with fields from the
2286
2650
  ~/.sky/config.yaml's kubernetes.pod_spec dict.
2287
2651
  This can be used to add fields to the YAML that are not supported by
@@ -2320,72 +2684,138 @@ def combine_pod_config_fields(
2320
2684
  - name: my-secret
2321
2685
  ```
2322
2686
  """
2323
- with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
2324
- yaml_content = f.read()
2325
- yaml_obj = yaml.safe_load(yaml_content)
2326
- # We don't use override_configs in `skypilot_config.get_nested`, as merging
2687
+ merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
2688
+ # We don't use override_configs in `get_effective_region_config`, as merging
2327
2689
  # the pod config requires special handling.
2328
- kubernetes_config = skypilot_config.get_nested(('kubernetes', 'pod_config'),
2329
- default_value={},
2330
- override_configs={})
2331
- override_pod_config = (cluster_config_overrides.get('kubernetes', {}).get(
2332
- 'pod_config', {}))
2690
+ if isinstance(cloud, clouds.SSH):
2691
+ kubernetes_config = skypilot_config.get_effective_region_config(
2692
+ cloud='ssh', region=None, keys=('pod_config',), default_value={})
2693
+ override_pod_config = config_utils.get_cloud_config_value_from_dict(
2694
+ dict_config=cluster_config_overrides,
2695
+ cloud='ssh',
2696
+ keys=('pod_config',),
2697
+ default_value={})
2698
+ else:
2699
+ kubernetes_config = skypilot_config.get_effective_region_config(
2700
+ cloud='kubernetes',
2701
+ region=context,
2702
+ keys=('pod_config',),
2703
+ default_value={})
2704
+ override_pod_config = config_utils.get_cloud_config_value_from_dict(
2705
+ dict_config=cluster_config_overrides,
2706
+ cloud='kubernetes',
2707
+ region=context,
2708
+ keys=('pod_config',),
2709
+ default_value={})
2333
2710
  config_utils.merge_k8s_configs(kubernetes_config, override_pod_config)
2334
2711
 
2335
2712
  # Merge the kubernetes config into the YAML for both head and worker nodes.
2336
2713
  config_utils.merge_k8s_configs(
2337
- yaml_obj['available_node_types']['ray_head_default']['node_config'],
2338
- kubernetes_config)
2339
-
2340
- # Write the updated YAML back to the file
2341
- common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
2714
+ merged_cluster_yaml_obj['available_node_types']['ray_head_default']
2715
+ ['node_config'], kubernetes_config)
2716
+ return merged_cluster_yaml_obj
2342
2717
 
2343
2718
 
2344
- def combine_metadata_fields(cluster_yaml_path: str) -> None:
2719
+ def combine_metadata_fields(cluster_yaml_obj: Dict[str, Any],
2720
+ cluster_config_overrides: Dict[str, Any],
2721
+ context: Optional[str] = None) -> Dict[str, Any]:
2345
2722
  """Updates the metadata for all Kubernetes objects created by SkyPilot with
2346
2723
  fields from the ~/.sky/config.yaml's kubernetes.custom_metadata dict.
2347
2724
 
2348
2725
  Obeys the same add or update semantics as combine_pod_config_fields().
2349
2726
  """
2350
-
2351
- with open(cluster_yaml_path, 'r', encoding='utf-8') as f:
2352
- yaml_content = f.read()
2353
- yaml_obj = yaml.safe_load(yaml_content)
2354
- custom_metadata = skypilot_config.get_nested(
2355
- ('kubernetes', 'custom_metadata'), {})
2727
+ merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
2728
+ # Get custom_metadata from global config
2729
+ custom_metadata = skypilot_config.get_effective_region_config(
2730
+ cloud='kubernetes',
2731
+ region=context,
2732
+ keys=('custom_metadata',),
2733
+ default_value={})
2734
+
2735
+ # Get custom_metadata from task-level config overrides
2736
+ override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
2737
+ dict_config=cluster_config_overrides,
2738
+ cloud='kubernetes',
2739
+ region=context,
2740
+ keys=('custom_metadata',),
2741
+ default_value={})
2742
+
2743
+ # Merge task-level overrides with global config
2744
+ config_utils.merge_k8s_configs(custom_metadata, override_custom_metadata)
2356
2745
 
2357
2746
  # List of objects in the cluster YAML to be updated
2358
2747
  combination_destinations = [
2359
2748
  # Service accounts
2360
- yaml_obj['provider']['autoscaler_service_account']['metadata'],
2361
- yaml_obj['provider']['autoscaler_role']['metadata'],
2362
- yaml_obj['provider']['autoscaler_role_binding']['metadata'],
2363
- yaml_obj['provider']['autoscaler_service_account']['metadata'],
2364
- # Pod spec
2365
- yaml_obj['available_node_types']['ray_head_default']['node_config']
2749
+ merged_cluster_yaml_obj['provider']['autoscaler_service_account']
2750
+ ['metadata'],
2751
+ merged_cluster_yaml_obj['provider']['autoscaler_role']['metadata'],
2752
+ merged_cluster_yaml_obj['provider']['autoscaler_role_binding']
2753
+ ['metadata'],
2754
+ merged_cluster_yaml_obj['provider']['autoscaler_service_account']
2366
2755
  ['metadata'],
2756
+ # Pod spec
2757
+ merged_cluster_yaml_obj['available_node_types']['ray_head_default']
2758
+ ['node_config']['metadata'],
2367
2759
  # Services for pods
2368
- *[svc['metadata'] for svc in yaml_obj['provider']['services']]
2760
+ *[
2761
+ svc['metadata']
2762
+ for svc in merged_cluster_yaml_obj['provider']['services']
2763
+ ]
2369
2764
  ]
2370
2765
 
2371
2766
  for destination in combination_destinations:
2372
2767
  config_utils.merge_k8s_configs(destination, custom_metadata)
2373
2768
 
2374
- # Write the updated YAML back to the file
2375
- common_utils.dump_yaml(cluster_yaml_path, yaml_obj)
2769
+ return merged_cluster_yaml_obj
2770
+
2376
2771
 
2772
+ def combine_pod_config_fields_and_metadata(
2773
+ cluster_yaml_obj: Dict[str, Any],
2774
+ cluster_config_overrides: Dict[str, Any],
2775
+ cloud: Optional[clouds.Cloud] = None,
2776
+ context: Optional[str] = None) -> Dict[str, Any]:
2777
+ """Combines pod config fields and metadata fields"""
2778
+ combined_yaml_obj = combine_pod_config_fields(cluster_yaml_obj,
2779
+ cluster_config_overrides,
2780
+ cloud, context)
2781
+ combined_yaml_obj = combine_metadata_fields(combined_yaml_obj,
2782
+ cluster_config_overrides,
2783
+ context)
2784
+ return combined_yaml_obj
2377
2785
 
2378
- def merge_custom_metadata(original_metadata: Dict[str, Any]) -> None:
2786
+
2787
+ def merge_custom_metadata(
2788
+ original_metadata: Dict[str, Any],
2789
+ context: Optional[str] = None,
2790
+ cluster_config_overrides: Optional[Dict[str, Any]] = None) -> None:
2379
2791
  """Merges original metadata with custom_metadata from config
2380
2792
 
2381
2793
  Merge is done in-place, so return is not required
2382
2794
  """
2383
- custom_metadata = skypilot_config.get_nested(
2384
- ('kubernetes', 'custom_metadata'), {})
2795
+ # Get custom_metadata from global config
2796
+ custom_metadata = skypilot_config.get_effective_region_config(
2797
+ cloud='kubernetes',
2798
+ region=context,
2799
+ keys=('custom_metadata',),
2800
+ default_value={})
2801
+
2802
+ # Get custom_metadata from task-level config overrides if available
2803
+ if cluster_config_overrides is not None:
2804
+ override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
2805
+ dict_config=cluster_config_overrides,
2806
+ cloud='kubernetes',
2807
+ region=context,
2808
+ keys=('custom_metadata',),
2809
+ default_value={})
2810
+ # Merge task-level overrides with global config
2811
+ config_utils.merge_k8s_configs(custom_metadata,
2812
+ override_custom_metadata)
2813
+
2385
2814
  config_utils.merge_k8s_configs(original_metadata, custom_metadata)
2386
2815
 
2387
2816
 
2388
- def check_nvidia_runtime_class(context: Optional[str] = None) -> bool:
2817
+ @_retry_on_error(resource_type='runtimeclass')
2818
+ def check_nvidia_runtime_class(*, context: Optional[str] = None) -> bool:
2389
2819
  """Checks if the 'nvidia' RuntimeClass exists in the cluster"""
2390
2820
  # Fetch the list of available RuntimeClasses
2391
2821
  runtime_classes = kubernetes.node_api(context).list_runtime_class()
@@ -2435,7 +2865,7 @@ def create_namespace(namespace: str, context: Optional[str]) -> None:
2435
2865
  return
2436
2866
 
2437
2867
  ns_metadata = dict(name=namespace, labels={'parent': 'skypilot'})
2438
- merge_custom_metadata(ns_metadata)
2868
+ merge_custom_metadata(ns_metadata, context)
2439
2869
  namespace_obj = kubernetes_client.V1Namespace(metadata=ns_metadata)
2440
2870
  try:
2441
2871
  kubernetes.core_api(context).create_namespace(namespace_obj)
@@ -2461,15 +2891,14 @@ def get_head_pod_name(cluster_name_on_cloud: str):
2461
2891
  return f'{cluster_name_on_cloud}-head'
2462
2892
 
2463
2893
 
2464
- def get_autoscaler_type(
2465
- ) -> Optional[kubernetes_enums.KubernetesAutoscalerType]:
2466
- """Returns the autoscaler type by reading from config"""
2467
- autoscaler_type = skypilot_config.get_nested(('kubernetes', 'autoscaler'),
2468
- None)
2469
- if autoscaler_type is not None:
2470
- autoscaler_type = kubernetes_enums.KubernetesAutoscalerType(
2471
- autoscaler_type)
2472
- return autoscaler_type
2894
+ def get_custom_config_k8s_contexts() -> List[str]:
2895
+ """Returns the list of context names from the config"""
2896
+ contexts = skypilot_config.get_effective_region_config(
2897
+ cloud='kubernetes',
2898
+ region=None,
2899
+ keys=('context_configs',),
2900
+ default_value={})
2901
+ return [*contexts] or []
2473
2902
 
2474
2903
 
2475
2904
  # Mapping of known spot label keys and values for different cluster types
@@ -2481,6 +2910,21 @@ SPOT_LABEL_MAP = {
2481
2910
  }
2482
2911
 
2483
2912
 
2913
+ def get_autoscaler_type(
2914
+ context: Optional[str] = None
2915
+ ) -> Optional[kubernetes_enums.KubernetesAutoscalerType]:
2916
+ """Returns the autoscaler type by reading from config"""
2917
+ autoscaler_type = skypilot_config.get_effective_region_config(
2918
+ cloud='kubernetes',
2919
+ region=context,
2920
+ keys=('autoscaler',),
2921
+ default_value=None)
2922
+ if autoscaler_type is not None:
2923
+ autoscaler_type = kubernetes_enums.KubernetesAutoscalerType(
2924
+ autoscaler_type)
2925
+ return autoscaler_type
2926
+
2927
+
2484
2928
  def get_spot_label(
2485
2929
  context: Optional[str] = None) -> Tuple[Optional[str], Optional[str]]:
2486
2930
  """Get the spot label key and value for using spot instances, if supported.
@@ -2504,7 +2948,7 @@ def get_spot_label(
2504
2948
 
2505
2949
  # Check if autoscaler is configured. Allow spot instances if autoscaler type
2506
2950
  # is known to support spot instances.
2507
- autoscaler_type = get_autoscaler_type()
2951
+ autoscaler_type = get_autoscaler_type(context=context)
2508
2952
  if autoscaler_type == kubernetes_enums.KubernetesAutoscalerType.GKE:
2509
2953
  return SPOT_LABEL_MAP[autoscaler_type.value]
2510
2954
 
@@ -2546,7 +2990,7 @@ def get_unlabeled_accelerator_nodes(context: Optional[str] = None) -> List[Any]:
2546
2990
  nodes = get_kubernetes_nodes(context=context)
2547
2991
  nodes_with_accelerator = []
2548
2992
  for node in nodes:
2549
- if get_gpu_resource_key() in node.status.capacity:
2993
+ if get_gpu_resource_key(context) in node.status.capacity:
2550
2994
  nodes_with_accelerator.append(node)
2551
2995
 
2552
2996
  label_formatter, _ = detect_gpu_label_formatter(context)
@@ -2590,14 +3034,6 @@ def get_kubernetes_node_info(
2590
3034
  information.
2591
3035
  """
2592
3036
  nodes = get_kubernetes_nodes(context=context)
2593
- # Get the pods to get the real-time resource usage
2594
- try:
2595
- pods = get_all_pods_in_kubernetes_cluster(context=context)
2596
- except kubernetes.api_exception() as e:
2597
- if e.status == 403:
2598
- pods = None
2599
- else:
2600
- raise
2601
3037
 
2602
3038
  lf, _ = detect_gpu_label_formatter(context)
2603
3039
  if not lf:
@@ -2605,6 +3041,29 @@ def get_kubernetes_node_info(
2605
3041
  else:
2606
3042
  label_keys = lf.get_label_keys()
2607
3043
 
3044
+ # Check if all nodes have no accelerators to avoid fetching pods
3045
+ has_accelerator_nodes = False
3046
+ for node in nodes:
3047
+ accelerator_count = get_node_accelerator_count(context,
3048
+ node.status.allocatable)
3049
+ if accelerator_count > 0:
3050
+ has_accelerator_nodes = True
3051
+ break
3052
+
3053
+ # Get the allocated GPU quantity by each node
3054
+ allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
3055
+ error_on_get_allocated_gpu_qty_by_node = False
3056
+ if has_accelerator_nodes:
3057
+ try:
3058
+ allocated_qty_by_node = get_allocated_gpu_qty_by_node(
3059
+ context=context)
3060
+ except kubernetes.api_exception() as e:
3061
+ if e.status == 403:
3062
+ error_on_get_allocated_gpu_qty_by_node = True
3063
+ pass
3064
+ else:
3065
+ raise
3066
+
2608
3067
  node_info_dict: Dict[str, models.KubernetesNodeInfo] = {}
2609
3068
  has_multi_host_tpu = False
2610
3069
 
@@ -2619,24 +3078,36 @@ def get_kubernetes_node_info(
2619
3078
  node.metadata.labels.get(label_key))
2620
3079
  break
2621
3080
 
2622
- allocated_qty = 0
2623
- accelerator_count = get_node_accelerator_count(node.status.allocatable)
3081
+ # Extract IP address from node addresses (prefer external, fallback to internal)
3082
+ node_ip = None
3083
+ if node.status.addresses:
3084
+ # First try to find external IP
3085
+ for address in node.status.addresses:
3086
+ if address.type == 'ExternalIP':
3087
+ node_ip = address.address
3088
+ break
3089
+ # If no external IP, try to find internal IP
3090
+ if node_ip is None:
3091
+ for address in node.status.addresses:
3092
+ if address.type == 'InternalIP':
3093
+ node_ip = address.address
3094
+ break
3095
+
3096
+ accelerator_count = get_node_accelerator_count(context,
3097
+ node.status.allocatable)
3098
+ if accelerator_count == 0:
3099
+ node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
3100
+ name=node.metadata.name,
3101
+ accelerator_type=accelerator_name,
3102
+ total={'accelerator_count': 0},
3103
+ free={'accelerators_available': 0},
3104
+ ip_address=node_ip)
3105
+ continue
2624
3106
 
2625
- if pods is None:
3107
+ if not has_accelerator_nodes or error_on_get_allocated_gpu_qty_by_node:
2626
3108
  accelerators_available = -1
2627
-
2628
3109
  else:
2629
- for pod in pods:
2630
- # Get all the pods running on the node
2631
- if (pod.spec.node_name == node.metadata.name and
2632
- pod.status.phase in ['Running', 'Pending']):
2633
- # Iterate over all the containers in the pod and sum the
2634
- # GPU requests
2635
- for container in pod.spec.containers:
2636
- if container.resources.requests:
2637
- allocated_qty += get_node_accelerator_count(
2638
- container.resources.requests)
2639
-
3110
+ allocated_qty = allocated_qty_by_node[node.metadata.name]
2640
3111
  accelerators_available = accelerator_count - allocated_qty
2641
3112
 
2642
3113
  # Exclude multi-host TPUs from being processed.
@@ -2650,7 +3121,8 @@ def get_kubernetes_node_info(
2650
3121
  name=node.metadata.name,
2651
3122
  accelerator_type=accelerator_name,
2652
3123
  total={'accelerator_count': int(accelerator_count)},
2653
- free={'accelerators_available': int(accelerators_available)})
3124
+ free={'accelerators_available': int(accelerators_available)},
3125
+ ip_address=node_ip)
2654
3126
  hint = ''
2655
3127
  if has_multi_host_tpu:
2656
3128
  hint = ('(Note: Multi-host TPUs are detected and excluded from the '
@@ -2767,7 +3239,7 @@ def set_autodown_annotations(handle: 'backends.CloudVmRayResourceHandle',
2767
3239
  tags = {
2768
3240
  provision_constants.TAG_RAY_CLUSTER_NAME: handle.cluster_name_on_cloud,
2769
3241
  }
2770
- ray_config = common_utils.read_yaml(handle.cluster_yaml)
3242
+ ray_config = global_user_state.get_cluster_yaml_dict(handle.cluster_yaml)
2771
3243
  provider_config = ray_config['provider']
2772
3244
  namespace = get_namespace_from_config(provider_config)
2773
3245
  context = get_context_from_config(provider_config)
@@ -2809,8 +3281,8 @@ def get_context_from_config(provider_config: Dict[str, Any]) -> Optional[str]:
2809
3281
  context = provider_config.get('context',
2810
3282
  get_current_kube_config_context_name())
2811
3283
  if context == kubernetes.in_cluster_context_name():
2812
- # If the context (also used as the region) is in-cluster, we need to
2813
- # we need to use in-cluster auth by setting the context to None.
3284
+ # If the context (also used as the region) is in-cluster, we need
3285
+ # to use in-cluster auth by setting the context to None.
2814
3286
  context = None
2815
3287
  return context
2816
3288
 
@@ -2829,23 +3301,27 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
2829
3301
 
2830
3302
  try:
2831
3303
  pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
2832
- label_selector='skypilot-cluster',
3304
+ label_selector=provision_constants.TAG_SKYPILOT_CLUSTER_NAME,
2833
3305
  _request_timeout=kubernetes.API_TIMEOUT).items
2834
3306
  except kubernetes.max_retry_error():
2835
3307
  raise exceptions.ResourcesUnavailableError(
2836
3308
  'Timed out trying to get SkyPilot pods from Kubernetes cluster. '
2837
3309
  'Please check if the cluster is healthy and retry. To debug, run: '
2838
- 'kubectl get pods --selector=skypilot-cluster --all-namespaces'
3310
+ 'kubectl get pods --selector=skypilot-cluster-name --all-namespaces'
2839
3311
  ) from None
2840
3312
  return pods
2841
3313
 
2842
3314
 
2843
- def is_tpu_on_gke(accelerator: str) -> bool:
3315
+ def is_tpu_on_gke(accelerator: str, normalize: bool = True) -> bool:
2844
3316
  """Determines if the given accelerator is a TPU supported on GKE."""
3317
+ if normalize:
3318
+ normalized, _ = normalize_tpu_accelerator_name(accelerator)
3319
+ return normalized in GKE_TPU_ACCELERATOR_TO_GENERATION
2845
3320
  return accelerator in GKE_TPU_ACCELERATOR_TO_GENERATION
2846
3321
 
2847
3322
 
2848
- def get_node_accelerator_count(attribute_dict: dict) -> int:
3323
+ def get_node_accelerator_count(context: Optional[str],
3324
+ attribute_dict: dict) -> int:
2849
3325
  """Retrieves the count of accelerators from a node's resource dictionary.
2850
3326
 
2851
3327
  This method checks the node's allocatable resources or the accelerators
@@ -2860,7 +3336,7 @@ def get_node_accelerator_count(attribute_dict: dict) -> int:
2860
3336
  Number of accelerators allocated or available from the node. If no
2861
3337
  resource is found, it returns 0.
2862
3338
  """
2863
- gpu_resource_name = get_gpu_resource_key()
3339
+ gpu_resource_name = get_gpu_resource_key(context)
2864
3340
  assert not (gpu_resource_name in attribute_dict and
2865
3341
  TPU_RESOURCE_KEY in attribute_dict)
2866
3342
  if gpu_resource_name in attribute_dict:
@@ -2968,7 +3444,8 @@ def process_skypilot_pods(
2968
3444
  serve_controllers: List[KubernetesSkyPilotClusterInfo] = []
2969
3445
 
2970
3446
  for pod in pods:
2971
- cluster_name_on_cloud = pod.metadata.labels.get('skypilot-cluster')
3447
+ cluster_name_on_cloud = pod.metadata.labels.get(
3448
+ provision_constants.TAG_SKYPILOT_CLUSTER_NAME)
2972
3449
  cluster_name = cluster_name_on_cloud.rsplit(
2973
3450
  '-', 1
2974
3451
  )[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
@@ -2986,7 +3463,7 @@ def process_skypilot_pods(
2986
3463
  unit='G')
2987
3464
  gpu_count = parse_cpu_or_gpu_resource(
2988
3465
  pod.spec.containers[0].resources.requests.get(
2989
- 'nvidia.com/gpu', '0'))
3466
+ get_gpu_resource_key(context), '0'))
2990
3467
  gpu_name = None
2991
3468
  if gpu_count > 0:
2992
3469
  label_formatter, _ = (detect_gpu_label_formatter(context))
@@ -2995,9 +3472,20 @@ def process_skypilot_pods(
2995
3472
  f'requesting GPUs: {pod.metadata.name}')
2996
3473
  gpu_label = label_formatter.get_label_key()
2997
3474
  # Get GPU name from pod node selector
2998
- if pod.spec.node_selector is not None:
2999
- gpu_name = label_formatter.get_accelerator_from_label_value(
3000
- pod.spec.node_selector.get(gpu_label))
3475
+ node_selector_terms = (
3476
+ pod.spec.affinity.node_affinity.
3477
+ required_during_scheduling_ignored_during_execution.
3478
+ node_selector_terms)
3479
+ if node_selector_terms is not None:
3480
+ expressions = []
3481
+ for term in node_selector_terms:
3482
+ if term.match_expressions:
3483
+ expressions.extend(term.match_expressions)
3484
+ for expression in expressions:
3485
+ if expression.key == gpu_label and expression.operator == 'In':
3486
+ gpu_name = label_formatter.get_accelerator_from_label_value(
3487
+ expression.values[0])
3488
+ break
3001
3489
 
3002
3490
  resources = resources_lib.Resources(
3003
3491
  cloud=clouds.Kubernetes(),
@@ -3041,33 +3529,206 @@ def process_skypilot_pods(
3041
3529
  return list(clusters.values()), jobs_controllers, serve_controllers
3042
3530
 
3043
3531
 
3044
- def get_gpu_resource_key():
3045
- """Get the GPU resource name to use in kubernetes.
3046
- The function first checks for an environment variable.
3047
- If defined, it uses its value; otherwise, it returns the default value.
3048
- Args:
3049
- name (str): Default GPU resource name, default is "nvidia.com/gpu".
3532
+ def _gpu_resource_key_helper(context: Optional[str]) -> str:
3533
+ """Helper function to get the GPU resource key."""
3534
+ gpu_resource_key = SUPPORTED_GPU_RESOURCE_KEYS['nvidia']
3535
+ try:
3536
+ nodes = kubernetes.core_api(context).list_node().items
3537
+ for gpu_key in SUPPORTED_GPU_RESOURCE_KEYS.values():
3538
+ if any(gpu_key in node.status.capacity for node in nodes):
3539
+ return gpu_key
3540
+ except Exception as e: # pylint: disable=broad-except
3541
+ logger.warning(f'Failed to load kube config or query nodes: {e}. '
3542
+ 'Falling back to default GPU resource key.')
3543
+ return gpu_resource_key
3544
+
3545
+
3546
+ @annotations.lru_cache(scope='request')
3547
+ def get_gpu_resource_key(context: Optional[str] = None) -> str:
3548
+ """Get the GPU resource name to use in Kubernetes.
3549
+
3550
+ The function auto-detects the GPU resource key by querying the Kubernetes node API.
3551
+ If detection fails, it falls back to a default value.
3552
+ An environment variable can override the detected or default value.
3553
+
3050
3554
  Returns:
3051
3555
  str: The selected GPU resource name.
3052
3556
  """
3053
- # Retrieve GPU resource name from environment variable, if set.
3054
- # Else use default.
3055
- # E.g., can be nvidia.com/gpu-h100, amd.com/gpu etc.
3056
- return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=GPU_RESOURCE_KEY)
3557
+ gpu_resource_key = _gpu_resource_key_helper(context)
3558
+ return os.getenv('CUSTOM_GPU_RESOURCE_KEY', default=gpu_resource_key)
3057
3559
 
3058
3560
 
3059
- def _get_kubeconfig_path() -> str:
3060
- """Get the path to the kubeconfig file.
3561
+ def get_kubeconfig_paths() -> List[str]:
3562
+ """Get the path to the kubeconfig files.
3061
3563
  Parses `KUBECONFIG` env var if present, else uses the default path.
3062
- Currently, specifying multiple KUBECONFIG paths in the envvar is not
3063
- allowed, hence will raise a ValueError.
3064
3564
  """
3065
- kubeconfig_path = os.path.expanduser(
3066
- os.getenv(
3067
- 'KUBECONFIG', kubernetes.kubernetes.config.kube_config.
3068
- KUBE_CONFIG_DEFAULT_LOCATION))
3069
- if len(kubeconfig_path.split(os.pathsep)) > 1:
3070
- raise ValueError('SkyPilot currently only supports one '
3071
- 'config file path with $KUBECONFIG. Current '
3072
- f'path(s) are {kubeconfig_path}.')
3073
- return kubeconfig_path
3565
+ # We should always use the latest KUBECONFIG environment variable to
3566
+ # make sure env var overrides get respected.
3567
+ paths = os.getenv('KUBECONFIG', kubernetes.DEFAULT_KUBECONFIG_PATH)
3568
+ expanded = []
3569
+ for path in paths.split(kubernetes.ENV_KUBECONFIG_PATH_SEPARATOR):
3570
+ expanded.append(os.path.expanduser(path))
3571
+ return expanded
3572
+
3573
+
3574
+ def format_kubeconfig_exec_auth(config: Any,
3575
+ output_path: str,
3576
+ inject_wrapper: bool = True) -> bool:
3577
+ """Reformat the kubeconfig so that exec-based authentication can be used
3578
+ with SkyPilot. Will create a new kubeconfig file under <output_path>
3579
+ regardless of whether a change has been made.
3580
+
3581
+ kubectl internally strips all environment variables except for system
3582
+ defaults. If `inject_wrapper` is true, a wrapper executable is applied
3583
+ to inject the relevant PATH information before exec-auth is executed.
3584
+
3585
+ Contents of sky-kube-exec-wrapper:
3586
+
3587
+ #!/bin/bash
3588
+ export PATH="$HOME/skypilot-runtime/bin:$HOME/google-cloud-sdk:$PATH"
3589
+ exec "$@"
3590
+
3591
+ refer to `skylet/constants.py` for more information.
3592
+
3593
+ Args:
3594
+ config (dict): kubeconfig parsed by yaml.safe_load
3595
+ output_path (str): Path where the potentially modified kubeconfig file
3596
+ will be saved
3597
+ inject_wrapper (bool): Whether to inject the wrapper script
3598
+ Returns: whether config was updated, for logging purposes
3599
+ """
3600
+ updated = False
3601
+ for user in config.get('users', []):
3602
+ exec_info = user.get('user', {}).get('exec', {})
3603
+ current_command = exec_info.get('command', '')
3604
+
3605
+ if current_command:
3606
+ # Strip the path and keep only the executable name
3607
+ executable = os.path.basename(current_command)
3608
+ if executable == kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER:
3609
+ # we don't want this happening recursively.
3610
+ continue
3611
+
3612
+ if inject_wrapper:
3613
+ exec_info[
3614
+ 'command'] = kubernetes_constants.SKY_K8S_EXEC_AUTH_WRAPPER
3615
+ if exec_info.get('args') is None:
3616
+ exec_info['args'] = []
3617
+ exec_info['args'].insert(0, executable)
3618
+ updated = True
3619
+ elif executable != current_command:
3620
+ exec_info['command'] = executable
3621
+ updated = True
3622
+
3623
+ # Handle Nebius kubeconfigs: change --profile to 'sky'
3624
+ if executable == 'nebius':
3625
+ args = exec_info.get('args', [])
3626
+ if args and '--profile' in args:
3627
+ try:
3628
+ profile_index = args.index('--profile')
3629
+ if profile_index + 1 < len(args):
3630
+ old_profile = args[profile_index + 1]
3631
+ if old_profile != 'sky':
3632
+ args[profile_index + 1] = 'sky'
3633
+ updated = True
3634
+ except ValueError:
3635
+ pass
3636
+
3637
+ os.makedirs(os.path.dirname(os.path.expanduser(output_path)), exist_ok=True)
3638
+ with open(output_path, 'w', encoding='utf-8') as file:
3639
+ yaml.safe_dump(config, file)
3640
+
3641
+ return updated
3642
+
3643
+
3644
+ def format_kubeconfig_exec_auth_with_cache(kubeconfig_path: str) -> str:
3645
+ """Reformat the kubeconfig file or retrieve it from cache if it has already
3646
+ been formatted before. Store it in the cache directory if necessary.
3647
+
3648
+ Having a cache for this is good if users spawn an extreme number of jobs
3649
+ concurrently.
3650
+
3651
+ Args:
3652
+ kubeconfig_path (str): kubeconfig path
3653
+ Returns: updated kubeconfig path
3654
+ """
3655
+ # TODO(kyuds): GC cache files
3656
+ with open(kubeconfig_path, 'r', encoding='utf-8') as file:
3657
+ config = yaml_utils.safe_load(file)
3658
+ normalized = yaml.dump(config, sort_keys=True)
3659
+ hashed = hashlib.sha1(normalized.encode('utf-8')).hexdigest()
3660
+ path = os.path.expanduser(
3661
+ f'{kubernetes_constants.SKY_K8S_EXEC_AUTH_KUBECONFIG_CACHE}/{hashed}.yaml'
3662
+ )
3663
+
3664
+ # If we have already converted the same kubeconfig before, just return.
3665
+ if os.path.isfile(path):
3666
+ return path
3667
+
3668
+ try:
3669
+ format_kubeconfig_exec_auth(config, path)
3670
+ return path
3671
+ except Exception as e: # pylint: disable=broad-except
3672
+ # There may be problems with kubeconfig, but the user is not actually
3673
+ # using Kubernetes (or SSH Node Pools)
3674
+ logger.warning(
3675
+ f'Failed to format kubeconfig at {kubeconfig_path}. '
3676
+ 'Please check if the kubeconfig is valid. This may cause '
3677
+ 'problems when Kubernetes infra is used. '
3678
+ f'Reason: {common_utils.format_exception(e)}')
3679
+ return kubeconfig_path
3680
+
3681
+
3682
+ def delete_k8s_resource_with_retry(delete_func: Callable, resource_type: str,
3683
+ resource_name: str) -> None:
3684
+ """Helper to delete Kubernetes resources with 404 handling and retries.
3685
+
3686
+ Args:
3687
+ delete_func: Function to call to delete the resource
3688
+ resource_type: Type of resource being deleted (e.g. 'service'),
3689
+ used in logging
3690
+ resource_name: Name of the resource being deleted, used in logging
3691
+ """
3692
+ max_retries = 3
3693
+ retry_delay = 5 # seconds
3694
+
3695
+ for attempt in range(max_retries):
3696
+ try:
3697
+ delete_func()
3698
+ return
3699
+ except kubernetes.api_exception() as e:
3700
+ if e.status == 404:
3701
+ logger.warning(
3702
+ f'terminate_instances: Tried to delete {resource_type} '
3703
+ f'{resource_name}, but the {resource_type} was not '
3704
+ 'found (404).')
3705
+ return
3706
+ elif attempt < max_retries - 1:
3707
+ logger.warning(f'terminate_instances: Failed to delete '
3708
+ f'{resource_type} {resource_name} (attempt '
3709
+ f'{attempt + 1}/{max_retries}). Error: {e}. '
3710
+ f'Retrying in {retry_delay} seconds...')
3711
+ time.sleep(retry_delay)
3712
+ else:
3713
+ raise
3714
+
3715
+
3716
+ def should_exclude_pod_from_gpu_allocation(pod) -> bool:
3717
+ """Check if a pod should be excluded from GPU count calculations.
3718
+
3719
+ Some cloud providers run low priority test/verification pods that request
3720
+ GPUs but should not count against real GPU availability since they are
3721
+ designed to be evicted when higher priority workloads need resources.
3722
+
3723
+ Args:
3724
+ pod: Kubernetes pod object
3725
+
3726
+ Returns:
3727
+ bool: True if the pod should be excluded from GPU count calculations.
3728
+ """
3729
+ # CoreWeave HPC verification pods - identified by namespace
3730
+ if (hasattr(pod.metadata, 'namespace') and
3731
+ pod.metadata.namespace == 'cw-hpc-verification'):
3732
+ return True
3733
+
3734
+ return False