skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
@@ -1,4 +1,5 @@
1
1
  """Kubernetes utilities for SkyPilot."""
2
+ import collections
2
3
  import copy
3
4
  import dataclasses
4
5
  import datetime
@@ -13,8 +14,10 @@ import shutil
13
14
  import subprocess
14
15
  import time
15
16
  import typing
16
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
17
- from urllib.parse import urlparse
17
+ from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple,
18
+ Union)
19
+
20
+ import ijson
18
21
 
19
22
  from sky import clouds
20
23
  from sky import exceptions
@@ -32,7 +35,6 @@ from sky.skylet import constants
32
35
  from sky.utils import annotations
33
36
  from sky.utils import common_utils
34
37
  from sky.utils import config_utils
35
- from sky.utils import directory_utils
36
38
  from sky.utils import env_options
37
39
  from sky.utils import kubernetes_enums
38
40
  from sky.utils import schemas
@@ -61,6 +63,8 @@ HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
61
63
  # and store all data that needs to be persisted in future.
62
64
  HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
63
65
 
66
+ IJSON_BUFFER_SIZE = 64 * 1024 # 64KB, default from ijson
67
+
64
68
 
65
69
  class KubernetesHighPerformanceNetworkType(enum.Enum):
66
70
  """Enum for different Kubernetes cluster types with high performance
@@ -106,8 +110,9 @@ class KubernetesHighPerformanceNetworkType(enum.Enum):
106
110
  return {
107
111
  'NCCL_SOCKET_IFNAME': 'eth0',
108
112
  'NCCL_IB_HCA': 'ibp',
109
- 'UCX_NET_DEVICES': ('ibp0:1,ibp1:1,ibp2:1,ibp3:1,'
110
- 'ibp4:1,ibp5:1,ibp6:1,ibp7:1')
113
+ # Restrict UCX to TCP to avoid unneccsary errors. NCCL doesn't use UCX
114
+ 'UCX_TLS': 'tcp',
115
+ 'UCX_NET_DEVICES': 'eth0',
111
116
  }
112
117
  else:
113
118
  # GCP clusters and generic clusters - environment variables are
@@ -235,6 +240,40 @@ def normalize_tpu_accelerator_name(accelerator: str) -> Tuple[str, int]:
235
240
  return accelerator, 1
236
241
 
237
242
 
243
+ def _is_cloudflare_403_error(exception: Exception) -> bool:
244
+ """Check if an exception is a transient CloudFlare 403 error.
245
+
246
+ CloudFlare proxy 403 errors with CF-specific headers are transient and
247
+ should be retried, unlike real RBAC 403 errors.
248
+
249
+ Args:
250
+ exception: The exception to check
251
+
252
+ Returns:
253
+ True if this is a CloudFlare 403 error that should be retried
254
+ """
255
+ if not isinstance(exception, kubernetes.api_exception()):
256
+ return False
257
+
258
+ # Only check for 403 errors
259
+ if exception.status != 403:
260
+ return False
261
+
262
+ # Check for CloudFlare-specific headers
263
+ headers = exception.headers if hasattr(exception, 'headers') else {}
264
+ if not headers:
265
+ return False
266
+
267
+ # CloudFlare errors have CF-RAY header and/or Server: cloudflare
268
+ for k, v in headers.items():
269
+ if 'cf-ray' in k.lower():
270
+ return True
271
+ if 'server' in k.lower() and 'cloudflare' in str(v).lower():
272
+ return True
273
+
274
+ return False
275
+
276
+
238
277
  def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
239
278
  retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
240
279
  resource_type: Optional[str] = None):
@@ -269,19 +308,25 @@ def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
269
308
  kubernetes.api_exception(),
270
309
  kubernetes.config_exception()) as e:
271
310
  last_exception = e
311
+
312
+ # Check if this is a CloudFlare transient 403 error
313
+ is_cloudflare_403 = _is_cloudflare_403_error(e)
314
+
272
315
  # Don't retry on permanent errors like 401 (Unauthorized)
273
- # or 403 (Forbidden)
316
+ # or 403 (Forbidden), unless it's a CloudFlare transient 403
274
317
  if (isinstance(e, kubernetes.api_exception()) and
275
- e.status in (401, 403)):
318
+ e.status in (401, 403) and not is_cloudflare_403):
276
319
  # Raise KubeAPIUnreachableError exception so that the
277
320
  # optimizer/provisioner can failover to other clouds.
278
321
  raise exceptions.KubeAPIUnreachableError(
279
322
  f'Kubernetes API error: {str(e)}') from e
280
323
  if attempt < max_retries - 1:
281
324
  sleep_time = backoff.current_backoff()
282
- logger.debug(f'Kubernetes API call {func.__name__} '
283
- f'failed with {str(e)}. Retrying in '
284
- f'{sleep_time:.1f}s...')
325
+ error_type = 'CloudFlare 403' if is_cloudflare_403 else 'error'
326
+ logger.debug(
327
+ f'Kubernetes API call {func.__name__} '
328
+ f'failed with {error_type} {str(e)}. Retrying in '
329
+ f'{sleep_time:.1f}s...')
285
330
  time.sleep(sleep_time)
286
331
  continue
287
332
 
@@ -451,6 +496,9 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
451
496
 
452
497
  LABEL_KEY = 'gpu.nvidia.com/class'
453
498
 
499
+ # TODO (kyuds): fill in more label values for different accelerators.
500
+ ACC_VALUE_MAPPINGS = {'H100_NVLINK_80GB': 'H100'}
501
+
454
502
  @classmethod
455
503
  def get_label_key(cls, accelerator: Optional[str] = None) -> str:
456
504
  return cls.LABEL_KEY
@@ -469,7 +517,8 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
469
517
 
470
518
  @classmethod
471
519
  def get_accelerator_from_label_value(cls, value: str) -> str:
472
- return value
520
+ # return original label value if not found in mappings.
521
+ return cls.ACC_VALUE_MAPPINGS.get(value, value)
473
522
 
474
523
 
475
524
  class GKELabelFormatter(GPULabelFormatter):
@@ -689,6 +738,7 @@ def detect_gpu_label_formatter(
689
738
  for label, value in node.metadata.labels.items():
690
739
  node_labels[node.metadata.name].append((label, value))
691
740
 
741
+ invalid_label_values: List[Tuple[str, str, str, str]] = []
692
742
  # Check if the node labels contain any of the GPU label prefixes
693
743
  for lf in LABEL_FORMATTER_REGISTRY:
694
744
  skip = False
@@ -702,11 +752,8 @@ def detect_gpu_label_formatter(
702
752
  if valid:
703
753
  return lf(), node_labels
704
754
  else:
705
- logger.warning(f'GPU label {label} matched for label '
706
- f'formatter {lf.__class__.__name__}, '
707
- f'but has invalid value {value}. '
708
- f'Reason: {reason}. '
709
- 'Skipping...')
755
+ invalid_label_values.append(
756
+ (label, lf.__name__, value, reason))
710
757
  skip = True
711
758
  break
712
759
  if skip:
@@ -714,6 +761,13 @@ def detect_gpu_label_formatter(
714
761
  if skip:
715
762
  continue
716
763
 
764
+ for label, lf_name, value, reason in invalid_label_values:
765
+ logger.warning(f'GPU label {label} matched for label '
766
+ f'formatter {lf_name}, '
767
+ f'but has invalid value {value}. '
768
+ f'Reason: {reason}. '
769
+ 'Skipping...')
770
+
717
771
  return None, node_labels
718
772
 
719
773
 
@@ -1012,15 +1066,16 @@ class GKEAutoscaler(Autoscaler):
1012
1066
  to fit the instance type.
1013
1067
  """
1014
1068
  for accelerator in node_pool_accelerators:
1069
+ raw_value = accelerator['acceleratorType']
1015
1070
  node_accelerator_type = (
1016
- GKELabelFormatter.get_accelerator_from_label_value(
1017
- accelerator['acceleratorType']))
1071
+ GKELabelFormatter.get_accelerator_from_label_value(raw_value))
1018
1072
  # handle heterogenous nodes.
1019
1073
  if not node_accelerator_type:
1020
1074
  continue
1021
1075
  node_accelerator_count = accelerator['acceleratorCount']
1022
- if node_accelerator_type == requested_gpu_type and int(
1023
- node_accelerator_count) >= requested_gpu_count:
1076
+ viable_names = [node_accelerator_type.lower(), raw_value.lower()]
1077
+ if (requested_gpu_type.lower() in viable_names and
1078
+ int(node_accelerator_count) >= requested_gpu_count):
1024
1079
  return True
1025
1080
  return False
1026
1081
 
@@ -1137,9 +1192,51 @@ def detect_accelerator_resource(
1137
1192
  return has_accelerator, cluster_resources
1138
1193
 
1139
1194
 
1195
+ @dataclasses.dataclass
1196
+ class V1ObjectMeta:
1197
+ name: str
1198
+ labels: Dict[str, str]
1199
+ namespace: str = '' # Used for pods, not nodes
1200
+
1201
+
1202
+ @dataclasses.dataclass
1203
+ class V1NodeAddress:
1204
+ type: str
1205
+ address: str
1206
+
1207
+
1208
+ @dataclasses.dataclass
1209
+ class V1NodeStatus:
1210
+ allocatable: Dict[str, str]
1211
+ capacity: Dict[str, str]
1212
+ addresses: List[V1NodeAddress]
1213
+
1214
+
1215
+ @dataclasses.dataclass
1216
+ class V1Node:
1217
+ metadata: V1ObjectMeta
1218
+ status: V1NodeStatus
1219
+
1220
+ @classmethod
1221
+ def from_dict(cls, data: dict) -> 'V1Node':
1222
+ """Create V1Node from a dictionary."""
1223
+ return cls(metadata=V1ObjectMeta(
1224
+ name=data['metadata']['name'],
1225
+ labels=data['metadata'].get('labels', {}),
1226
+ ),
1227
+ status=V1NodeStatus(
1228
+ allocatable=data['status']['allocatable'],
1229
+ capacity=data['status']['capacity'],
1230
+ addresses=[
1231
+ V1NodeAddress(type=addr['type'],
1232
+ address=addr['address'])
1233
+ for addr in data['status'].get('addresses', [])
1234
+ ]))
1235
+
1236
+
1140
1237
  @annotations.lru_cache(scope='request', maxsize=10)
1141
1238
  @_retry_on_error(resource_type='node')
1142
- def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[Any]:
1239
+ def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[V1Node]:
1143
1240
  """Gets the kubernetes nodes in the context.
1144
1241
 
1145
1242
  If context is None, gets the nodes in the current context.
@@ -1147,25 +1244,113 @@ def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[Any]:
1147
1244
  if context is None:
1148
1245
  context = get_current_kube_config_context_name()
1149
1246
 
1150
- nodes = kubernetes.core_api(context).list_node(
1151
- _request_timeout=kubernetes.API_TIMEOUT).items
1247
+ # Return raw urllib3.HTTPResponse object so that we can parse the json
1248
+ # more efficiently.
1249
+ response = kubernetes.core_api(context).list_node(
1250
+ _request_timeout=kubernetes.API_TIMEOUT, _preload_content=False)
1251
+ try:
1252
+ nodes = [
1253
+ V1Node.from_dict(item_dict) for item_dict in ijson.items(
1254
+ response, 'items.item', buf_size=IJSON_BUFFER_SIZE)
1255
+ ]
1256
+ finally:
1257
+ response.release_conn()
1258
+
1152
1259
  return nodes
1153
1260
 
1154
1261
 
1155
- @_retry_on_error(resource_type='pod')
1156
- def get_all_pods_in_kubernetes_cluster(*,
1157
- context: Optional[str] = None
1158
- ) -> List[Any]:
1159
- """Gets pods in all namespaces in kubernetes cluster indicated by context.
1262
+ @dataclasses.dataclass
1263
+ class V1PodStatus:
1264
+ phase: str
1265
+
1266
+
1267
+ @dataclasses.dataclass
1268
+ class V1ResourceRequirements:
1269
+ requests: Optional[Dict[str, str]]
1270
+
1271
+
1272
+ @dataclasses.dataclass
1273
+ class V1Container:
1274
+ resources: V1ResourceRequirements
1160
1275
 
1161
- Used for computing cluster resource usage.
1276
+
1277
+ @dataclasses.dataclass
1278
+ class V1PodSpec:
1279
+ containers: List[V1Container]
1280
+ node_name: Optional[str]
1281
+
1282
+
1283
+ @dataclasses.dataclass
1284
+ class V1Pod:
1285
+ metadata: V1ObjectMeta
1286
+ status: V1PodStatus
1287
+ spec: V1PodSpec
1288
+
1289
+ @classmethod
1290
+ def from_dict(cls, data: dict) -> 'V1Pod':
1291
+ """Create V1Pod from a dictionary."""
1292
+ return cls(metadata=V1ObjectMeta(
1293
+ name=data['metadata']['name'],
1294
+ labels=data['metadata'].get('labels', {}),
1295
+ namespace=data['metadata'].get('namespace'),
1296
+ ),
1297
+ status=V1PodStatus(phase=data['status'].get('phase'),),
1298
+ spec=V1PodSpec(
1299
+ node_name=data['spec'].get('nodeName'),
1300
+ containers=[
1301
+ V1Container(resources=V1ResourceRequirements(
1302
+ requests=container.get('resources', {}).get(
1303
+ 'requests') or None))
1304
+ for container in data['spec'].get('containers', [])
1305
+ ]))
1306
+
1307
+
1308
+ @_retry_on_error(resource_type='pod')
1309
+ def get_allocated_gpu_qty_by_node(
1310
+ *,
1311
+ context: Optional[str] = None,
1312
+ ) -> Dict[str, int]:
1313
+ """Gets allocated GPU quantity by each node by fetching pods in
1314
+ all namespaces in kubernetes cluster indicated by context.
1162
1315
  """
1163
1316
  if context is None:
1164
1317
  context = get_current_kube_config_context_name()
1318
+ non_included_pod_statuses = POD_STATUSES.copy()
1319
+ status_filters = ['Running', 'Pending']
1320
+ if status_filters is not None:
1321
+ non_included_pod_statuses -= set(status_filters)
1322
+ field_selector = ','.join(
1323
+ [f'status.phase!={status}' for status in non_included_pod_statuses])
1165
1324
 
1166
- pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
1167
- _request_timeout=kubernetes.API_TIMEOUT).items
1168
- return pods
1325
+ # Return raw urllib3.HTTPResponse object so that we can parse the json
1326
+ # more efficiently.
1327
+ response = kubernetes.core_api(context).list_pod_for_all_namespaces(
1328
+ _request_timeout=kubernetes.API_TIMEOUT,
1329
+ _preload_content=False,
1330
+ field_selector=field_selector)
1331
+ try:
1332
+ allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
1333
+ for item_dict in ijson.items(response,
1334
+ 'items.item',
1335
+ buf_size=IJSON_BUFFER_SIZE):
1336
+ pod = V1Pod.from_dict(item_dict)
1337
+ if should_exclude_pod_from_gpu_allocation(pod):
1338
+ logger.debug(
1339
+ f'Excluding pod {pod.metadata.name} from GPU count '
1340
+ f'calculations on node {pod.spec.node_name}')
1341
+ continue
1342
+ # Iterate over all the containers in the pod and sum the
1343
+ # GPU requests
1344
+ pod_allocated_qty = 0
1345
+ for container in pod.spec.containers:
1346
+ if container.resources.requests:
1347
+ pod_allocated_qty += get_node_accelerator_count(
1348
+ context, container.resources.requests)
1349
+ if pod_allocated_qty > 0 and pod.spec.node_name:
1350
+ allocated_qty_by_node[pod.spec.node_name] += pod_allocated_qty
1351
+ return allocated_qty_by_node
1352
+ finally:
1353
+ response.release_conn()
1169
1354
 
1170
1355
 
1171
1356
  def check_instance_fits(context: Optional[str],
@@ -1448,9 +1633,13 @@ def get_accelerator_label_key_values(
1448
1633
  if is_multi_host_tpu(node_metadata_labels):
1449
1634
  continue
1450
1635
  for label, value in label_list:
1451
- if (label_formatter.match_label_key(label) and
1452
- label_formatter.get_accelerator_from_label_value(
1453
- value).lower() == acc_type.lower()):
1636
+ if label_formatter.match_label_key(label):
1637
+ # match either canonicalized name or raw name
1638
+ accelerator = (label_formatter.
1639
+ get_accelerator_from_label_value(value))
1640
+ viable = [value.lower(), accelerator.lower()]
1641
+ if acc_type.lower() not in viable:
1642
+ continue
1454
1643
  if is_tpu_on_gke(acc_type):
1455
1644
  assert isinstance(label_formatter,
1456
1645
  GKELabelFormatter)
@@ -1550,23 +1739,6 @@ def get_port(svc_name: str, namespace: str, context: Optional[str]) -> int:
1550
1739
  return head_service.spec.ports[0].node_port
1551
1740
 
1552
1741
 
1553
- def get_external_ip(network_mode: Optional[
1554
- kubernetes_enums.KubernetesNetworkingMode], context: Optional[str]) -> str:
1555
- if network_mode == kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD:
1556
- return '127.0.0.1'
1557
- # Return the IP address of the first node with an external IP
1558
- nodes = kubernetes.core_api(context).list_node().items
1559
- for node in nodes:
1560
- if node.status.addresses:
1561
- for address in node.status.addresses:
1562
- if address.type == 'ExternalIP':
1563
- return address.address
1564
- # If no external IP is found, use the API server IP
1565
- api_host = kubernetes.core_api(context).api_client.configuration.host
1566
- parsed_url = urlparse(api_host)
1567
- return parsed_url.hostname
1568
-
1569
-
1570
1742
  def check_credentials(context: Optional[str],
1571
1743
  timeout: int = kubernetes.API_TIMEOUT,
1572
1744
  run_optional_checks: bool = False) -> \
@@ -1585,7 +1757,10 @@ def check_credentials(context: Optional[str],
1585
1757
  try:
1586
1758
  namespace = get_kube_config_context_namespace(context)
1587
1759
  kubernetes.core_api(context).list_namespaced_pod(
1588
- namespace, _request_timeout=timeout)
1760
+ namespace, limit=1, _request_timeout=timeout)
1761
+ # This call is "free" because this function is a cached call,
1762
+ # and it will not be called again in this function.
1763
+ get_kubernetes_nodes(context=context)
1589
1764
  except ImportError:
1590
1765
  # TODO(romilb): Update these error strs to also include link to docs
1591
1766
  # when docs are ready.
@@ -1710,11 +1885,17 @@ class PodValidator:
1710
1885
 
1711
1886
  if isinstance(klass, str):
1712
1887
  if klass.startswith('list['):
1713
- sub_kls = re.match(r'list\[(.*)\]', klass).group(1)
1888
+ match = re.match(r'list\[(.*)\]', klass)
1889
+ if match is None:
1890
+ raise ValueError(f'Invalid list type format: {klass}')
1891
+ sub_kls = match.group(1)
1714
1892
  return [cls.__validate(sub_data, sub_kls) for sub_data in data]
1715
1893
 
1716
1894
  if klass.startswith('dict('):
1717
- sub_kls = re.match(r'dict\(([^,]*), (.*)\)', klass).group(2)
1895
+ match = re.match(r'dict\(([^,]*), (.*)\)', klass)
1896
+ if match is None:
1897
+ raise ValueError(f'Invalid dict type format: {klass}')
1898
+ sub_kls = match.group(2)
1718
1899
  return {k: cls.__validate(v, sub_kls) for k, v in data.items()}
1719
1900
 
1720
1901
  # convert str to class
@@ -2073,6 +2254,15 @@ def get_kube_config_context_namespace(
2073
2254
  return DEFAULT_NAMESPACE
2074
2255
 
2075
2256
 
2257
+ def parse_cpu_or_gpu_resource_to_float(resource_str: str) -> float:
2258
+ if not resource_str:
2259
+ return 0.0
2260
+ if resource_str[-1] == 'm':
2261
+ return float(resource_str[:-1]) / 1000
2262
+ else:
2263
+ return float(resource_str)
2264
+
2265
+
2076
2266
  def parse_cpu_or_gpu_resource(resource_qty_str: str) -> Union[int, float]:
2077
2267
  resource_str = str(resource_qty_str)
2078
2268
  if resource_str[-1] == 'm':
@@ -2150,16 +2340,9 @@ class KubernetesInstanceType:
2150
2340
  @staticmethod
2151
2341
  def is_valid_instance_type(name: str) -> bool:
2152
2342
  """Returns whether the given name is a valid instance type."""
2153
- # Before https://github.com/skypilot-org/skypilot/pull/4756,
2154
- # the accelerators are appended with format "--{a}{type}",
2155
- # e.g. "4CPU--16GB--1V100".
2156
- # Check both patterns to keep backward compatibility.
2157
- # TODO(romilb): Backward compatibility, remove after 0.11.0.
2158
- prev_pattern = re.compile(
2159
- r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--\d+\S+)?$')
2160
2343
  pattern = re.compile(
2161
2344
  r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--[\w\d-]+:\d+)?$')
2162
- return bool(pattern.match(name)) or bool(prev_pattern.match(name))
2345
+ return bool(pattern.match(name))
2163
2346
 
2164
2347
  @classmethod
2165
2348
  def _parse_instance_type(
@@ -2176,11 +2359,6 @@ class KubernetesInstanceType:
2176
2359
  r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_type>[\w\d-]+):(?P<accelerator_count>\d+))?$' # pylint: disable=line-too-long
2177
2360
  )
2178
2361
  match = pattern.match(name)
2179
- # TODO(romilb): Backward compatibility, remove after 0.11.0.
2180
- prev_pattern = re.compile(
2181
- r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_count>\d+)(?P<accelerator_type>\S+))?$' # pylint: disable=line-too-long
2182
- )
2183
- prev_match = prev_pattern.match(name)
2184
2362
  if match:
2185
2363
  cpus = float(match.group('cpus'))
2186
2364
  memory = float(match.group('memory'))
@@ -2193,19 +2371,6 @@ class KubernetesInstanceType:
2193
2371
  accelerator_count = None
2194
2372
  accelerator_type = None
2195
2373
  return cpus, memory, accelerator_count, accelerator_type
2196
- # TODO(romilb): Backward compatibility, remove after 0.11.0.
2197
- elif prev_match:
2198
- cpus = float(prev_match.group('cpus'))
2199
- memory = float(prev_match.group('memory'))
2200
- accelerator_count = prev_match.group('accelerator_count')
2201
- accelerator_type = prev_match.group('accelerator_type')
2202
- if accelerator_count:
2203
- accelerator_count = int(accelerator_count)
2204
- accelerator_type = str(accelerator_type)
2205
- else:
2206
- accelerator_count = None
2207
- accelerator_type = None
2208
- return cpus, memory, accelerator_count, accelerator_type
2209
2374
  else:
2210
2375
  raise ValueError(f'Invalid instance name: {name}')
2211
2376
 
@@ -2278,16 +2443,14 @@ def construct_ssh_jump_command(
2278
2443
 
2279
2444
 
2280
2445
  def get_ssh_proxy_command(
2281
- k8s_ssh_target: str,
2282
- network_mode: kubernetes_enums.KubernetesNetworkingMode,
2446
+ pod_name: str,
2283
2447
  private_key_path: str,
2284
2448
  context: Optional[str],
2285
2449
  namespace: str,
2286
2450
  ) -> str:
2287
2451
  """Generates the SSH proxy command to connect to the pod.
2288
2452
 
2289
- Uses a jump pod if the network mode is NODEPORT, and direct port-forwarding
2290
- if the network mode is PORTFORWARD.
2453
+ Uses a direct port-forwarding.
2291
2454
 
2292
2455
  By default, establishing an SSH connection creates a communication
2293
2456
  channel to a remote node by setting up a TCP connection. When a
@@ -2298,17 +2461,8 @@ def get_ssh_proxy_command(
2298
2461
  Pods within a Kubernetes cluster have internal IP addresses that are
2299
2462
  typically not accessible from outside the cluster. Since the default TCP
2300
2463
  connection of SSH won't allow access to these pods, we employ a
2301
- ProxyCommand to establish the required communication channel. We offer this
2302
- in two different networking options: NodePort/port-forward.
2303
-
2304
- With the NodePort networking mode, a NodePort service is launched. This
2305
- service opens an external port on the node which redirects to the desired
2306
- port to a SSH jump pod. When establishing an SSH session in this mode, the
2307
- ProxyCommand makes use of this external port to create a communication
2308
- channel directly to port 22, which is the default port ssh server listens
2309
- on, of the jump pod.
2464
+ ProxyCommand to establish the required communication channel.
2310
2465
 
2311
- With Port-forward mode, instead of directly exposing an external port,
2312
2466
  'kubectl port-forward' sets up a tunnel between a local port
2313
2467
  (127.0.0.1:23100) and port 22 of the provisioned pod. Then we establish TCP
2314
2468
  connection to the local end of this tunnel, 127.0.0.1:23100, using 'socat'.
@@ -2319,38 +2473,26 @@ def get_ssh_proxy_command(
2319
2473
  the local machine.
2320
2474
 
2321
2475
  Args:
2322
- k8s_ssh_target: str; The Kubernetes object that will be used as the
2323
- target for SSH. If network_mode is NODEPORT, this is the name of the
2324
- service. If network_mode is PORTFORWARD, this is the pod name.
2325
- network_mode: KubernetesNetworkingMode; networking mode for ssh
2326
- session. It is either 'NODEPORT' or 'PORTFORWARD'
2476
+ pod_name: str; The Kubernetes pod name that will be used as the
2477
+ target for SSH.
2327
2478
  private_key_path: str; Path to the private key to use for SSH.
2328
2479
  This key must be authorized to access the SSH jump pod.
2329
- Required for NODEPORT networking mode.
2330
2480
  namespace: Kubernetes namespace to use.
2331
- Required for NODEPORT networking mode.
2332
2481
  """
2333
- # Fetch IP to connect to for the jump svc
2334
- ssh_jump_ip = get_external_ip(network_mode, context)
2482
+ ssh_jump_ip = '127.0.0.1' # Local end of the port-forward tunnel
2335
2483
  assert private_key_path is not None, 'Private key path must be provided'
2336
- if network_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
2337
- assert namespace is not None, 'Namespace must be provided for NodePort'
2338
- ssh_jump_port = get_port(k8s_ssh_target, namespace, context)
2339
- ssh_jump_proxy_command = construct_ssh_jump_command(
2340
- private_key_path, ssh_jump_ip, ssh_jump_port=ssh_jump_port)
2341
- else:
2342
- ssh_jump_proxy_command_path = create_proxy_command_script()
2343
- ssh_jump_proxy_command = construct_ssh_jump_command(
2344
- private_key_path,
2345
- ssh_jump_ip,
2346
- ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
2347
- proxy_cmd_path=ssh_jump_proxy_command_path,
2348
- proxy_cmd_target_pod=k8s_ssh_target,
2349
- # We embed both the current context and namespace to the SSH proxy
2350
- # command to make sure SSH still works when the current
2351
- # context/namespace is changed by the user.
2352
- current_kube_context=context,
2353
- current_kube_namespace=namespace)
2484
+ ssh_jump_proxy_command_path = create_proxy_command_script()
2485
+ ssh_jump_proxy_command = construct_ssh_jump_command(
2486
+ private_key_path,
2487
+ ssh_jump_ip,
2488
+ ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
2489
+ proxy_cmd_path=ssh_jump_proxy_command_path,
2490
+ proxy_cmd_target_pod=pod_name,
2491
+ # We embed both the current context and namespace to the SSH proxy
2492
+ # command to make sure SSH still works when the current
2493
+ # context/namespace is changed by the user.
2494
+ current_kube_context=context,
2495
+ current_kube_namespace=namespace)
2354
2496
  return ssh_jump_proxy_command
2355
2497
 
2356
2498
 
@@ -2382,240 +2524,6 @@ def create_proxy_command_script() -> str:
2382
2524
  return PORT_FORWARD_PROXY_CMD_PATH
2383
2525
 
2384
2526
 
2385
- def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
2386
- context: Optional[str],
2387
- service_type: kubernetes_enums.KubernetesServiceType):
2388
- """Sets up Kubernetes service resource to access for SSH jump pod.
2389
-
2390
- This method acts as a necessary complement to be run along with
2391
- setup_ssh_jump_pod(...) method. This service ensures the pod is accessible.
2392
-
2393
- Args:
2394
- ssh_jump_name: Name to use for the SSH jump service
2395
- namespace: Namespace to create the SSH jump service in
2396
- service_type: Networking configuration on either to use NodePort
2397
- or ClusterIP service to ssh in
2398
- """
2399
- # Fill in template - ssh_key_secret and ssh_jump_image are not required for
2400
- # the service spec, so we pass in empty strs.
2401
- content = fill_ssh_jump_template('', '', ssh_jump_name, service_type.value)
2402
-
2403
- # Add custom metadata from config
2404
- merge_custom_metadata(content['service_spec']['metadata'], context)
2405
-
2406
- # Create service
2407
- try:
2408
- kubernetes.core_api(context).create_namespaced_service(
2409
- namespace, content['service_spec'])
2410
- except kubernetes.api_exception() as e:
2411
- # SSH Jump Pod service already exists.
2412
- if e.status == 409:
2413
- ssh_jump_service = kubernetes.core_api(
2414
- context).read_namespaced_service(name=ssh_jump_name,
2415
- namespace=namespace)
2416
- curr_svc_type = ssh_jump_service.spec.type
2417
- if service_type.value == curr_svc_type:
2418
- # If the currently existing SSH Jump service's type is identical
2419
- # to user's configuration for networking mode
2420
- logger.debug(
2421
- f'SSH Jump Service {ssh_jump_name} already exists in the '
2422
- 'cluster, using it.')
2423
- else:
2424
- # If a different type of service type for SSH Jump pod compared
2425
- # to user's configuration for networking mode exists, we remove
2426
- # existing servie to create a new one following user's config
2427
- kubernetes.core_api(context).delete_namespaced_service(
2428
- name=ssh_jump_name, namespace=namespace)
2429
- kubernetes.core_api(context).create_namespaced_service(
2430
- namespace, content['service_spec'])
2431
- port_forward_mode = (
2432
- kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value)
2433
- nodeport_mode = (
2434
- kubernetes_enums.KubernetesNetworkingMode.NODEPORT.value)
2435
- clusterip_svc = (
2436
- kubernetes_enums.KubernetesServiceType.CLUSTERIP.value)
2437
- nodeport_svc = (
2438
- kubernetes_enums.KubernetesServiceType.NODEPORT.value)
2439
- curr_network_mode = port_forward_mode \
2440
- if curr_svc_type == clusterip_svc else nodeport_mode
2441
- new_network_mode = nodeport_mode \
2442
- if curr_svc_type == clusterip_svc else port_forward_mode
2443
- new_svc_type = nodeport_svc \
2444
- if curr_svc_type == clusterip_svc else clusterip_svc
2445
- logger.info(
2446
- f'Switching the networking mode from '
2447
- f'\'{curr_network_mode}\' to \'{new_network_mode}\' '
2448
- f'following networking configuration. Deleting existing '
2449
- f'\'{curr_svc_type}\' service and recreating as '
2450
- f'\'{new_svc_type}\' service.')
2451
- else:
2452
- raise
2453
- else:
2454
- logger.info(f'Created SSH Jump Service {ssh_jump_name}.')
2455
-
2456
-
2457
- def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
2458
- ssh_key_secret: str, namespace: str,
2459
- context: Optional[str]):
2460
- """Sets up Kubernetes RBAC and pod for SSH jump host.
2461
-
2462
- Our Kubernetes implementation uses a SSH jump pod to reach SkyPilot clusters
2463
- running inside a cluster. This function sets up the resources needed for
2464
- the SSH jump pod. This includes a service account which grants the jump pod
2465
- permission to watch for other SkyPilot pods and terminate itself if there
2466
- are no SkyPilot pods running.
2467
-
2468
- setup_ssh_jump_service must also be run to ensure that the SSH jump pod is
2469
- reachable.
2470
-
2471
- Args:
2472
- ssh_jump_image: Container image to use for the SSH jump pod
2473
- ssh_jump_name: Name to use for the SSH jump pod
2474
- ssh_key_secret: Secret name for the SSH key stored in the cluster
2475
- namespace: Namespace to create the SSH jump pod in
2476
- """
2477
- # Fill in template - service is created separately so service_type is not
2478
- # required, so we pass in empty str.
2479
- content = fill_ssh_jump_template(ssh_key_secret, ssh_jump_image,
2480
- ssh_jump_name, '')
2481
-
2482
- # Add custom metadata to all objects
2483
- for object_type in content.keys():
2484
- merge_custom_metadata(content[object_type]['metadata'], context)
2485
-
2486
- # ServiceAccount
2487
- try:
2488
- kubernetes.core_api(context).create_namespaced_service_account(
2489
- namespace, content['service_account'])
2490
- except kubernetes.api_exception() as e:
2491
- if e.status == 409:
2492
- logger.info(
2493
- 'SSH Jump ServiceAccount already exists in the cluster, using '
2494
- 'it.')
2495
- else:
2496
- raise
2497
- else:
2498
- logger.info('Created SSH Jump ServiceAccount.')
2499
- # Role
2500
- try:
2501
- kubernetes.auth_api(context).create_namespaced_role(
2502
- namespace, content['role'])
2503
- except kubernetes.api_exception() as e:
2504
- if e.status == 409:
2505
- logger.info(
2506
- 'SSH Jump Role already exists in the cluster, using it.')
2507
- else:
2508
- raise
2509
- else:
2510
- logger.info('Created SSH Jump Role.')
2511
- # RoleBinding
2512
- try:
2513
- kubernetes.auth_api(context).create_namespaced_role_binding(
2514
- namespace, content['role_binding'])
2515
- except kubernetes.api_exception() as e:
2516
- if e.status == 409:
2517
- logger.info(
2518
- 'SSH Jump RoleBinding already exists in the cluster, using '
2519
- 'it.')
2520
- else:
2521
- raise
2522
- else:
2523
- logger.info('Created SSH Jump RoleBinding.')
2524
- # Pod
2525
- try:
2526
- kubernetes.core_api(context).create_namespaced_pod(
2527
- namespace, content['pod_spec'])
2528
- except kubernetes.api_exception() as e:
2529
- if e.status == 409:
2530
- logger.info(
2531
- f'SSH Jump Host {ssh_jump_name} already exists in the cluster, '
2532
- 'using it.')
2533
- else:
2534
- raise
2535
- else:
2536
- logger.info(f'Created SSH Jump Host {ssh_jump_name}.')
2537
-
2538
-
2539
- def clean_zombie_ssh_jump_pod(namespace: str, context: Optional[str],
2540
- node_id: str):
2541
- """Analyzes SSH jump pod and removes if it is in a bad state
2542
-
2543
- Prevents the existence of a dangling SSH jump pod. This could happen
2544
- in case the pod main container did not start properly (or failed). In that
2545
- case, jump pod lifecycle manager will not function properly to
2546
- remove the pod and service automatically, and must be done manually.
2547
-
2548
- Args:
2549
- namespace: Namespace to remove the SSH jump pod and service from
2550
- node_id: Name of head pod
2551
- """
2552
-
2553
- def find(l, predicate):
2554
- """Utility function to find element in given list"""
2555
- results = [x for x in l if predicate(x)]
2556
- return results[0] if results else None
2557
-
2558
- # Get the SSH jump pod name from the head pod
2559
- try:
2560
- pod = kubernetes.core_api(context).read_namespaced_pod(
2561
- node_id, namespace)
2562
- except kubernetes.api_exception() as e:
2563
- if e.status == 404:
2564
- logger.warning(f'Failed to get pod {node_id},'
2565
- ' but the pod was not found (404).')
2566
- raise
2567
- else:
2568
- ssh_jump_name = pod.metadata.labels.get('skypilot-ssh-jump')
2569
- try:
2570
- ssh_jump_pod = kubernetes.core_api(context).read_namespaced_pod(
2571
- ssh_jump_name, namespace)
2572
- cont_ready_cond = find(ssh_jump_pod.status.conditions,
2573
- lambda c: c.type == 'ContainersReady')
2574
- if (cont_ready_cond and cont_ready_cond.status
2575
- == 'False') or ssh_jump_pod.status.phase == 'Pending':
2576
- # Either the main container is not ready or the pod failed
2577
- # to schedule. To be on the safe side and prevent a dangling
2578
- # ssh jump pod, lets remove it and the service. Otherwise, main
2579
- # container is ready and its lifecycle management script takes
2580
- # care of the cleaning.
2581
- kubernetes.core_api(context).delete_namespaced_pod(
2582
- ssh_jump_name, namespace)
2583
- kubernetes.core_api(context).delete_namespaced_service(
2584
- ssh_jump_name, namespace)
2585
- except kubernetes.api_exception() as e:
2586
- # We keep the warning in debug to avoid polluting the `sky launch`
2587
- # output.
2588
- logger.debug(f'Tried to check ssh jump pod {ssh_jump_name},'
2589
- f' but got error {e}\n. Consider running `kubectl '
2590
- f'delete pod {ssh_jump_name} -n {namespace}` to manually '
2591
- 'remove the pod if it has crashed.')
2592
- # We encountered an issue while checking ssh jump pod. To be on
2593
- # the safe side, lets remove its service so the port is freed
2594
- try:
2595
- kubernetes.core_api(context).delete_namespaced_service(
2596
- ssh_jump_name, namespace)
2597
- except kubernetes.api_exception():
2598
- pass
2599
-
2600
-
2601
- def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
2602
- ssh_jump_name: str, service_type: str) -> Dict:
2603
- template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
2604
- 'kubernetes-ssh-jump.yml.j2')
2605
- if not os.path.exists(template_path):
2606
- raise FileNotFoundError(
2607
- 'Template "kubernetes-ssh-jump.j2" does not exist.')
2608
- with open(template_path, 'r', encoding='utf-8') as fin:
2609
- template = fin.read()
2610
- j2_template = jinja2.Template(template)
2611
- cont = j2_template.render(name=ssh_jump_name,
2612
- image=ssh_jump_image,
2613
- secret=ssh_key_secret,
2614
- service_type=service_type)
2615
- content = yaml_utils.safe_load(cont)
2616
- return content
2617
-
2618
-
2619
2527
  def check_port_forward_mode_dependencies(
2620
2528
  raise_error: bool = True) -> Optional[List[str]]:
2621
2529
  """Checks if 'socat' and 'nc' are installed
@@ -2762,26 +2670,22 @@ def combine_pod_config_fields(
2762
2670
  merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
2763
2671
  # We don't use override_configs in `get_effective_region_config`, as merging
2764
2672
  # the pod config requires special handling.
2765
- if isinstance(cloud, clouds.SSH):
2766
- kubernetes_config = skypilot_config.get_effective_region_config(
2767
- cloud='ssh', region=None, keys=('pod_config',), default_value={})
2768
- override_pod_config = config_utils.get_cloud_config_value_from_dict(
2769
- dict_config=cluster_config_overrides,
2770
- cloud='ssh',
2771
- keys=('pod_config',),
2772
- default_value={})
2773
- else:
2774
- kubernetes_config = skypilot_config.get_effective_region_config(
2775
- cloud='kubernetes',
2776
- region=context,
2777
- keys=('pod_config',),
2778
- default_value={})
2779
- override_pod_config = config_utils.get_cloud_config_value_from_dict(
2780
- dict_config=cluster_config_overrides,
2781
- cloud='kubernetes',
2782
- region=context,
2783
- keys=('pod_config',),
2784
- default_value={})
2673
+ cloud_str = 'ssh' if isinstance(cloud, clouds.SSH) else 'kubernetes'
2674
+ context_str = context
2675
+ if isinstance(cloud, clouds.SSH) and context is not None:
2676
+ assert context.startswith('ssh-'), 'SSH context must start with "ssh-"'
2677
+ context_str = context[len('ssh-'):]
2678
+ kubernetes_config = skypilot_config.get_effective_region_config(
2679
+ cloud=cloud_str,
2680
+ region=context_str,
2681
+ keys=('pod_config',),
2682
+ default_value={})
2683
+ override_pod_config = config_utils.get_cloud_config_value_from_dict(
2684
+ dict_config=cluster_config_overrides,
2685
+ cloud=cloud_str,
2686
+ region=context_str,
2687
+ keys=('pod_config',),
2688
+ default_value={})
2785
2689
  config_utils.merge_k8s_configs(kubernetes_config, override_pod_config)
2786
2690
 
2787
2691
  # Merge the kubernetes config into the YAML for both head and worker nodes.
@@ -2800,9 +2704,11 @@ def combine_metadata_fields(cluster_yaml_obj: Dict[str, Any],
2800
2704
  Obeys the same add or update semantics as combine_pod_config_fields().
2801
2705
  """
2802
2706
  merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
2707
+ context, cloud_str = get_cleaned_context_and_cloud_str(context)
2708
+
2803
2709
  # Get custom_metadata from global config
2804
2710
  custom_metadata = skypilot_config.get_effective_region_config(
2805
- cloud='kubernetes',
2711
+ cloud=cloud_str,
2806
2712
  region=context,
2807
2713
  keys=('custom_metadata',),
2808
2714
  default_value={})
@@ -2810,7 +2716,7 @@ def combine_metadata_fields(cluster_yaml_obj: Dict[str, Any],
2810
2716
  # Get custom_metadata from task-level config overrides
2811
2717
  override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
2812
2718
  dict_config=cluster_config_overrides,
2813
- cloud='kubernetes',
2719
+ cloud=cloud_str,
2814
2720
  region=context,
2815
2721
  keys=('custom_metadata',),
2816
2722
  default_value={})
@@ -2867,9 +2773,11 @@ def merge_custom_metadata(
2867
2773
 
2868
2774
  Merge is done in-place, so return is not required
2869
2775
  """
2776
+ context, cloud_str = get_cleaned_context_and_cloud_str(context)
2777
+
2870
2778
  # Get custom_metadata from global config
2871
2779
  custom_metadata = skypilot_config.get_effective_region_config(
2872
- cloud='kubernetes',
2780
+ cloud=cloud_str,
2873
2781
  region=context,
2874
2782
  keys=('custom_metadata',),
2875
2783
  default_value={})
@@ -2878,7 +2786,7 @@ def merge_custom_metadata(
2878
2786
  if cluster_config_overrides is not None:
2879
2787
  override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
2880
2788
  dict_config=cluster_config_overrides,
2881
- cloud='kubernetes',
2789
+ cloud=cloud_str,
2882
2790
  region=context,
2883
2791
  keys=('custom_metadata',),
2884
2792
  default_value={})
@@ -2889,7 +2797,8 @@ def merge_custom_metadata(
2889
2797
  config_utils.merge_k8s_configs(original_metadata, custom_metadata)
2890
2798
 
2891
2799
 
2892
- def check_nvidia_runtime_class(context: Optional[str] = None) -> bool:
2800
+ @_retry_on_error(resource_type='runtimeclass')
2801
+ def check_nvidia_runtime_class(*, context: Optional[str] = None) -> bool:
2893
2802
  """Checks if the 'nvidia' RuntimeClass exists in the cluster"""
2894
2803
  # Fetch the list of available RuntimeClasses
2895
2804
  runtime_classes = kubernetes.node_api(context).list_runtime_class()
@@ -3108,14 +3017,6 @@ def get_kubernetes_node_info(
3108
3017
  information.
3109
3018
  """
3110
3019
  nodes = get_kubernetes_nodes(context=context)
3111
- # Get the pods to get the real-time resource usage
3112
- try:
3113
- pods = get_all_pods_in_kubernetes_cluster(context=context)
3114
- except kubernetes.api_exception() as e:
3115
- if e.status == 403:
3116
- pods = None
3117
- else:
3118
- raise
3119
3020
 
3120
3021
  lf, _ = detect_gpu_label_formatter(context)
3121
3022
  if not lf:
@@ -3123,6 +3024,29 @@ def get_kubernetes_node_info(
3123
3024
  else:
3124
3025
  label_keys = lf.get_label_keys()
3125
3026
 
3027
+ # Check if all nodes have no accelerators to avoid fetching pods
3028
+ has_accelerator_nodes = False
3029
+ for node in nodes:
3030
+ accelerator_count = get_node_accelerator_count(context,
3031
+ node.status.allocatable)
3032
+ if accelerator_count > 0:
3033
+ has_accelerator_nodes = True
3034
+ break
3035
+
3036
+ # Get the allocated GPU quantity by each node
3037
+ allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
3038
+ error_on_get_allocated_gpu_qty_by_node = False
3039
+ if has_accelerator_nodes:
3040
+ try:
3041
+ allocated_qty_by_node = get_allocated_gpu_qty_by_node(
3042
+ context=context)
3043
+ except kubernetes.api_exception() as e:
3044
+ if e.status == 403:
3045
+ error_on_get_allocated_gpu_qty_by_node = True
3046
+ pass
3047
+ else:
3048
+ raise
3049
+
3126
3050
  node_info_dict: Dict[str, models.KubernetesNodeInfo] = {}
3127
3051
  has_multi_host_tpu = False
3128
3052
 
@@ -3152,32 +3076,21 @@ def get_kubernetes_node_info(
3152
3076
  node_ip = address.address
3153
3077
  break
3154
3078
 
3155
- allocated_qty = 0
3156
3079
  accelerator_count = get_node_accelerator_count(context,
3157
3080
  node.status.allocatable)
3081
+ if accelerator_count == 0:
3082
+ node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
3083
+ name=node.metadata.name,
3084
+ accelerator_type=accelerator_name,
3085
+ total={'accelerator_count': 0},
3086
+ free={'accelerators_available': 0},
3087
+ ip_address=node_ip)
3088
+ continue
3158
3089
 
3159
- if pods is None:
3090
+ if not has_accelerator_nodes or error_on_get_allocated_gpu_qty_by_node:
3160
3091
  accelerators_available = -1
3161
-
3162
3092
  else:
3163
- for pod in pods:
3164
- # Get all the pods running on the node
3165
- if (pod.spec.node_name == node.metadata.name and
3166
- pod.status.phase in ['Running', 'Pending']):
3167
- # Skip pods that should not count against GPU count
3168
- if should_exclude_pod_from_gpu_allocation(pod):
3169
- logger.debug(
3170
- f'Excluding low priority pod '
3171
- f'{pod.metadata.name} from GPU allocation '
3172
- f'calculations on node {node.metadata.name}')
3173
- continue
3174
- # Iterate over all the containers in the pod and sum the
3175
- # GPU requests
3176
- for container in pod.spec.containers:
3177
- if container.resources.requests:
3178
- allocated_qty += get_node_accelerator_count(
3179
- context, container.resources.requests)
3180
-
3093
+ allocated_qty = allocated_qty_by_node[node.metadata.name]
3181
3094
  accelerators_available = accelerator_count - allocated_qty
3182
3095
 
3183
3096
  # Exclude multi-host TPUs from being processed.
@@ -3224,7 +3137,11 @@ def filter_pods(namespace: str,
3224
3137
  context: Optional[str],
3225
3138
  tag_filters: Dict[str, str],
3226
3139
  status_filters: Optional[List[str]] = None) -> Dict[str, Any]:
3227
- """Filters pods by tags and status."""
3140
+ """Filters pods by tags and status.
3141
+
3142
+ Returned dict is sorted by name, with workers sorted by their numeric suffix.
3143
+ This ensures consistent ordering for SSH configuration and other operations.
3144
+ """
3228
3145
  non_included_pod_statuses = POD_STATUSES.copy()
3229
3146
 
3230
3147
  field_selector = ''
@@ -3242,7 +3159,32 @@ def filter_pods(namespace: str,
3242
3159
  pods = [
3243
3160
  pod for pod in pod_list.items if pod.metadata.deletion_timestamp is None
3244
3161
  ]
3245
- return {pod.metadata.name: pod for pod in pods}
3162
+
3163
+ # Sort pods by name, with workers sorted by their numeric suffix.
3164
+ # This ensures consistent ordering (e.g., cluster-head, cluster-worker1,
3165
+ # cluster-worker2, cluster-worker3, ...) even when Kubernetes API
3166
+ # returns them in arbitrary order. This works even if there were
3167
+ # somehow pod names other than head/worker ones, and those end up at
3168
+ # the end of the list.
3169
+ def get_pod_sort_key(
3170
+ pod: V1Pod
3171
+ ) -> Union[Tuple[Literal[0], str], Tuple[Literal[1], int], Tuple[Literal[2],
3172
+ str]]:
3173
+ name = pod.metadata.name
3174
+ name_suffix = name.split('-')[-1]
3175
+ if name_suffix == 'head':
3176
+ return (0, name)
3177
+ elif name_suffix.startswith('worker'):
3178
+ try:
3179
+ return (1, int(name_suffix.split('worker')[-1]))
3180
+ except (ValueError, IndexError):
3181
+ return (2, name)
3182
+ else:
3183
+ return (2, name)
3184
+
3185
+ sorted_pods = sorted(pods, key=get_pod_sort_key)
3186
+
3187
+ return {pod.metadata.name: pod for pod in sorted_pods}
3246
3188
 
3247
3189
 
3248
3190
  def _remove_pod_annotation(pod: Any,
@@ -3371,13 +3313,13 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
3371
3313
 
3372
3314
  try:
3373
3315
  pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
3374
- label_selector='skypilot-cluster',
3316
+ label_selector=provision_constants.TAG_SKYPILOT_CLUSTER_NAME,
3375
3317
  _request_timeout=kubernetes.API_TIMEOUT).items
3376
3318
  except kubernetes.max_retry_error():
3377
3319
  raise exceptions.ResourcesUnavailableError(
3378
3320
  'Timed out trying to get SkyPilot pods from Kubernetes cluster. '
3379
3321
  'Please check if the cluster is healthy and retry. To debug, run: '
3380
- 'kubectl get pods --selector=skypilot-cluster --all-namespaces'
3322
+ 'kubectl get pods --selector=skypilot-cluster-name --all-namespaces'
3381
3323
  ) from None
3382
3324
  return pods
3383
3325
 
@@ -3514,7 +3456,8 @@ def process_skypilot_pods(
3514
3456
  serve_controllers: List[KubernetesSkyPilotClusterInfo] = []
3515
3457
 
3516
3458
  for pod in pods:
3517
- cluster_name_on_cloud = pod.metadata.labels.get('skypilot-cluster')
3459
+ cluster_name_on_cloud = pod.metadata.labels.get(
3460
+ provision_constants.TAG_SKYPILOT_CLUSTER_NAME)
3518
3461
  cluster_name = cluster_name_on_cloud.rsplit(
3519
3462
  '-', 1
3520
3463
  )[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
@@ -3541,9 +3484,20 @@ def process_skypilot_pods(
3541
3484
  f'requesting GPUs: {pod.metadata.name}')
3542
3485
  gpu_label = label_formatter.get_label_key()
3543
3486
  # Get GPU name from pod node selector
3544
- if pod.spec.node_selector is not None:
3545
- gpu_name = label_formatter.get_accelerator_from_label_value(
3546
- pod.spec.node_selector.get(gpu_label))
3487
+ node_selector_terms = (
3488
+ pod.spec.affinity.node_affinity.
3489
+ required_during_scheduling_ignored_during_execution.
3490
+ node_selector_terms)
3491
+ if node_selector_terms is not None:
3492
+ expressions = []
3493
+ for term in node_selector_terms:
3494
+ if term.match_expressions:
3495
+ expressions.extend(term.match_expressions)
3496
+ for expression in expressions:
3497
+ if expression.key == gpu_label and expression.operator == 'In':
3498
+ gpu_name = label_formatter.get_accelerator_from_label_value(
3499
+ expression.values[0])
3500
+ break
3547
3501
 
3548
3502
  resources = resources_lib.Resources(
3549
3503
  cloud=clouds.Kubernetes(),
@@ -3790,3 +3744,13 @@ def should_exclude_pod_from_gpu_allocation(pod) -> bool:
3790
3744
  return True
3791
3745
 
3792
3746
  return False
3747
+
3748
+
3749
+ def get_cleaned_context_and_cloud_str(
3750
+ context: Optional[str]) -> Tuple[Optional[str], str]:
3751
+ """Return the cleaned context and relevant cloud string from a context."""
3752
+ cloud_str = 'kubernetes'
3753
+ if context is not None and context.startswith('ssh-'):
3754
+ cloud_str = 'ssh'
3755
+ context = context[len('ssh-'):]
3756
+ return context, cloud_str