skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251210__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (429) hide show
  1. sky/__init__.py +12 -2
  2. sky/adaptors/aws.py +27 -22
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/adaptors/slurm.py +478 -0
  14. sky/admin_policy.py +20 -0
  15. sky/authentication.py +157 -263
  16. sky/backends/__init__.py +3 -2
  17. sky/backends/backend.py +11 -3
  18. sky/backends/backend_utils.py +630 -185
  19. sky/backends/cloud_vm_ray_backend.py +1111 -928
  20. sky/backends/local_docker_backend.py +9 -5
  21. sky/backends/task_codegen.py +971 -0
  22. sky/backends/wheel_utils.py +18 -0
  23. sky/catalog/__init__.py +8 -3
  24. sky/catalog/aws_catalog.py +4 -0
  25. sky/catalog/common.py +19 -1
  26. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  27. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  28. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  29. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  30. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  31. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  32. sky/catalog/kubernetes_catalog.py +36 -32
  33. sky/catalog/primeintellect_catalog.py +95 -0
  34. sky/catalog/runpod_catalog.py +5 -1
  35. sky/catalog/seeweb_catalog.py +184 -0
  36. sky/catalog/shadeform_catalog.py +165 -0
  37. sky/catalog/slurm_catalog.py +243 -0
  38. sky/check.py +87 -46
  39. sky/client/cli/command.py +1004 -434
  40. sky/client/cli/flags.py +4 -2
  41. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  42. sky/client/cli/utils.py +79 -0
  43. sky/client/common.py +12 -2
  44. sky/client/sdk.py +188 -65
  45. sky/client/sdk_async.py +34 -33
  46. sky/cloud_stores.py +82 -3
  47. sky/clouds/__init__.py +8 -0
  48. sky/clouds/aws.py +337 -129
  49. sky/clouds/azure.py +24 -18
  50. sky/clouds/cloud.py +47 -13
  51. sky/clouds/cudo.py +16 -13
  52. sky/clouds/do.py +9 -7
  53. sky/clouds/fluidstack.py +12 -5
  54. sky/clouds/gcp.py +14 -7
  55. sky/clouds/hyperbolic.py +12 -5
  56. sky/clouds/ibm.py +12 -5
  57. sky/clouds/kubernetes.py +80 -45
  58. sky/clouds/lambda_cloud.py +12 -5
  59. sky/clouds/nebius.py +23 -9
  60. sky/clouds/oci.py +19 -12
  61. sky/clouds/paperspace.py +4 -1
  62. sky/clouds/primeintellect.py +317 -0
  63. sky/clouds/runpod.py +85 -24
  64. sky/clouds/scp.py +12 -8
  65. sky/clouds/seeweb.py +477 -0
  66. sky/clouds/shadeform.py +400 -0
  67. sky/clouds/slurm.py +578 -0
  68. sky/clouds/ssh.py +6 -3
  69. sky/clouds/utils/scp_utils.py +61 -50
  70. sky/clouds/vast.py +43 -27
  71. sky/clouds/vsphere.py +14 -16
  72. sky/core.py +296 -195
  73. sky/dashboard/out/404.html +1 -1
  74. sky/dashboard/out/_next/static/KYAhEFa3FTfq4JyKVgo-s/_buildManifest.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/1141-9c810f01ff4f398a.js +11 -0
  76. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  77. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  79. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  80. sky/dashboard/out/_next/static/chunks/3294.ddda8c6c6f9f24dc.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  82. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-b589397dc09c5b4e.js} +1 -1
  83. sky/dashboard/out/_next/static/chunks/3850-fd5696f3bbbaddae.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  86. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  87. sky/dashboard/out/_next/static/chunks/6856-da20c5fd999f319c.js +1 -0
  88. sky/dashboard/out/_next/static/chunks/6990-09cbf02d3cd518c3.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  90. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  92. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  93. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  94. sky/dashboard/out/_next/static/chunks/9353-8369df1cf105221c.js +1 -0
  95. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  96. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  97. sky/dashboard/out/_next/static/chunks/pages/_app-68b647e26f9d2793.js +34 -0
  98. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-33f525539665fdfd.js +16 -0
  99. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a7565f586ef86467.js +1 -0
  100. sky/dashboard/out/_next/static/chunks/pages/clusters-9e5d47818b9bdadd.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/pages/{config-dfb9bf07b13045f4.js → config-718cdc365de82689.js} +1 -1
  102. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-12c559ec4d81fdbd.js} +1 -1
  103. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-d187cd0413d72475.js} +1 -1
  104. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-895847b6cf200b04.js +16 -0
  105. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-8d0f4655400b4eb9.js +21 -0
  106. sky/dashboard/out/_next/static/chunks/pages/jobs-e5a98f17f8513a96.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/pages/plugins/[...slug]-4f46050ca065d8f8.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/pages/users-2f7646eb77785a2c.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/pages/volumes-ef19d49c6d0e8500.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-96e0f298308da7e2.js} +1 -1
  111. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-cb4da3abe08ebf19.js} +1 -1
  112. sky/dashboard/out/_next/static/chunks/webpack-fba3de387ff6bb08.js +1 -0
  113. sky/dashboard/out/_next/static/css/c5a4cfd2600fc715.css +3 -0
  114. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  115. sky/dashboard/out/clusters/[cluster].html +1 -1
  116. sky/dashboard/out/clusters.html +1 -1
  117. sky/dashboard/out/config.html +1 -1
  118. sky/dashboard/out/index.html +1 -1
  119. sky/dashboard/out/infra/[context].html +1 -1
  120. sky/dashboard/out/infra.html +1 -1
  121. sky/dashboard/out/jobs/[job].html +1 -1
  122. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  123. sky/dashboard/out/jobs.html +1 -1
  124. sky/dashboard/out/plugins/[...slug].html +1 -0
  125. sky/dashboard/out/users.html +1 -1
  126. sky/dashboard/out/volumes.html +1 -1
  127. sky/dashboard/out/workspace/new.html +1 -1
  128. sky/dashboard/out/workspaces/[name].html +1 -1
  129. sky/dashboard/out/workspaces.html +1 -1
  130. sky/data/data_utils.py +92 -1
  131. sky/data/mounting_utils.py +177 -30
  132. sky/data/storage.py +200 -19
  133. sky/data/storage_utils.py +10 -45
  134. sky/exceptions.py +18 -7
  135. sky/execution.py +74 -31
  136. sky/global_user_state.py +605 -191
  137. sky/jobs/__init__.py +2 -0
  138. sky/jobs/client/sdk.py +101 -4
  139. sky/jobs/client/sdk_async.py +31 -5
  140. sky/jobs/constants.py +15 -8
  141. sky/jobs/controller.py +726 -284
  142. sky/jobs/file_content_utils.py +128 -0
  143. sky/jobs/log_gc.py +193 -0
  144. sky/jobs/recovery_strategy.py +250 -100
  145. sky/jobs/scheduler.py +271 -173
  146. sky/jobs/server/core.py +367 -114
  147. sky/jobs/server/server.py +81 -35
  148. sky/jobs/server/utils.py +89 -35
  149. sky/jobs/state.py +1498 -620
  150. sky/jobs/utils.py +771 -306
  151. sky/logs/agent.py +40 -5
  152. sky/logs/aws.py +9 -19
  153. sky/metrics/utils.py +282 -39
  154. sky/models.py +2 -0
  155. sky/optimizer.py +7 -6
  156. sky/provision/__init__.py +38 -1
  157. sky/provision/aws/config.py +34 -13
  158. sky/provision/aws/instance.py +5 -2
  159. sky/provision/azure/instance.py +5 -3
  160. sky/provision/common.py +22 -0
  161. sky/provision/cudo/instance.py +4 -3
  162. sky/provision/do/instance.py +4 -3
  163. sky/provision/docker_utils.py +112 -28
  164. sky/provision/fluidstack/instance.py +6 -5
  165. sky/provision/gcp/config.py +6 -1
  166. sky/provision/gcp/instance.py +4 -2
  167. sky/provision/hyperbolic/instance.py +4 -2
  168. sky/provision/instance_setup.py +66 -20
  169. sky/provision/kubernetes/__init__.py +2 -0
  170. sky/provision/kubernetes/config.py +7 -44
  171. sky/provision/kubernetes/constants.py +0 -1
  172. sky/provision/kubernetes/instance.py +609 -213
  173. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  174. sky/provision/kubernetes/network.py +12 -8
  175. sky/provision/kubernetes/network_utils.py +8 -25
  176. sky/provision/kubernetes/utils.py +422 -422
  177. sky/provision/kubernetes/volume.py +150 -18
  178. sky/provision/lambda_cloud/instance.py +16 -13
  179. sky/provision/nebius/instance.py +6 -2
  180. sky/provision/nebius/utils.py +103 -86
  181. sky/provision/oci/instance.py +4 -2
  182. sky/provision/paperspace/instance.py +4 -3
  183. sky/provision/primeintellect/__init__.py +10 -0
  184. sky/provision/primeintellect/config.py +11 -0
  185. sky/provision/primeintellect/instance.py +454 -0
  186. sky/provision/primeintellect/utils.py +398 -0
  187. sky/provision/provisioner.py +45 -15
  188. sky/provision/runpod/__init__.py +2 -0
  189. sky/provision/runpod/instance.py +4 -3
  190. sky/provision/runpod/volume.py +69 -13
  191. sky/provision/scp/instance.py +307 -130
  192. sky/provision/seeweb/__init__.py +11 -0
  193. sky/provision/seeweb/config.py +13 -0
  194. sky/provision/seeweb/instance.py +812 -0
  195. sky/provision/shadeform/__init__.py +11 -0
  196. sky/provision/shadeform/config.py +12 -0
  197. sky/provision/shadeform/instance.py +351 -0
  198. sky/provision/shadeform/shadeform_utils.py +83 -0
  199. sky/provision/slurm/__init__.py +12 -0
  200. sky/provision/slurm/config.py +13 -0
  201. sky/provision/slurm/instance.py +572 -0
  202. sky/provision/slurm/utils.py +583 -0
  203. sky/provision/vast/instance.py +9 -4
  204. sky/provision/vast/utils.py +10 -6
  205. sky/provision/volume.py +164 -0
  206. sky/provision/vsphere/common/ssl_helper.py +1 -1
  207. sky/provision/vsphere/common/vapiconnect.py +2 -1
  208. sky/provision/vsphere/common/vim_utils.py +3 -2
  209. sky/provision/vsphere/instance.py +8 -6
  210. sky/provision/vsphere/vsphere_utils.py +8 -1
  211. sky/resources.py +11 -3
  212. sky/schemas/api/responses.py +107 -6
  213. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  214. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  215. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  216. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  217. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  218. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  219. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  220. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  221. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  222. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  223. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  224. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  225. sky/schemas/generated/jobsv1_pb2.py +86 -0
  226. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  227. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  228. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  229. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  230. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  231. sky/schemas/generated/servev1_pb2.py +58 -0
  232. sky/schemas/generated/servev1_pb2.pyi +115 -0
  233. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  234. sky/serve/autoscalers.py +2 -0
  235. sky/serve/client/impl.py +55 -21
  236. sky/serve/constants.py +4 -3
  237. sky/serve/controller.py +17 -11
  238. sky/serve/load_balancing_policies.py +1 -1
  239. sky/serve/replica_managers.py +219 -142
  240. sky/serve/serve_rpc_utils.py +179 -0
  241. sky/serve/serve_state.py +63 -54
  242. sky/serve/serve_utils.py +145 -109
  243. sky/serve/server/core.py +46 -25
  244. sky/serve/server/impl.py +311 -162
  245. sky/serve/server/server.py +21 -19
  246. sky/serve/service.py +84 -68
  247. sky/serve/service_spec.py +45 -7
  248. sky/server/auth/loopback.py +38 -0
  249. sky/server/auth/oauth2_proxy.py +12 -7
  250. sky/server/common.py +47 -24
  251. sky/server/config.py +62 -28
  252. sky/server/constants.py +9 -1
  253. sky/server/daemons.py +109 -38
  254. sky/server/metrics.py +76 -96
  255. sky/server/middleware_utils.py +166 -0
  256. sky/server/plugins.py +222 -0
  257. sky/server/requests/executor.py +384 -145
  258. sky/server/requests/payloads.py +83 -19
  259. sky/server/requests/preconditions.py +15 -13
  260. sky/server/requests/request_names.py +123 -0
  261. sky/server/requests/requests.py +511 -157
  262. sky/server/requests/serializers/decoders.py +48 -17
  263. sky/server/requests/serializers/encoders.py +102 -20
  264. sky/server/requests/serializers/return_value_serializers.py +60 -0
  265. sky/server/requests/threads.py +117 -0
  266. sky/server/rest.py +116 -24
  267. sky/server/server.py +497 -179
  268. sky/server/server_utils.py +30 -0
  269. sky/server/stream_utils.py +219 -45
  270. sky/server/uvicorn.py +30 -19
  271. sky/setup_files/MANIFEST.in +6 -1
  272. sky/setup_files/alembic.ini +8 -0
  273. sky/setup_files/dependencies.py +64 -19
  274. sky/setup_files/setup.py +44 -44
  275. sky/sky_logging.py +13 -5
  276. sky/skylet/attempt_skylet.py +116 -24
  277. sky/skylet/configs.py +3 -1
  278. sky/skylet/constants.py +139 -29
  279. sky/skylet/events.py +74 -14
  280. sky/skylet/executor/__init__.py +1 -0
  281. sky/skylet/executor/slurm.py +189 -0
  282. sky/skylet/job_lib.py +143 -105
  283. sky/skylet/log_lib.py +252 -8
  284. sky/skylet/log_lib.pyi +47 -7
  285. sky/skylet/providers/ibm/node_provider.py +12 -8
  286. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  287. sky/skylet/runtime_utils.py +21 -0
  288. sky/skylet/services.py +524 -0
  289. sky/skylet/skylet.py +27 -2
  290. sky/skylet/subprocess_daemon.py +104 -28
  291. sky/skypilot_config.py +99 -79
  292. sky/ssh_node_pools/constants.py +12 -0
  293. sky/ssh_node_pools/core.py +40 -3
  294. sky/ssh_node_pools/deploy/__init__.py +4 -0
  295. sky/ssh_node_pools/deploy/deploy.py +952 -0
  296. sky/ssh_node_pools/deploy/tunnel_utils.py +199 -0
  297. sky/ssh_node_pools/deploy/utils.py +173 -0
  298. sky/ssh_node_pools/server.py +20 -21
  299. sky/{utils/kubernetes/ssh_utils.py → ssh_node_pools/utils.py} +9 -6
  300. sky/task.py +221 -104
  301. sky/templates/aws-ray.yml.j2 +1 -0
  302. sky/templates/azure-ray.yml.j2 +1 -0
  303. sky/templates/cudo-ray.yml.j2 +1 -0
  304. sky/templates/do-ray.yml.j2 +1 -0
  305. sky/templates/fluidstack-ray.yml.j2 +1 -0
  306. sky/templates/gcp-ray.yml.j2 +1 -0
  307. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  308. sky/templates/ibm-ray.yml.j2 +2 -1
  309. sky/templates/jobs-controller.yaml.j2 +3 -0
  310. sky/templates/kubernetes-ray.yml.j2 +204 -55
  311. sky/templates/lambda-ray.yml.j2 +1 -0
  312. sky/templates/nebius-ray.yml.j2 +3 -0
  313. sky/templates/oci-ray.yml.j2 +1 -0
  314. sky/templates/paperspace-ray.yml.j2 +1 -0
  315. sky/templates/primeintellect-ray.yml.j2 +72 -0
  316. sky/templates/runpod-ray.yml.j2 +1 -0
  317. sky/templates/scp-ray.yml.j2 +1 -0
  318. sky/templates/seeweb-ray.yml.j2 +171 -0
  319. sky/templates/shadeform-ray.yml.j2 +73 -0
  320. sky/templates/slurm-ray.yml.j2 +85 -0
  321. sky/templates/vast-ray.yml.j2 +2 -0
  322. sky/templates/vsphere-ray.yml.j2 +1 -0
  323. sky/templates/websocket_proxy.py +188 -43
  324. sky/usage/usage_lib.py +16 -4
  325. sky/users/model.conf +1 -1
  326. sky/users/permission.py +84 -44
  327. sky/users/rbac.py +31 -3
  328. sky/utils/accelerator_registry.py +6 -3
  329. sky/utils/admin_policy_utils.py +18 -5
  330. sky/utils/annotations.py +128 -6
  331. sky/utils/asyncio_utils.py +78 -0
  332. sky/utils/atomic.py +1 -1
  333. sky/utils/auth_utils.py +153 -0
  334. sky/utils/cli_utils/status_utils.py +12 -7
  335. sky/utils/cluster_utils.py +28 -6
  336. sky/utils/command_runner.py +283 -30
  337. sky/utils/command_runner.pyi +63 -7
  338. sky/utils/common.py +3 -1
  339. sky/utils/common_utils.py +55 -7
  340. sky/utils/config_utils.py +1 -14
  341. sky/utils/context.py +127 -40
  342. sky/utils/context_utils.py +73 -18
  343. sky/utils/controller_utils.py +229 -70
  344. sky/utils/db/db_utils.py +95 -18
  345. sky/utils/db/kv_cache.py +149 -0
  346. sky/utils/db/migration_utils.py +24 -7
  347. sky/utils/env_options.py +4 -0
  348. sky/utils/git.py +559 -1
  349. sky/utils/kubernetes/create_cluster.sh +15 -30
  350. sky/utils/kubernetes/delete_cluster.sh +10 -7
  351. sky/utils/kubernetes/generate_kind_config.py +6 -66
  352. sky/utils/kubernetes/gpu_labeler.py +13 -3
  353. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  354. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  355. sky/utils/kubernetes/kubernetes_deploy_utils.py +187 -260
  356. sky/utils/kubernetes/rsync_helper.sh +11 -3
  357. sky/utils/kubernetes/ssh-tunnel.sh +7 -376
  358. sky/utils/kubernetes_enums.py +7 -15
  359. sky/utils/lock_events.py +4 -4
  360. sky/utils/locks.py +128 -31
  361. sky/utils/log_utils.py +0 -319
  362. sky/utils/resource_checker.py +13 -10
  363. sky/utils/resources_utils.py +53 -29
  364. sky/utils/rich_utils.py +8 -4
  365. sky/utils/schemas.py +138 -52
  366. sky/utils/subprocess_utils.py +17 -4
  367. sky/utils/thread_utils.py +91 -0
  368. sky/utils/timeline.py +2 -1
  369. sky/utils/ux_utils.py +35 -1
  370. sky/utils/volume.py +88 -4
  371. sky/utils/yaml_utils.py +9 -0
  372. sky/volumes/client/sdk.py +48 -10
  373. sky/volumes/server/core.py +59 -22
  374. sky/volumes/server/server.py +46 -17
  375. sky/volumes/volume.py +54 -42
  376. sky/workspaces/core.py +57 -21
  377. sky/workspaces/server.py +13 -12
  378. sky_templates/README.md +3 -0
  379. sky_templates/__init__.py +3 -0
  380. sky_templates/ray/__init__.py +0 -0
  381. sky_templates/ray/start_cluster +183 -0
  382. sky_templates/ray/stop_cluster +75 -0
  383. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/METADATA +343 -65
  384. skypilot_nightly-1.0.0.dev20251210.dist-info/RECORD +629 -0
  385. skypilot_nightly-1.0.0.dev20251210.dist-info/top_level.txt +2 -0
  386. sky/client/cli/git.py +0 -549
  387. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  388. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  389. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  390. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  391. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  392. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  393. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +0 -1
  394. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  395. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  396. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  397. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  398. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  399. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  400. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  401. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  402. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  403. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  404. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  405. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  406. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  407. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  408. sky/dashboard/out/_next/static/chunks/pages/_app-ce361c6959bc2001.js +0 -34
  409. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  410. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  411. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  412. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  413. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  414. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  415. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  416. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  417. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  418. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  419. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  420. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  421. sky/utils/kubernetes/cleanup-tunnel.sh +0 -62
  422. sky/utils/kubernetes/deploy_remote_cluster.py +0 -1299
  423. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  424. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  425. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  426. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → KYAhEFa3FTfq4JyKVgo-s}/_ssgManifest.js +0 -0
  427. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/WHEEL +0 -0
  428. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/entry_points.txt +0 -0
  429. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251210.dist-info}/licenses/LICENSE +0 -0
@@ -1,4 +1,5 @@
1
1
  """Kubernetes utilities for SkyPilot."""
2
+ import collections
2
3
  import copy
3
4
  import dataclasses
4
5
  import datetime
@@ -13,8 +14,10 @@ import shutil
13
14
  import subprocess
14
15
  import time
15
16
  import typing
16
- from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
17
- from urllib.parse import urlparse
17
+ from typing import (Any, Callable, Dict, List, Literal, Optional, Set, Tuple,
18
+ Union)
19
+
20
+ import ijson
18
21
 
19
22
  from sky import clouds
20
23
  from sky import exceptions
@@ -32,7 +35,6 @@ from sky.skylet import constants
32
35
  from sky.utils import annotations
33
36
  from sky.utils import common_utils
34
37
  from sky.utils import config_utils
35
- from sky.utils import directory_utils
36
38
  from sky.utils import env_options
37
39
  from sky.utils import kubernetes_enums
38
40
  from sky.utils import schemas
@@ -61,6 +63,8 @@ HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_NAME = 'sky-data'
61
63
  # and store all data that needs to be persisted in future.
62
64
  HIGH_AVAILABILITY_DEPLOYMENT_VOLUME_MOUNT_PATH = '/home/sky'
63
65
 
66
+ IJSON_BUFFER_SIZE = 64 * 1024 # 64KB, default from ijson
67
+
64
68
 
65
69
  class KubernetesHighPerformanceNetworkType(enum.Enum):
66
70
  """Enum for different Kubernetes cluster types with high performance
@@ -106,8 +110,9 @@ class KubernetesHighPerformanceNetworkType(enum.Enum):
106
110
  return {
107
111
  'NCCL_SOCKET_IFNAME': 'eth0',
108
112
  'NCCL_IB_HCA': 'ibp',
109
- 'UCX_NET_DEVICES': ('ibp0:1,ibp1:1,ibp2:1,ibp3:1,'
110
- 'ibp4:1,ibp5:1,ibp6:1,ibp7:1')
113
+ # Restrict UCX to TCP to avoid unneccsary errors. NCCL doesn't use UCX
114
+ 'UCX_TLS': 'tcp',
115
+ 'UCX_NET_DEVICES': 'eth0',
111
116
  }
112
117
  else:
113
118
  # GCP clusters and generic clusters - environment variables are
@@ -235,6 +240,40 @@ def normalize_tpu_accelerator_name(accelerator: str) -> Tuple[str, int]:
235
240
  return accelerator, 1
236
241
 
237
242
 
243
+ def _is_cloudflare_403_error(exception: Exception) -> bool:
244
+ """Check if an exception is a transient CloudFlare 403 error.
245
+
246
+ CloudFlare proxy 403 errors with CF-specific headers are transient and
247
+ should be retried, unlike real RBAC 403 errors.
248
+
249
+ Args:
250
+ exception: The exception to check
251
+
252
+ Returns:
253
+ True if this is a CloudFlare 403 error that should be retried
254
+ """
255
+ if not isinstance(exception, kubernetes.api_exception()):
256
+ return False
257
+
258
+ # Only check for 403 errors
259
+ if exception.status != 403:
260
+ return False
261
+
262
+ # Check for CloudFlare-specific headers
263
+ headers = exception.headers if hasattr(exception, 'headers') else {}
264
+ if not headers:
265
+ return False
266
+
267
+ # CloudFlare errors have CF-RAY header and/or Server: cloudflare
268
+ for k, v in headers.items():
269
+ if 'cf-ray' in k.lower():
270
+ return True
271
+ if 'server' in k.lower() and 'cloudflare' in str(v).lower():
272
+ return True
273
+
274
+ return False
275
+
276
+
238
277
  def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
239
278
  retry_interval=DEFAULT_RETRY_INTERVAL_SECONDS,
240
279
  resource_type: Optional[str] = None):
@@ -269,19 +308,25 @@ def _retry_on_error(max_retries=DEFAULT_MAX_RETRIES,
269
308
  kubernetes.api_exception(),
270
309
  kubernetes.config_exception()) as e:
271
310
  last_exception = e
311
+
312
+ # Check if this is a CloudFlare transient 403 error
313
+ is_cloudflare_403 = _is_cloudflare_403_error(e)
314
+
272
315
  # Don't retry on permanent errors like 401 (Unauthorized)
273
- # or 403 (Forbidden)
316
+ # or 403 (Forbidden), unless it's a CloudFlare transient 403
274
317
  if (isinstance(e, kubernetes.api_exception()) and
275
- e.status in (401, 403)):
318
+ e.status in (401, 403) and not is_cloudflare_403):
276
319
  # Raise KubeAPIUnreachableError exception so that the
277
320
  # optimizer/provisioner can failover to other clouds.
278
321
  raise exceptions.KubeAPIUnreachableError(
279
322
  f'Kubernetes API error: {str(e)}') from e
280
323
  if attempt < max_retries - 1:
281
324
  sleep_time = backoff.current_backoff()
282
- logger.debug(f'Kubernetes API call {func.__name__} '
283
- f'failed with {str(e)}. Retrying in '
284
- f'{sleep_time:.1f}s...')
325
+ error_type = 'CloudFlare 403' if is_cloudflare_403 else 'error'
326
+ logger.debug(
327
+ f'Kubernetes API call {func.__name__} '
328
+ f'failed with {error_type} {str(e)}. Retrying in '
329
+ f'{sleep_time:.1f}s...')
285
330
  time.sleep(sleep_time)
286
331
  continue
287
332
 
@@ -451,6 +496,9 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
451
496
 
452
497
  LABEL_KEY = 'gpu.nvidia.com/class'
453
498
 
499
+ # TODO (kyuds): fill in more label values for different accelerators.
500
+ ACC_VALUE_MAPPINGS = {'H100_NVLINK_80GB': 'H100'}
501
+
454
502
  @classmethod
455
503
  def get_label_key(cls, accelerator: Optional[str] = None) -> str:
456
504
  return cls.LABEL_KEY
@@ -469,7 +517,8 @@ class CoreWeaveLabelFormatter(GPULabelFormatter):
469
517
 
470
518
  @classmethod
471
519
  def get_accelerator_from_label_value(cls, value: str) -> str:
472
- return value
520
+ # return original label value if not found in mappings.
521
+ return cls.ACC_VALUE_MAPPINGS.get(value, value)
473
522
 
474
523
 
475
524
  class GKELabelFormatter(GPULabelFormatter):
@@ -689,6 +738,7 @@ def detect_gpu_label_formatter(
689
738
  for label, value in node.metadata.labels.items():
690
739
  node_labels[node.metadata.name].append((label, value))
691
740
 
741
+ invalid_label_values: List[Tuple[str, str, str, str]] = []
692
742
  # Check if the node labels contain any of the GPU label prefixes
693
743
  for lf in LABEL_FORMATTER_REGISTRY:
694
744
  skip = False
@@ -702,11 +752,8 @@ def detect_gpu_label_formatter(
702
752
  if valid:
703
753
  return lf(), node_labels
704
754
  else:
705
- logger.warning(f'GPU label {label} matched for label '
706
- f'formatter {lf.__class__.__name__}, '
707
- f'but has invalid value {value}. '
708
- f'Reason: {reason}. '
709
- 'Skipping...')
755
+ invalid_label_values.append(
756
+ (label, lf.__name__, value, reason))
710
757
  skip = True
711
758
  break
712
759
  if skip:
@@ -714,6 +761,13 @@ def detect_gpu_label_formatter(
714
761
  if skip:
715
762
  continue
716
763
 
764
+ for label, lf_name, value, reason in invalid_label_values:
765
+ logger.warning(f'GPU label {label} matched for label '
766
+ f'formatter {lf_name}, '
767
+ f'but has invalid value {value}. '
768
+ f'Reason: {reason}. '
769
+ 'Skipping...')
770
+
717
771
  return None, node_labels
718
772
 
719
773
 
@@ -1012,15 +1066,16 @@ class GKEAutoscaler(Autoscaler):
1012
1066
  to fit the instance type.
1013
1067
  """
1014
1068
  for accelerator in node_pool_accelerators:
1069
+ raw_value = accelerator['acceleratorType']
1015
1070
  node_accelerator_type = (
1016
- GKELabelFormatter.get_accelerator_from_label_value(
1017
- accelerator['acceleratorType']))
1071
+ GKELabelFormatter.get_accelerator_from_label_value(raw_value))
1018
1072
  # handle heterogenous nodes.
1019
1073
  if not node_accelerator_type:
1020
1074
  continue
1021
1075
  node_accelerator_count = accelerator['acceleratorCount']
1022
- if node_accelerator_type == requested_gpu_type and int(
1023
- node_accelerator_count) >= requested_gpu_count:
1076
+ viable_names = [node_accelerator_type.lower(), raw_value.lower()]
1077
+ if (requested_gpu_type.lower() in viable_names and
1078
+ int(node_accelerator_count) >= requested_gpu_count):
1024
1079
  return True
1025
1080
  return False
1026
1081
 
@@ -1137,9 +1192,76 @@ def detect_accelerator_resource(
1137
1192
  return has_accelerator, cluster_resources
1138
1193
 
1139
1194
 
1195
+ @dataclasses.dataclass
1196
+ class V1ObjectMeta:
1197
+ name: str
1198
+ labels: Dict[str, str]
1199
+ namespace: str = '' # Used for pods, not nodes
1200
+
1201
+
1202
+ @dataclasses.dataclass
1203
+ class V1NodeAddress:
1204
+ type: str
1205
+ address: str
1206
+
1207
+
1208
+ @dataclasses.dataclass
1209
+ class V1NodeCondition:
1210
+ """Represents a Kubernetes node condition."""
1211
+ type: str
1212
+ status: str
1213
+
1214
+
1215
+ @dataclasses.dataclass
1216
+ class V1NodeStatus:
1217
+ allocatable: Dict[str, str]
1218
+ capacity: Dict[str, str]
1219
+ addresses: List[V1NodeAddress]
1220
+ conditions: List[V1NodeCondition]
1221
+
1222
+
1223
+ @dataclasses.dataclass
1224
+ class V1Node:
1225
+ """Represents a Kubernetes node."""
1226
+ metadata: V1ObjectMeta
1227
+ status: V1NodeStatus
1228
+
1229
+ @classmethod
1230
+ def from_dict(cls, data: dict) -> 'V1Node':
1231
+ """Create V1Node from a dictionary."""
1232
+ return cls(metadata=V1ObjectMeta(
1233
+ name=data['metadata']['name'],
1234
+ labels=data['metadata'].get('labels', {}),
1235
+ ),
1236
+ status=V1NodeStatus(
1237
+ allocatable=data['status']['allocatable'],
1238
+ capacity=data['status']['capacity'],
1239
+ addresses=[
1240
+ V1NodeAddress(type=addr['type'],
1241
+ address=addr['address'])
1242
+ for addr in data['status'].get('addresses', [])
1243
+ ],
1244
+ conditions=[
1245
+ V1NodeCondition(type=cond['type'],
1246
+ status=cond['status'])
1247
+ for cond in data['status'].get('conditions', [])
1248
+ ]))
1249
+
1250
+ def is_ready(self) -> bool:
1251
+ """Check if the node is ready based on its conditions.
1252
+
1253
+ A node is considered ready if it has a 'Ready' condition with
1254
+ status 'True'.
1255
+ """
1256
+ for condition in self.status.conditions:
1257
+ if condition.type == 'Ready':
1258
+ return condition.status == 'True'
1259
+ return False
1260
+
1261
+
1140
1262
  @annotations.lru_cache(scope='request', maxsize=10)
1141
1263
  @_retry_on_error(resource_type='node')
1142
- def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[Any]:
1264
+ def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[V1Node]:
1143
1265
  """Gets the kubernetes nodes in the context.
1144
1266
 
1145
1267
  If context is None, gets the nodes in the current context.
@@ -1147,25 +1269,113 @@ def get_kubernetes_nodes(*, context: Optional[str] = None) -> List[Any]:
1147
1269
  if context is None:
1148
1270
  context = get_current_kube_config_context_name()
1149
1271
 
1150
- nodes = kubernetes.core_api(context).list_node(
1151
- _request_timeout=kubernetes.API_TIMEOUT).items
1272
+ # Return raw urllib3.HTTPResponse object so that we can parse the json
1273
+ # more efficiently.
1274
+ response = kubernetes.core_api(context).list_node(
1275
+ _request_timeout=kubernetes.API_TIMEOUT, _preload_content=False)
1276
+ try:
1277
+ nodes = [
1278
+ V1Node.from_dict(item_dict) for item_dict in ijson.items(
1279
+ response, 'items.item', buf_size=IJSON_BUFFER_SIZE)
1280
+ ]
1281
+ finally:
1282
+ response.release_conn()
1283
+
1152
1284
  return nodes
1153
1285
 
1154
1286
 
1155
- @_retry_on_error(resource_type='pod')
1156
- def get_all_pods_in_kubernetes_cluster(*,
1157
- context: Optional[str] = None
1158
- ) -> List[Any]:
1159
- """Gets pods in all namespaces in kubernetes cluster indicated by context.
1287
+ @dataclasses.dataclass
1288
+ class V1PodStatus:
1289
+ phase: str
1290
+
1291
+
1292
+ @dataclasses.dataclass
1293
+ class V1ResourceRequirements:
1294
+ requests: Optional[Dict[str, str]]
1295
+
1296
+
1297
+ @dataclasses.dataclass
1298
+ class V1Container:
1299
+ resources: V1ResourceRequirements
1300
+
1301
+
1302
+ @dataclasses.dataclass
1303
+ class V1PodSpec:
1304
+ containers: List[V1Container]
1305
+ node_name: Optional[str]
1306
+
1307
+
1308
+ @dataclasses.dataclass
1309
+ class V1Pod:
1310
+ metadata: V1ObjectMeta
1311
+ status: V1PodStatus
1312
+ spec: V1PodSpec
1313
+
1314
+ @classmethod
1315
+ def from_dict(cls, data: dict) -> 'V1Pod':
1316
+ """Create V1Pod from a dictionary."""
1317
+ return cls(metadata=V1ObjectMeta(
1318
+ name=data['metadata']['name'],
1319
+ labels=data['metadata'].get('labels', {}),
1320
+ namespace=data['metadata'].get('namespace'),
1321
+ ),
1322
+ status=V1PodStatus(phase=data['status'].get('phase'),),
1323
+ spec=V1PodSpec(
1324
+ node_name=data['spec'].get('nodeName'),
1325
+ containers=[
1326
+ V1Container(resources=V1ResourceRequirements(
1327
+ requests=container.get('resources', {}).get(
1328
+ 'requests') or None))
1329
+ for container in data['spec'].get('containers', [])
1330
+ ]))
1331
+
1160
1332
 
1161
- Used for computing cluster resource usage.
1333
+ @_retry_on_error(resource_type='pod')
1334
+ def get_allocated_gpu_qty_by_node(
1335
+ *,
1336
+ context: Optional[str] = None,
1337
+ ) -> Dict[str, int]:
1338
+ """Gets allocated GPU quantity by each node by fetching pods in
1339
+ all namespaces in kubernetes cluster indicated by context.
1162
1340
  """
1163
1341
  if context is None:
1164
1342
  context = get_current_kube_config_context_name()
1343
+ non_included_pod_statuses = POD_STATUSES.copy()
1344
+ status_filters = ['Running', 'Pending']
1345
+ if status_filters is not None:
1346
+ non_included_pod_statuses -= set(status_filters)
1347
+ field_selector = ','.join(
1348
+ [f'status.phase!={status}' for status in non_included_pod_statuses])
1165
1349
 
1166
- pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
1167
- _request_timeout=kubernetes.API_TIMEOUT).items
1168
- return pods
1350
+ # Return raw urllib3.HTTPResponse object so that we can parse the json
1351
+ # more efficiently.
1352
+ response = kubernetes.core_api(context).list_pod_for_all_namespaces(
1353
+ _request_timeout=kubernetes.API_TIMEOUT,
1354
+ _preload_content=False,
1355
+ field_selector=field_selector)
1356
+ try:
1357
+ allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
1358
+ for item_dict in ijson.items(response,
1359
+ 'items.item',
1360
+ buf_size=IJSON_BUFFER_SIZE):
1361
+ pod = V1Pod.from_dict(item_dict)
1362
+ if should_exclude_pod_from_gpu_allocation(pod):
1363
+ logger.debug(
1364
+ f'Excluding pod {pod.metadata.name} from GPU count '
1365
+ f'calculations on node {pod.spec.node_name}')
1366
+ continue
1367
+ # Iterate over all the containers in the pod and sum the
1368
+ # GPU requests
1369
+ pod_allocated_qty = 0
1370
+ for container in pod.spec.containers:
1371
+ if container.resources.requests:
1372
+ pod_allocated_qty += get_node_accelerator_count(
1373
+ context, container.resources.requests)
1374
+ if pod_allocated_qty > 0 and pod.spec.node_name:
1375
+ allocated_qty_by_node[pod.spec.node_name] += pod_allocated_qty
1376
+ return allocated_qty_by_node
1377
+ finally:
1378
+ response.release_conn()
1169
1379
 
1170
1380
 
1171
1381
  def check_instance_fits(context: Optional[str],
@@ -1266,11 +1476,12 @@ def check_instance_fits(context: Optional[str],
1266
1476
  return False, str(e)
1267
1477
  # Get the set of nodes that have the GPU type
1268
1478
  gpu_nodes = [
1269
- node for node in nodes if gpu_label_key in node.metadata.labels and
1479
+ node for node in nodes
1480
+ if node.is_ready() and gpu_label_key in node.metadata.labels and
1270
1481
  node.metadata.labels[gpu_label_key] in gpu_label_values
1271
1482
  ]
1272
1483
  if not gpu_nodes:
1273
- return False, f'No GPU nodes found with {acc_type} on the cluster'
1484
+ return False, f'No ready GPU nodes found with {acc_type} on the cluster'
1274
1485
  if is_tpu_on_gke(acc_type):
1275
1486
  # If requested accelerator is a TPU type, check if the cluster
1276
1487
  # has sufficient TPU resource to meet the requirement.
@@ -1294,7 +1505,9 @@ def check_instance_fits(context: Optional[str],
1294
1505
  f'enough CPU (> {k8s_instance_type.cpus} CPUs) and/or '
1295
1506
  f'memory (> {k8s_instance_type.memory} G). ')
1296
1507
  else:
1297
- candidate_nodes = nodes
1508
+ candidate_nodes = [node for node in nodes if node.is_ready()]
1509
+ if not candidate_nodes:
1510
+ return False, 'No ready nodes found in the cluster.'
1298
1511
  not_fit_reason_prefix = (f'No nodes found with enough '
1299
1512
  f'CPU (> {k8s_instance_type.cpus} CPUs) '
1300
1513
  'and/or memory '
@@ -1448,9 +1661,13 @@ def get_accelerator_label_key_values(
1448
1661
  if is_multi_host_tpu(node_metadata_labels):
1449
1662
  continue
1450
1663
  for label, value in label_list:
1451
- if (label_formatter.match_label_key(label) and
1452
- label_formatter.get_accelerator_from_label_value(
1453
- value).lower() == acc_type.lower()):
1664
+ if label_formatter.match_label_key(label):
1665
+ # match either canonicalized name or raw name
1666
+ accelerator = (label_formatter.
1667
+ get_accelerator_from_label_value(value))
1668
+ viable = [value.lower(), accelerator.lower()]
1669
+ if acc_type.lower() not in viable:
1670
+ continue
1454
1671
  if is_tpu_on_gke(acc_type):
1455
1672
  assert isinstance(label_formatter,
1456
1673
  GKELabelFormatter)
@@ -1550,23 +1767,6 @@ def get_port(svc_name: str, namespace: str, context: Optional[str]) -> int:
1550
1767
  return head_service.spec.ports[0].node_port
1551
1768
 
1552
1769
 
1553
- def get_external_ip(network_mode: Optional[
1554
- kubernetes_enums.KubernetesNetworkingMode], context: Optional[str]) -> str:
1555
- if network_mode == kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD:
1556
- return '127.0.0.1'
1557
- # Return the IP address of the first node with an external IP
1558
- nodes = kubernetes.core_api(context).list_node().items
1559
- for node in nodes:
1560
- if node.status.addresses:
1561
- for address in node.status.addresses:
1562
- if address.type == 'ExternalIP':
1563
- return address.address
1564
- # If no external IP is found, use the API server IP
1565
- api_host = kubernetes.core_api(context).api_client.configuration.host
1566
- parsed_url = urlparse(api_host)
1567
- return parsed_url.hostname
1568
-
1569
-
1570
1770
  def check_credentials(context: Optional[str],
1571
1771
  timeout: int = kubernetes.API_TIMEOUT,
1572
1772
  run_optional_checks: bool = False) -> \
@@ -1585,7 +1785,10 @@ def check_credentials(context: Optional[str],
1585
1785
  try:
1586
1786
  namespace = get_kube_config_context_namespace(context)
1587
1787
  kubernetes.core_api(context).list_namespaced_pod(
1588
- namespace, _request_timeout=timeout)
1788
+ namespace, limit=1, _request_timeout=timeout)
1789
+ # This call is "free" because this function is a cached call,
1790
+ # and it will not be called again in this function.
1791
+ get_kubernetes_nodes(context=context)
1589
1792
  except ImportError:
1590
1793
  # TODO(romilb): Update these error strs to also include link to docs
1591
1794
  # when docs are ready.
@@ -1710,11 +1913,17 @@ class PodValidator:
1710
1913
 
1711
1914
  if isinstance(klass, str):
1712
1915
  if klass.startswith('list['):
1713
- sub_kls = re.match(r'list\[(.*)\]', klass).group(1)
1916
+ match = re.match(r'list\[(.*)\]', klass)
1917
+ if match is None:
1918
+ raise ValueError(f'Invalid list type format: {klass}')
1919
+ sub_kls = match.group(1)
1714
1920
  return [cls.__validate(sub_data, sub_kls) for sub_data in data]
1715
1921
 
1716
1922
  if klass.startswith('dict('):
1717
- sub_kls = re.match(r'dict\(([^,]*), (.*)\)', klass).group(2)
1923
+ match = re.match(r'dict\(([^,]*), (.*)\)', klass)
1924
+ if match is None:
1925
+ raise ValueError(f'Invalid dict type format: {klass}')
1926
+ sub_kls = match.group(2)
1718
1927
  return {k: cls.__validate(v, sub_kls) for k, v in data.items()}
1719
1928
 
1720
1929
  # convert str to class
@@ -2073,6 +2282,15 @@ def get_kube_config_context_namespace(
2073
2282
  return DEFAULT_NAMESPACE
2074
2283
 
2075
2284
 
2285
+ def parse_cpu_or_gpu_resource_to_float(resource_str: str) -> float:
2286
+ if not resource_str:
2287
+ return 0.0
2288
+ if resource_str[-1] == 'm':
2289
+ return float(resource_str[:-1]) / 1000
2290
+ else:
2291
+ return float(resource_str)
2292
+
2293
+
2076
2294
  def parse_cpu_or_gpu_resource(resource_qty_str: str) -> Union[int, float]:
2077
2295
  resource_str = str(resource_qty_str)
2078
2296
  if resource_str[-1] == 'm':
@@ -2150,16 +2368,9 @@ class KubernetesInstanceType:
2150
2368
  @staticmethod
2151
2369
  def is_valid_instance_type(name: str) -> bool:
2152
2370
  """Returns whether the given name is a valid instance type."""
2153
- # Before https://github.com/skypilot-org/skypilot/pull/4756,
2154
- # the accelerators are appended with format "--{a}{type}",
2155
- # e.g. "4CPU--16GB--1V100".
2156
- # Check both patterns to keep backward compatibility.
2157
- # TODO(romilb): Backward compatibility, remove after 0.11.0.
2158
- prev_pattern = re.compile(
2159
- r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--\d+\S+)?$')
2160
2371
  pattern = re.compile(
2161
2372
  r'^(\d+(\.\d+)?CPU--\d+(\.\d+)?GB)(--[\w\d-]+:\d+)?$')
2162
- return bool(pattern.match(name)) or bool(prev_pattern.match(name))
2373
+ return bool(pattern.match(name))
2163
2374
 
2164
2375
  @classmethod
2165
2376
  def _parse_instance_type(
@@ -2176,11 +2387,6 @@ class KubernetesInstanceType:
2176
2387
  r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_type>[\w\d-]+):(?P<accelerator_count>\d+))?$' # pylint: disable=line-too-long
2177
2388
  )
2178
2389
  match = pattern.match(name)
2179
- # TODO(romilb): Backward compatibility, remove after 0.11.0.
2180
- prev_pattern = re.compile(
2181
- r'^(?P<cpus>\d+(\.\d+)?)CPU--(?P<memory>\d+(\.\d+)?)GB(?:--(?P<accelerator_count>\d+)(?P<accelerator_type>\S+))?$' # pylint: disable=line-too-long
2182
- )
2183
- prev_match = prev_pattern.match(name)
2184
2390
  if match:
2185
2391
  cpus = float(match.group('cpus'))
2186
2392
  memory = float(match.group('memory'))
@@ -2193,19 +2399,6 @@ class KubernetesInstanceType:
2193
2399
  accelerator_count = None
2194
2400
  accelerator_type = None
2195
2401
  return cpus, memory, accelerator_count, accelerator_type
2196
- # TODO(romilb): Backward compatibility, remove after 0.11.0.
2197
- elif prev_match:
2198
- cpus = float(prev_match.group('cpus'))
2199
- memory = float(prev_match.group('memory'))
2200
- accelerator_count = prev_match.group('accelerator_count')
2201
- accelerator_type = prev_match.group('accelerator_type')
2202
- if accelerator_count:
2203
- accelerator_count = int(accelerator_count)
2204
- accelerator_type = str(accelerator_type)
2205
- else:
2206
- accelerator_count = None
2207
- accelerator_type = None
2208
- return cpus, memory, accelerator_count, accelerator_type
2209
2402
  else:
2210
2403
  raise ValueError(f'Invalid instance name: {name}')
2211
2404
 
@@ -2278,16 +2471,14 @@ def construct_ssh_jump_command(
2278
2471
 
2279
2472
 
2280
2473
  def get_ssh_proxy_command(
2281
- k8s_ssh_target: str,
2282
- network_mode: kubernetes_enums.KubernetesNetworkingMode,
2474
+ pod_name: str,
2283
2475
  private_key_path: str,
2284
2476
  context: Optional[str],
2285
2477
  namespace: str,
2286
2478
  ) -> str:
2287
2479
  """Generates the SSH proxy command to connect to the pod.
2288
2480
 
2289
- Uses a jump pod if the network mode is NODEPORT, and direct port-forwarding
2290
- if the network mode is PORTFORWARD.
2481
+ Uses a direct port-forwarding.
2291
2482
 
2292
2483
  By default, establishing an SSH connection creates a communication
2293
2484
  channel to a remote node by setting up a TCP connection. When a
@@ -2298,17 +2489,8 @@ def get_ssh_proxy_command(
2298
2489
  Pods within a Kubernetes cluster have internal IP addresses that are
2299
2490
  typically not accessible from outside the cluster. Since the default TCP
2300
2491
  connection of SSH won't allow access to these pods, we employ a
2301
- ProxyCommand to establish the required communication channel. We offer this
2302
- in two different networking options: NodePort/port-forward.
2303
-
2304
- With the NodePort networking mode, a NodePort service is launched. This
2305
- service opens an external port on the node which redirects to the desired
2306
- port to a SSH jump pod. When establishing an SSH session in this mode, the
2307
- ProxyCommand makes use of this external port to create a communication
2308
- channel directly to port 22, which is the default port ssh server listens
2309
- on, of the jump pod.
2492
+ ProxyCommand to establish the required communication channel.
2310
2493
 
2311
- With Port-forward mode, instead of directly exposing an external port,
2312
2494
  'kubectl port-forward' sets up a tunnel between a local port
2313
2495
  (127.0.0.1:23100) and port 22 of the provisioned pod. Then we establish TCP
2314
2496
  connection to the local end of this tunnel, 127.0.0.1:23100, using 'socat'.
@@ -2319,38 +2501,26 @@ def get_ssh_proxy_command(
2319
2501
  the local machine.
2320
2502
 
2321
2503
  Args:
2322
- k8s_ssh_target: str; The Kubernetes object that will be used as the
2323
- target for SSH. If network_mode is NODEPORT, this is the name of the
2324
- service. If network_mode is PORTFORWARD, this is the pod name.
2325
- network_mode: KubernetesNetworkingMode; networking mode for ssh
2326
- session. It is either 'NODEPORT' or 'PORTFORWARD'
2504
+ pod_name: str; The Kubernetes pod name that will be used as the
2505
+ target for SSH.
2327
2506
  private_key_path: str; Path to the private key to use for SSH.
2328
2507
  This key must be authorized to access the SSH jump pod.
2329
- Required for NODEPORT networking mode.
2330
2508
  namespace: Kubernetes namespace to use.
2331
- Required for NODEPORT networking mode.
2332
2509
  """
2333
- # Fetch IP to connect to for the jump svc
2334
- ssh_jump_ip = get_external_ip(network_mode, context)
2510
+ ssh_jump_ip = '127.0.0.1' # Local end of the port-forward tunnel
2335
2511
  assert private_key_path is not None, 'Private key path must be provided'
2336
- if network_mode == kubernetes_enums.KubernetesNetworkingMode.NODEPORT:
2337
- assert namespace is not None, 'Namespace must be provided for NodePort'
2338
- ssh_jump_port = get_port(k8s_ssh_target, namespace, context)
2339
- ssh_jump_proxy_command = construct_ssh_jump_command(
2340
- private_key_path, ssh_jump_ip, ssh_jump_port=ssh_jump_port)
2341
- else:
2342
- ssh_jump_proxy_command_path = create_proxy_command_script()
2343
- ssh_jump_proxy_command = construct_ssh_jump_command(
2344
- private_key_path,
2345
- ssh_jump_ip,
2346
- ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
2347
- proxy_cmd_path=ssh_jump_proxy_command_path,
2348
- proxy_cmd_target_pod=k8s_ssh_target,
2349
- # We embed both the current context and namespace to the SSH proxy
2350
- # command to make sure SSH still works when the current
2351
- # context/namespace is changed by the user.
2352
- current_kube_context=context,
2353
- current_kube_namespace=namespace)
2512
+ ssh_jump_proxy_command_path = create_proxy_command_script()
2513
+ ssh_jump_proxy_command = construct_ssh_jump_command(
2514
+ private_key_path,
2515
+ ssh_jump_ip,
2516
+ ssh_jump_user=constants.SKY_SSH_USER_PLACEHOLDER,
2517
+ proxy_cmd_path=ssh_jump_proxy_command_path,
2518
+ proxy_cmd_target_pod=pod_name,
2519
+ # We embed both the current context and namespace to the SSH proxy
2520
+ # command to make sure SSH still works when the current
2521
+ # context/namespace is changed by the user.
2522
+ current_kube_context=context,
2523
+ current_kube_namespace=namespace)
2354
2524
  return ssh_jump_proxy_command
2355
2525
 
2356
2526
 
@@ -2382,240 +2552,6 @@ def create_proxy_command_script() -> str:
2382
2552
  return PORT_FORWARD_PROXY_CMD_PATH
2383
2553
 
2384
2554
 
2385
- def setup_ssh_jump_svc(ssh_jump_name: str, namespace: str,
2386
- context: Optional[str],
2387
- service_type: kubernetes_enums.KubernetesServiceType):
2388
- """Sets up Kubernetes service resource to access for SSH jump pod.
2389
-
2390
- This method acts as a necessary complement to be run along with
2391
- setup_ssh_jump_pod(...) method. This service ensures the pod is accessible.
2392
-
2393
- Args:
2394
- ssh_jump_name: Name to use for the SSH jump service
2395
- namespace: Namespace to create the SSH jump service in
2396
- service_type: Networking configuration on either to use NodePort
2397
- or ClusterIP service to ssh in
2398
- """
2399
- # Fill in template - ssh_key_secret and ssh_jump_image are not required for
2400
- # the service spec, so we pass in empty strs.
2401
- content = fill_ssh_jump_template('', '', ssh_jump_name, service_type.value)
2402
-
2403
- # Add custom metadata from config
2404
- merge_custom_metadata(content['service_spec']['metadata'], context)
2405
-
2406
- # Create service
2407
- try:
2408
- kubernetes.core_api(context).create_namespaced_service(
2409
- namespace, content['service_spec'])
2410
- except kubernetes.api_exception() as e:
2411
- # SSH Jump Pod service already exists.
2412
- if e.status == 409:
2413
- ssh_jump_service = kubernetes.core_api(
2414
- context).read_namespaced_service(name=ssh_jump_name,
2415
- namespace=namespace)
2416
- curr_svc_type = ssh_jump_service.spec.type
2417
- if service_type.value == curr_svc_type:
2418
- # If the currently existing SSH Jump service's type is identical
2419
- # to user's configuration for networking mode
2420
- logger.debug(
2421
- f'SSH Jump Service {ssh_jump_name} already exists in the '
2422
- 'cluster, using it.')
2423
- else:
2424
- # If a different type of service type for SSH Jump pod compared
2425
- # to user's configuration for networking mode exists, we remove
2426
- # existing servie to create a new one following user's config
2427
- kubernetes.core_api(context).delete_namespaced_service(
2428
- name=ssh_jump_name, namespace=namespace)
2429
- kubernetes.core_api(context).create_namespaced_service(
2430
- namespace, content['service_spec'])
2431
- port_forward_mode = (
2432
- kubernetes_enums.KubernetesNetworkingMode.PORTFORWARD.value)
2433
- nodeport_mode = (
2434
- kubernetes_enums.KubernetesNetworkingMode.NODEPORT.value)
2435
- clusterip_svc = (
2436
- kubernetes_enums.KubernetesServiceType.CLUSTERIP.value)
2437
- nodeport_svc = (
2438
- kubernetes_enums.KubernetesServiceType.NODEPORT.value)
2439
- curr_network_mode = port_forward_mode \
2440
- if curr_svc_type == clusterip_svc else nodeport_mode
2441
- new_network_mode = nodeport_mode \
2442
- if curr_svc_type == clusterip_svc else port_forward_mode
2443
- new_svc_type = nodeport_svc \
2444
- if curr_svc_type == clusterip_svc else clusterip_svc
2445
- logger.info(
2446
- f'Switching the networking mode from '
2447
- f'\'{curr_network_mode}\' to \'{new_network_mode}\' '
2448
- f'following networking configuration. Deleting existing '
2449
- f'\'{curr_svc_type}\' service and recreating as '
2450
- f'\'{new_svc_type}\' service.')
2451
- else:
2452
- raise
2453
- else:
2454
- logger.info(f'Created SSH Jump Service {ssh_jump_name}.')
2455
-
2456
-
2457
- def setup_ssh_jump_pod(ssh_jump_name: str, ssh_jump_image: str,
2458
- ssh_key_secret: str, namespace: str,
2459
- context: Optional[str]):
2460
- """Sets up Kubernetes RBAC and pod for SSH jump host.
2461
-
2462
- Our Kubernetes implementation uses a SSH jump pod to reach SkyPilot clusters
2463
- running inside a cluster. This function sets up the resources needed for
2464
- the SSH jump pod. This includes a service account which grants the jump pod
2465
- permission to watch for other SkyPilot pods and terminate itself if there
2466
- are no SkyPilot pods running.
2467
-
2468
- setup_ssh_jump_service must also be run to ensure that the SSH jump pod is
2469
- reachable.
2470
-
2471
- Args:
2472
- ssh_jump_image: Container image to use for the SSH jump pod
2473
- ssh_jump_name: Name to use for the SSH jump pod
2474
- ssh_key_secret: Secret name for the SSH key stored in the cluster
2475
- namespace: Namespace to create the SSH jump pod in
2476
- """
2477
- # Fill in template - service is created separately so service_type is not
2478
- # required, so we pass in empty str.
2479
- content = fill_ssh_jump_template(ssh_key_secret, ssh_jump_image,
2480
- ssh_jump_name, '')
2481
-
2482
- # Add custom metadata to all objects
2483
- for object_type in content.keys():
2484
- merge_custom_metadata(content[object_type]['metadata'], context)
2485
-
2486
- # ServiceAccount
2487
- try:
2488
- kubernetes.core_api(context).create_namespaced_service_account(
2489
- namespace, content['service_account'])
2490
- except kubernetes.api_exception() as e:
2491
- if e.status == 409:
2492
- logger.info(
2493
- 'SSH Jump ServiceAccount already exists in the cluster, using '
2494
- 'it.')
2495
- else:
2496
- raise
2497
- else:
2498
- logger.info('Created SSH Jump ServiceAccount.')
2499
- # Role
2500
- try:
2501
- kubernetes.auth_api(context).create_namespaced_role(
2502
- namespace, content['role'])
2503
- except kubernetes.api_exception() as e:
2504
- if e.status == 409:
2505
- logger.info(
2506
- 'SSH Jump Role already exists in the cluster, using it.')
2507
- else:
2508
- raise
2509
- else:
2510
- logger.info('Created SSH Jump Role.')
2511
- # RoleBinding
2512
- try:
2513
- kubernetes.auth_api(context).create_namespaced_role_binding(
2514
- namespace, content['role_binding'])
2515
- except kubernetes.api_exception() as e:
2516
- if e.status == 409:
2517
- logger.info(
2518
- 'SSH Jump RoleBinding already exists in the cluster, using '
2519
- 'it.')
2520
- else:
2521
- raise
2522
- else:
2523
- logger.info('Created SSH Jump RoleBinding.')
2524
- # Pod
2525
- try:
2526
- kubernetes.core_api(context).create_namespaced_pod(
2527
- namespace, content['pod_spec'])
2528
- except kubernetes.api_exception() as e:
2529
- if e.status == 409:
2530
- logger.info(
2531
- f'SSH Jump Host {ssh_jump_name} already exists in the cluster, '
2532
- 'using it.')
2533
- else:
2534
- raise
2535
- else:
2536
- logger.info(f'Created SSH Jump Host {ssh_jump_name}.')
2537
-
2538
-
2539
- def clean_zombie_ssh_jump_pod(namespace: str, context: Optional[str],
2540
- node_id: str):
2541
- """Analyzes SSH jump pod and removes if it is in a bad state
2542
-
2543
- Prevents the existence of a dangling SSH jump pod. This could happen
2544
- in case the pod main container did not start properly (or failed). In that
2545
- case, jump pod lifecycle manager will not function properly to
2546
- remove the pod and service automatically, and must be done manually.
2547
-
2548
- Args:
2549
- namespace: Namespace to remove the SSH jump pod and service from
2550
- node_id: Name of head pod
2551
- """
2552
-
2553
- def find(l, predicate):
2554
- """Utility function to find element in given list"""
2555
- results = [x for x in l if predicate(x)]
2556
- return results[0] if results else None
2557
-
2558
- # Get the SSH jump pod name from the head pod
2559
- try:
2560
- pod = kubernetes.core_api(context).read_namespaced_pod(
2561
- node_id, namespace)
2562
- except kubernetes.api_exception() as e:
2563
- if e.status == 404:
2564
- logger.warning(f'Failed to get pod {node_id},'
2565
- ' but the pod was not found (404).')
2566
- raise
2567
- else:
2568
- ssh_jump_name = pod.metadata.labels.get('skypilot-ssh-jump')
2569
- try:
2570
- ssh_jump_pod = kubernetes.core_api(context).read_namespaced_pod(
2571
- ssh_jump_name, namespace)
2572
- cont_ready_cond = find(ssh_jump_pod.status.conditions,
2573
- lambda c: c.type == 'ContainersReady')
2574
- if (cont_ready_cond and cont_ready_cond.status
2575
- == 'False') or ssh_jump_pod.status.phase == 'Pending':
2576
- # Either the main container is not ready or the pod failed
2577
- # to schedule. To be on the safe side and prevent a dangling
2578
- # ssh jump pod, lets remove it and the service. Otherwise, main
2579
- # container is ready and its lifecycle management script takes
2580
- # care of the cleaning.
2581
- kubernetes.core_api(context).delete_namespaced_pod(
2582
- ssh_jump_name, namespace)
2583
- kubernetes.core_api(context).delete_namespaced_service(
2584
- ssh_jump_name, namespace)
2585
- except kubernetes.api_exception() as e:
2586
- # We keep the warning in debug to avoid polluting the `sky launch`
2587
- # output.
2588
- logger.debug(f'Tried to check ssh jump pod {ssh_jump_name},'
2589
- f' but got error {e}\n. Consider running `kubectl '
2590
- f'delete pod {ssh_jump_name} -n {namespace}` to manually '
2591
- 'remove the pod if it has crashed.')
2592
- # We encountered an issue while checking ssh jump pod. To be on
2593
- # the safe side, lets remove its service so the port is freed
2594
- try:
2595
- kubernetes.core_api(context).delete_namespaced_service(
2596
- ssh_jump_name, namespace)
2597
- except kubernetes.api_exception():
2598
- pass
2599
-
2600
-
2601
- def fill_ssh_jump_template(ssh_key_secret: str, ssh_jump_image: str,
2602
- ssh_jump_name: str, service_type: str) -> Dict:
2603
- template_path = os.path.join(directory_utils.get_sky_dir(), 'templates',
2604
- 'kubernetes-ssh-jump.yml.j2')
2605
- if not os.path.exists(template_path):
2606
- raise FileNotFoundError(
2607
- 'Template "kubernetes-ssh-jump.j2" does not exist.')
2608
- with open(template_path, 'r', encoding='utf-8') as fin:
2609
- template = fin.read()
2610
- j2_template = jinja2.Template(template)
2611
- cont = j2_template.render(name=ssh_jump_name,
2612
- image=ssh_jump_image,
2613
- secret=ssh_key_secret,
2614
- service_type=service_type)
2615
- content = yaml_utils.safe_load(cont)
2616
- return content
2617
-
2618
-
2619
2555
  def check_port_forward_mode_dependencies(
2620
2556
  raise_error: bool = True) -> Optional[List[str]]:
2621
2557
  """Checks if 'socat' and 'nc' are installed
@@ -2762,26 +2698,22 @@ def combine_pod_config_fields(
2762
2698
  merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
2763
2699
  # We don't use override_configs in `get_effective_region_config`, as merging
2764
2700
  # the pod config requires special handling.
2765
- if isinstance(cloud, clouds.SSH):
2766
- kubernetes_config = skypilot_config.get_effective_region_config(
2767
- cloud='ssh', region=None, keys=('pod_config',), default_value={})
2768
- override_pod_config = config_utils.get_cloud_config_value_from_dict(
2769
- dict_config=cluster_config_overrides,
2770
- cloud='ssh',
2771
- keys=('pod_config',),
2772
- default_value={})
2773
- else:
2774
- kubernetes_config = skypilot_config.get_effective_region_config(
2775
- cloud='kubernetes',
2776
- region=context,
2777
- keys=('pod_config',),
2778
- default_value={})
2779
- override_pod_config = config_utils.get_cloud_config_value_from_dict(
2780
- dict_config=cluster_config_overrides,
2781
- cloud='kubernetes',
2782
- region=context,
2783
- keys=('pod_config',),
2784
- default_value={})
2701
+ cloud_str = 'ssh' if isinstance(cloud, clouds.SSH) else 'kubernetes'
2702
+ context_str = context
2703
+ if isinstance(cloud, clouds.SSH) and context is not None:
2704
+ assert context.startswith('ssh-'), 'SSH context must start with "ssh-"'
2705
+ context_str = context[len('ssh-'):]
2706
+ kubernetes_config = skypilot_config.get_effective_region_config(
2707
+ cloud=cloud_str,
2708
+ region=context_str,
2709
+ keys=('pod_config',),
2710
+ default_value={})
2711
+ override_pod_config = config_utils.get_cloud_config_value_from_dict(
2712
+ dict_config=cluster_config_overrides,
2713
+ cloud=cloud_str,
2714
+ region=context_str,
2715
+ keys=('pod_config',),
2716
+ default_value={})
2785
2717
  config_utils.merge_k8s_configs(kubernetes_config, override_pod_config)
2786
2718
 
2787
2719
  # Merge the kubernetes config into the YAML for both head and worker nodes.
@@ -2800,9 +2732,11 @@ def combine_metadata_fields(cluster_yaml_obj: Dict[str, Any],
2800
2732
  Obeys the same add or update semantics as combine_pod_config_fields().
2801
2733
  """
2802
2734
  merged_cluster_yaml_obj = copy.deepcopy(cluster_yaml_obj)
2735
+ context, cloud_str = get_cleaned_context_and_cloud_str(context)
2736
+
2803
2737
  # Get custom_metadata from global config
2804
2738
  custom_metadata = skypilot_config.get_effective_region_config(
2805
- cloud='kubernetes',
2739
+ cloud=cloud_str,
2806
2740
  region=context,
2807
2741
  keys=('custom_metadata',),
2808
2742
  default_value={})
@@ -2810,7 +2744,7 @@ def combine_metadata_fields(cluster_yaml_obj: Dict[str, Any],
2810
2744
  # Get custom_metadata from task-level config overrides
2811
2745
  override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
2812
2746
  dict_config=cluster_config_overrides,
2813
- cloud='kubernetes',
2747
+ cloud=cloud_str,
2814
2748
  region=context,
2815
2749
  keys=('custom_metadata',),
2816
2750
  default_value={})
@@ -2867,9 +2801,11 @@ def merge_custom_metadata(
2867
2801
 
2868
2802
  Merge is done in-place, so return is not required
2869
2803
  """
2804
+ context, cloud_str = get_cleaned_context_and_cloud_str(context)
2805
+
2870
2806
  # Get custom_metadata from global config
2871
2807
  custom_metadata = skypilot_config.get_effective_region_config(
2872
- cloud='kubernetes',
2808
+ cloud=cloud_str,
2873
2809
  region=context,
2874
2810
  keys=('custom_metadata',),
2875
2811
  default_value={})
@@ -2878,7 +2814,7 @@ def merge_custom_metadata(
2878
2814
  if cluster_config_overrides is not None:
2879
2815
  override_custom_metadata = config_utils.get_cloud_config_value_from_dict(
2880
2816
  dict_config=cluster_config_overrides,
2881
- cloud='kubernetes',
2817
+ cloud=cloud_str,
2882
2818
  region=context,
2883
2819
  keys=('custom_metadata',),
2884
2820
  default_value={})
@@ -2889,7 +2825,8 @@ def merge_custom_metadata(
2889
2825
  config_utils.merge_k8s_configs(original_metadata, custom_metadata)
2890
2826
 
2891
2827
 
2892
- def check_nvidia_runtime_class(context: Optional[str] = None) -> bool:
2828
+ @_retry_on_error(resource_type='runtimeclass')
2829
+ def check_nvidia_runtime_class(*, context: Optional[str] = None) -> bool:
2893
2830
  """Checks if the 'nvidia' RuntimeClass exists in the cluster"""
2894
2831
  # Fetch the list of available RuntimeClasses
2895
2832
  runtime_classes = kubernetes.node_api(context).list_runtime_class()
@@ -3108,14 +3045,6 @@ def get_kubernetes_node_info(
3108
3045
  information.
3109
3046
  """
3110
3047
  nodes = get_kubernetes_nodes(context=context)
3111
- # Get the pods to get the real-time resource usage
3112
- try:
3113
- pods = get_all_pods_in_kubernetes_cluster(context=context)
3114
- except kubernetes.api_exception() as e:
3115
- if e.status == 403:
3116
- pods = None
3117
- else:
3118
- raise
3119
3048
 
3120
3049
  lf, _ = detect_gpu_label_formatter(context)
3121
3050
  if not lf:
@@ -3123,6 +3052,29 @@ def get_kubernetes_node_info(
3123
3052
  else:
3124
3053
  label_keys = lf.get_label_keys()
3125
3054
 
3055
+ # Check if all nodes have no accelerators to avoid fetching pods
3056
+ has_accelerator_nodes = False
3057
+ for node in nodes:
3058
+ accelerator_count = get_node_accelerator_count(context,
3059
+ node.status.allocatable)
3060
+ if accelerator_count > 0:
3061
+ has_accelerator_nodes = True
3062
+ break
3063
+
3064
+ # Get the allocated GPU quantity by each node
3065
+ allocated_qty_by_node: Dict[str, int] = collections.defaultdict(int)
3066
+ error_on_get_allocated_gpu_qty_by_node = False
3067
+ if has_accelerator_nodes:
3068
+ try:
3069
+ allocated_qty_by_node = get_allocated_gpu_qty_by_node(
3070
+ context=context)
3071
+ except kubernetes.api_exception() as e:
3072
+ if e.status == 403:
3073
+ error_on_get_allocated_gpu_qty_by_node = True
3074
+ pass
3075
+ else:
3076
+ raise
3077
+
3126
3078
  node_info_dict: Dict[str, models.KubernetesNodeInfo] = {}
3127
3079
  has_multi_host_tpu = False
3128
3080
 
@@ -3152,32 +3104,28 @@ def get_kubernetes_node_info(
3152
3104
  node_ip = address.address
3153
3105
  break
3154
3106
 
3155
- allocated_qty = 0
3156
3107
  accelerator_count = get_node_accelerator_count(context,
3157
3108
  node.status.allocatable)
3109
+ # Check if node is ready
3110
+ node_is_ready = node.is_ready()
3111
+
3112
+ if accelerator_count == 0:
3113
+ node_info_dict[node.metadata.name] = models.KubernetesNodeInfo(
3114
+ name=node.metadata.name,
3115
+ accelerator_type=accelerator_name,
3116
+ total={'accelerator_count': 0},
3117
+ free={'accelerators_available': 0},
3118
+ ip_address=node_ip,
3119
+ is_ready=node_is_ready)
3120
+ continue
3158
3121
 
3159
- if pods is None:
3122
+ if not node_is_ready:
3123
+ # If node is not ready, report 0 available GPUs
3124
+ accelerators_available = 0
3125
+ elif not has_accelerator_nodes or error_on_get_allocated_gpu_qty_by_node:
3160
3126
  accelerators_available = -1
3161
-
3162
3127
  else:
3163
- for pod in pods:
3164
- # Get all the pods running on the node
3165
- if (pod.spec.node_name == node.metadata.name and
3166
- pod.status.phase in ['Running', 'Pending']):
3167
- # Skip pods that should not count against GPU count
3168
- if should_exclude_pod_from_gpu_allocation(pod):
3169
- logger.debug(
3170
- f'Excluding low priority pod '
3171
- f'{pod.metadata.name} from GPU allocation '
3172
- f'calculations on node {node.metadata.name}')
3173
- continue
3174
- # Iterate over all the containers in the pod and sum the
3175
- # GPU requests
3176
- for container in pod.spec.containers:
3177
- if container.resources.requests:
3178
- allocated_qty += get_node_accelerator_count(
3179
- context, container.resources.requests)
3180
-
3128
+ allocated_qty = allocated_qty_by_node[node.metadata.name]
3181
3129
  accelerators_available = accelerator_count - allocated_qty
3182
3130
 
3183
3131
  # Exclude multi-host TPUs from being processed.
@@ -3192,7 +3140,8 @@ def get_kubernetes_node_info(
3192
3140
  accelerator_type=accelerator_name,
3193
3141
  total={'accelerator_count': int(accelerator_count)},
3194
3142
  free={'accelerators_available': int(accelerators_available)},
3195
- ip_address=node_ip)
3143
+ ip_address=node_ip,
3144
+ is_ready=node_is_ready)
3196
3145
  hint = ''
3197
3146
  if has_multi_host_tpu:
3198
3147
  hint = ('(Note: Multi-host TPUs are detected and excluded from the '
@@ -3224,7 +3173,11 @@ def filter_pods(namespace: str,
3224
3173
  context: Optional[str],
3225
3174
  tag_filters: Dict[str, str],
3226
3175
  status_filters: Optional[List[str]] = None) -> Dict[str, Any]:
3227
- """Filters pods by tags and status."""
3176
+ """Filters pods by tags and status.
3177
+
3178
+ Returned dict is sorted by name, with workers sorted by their numeric suffix.
3179
+ This ensures consistent ordering for SSH configuration and other operations.
3180
+ """
3228
3181
  non_included_pod_statuses = POD_STATUSES.copy()
3229
3182
 
3230
3183
  field_selector = ''
@@ -3242,7 +3195,32 @@ def filter_pods(namespace: str,
3242
3195
  pods = [
3243
3196
  pod for pod in pod_list.items if pod.metadata.deletion_timestamp is None
3244
3197
  ]
3245
- return {pod.metadata.name: pod for pod in pods}
3198
+
3199
+ # Sort pods by name, with workers sorted by their numeric suffix.
3200
+ # This ensures consistent ordering (e.g., cluster-head, cluster-worker1,
3201
+ # cluster-worker2, cluster-worker3, ...) even when Kubernetes API
3202
+ # returns them in arbitrary order. This works even if there were
3203
+ # somehow pod names other than head/worker ones, and those end up at
3204
+ # the end of the list.
3205
+ def get_pod_sort_key(
3206
+ pod: V1Pod
3207
+ ) -> Union[Tuple[Literal[0], str], Tuple[Literal[1], int], Tuple[Literal[2],
3208
+ str]]:
3209
+ name = pod.metadata.name
3210
+ name_suffix = name.split('-')[-1]
3211
+ if name_suffix == 'head':
3212
+ return (0, name)
3213
+ elif name_suffix.startswith('worker'):
3214
+ try:
3215
+ return (1, int(name_suffix.split('worker')[-1]))
3216
+ except (ValueError, IndexError):
3217
+ return (2, name)
3218
+ else:
3219
+ return (2, name)
3220
+
3221
+ sorted_pods = sorted(pods, key=get_pod_sort_key)
3222
+
3223
+ return {pod.metadata.name: pod for pod in sorted_pods}
3246
3224
 
3247
3225
 
3248
3226
  def _remove_pod_annotation(pod: Any,
@@ -3371,13 +3349,13 @@ def get_skypilot_pods(context: Optional[str] = None) -> List[Any]:
3371
3349
 
3372
3350
  try:
3373
3351
  pods = kubernetes.core_api(context).list_pod_for_all_namespaces(
3374
- label_selector='skypilot-cluster',
3352
+ label_selector=provision_constants.TAG_SKYPILOT_CLUSTER_NAME,
3375
3353
  _request_timeout=kubernetes.API_TIMEOUT).items
3376
3354
  except kubernetes.max_retry_error():
3377
3355
  raise exceptions.ResourcesUnavailableError(
3378
3356
  'Timed out trying to get SkyPilot pods from Kubernetes cluster. '
3379
3357
  'Please check if the cluster is healthy and retry. To debug, run: '
3380
- 'kubectl get pods --selector=skypilot-cluster --all-namespaces'
3358
+ 'kubectl get pods --selector=skypilot-cluster-name --all-namespaces'
3381
3359
  ) from None
3382
3360
  return pods
3383
3361
 
@@ -3514,7 +3492,8 @@ def process_skypilot_pods(
3514
3492
  serve_controllers: List[KubernetesSkyPilotClusterInfo] = []
3515
3493
 
3516
3494
  for pod in pods:
3517
- cluster_name_on_cloud = pod.metadata.labels.get('skypilot-cluster')
3495
+ cluster_name_on_cloud = pod.metadata.labels.get(
3496
+ provision_constants.TAG_SKYPILOT_CLUSTER_NAME)
3518
3497
  cluster_name = cluster_name_on_cloud.rsplit(
3519
3498
  '-', 1
3520
3499
  )[0] # Remove the user hash to get cluster name (e.g., mycluster-2ea4)
@@ -3541,9 +3520,20 @@ def process_skypilot_pods(
3541
3520
  f'requesting GPUs: {pod.metadata.name}')
3542
3521
  gpu_label = label_formatter.get_label_key()
3543
3522
  # Get GPU name from pod node selector
3544
- if pod.spec.node_selector is not None:
3545
- gpu_name = label_formatter.get_accelerator_from_label_value(
3546
- pod.spec.node_selector.get(gpu_label))
3523
+ node_selector_terms = (
3524
+ pod.spec.affinity.node_affinity.
3525
+ required_during_scheduling_ignored_during_execution.
3526
+ node_selector_terms)
3527
+ if node_selector_terms is not None:
3528
+ expressions = []
3529
+ for term in node_selector_terms:
3530
+ if term.match_expressions:
3531
+ expressions.extend(term.match_expressions)
3532
+ for expression in expressions:
3533
+ if expression.key == gpu_label and expression.operator == 'In':
3534
+ gpu_name = label_formatter.get_accelerator_from_label_value(
3535
+ expression.values[0])
3536
+ break
3547
3537
 
3548
3538
  resources = resources_lib.Resources(
3549
3539
  cloud=clouds.Kubernetes(),
@@ -3790,3 +3780,13 @@ def should_exclude_pod_from_gpu_allocation(pod) -> bool:
3790
3780
  return True
3791
3781
 
3792
3782
  return False
3783
+
3784
+
3785
+ def get_cleaned_context_and_cloud_str(
3786
+ context: Optional[str]) -> Tuple[Optional[str], str]:
3787
+ """Return the cleaned context and relevant cloud string from a context."""
3788
+ cloud_str = 'kubernetes'
3789
+ if context is not None and context.startswith('ssh-'):
3790
+ cloud_str = 'ssh'
3791
+ context = context[len('ssh-'):]
3792
+ return context, cloud_str