skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/clouds/kubernetes.py CHANGED
@@ -1,35 +1,40 @@
1
1
  """Kubernetes."""
2
+ import concurrent.futures
2
3
  import os
3
4
  import re
4
- import typing
5
+ import subprocess
6
+ import tempfile
5
7
  from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
6
8
 
9
+ import colorama
10
+
11
+ from sky import catalog
7
12
  from sky import clouds
8
13
  from sky import exceptions
14
+ from sky import resources as resources_lib
9
15
  from sky import sky_logging
10
16
  from sky import skypilot_config
11
17
  from sky.adaptors import kubernetes
12
- from sky.clouds import service_catalog
18
+ from sky.clouds.utils import gcp_utils
13
19
  from sky.provision import instance_setup
20
+ from sky.provision.gcp import constants as gcp_constants
14
21
  from sky.provision.kubernetes import network_utils
15
22
  from sky.provision.kubernetes import utils as kubernetes_utils
23
+ from sky.provision.kubernetes.utils import is_tpu_on_gke
24
+ from sky.provision.kubernetes.utils import KubernetesHighPerformanceNetworkType
25
+ from sky.provision.kubernetes.utils import normalize_tpu_accelerator_name
16
26
  from sky.skylet import constants
17
27
  from sky.utils import annotations
18
28
  from sky.utils import common_utils
29
+ from sky.utils import env_options
30
+ from sky.utils import kubernetes_enums
19
31
  from sky.utils import registry
20
32
  from sky.utils import resources_utils
21
33
  from sky.utils import schemas
22
-
23
- if typing.TYPE_CHECKING:
24
- # Renaming to avoid shadowing variables.
25
- from sky import resources as resources_lib
34
+ from sky.utils import volume as volume_lib
26
35
 
27
36
  logger = sky_logging.init_logger(__name__)
28
37
 
29
- # Check if KUBECONFIG is set, and use it if it is.
30
- DEFAULT_KUBECONFIG_PATH = '~/.kube/config'
31
- CREDENTIAL_PATH = os.environ.get('KUBECONFIG', DEFAULT_KUBECONFIG_PATH)
32
-
33
38
  # Namespace for SkyPilot resources shared across multiple tenants on the
34
39
  # same cluster (even if they might be running in different namespaces).
35
40
  # E.g., FUSE device manager daemonset is run in this namespace.
@@ -44,9 +49,6 @@ _FUSERMOUNT_SHARED_DIR = '/var/run/fusermount'
44
49
  class Kubernetes(clouds.Cloud):
45
50
  """Kubernetes."""
46
51
 
47
- SKY_SSH_KEY_SECRET_NAME = 'sky-ssh-keys'
48
- SKY_SSH_JUMP_NAME = 'sky-ssh-jump-pod'
49
-
50
52
  # Limit the length of the cluster name to avoid exceeding the limit of 63
51
53
  # characters for Kubernetes resources. We limit to 42 characters (63-21) to
52
54
  # allow additional characters for creating ingress services to expose ports.
@@ -54,9 +56,12 @@ class Kubernetes(clouds.Cloud):
54
56
  # where the suffix is 21 characters long.
55
57
  _MAX_CLUSTER_NAME_LEN_LIMIT = 42
56
58
 
59
+ _MAX_VOLUME_NAME_LEN_LIMIT = 253
60
+
57
61
  _SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = True
58
62
 
59
63
  _DEFAULT_NUM_VCPUS = 2
64
+ _DEFAULT_NUM_VCPUS_WITH_GPU = 4
60
65
  _DEFAULT_MEMORY_CPU_RATIO = 1
61
66
  _DEFAULT_MEMORY_CPU_RATIO_WITH_GPU = 4 # Allocate more memory for GPU tasks
62
67
  _REPR = 'Kubernetes'
@@ -73,6 +78,12 @@ class Kubernetes(clouds.Cloud):
73
78
  'tiers are not '
74
79
  'supported in '
75
80
  'Kubernetes.',
81
+ clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK:
82
+ ('Customized multiple network interfaces are not supported in '
83
+ 'Kubernetes.'),
84
+ clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
85
+ ('Custom network tier is not supported in this Kubernetes '
86
+ 'cluster.'),
76
87
  }
77
88
 
78
89
  IMAGE_CPU = 'skypilot:custom-cpu-ubuntu-2004'
@@ -86,47 +97,52 @@ class Kubernetes(clouds.Cloud):
86
97
  # Set of contexts that has logged as temporarily unreachable
87
98
  logged_unreachable_contexts: Set[str] = set()
88
99
 
89
- @property
90
- def ssh_key_secret_field_name(self):
91
- # Use a fresh user hash to avoid conflicts in the secret object naming.
92
- # This can happen when the controller is reusing the same user hash
93
- # through USER_ID_ENV_VAR but has a different SSH key.
94
- fresh_user_hash = common_utils.generate_user_hash()
95
- return f'ssh-publickey-{fresh_user_hash}'
96
-
97
100
  @classmethod
98
101
  def _unsupported_features_for_resources(
99
- cls, resources: 'resources_lib.Resources'
102
+ cls,
103
+ resources: 'resources_lib.Resources',
104
+ region: Optional[str] = None,
100
105
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
101
106
  # TODO(aylei): features need to be regional (per context) to make
102
107
  # multi-kubernetes selection/failover work.
103
108
  unsupported_features = cls._CLOUD_UNSUPPORTED_FEATURES.copy()
104
- context = resources.region
109
+ context = region if region is not None else resources.region
105
110
  if context is None:
106
- context = kubernetes_utils.get_current_kube_config_context_name()
107
- # Features to be disabled for exec auth
108
- is_exec_auth, message = kubernetes_utils.is_kubeconfig_exec_auth(
109
- context)
110
- if is_exec_auth:
111
- assert isinstance(message, str), message
112
- # Controllers cannot spin up new pods with exec auth.
113
- unsupported_features[
114
- clouds.CloudImplementationFeatures.HOST_CONTROLLERS] = message
115
- # Pod does not have permissions to down itself with exec auth.
116
- unsupported_features[
117
- clouds.CloudImplementationFeatures.AUTODOWN] = message
111
+ contexts = cls.existing_allowed_contexts()
112
+ else:
113
+ contexts = [context]
118
114
  unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
119
115
  'Stopping clusters is not supported on Kubernetes.')
120
116
  unsupported_features[clouds.CloudImplementationFeatures.AUTOSTOP] = (
121
117
  'Auto-stop is not supported on Kubernetes.')
122
- # Allow spot instances if supported by the cluster
123
- try:
124
- spot_label_key, _ = kubernetes_utils.get_spot_label(context)
125
- if spot_label_key is not None:
126
- unsupported_features.pop(
127
- clouds.CloudImplementationFeatures.SPOT_INSTANCE, None)
128
- except exceptions.KubeAPIUnreachableError as e:
129
- cls._log_unreachable_context(context, str(e))
118
+ for context in contexts:
119
+ # Allow spot instances if supported by the cluster
120
+ try:
121
+ # Run spot label check and network type detection concurrently
122
+ # as they are independent operations
123
+ with concurrent.futures.ThreadPoolExecutor(
124
+ max_workers=2) as executor:
125
+ spot_future = executor.submit(
126
+ kubernetes_utils.get_spot_label, context)
127
+ network_future = executor.submit(cls._detect_network_type,
128
+ context,
129
+ resources.network_tier)
130
+
131
+ spot_label_key, _ = spot_future.result()
132
+ if spot_label_key is not None:
133
+ unsupported_features.pop(
134
+ clouds.CloudImplementationFeatures.SPOT_INSTANCE,
135
+ None)
136
+
137
+ # Allow custom network tier if supported by the cluster
138
+ # (e.g., Nebius clusters with high performance networking)
139
+ network_type, _ = network_future.result()
140
+ if network_type.supports_high_performance_networking():
141
+ unsupported_features.pop(
142
+ clouds.CloudImplementationFeatures.
143
+ CUSTOM_NETWORK_TIER, None)
144
+ except exceptions.KubeAPIUnreachableError as e:
145
+ cls._log_unreachable_context(context, str(e))
130
146
  return unsupported_features
131
147
 
132
148
  @classmethod
@@ -149,7 +165,7 @@ class Kubernetes(clouds.Cloud):
149
165
  'Ignoring these contexts.')
150
166
 
151
167
  @classmethod
152
- def existing_allowed_contexts(cls) -> List[str]:
168
+ def existing_allowed_contexts(cls, silent: bool = False) -> List[str]:
153
169
  """Get existing allowed contexts.
154
170
 
155
171
  If None is returned in the list, it means that we are running in a pod
@@ -162,15 +178,35 @@ class Kubernetes(clouds.Cloud):
162
178
 
163
179
  all_contexts = set(all_contexts)
164
180
 
165
- allowed_contexts = skypilot_config.get_nested(
166
- ('kubernetes', 'allowed_contexts'), None)
181
+ # Allowed_contexts specified for workspace should take precedence over
182
+ # the global allowed_contexts.
183
+ allowed_contexts = skypilot_config.get_workspace_cloud(
184
+ 'kubernetes').get('allowed_contexts', None)
185
+ if allowed_contexts is None:
186
+ allowed_contexts = skypilot_config.get_effective_region_config(
187
+ cloud='kubernetes',
188
+ region=None,
189
+ keys=('allowed_contexts',),
190
+ default_value=None)
191
+
192
+ # Exclude contexts starting with `ssh-`
193
+ # TODO(romilb): Remove when SSH Node Pools use a separate kubeconfig.
194
+ all_contexts = [
195
+ ctx for ctx in all_contexts if not ctx.startswith('ssh-')
196
+ ]
197
+
198
+ allow_all_contexts = allowed_contexts == 'all' or (
199
+ allowed_contexts is None and
200
+ env_options.Options.ALLOW_ALL_KUBERNETES_CONTEXTS.get())
201
+ if allow_all_contexts:
202
+ allowed_contexts = all_contexts
167
203
 
168
204
  if allowed_contexts is None:
169
205
  # Try kubeconfig if present
170
206
  current_context = (
171
207
  kubernetes_utils.get_current_kube_config_context_name())
172
- if (current_context is None and
173
- kubernetes_utils.is_incluster_config_available()):
208
+ if ((current_context is None or current_context.startswith('ssh-'))
209
+ and kubernetes_utils.is_incluster_config_available()):
174
210
  # If no kubeconfig contexts found, use in-cluster if available
175
211
  current_context = kubernetes.in_cluster_context_name()
176
212
  allowed_contexts = []
@@ -183,8 +219,12 @@ class Kubernetes(clouds.Cloud):
183
219
  if context in all_contexts:
184
220
  existing_contexts.append(context)
185
221
  else:
222
+ # Skip SSH Node Pool contexts
223
+ if context.startswith('ssh-'):
224
+ continue
186
225
  skipped_contexts.append(context)
187
- cls._log_skipped_contexts_once(tuple(skipped_contexts))
226
+ if not silent:
227
+ cls._log_skipped_contexts_once(tuple(skipped_contexts))
188
228
  return existing_contexts
189
229
 
190
230
  @classmethod
@@ -218,10 +258,15 @@ class Kubernetes(clouds.Cloud):
218
258
  'refresh Kubernetes availability if permanent.')
219
259
 
220
260
  @classmethod
221
- def regions_with_offering(cls, instance_type: Optional[str],
222
- accelerators: Optional[Dict[str, int]],
223
- use_spot: bool, region: Optional[str],
224
- zone: Optional[str]) -> List[clouds.Region]:
261
+ def regions_with_offering(
262
+ cls,
263
+ instance_type: Optional[str],
264
+ accelerators: Optional[Dict[str, int]],
265
+ use_spot: bool,
266
+ region: Optional[str],
267
+ zone: Optional[str],
268
+ resources: Optional['resources_lib.Resources'] = None,
269
+ ) -> List[clouds.Region]:
225
270
  del accelerators, zone, use_spot # unused
226
271
  existing_contexts = cls.existing_allowed_contexts()
227
272
 
@@ -231,6 +276,19 @@ class Kubernetes(clouds.Cloud):
231
276
 
232
277
  if region is not None:
233
278
  regions = [r for r in regions if r.name == region]
279
+ if resources is not None:
280
+ filtered_regions = []
281
+ resources_required_features = resources.get_required_cloud_features(
282
+ )
283
+ for r in regions:
284
+ try:
285
+ cls.check_features_are_supported(
286
+ resources, resources_required_features, r.name)
287
+ filtered_regions.append(r)
288
+ except exceptions.NotSupportedError as e:
289
+ logger.info(f'Filter out context: {r.name}, reason: {e}')
290
+ continue
291
+ regions = filtered_regions
234
292
 
235
293
  # Check if requested instance type will fit in the cluster.
236
294
  # TODO(zhwu,romilb): autoscaler type needs to be regional (per
@@ -238,22 +296,6 @@ class Kubernetes(clouds.Cloud):
238
296
  if instance_type is None:
239
297
  return regions
240
298
 
241
- autoscaler_type = kubernetes_utils.get_autoscaler_type()
242
- if (autoscaler_type is not None and not kubernetes_utils.get_autoscaler(
243
- autoscaler_type).can_query_backend):
244
- # Unsupported autoscaler type. Rely on the autoscaler to
245
- # provision the right instance type without running checks.
246
- # Worst case, if autoscaling fails, the pod will be stuck in
247
- # pending state until provision_timeout, after which failover
248
- # will be triggered.
249
- #
250
- # Removing this if statement produces the same behavior,
251
- # because can_create_new_instance_of_type() always returns True
252
- # for unsupported autoscaler types.
253
- # This check is here as a performance optimization to avoid
254
- # further code executions that is known to return this result.
255
- return regions
256
-
257
299
  regions_to_return = []
258
300
  for r in regions:
259
301
  context = r.name
@@ -270,9 +312,34 @@ class Kubernetes(clouds.Cloud):
270
312
  'not fit in the existing Kubernetes cluster '
271
313
  'with context: '
272
314
  f'{context}. Reason: {reason}')
315
+
316
+ autoscaler_type = skypilot_config.get_effective_region_config(
317
+ cloud='kubernetes',
318
+ region=context,
319
+ keys=('autoscaler',),
320
+ default_value=None)
321
+ if (autoscaler_type is not None and
322
+ not kubernetes_utils.get_autoscaler(
323
+ kubernetes_enums.KubernetesAutoscalerType(
324
+ autoscaler_type)).can_query_backend):
325
+ # Unsupported autoscaler type. Rely on the autoscaler to
326
+ # provision the right instance type without running checks.
327
+ # Worst case, if autoscaling fails, the pod will be stuck in
328
+ # pending state until provision_timeout, after which failover
329
+ # will be triggered.
330
+ #
331
+ # Removing this if statement produces the same behavior,
332
+ # because can_create_new_instance_of_type() always returns True
333
+ # for unsupported autoscaler types.
334
+ # This check is here as a performance optimization to avoid
335
+ # further code executions that is known to return this result.
336
+ regions_to_return.append(r)
337
+ continue
338
+
273
339
  if autoscaler_type is None:
274
340
  continue
275
- autoscaler = kubernetes_utils.get_autoscaler(autoscaler_type)
341
+ autoscaler = kubernetes_utils.get_autoscaler(
342
+ kubernetes_enums.KubernetesAutoscalerType(autoscaler_type))
276
343
  logger.debug(f'{context} has autoscaler of type: {autoscaler_type}')
277
344
  if autoscaler.can_create_new_instance_of_type(
278
345
  context, instance_type):
@@ -312,10 +379,12 @@ class Kubernetes(clouds.Cloud):
312
379
  cls,
313
380
  cpus: Optional[str] = None,
314
381
  memory: Optional[str] = None,
315
- disk_tier: Optional['resources_utils.DiskTier'] = None) -> str:
382
+ disk_tier: Optional['resources_utils.DiskTier'] = None,
383
+ region: Optional[str] = None,
384
+ zone: Optional[str] = None) -> str:
316
385
  # TODO(romilb): In the future, we may want to move the instance type
317
386
  # selection + availability checking to a kubernetes_catalog module.
318
- del disk_tier # Unused.
387
+ del disk_tier, region, zone # Unused.
319
388
  # We strip '+' from resource requests since Kubernetes can provision
320
389
  # exactly the requested resources.
321
390
  instance_cpus = float(
@@ -379,7 +448,11 @@ class Kubernetes(clouds.Cloud):
379
448
  return 0
380
449
 
381
450
  @staticmethod
382
- def _calculate_provision_timeout(num_nodes: int) -> int:
451
+ def _calculate_provision_timeout(
452
+ num_nodes: int,
453
+ volume_mounts: Optional[List['volume_lib.VolumeMount']],
454
+ enable_flex_start: bool,
455
+ ) -> int:
383
456
  """Calculate provision timeout based on number of nodes.
384
457
 
385
458
  The timeout scales linearly with the number of nodes to account for
@@ -387,6 +460,8 @@ class Kubernetes(clouds.Cloud):
387
460
 
388
461
  Args:
389
462
  num_nodes: Number of nodes being provisioned
463
+ volume_mounts: Volume mounts for the pod
464
+ enable_flex_start: Whether flex start is enabled
390
465
 
391
466
  Returns:
392
467
  Timeout in seconds
@@ -394,19 +469,38 @@ class Kubernetes(clouds.Cloud):
394
469
  base_timeout = 10 # Base timeout for single node
395
470
  per_node_timeout = 0.2 # Additional seconds per node
396
471
  max_timeout = 60 # Cap at 1 minute
472
+ if enable_flex_start:
473
+ # Flex start takes longer to provision.
474
+ base_timeout = 1200
475
+ per_node_timeout = 10
476
+ max_timeout = 2400
477
+ elif volume_mounts is not None:
478
+ for volume_mount in volume_mounts:
479
+ if (volume_mount.volume_config.type ==
480
+ volume_lib.VolumeType.PVC.value):
481
+ if (volume_mount.volume_config.config.get(
482
+ 'access_mode', '') ==
483
+ volume_lib.VolumeAccessMode.READ_WRITE_MANY.value):
484
+ # GKE may take several minutes to provision a PV
485
+ # supporting READ_WRITE_MANY with filestore.
486
+ base_timeout = 180
487
+ max_timeout = 240
488
+ break
397
489
 
398
490
  return int(
399
491
  min(base_timeout + (per_node_timeout * (num_nodes - 1)),
400
492
  max_timeout))
401
493
 
402
494
  def make_deploy_resources_variables(
403
- self,
404
- resources: 'resources_lib.Resources',
405
- cluster_name: 'resources_utils.ClusterName',
406
- region: Optional['clouds.Region'],
407
- zones: Optional[List['clouds.Zone']],
408
- num_nodes: int,
409
- dryrun: bool = False) -> Dict[str, Optional[str]]:
495
+ self,
496
+ resources: 'resources_lib.Resources',
497
+ cluster_name: 'resources_utils.ClusterName',
498
+ region: Optional['clouds.Region'],
499
+ zones: Optional[List['clouds.Zone']],
500
+ num_nodes: int,
501
+ dryrun: bool = False,
502
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
503
+ ) -> Dict[str, Optional[str]]:
410
504
  del cluster_name, zones, dryrun # Unused.
411
505
  if region is None:
412
506
  context = kubernetes_utils.get_current_kube_config_context_name()
@@ -414,8 +508,9 @@ class Kubernetes(clouds.Cloud):
414
508
  context = region.name
415
509
  assert context is not None, 'No context found in kubeconfig'
416
510
 
417
- r = resources
418
- acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
511
+ resources = resources.assert_launchable()
512
+ acc_dict = self.get_accelerators_from_instance_type(
513
+ resources.instance_type)
419
514
  custom_resources = resources_utils.make_ray_custom_resources_str(
420
515
  acc_dict)
421
516
 
@@ -426,8 +521,12 @@ class Kubernetes(clouds.Cloud):
426
521
  cpus = k.cpus
427
522
  mem = k.memory
428
523
  # Optionally populate accelerator information.
429
- acc_count = k.accelerator_count if k.accelerator_count else 0
430
- acc_type = k.accelerator_type if k.accelerator_type else None
524
+ acc_type = k.accelerator_type
525
+ acc_count = k.accelerator_count
526
+ if acc_type is not None and is_tpu_on_gke(acc_type):
527
+ acc_type, acc_count = normalize_tpu_accelerator_name(acc_type)
528
+ else:
529
+ acc_count = acc_count or 0
431
530
 
432
531
  def _get_image_id(resources: 'resources_lib.Resources') -> str:
433
532
  image_id_dict = resources.image_id
@@ -444,15 +543,18 @@ class Kubernetes(clouds.Cloud):
444
543
  # Select image based on whether we are using GPUs or not.
445
544
  image_id = self.IMAGE_GPU if acc_count > 0 else self.IMAGE_CPU
446
545
  # Get the container image ID from the service catalog.
447
- image_id = service_catalog.get_image_id_from_tag(
448
- image_id, clouds='kubernetes')
546
+ image_id = catalog.get_image_id_from_tag(image_id,
547
+ clouds='kubernetes')
449
548
  return image_id
450
549
 
451
550
  image_id = _get_image_id(resources)
452
- # TODO(romilb): Create a lightweight image for SSH jump host
453
- ssh_jump_image = service_catalog.get_image_id_from_tag(
454
- self.IMAGE_CPU, clouds='kubernetes')
455
551
 
552
+ # Set environment variables for the pod. Note that SkyPilot env vars
553
+ # are set separately when the task is run. These env vars are
554
+ # independent of the SkyPilot task to be run.
555
+ k8s_env_vars = {kubernetes.IN_CLUSTER_CONTEXT_NAME_ENV_VAR: context}
556
+
557
+ # Setup GPU/TPU labels and resource keys.
456
558
  k8s_acc_label_key = None
457
559
  k8s_acc_label_values = None
458
560
  k8s_topology_label_key = None
@@ -472,17 +574,32 @@ class Kubernetes(clouds.Cloud):
472
574
  tpu_requested = True
473
575
  k8s_resource_key = kubernetes_utils.TPU_RESOURCE_KEY
474
576
  else:
475
- k8s_resource_key = kubernetes_utils.get_gpu_resource_key()
577
+ k8s_resource_key = kubernetes_utils.get_gpu_resource_key(
578
+ context)
476
579
  else:
580
+ # If no GPUs are requested, we set NVIDIA_VISIBLE_DEVICES=none to
581
+ # maintain GPU isolation. This is to override the default behavior
582
+ # of Nvidia device plugin which would expose all GPUs to the pod
583
+ # when no GPUs are requested.
584
+ # Note that NVIDIA_VISIBLE_DEVICES is different from
585
+ # CUDA_VISIBLE_DEVICES - the latter is used to control which GPUs
586
+ # are visible to the application and is set inside the pod, while
587
+ # the former is used to control which GPUs are visible to the pod
588
+ # through the nvidia runtime.
589
+ # See: https://github.com/NVIDIA/k8s-device-plugin/issues/61
590
+ k8s_env_vars['NVIDIA_VISIBLE_DEVICES'] = 'none'
477
591
  avoid_label_keys = kubernetes_utils.get_accelerator_label_keys(
478
592
  context)
479
593
  if len(avoid_label_keys) == 0:
480
594
  avoid_label_keys = None
481
- port_mode = network_utils.get_port_mode(None)
595
+ port_mode = network_utils.get_port_mode(None, context)
482
596
 
483
- remote_identity = skypilot_config.get_nested(
484
- ('kubernetes', 'remote_identity'),
485
- schemas.get_default_remote_identity('kubernetes'))
597
+ remote_identity = skypilot_config.get_effective_region_config(
598
+ # TODO(kyuds): Support SSH node pools as well.
599
+ cloud='kubernetes',
600
+ region=context,
601
+ keys=('remote_identity',),
602
+ default_value=schemas.get_default_remote_identity('kubernetes'))
486
603
 
487
604
  if isinstance(remote_identity, dict):
488
605
  # If remote_identity is a dict, use the service account for the
@@ -496,20 +613,17 @@ class Kubernetes(clouds.Cloud):
496
613
  # If remote_identity is not a dict, use
497
614
  k8s_service_account_name = remote_identity
498
615
 
499
- if (k8s_service_account_name ==
500
- schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value):
501
- # SA name doesn't matter since automounting credentials is disabled
502
- k8s_service_account_name = 'default'
503
- k8s_automount_sa_token = 'false'
504
- elif (k8s_service_account_name ==
505
- schemas.RemoteIdentityOptions.SERVICE_ACCOUNT.value):
506
- # Use the default service account
616
+ lc = schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value
617
+ sa = schemas.RemoteIdentityOptions.SERVICE_ACCOUNT.value
618
+
619
+ if k8s_service_account_name == lc or k8s_service_account_name == sa:
620
+ # Use the default service account if remote identity is not set.
621
+ # For LOCAL_CREDENTIALS, this is for in-cluster authentication
622
+ # which needs a serviceaccount (specifically for SSH node pools
623
+ # which uses in-cluster authentication internally, and we would
624
+ # like to support exec-auth when the user is also using SSH infra)
507
625
  k8s_service_account_name = (
508
626
  kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME)
509
- k8s_automount_sa_token = 'true'
510
- else:
511
- # User specified a custom service account
512
- k8s_automount_sa_token = 'true'
513
627
 
514
628
  fuse_device_required = bool(resources.requires_fuse)
515
629
 
@@ -518,26 +632,22 @@ class Kubernetes(clouds.Cloud):
518
632
  if resources.use_spot:
519
633
  spot_label_key, spot_label_value = kubernetes_utils.get_spot_label()
520
634
 
521
- # Timeout for resource provisioning. This timeout determines how long to
522
- # wait for pod to be in pending status before giving up.
523
- # Larger timeout may be required for autoscaling clusters, since
524
- # autoscaler may take some time to provision new nodes.
525
- # Note that this timeout includes time taken by the Kubernetes scheduler
526
- # itself, which can be upto 2-3 seconds, and up to 10-15 seconds when
527
- # scheduling 100s of pods.
528
- # We use a linear scaling formula to determine the timeout based on the
529
- # number of nodes.
530
-
531
- timeout = self._calculate_provision_timeout(num_nodes)
532
- timeout = skypilot_config.get_nested(
533
- ('kubernetes', 'provision_timeout'),
534
- timeout,
535
- override_configs=resources.cluster_config_overrides)
536
-
537
- # Set environment variables for the pod. Note that SkyPilot env vars
538
- # are set separately when the task is run. These env vars are
539
- # independent of the SkyPilot task to be run.
540
- k8s_env_vars = {kubernetes.IN_CLUSTER_CONTEXT_NAME_ENV_VAR: context}
635
+ network_type, machine_type = self._detect_network_type(
636
+ context, resources.network_tier)
637
+
638
+ # Check if this cluster supports high performance networking and
639
+ # configure appropriate settings for different cluster types
640
+ if (resources.network_tier is not None and
641
+ resources.network_tier == resources_utils.NetworkTier.BEST):
642
+ # Only proceed if CUSTOM_NETWORK_TIER is supported by this cluster
643
+ unsupported_features = self._unsupported_features_for_resources(
644
+ resources)
645
+ if clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER \
646
+ not in unsupported_features:
647
+ # Add high-performance networking environment variables for
648
+ # clusters with high performance networking
649
+ network_env_vars = network_type.get_network_env_vars()
650
+ k8s_env_vars.update(network_env_vars)
541
651
 
542
652
  # We specify object-store-memory to be 500MB to avoid taking up too
543
653
  # much memory on the head node. 'num-cpus' should be set to limit
@@ -551,9 +661,63 @@ class Kubernetes(clouds.Cloud):
551
661
  }
552
662
 
553
663
  # Get the storage class name for high availability controller's PVC
554
- k8s_ha_storage_class_name = skypilot_config.get_nested(
555
- ('kubernetes', 'high_availability', 'storage_class_name'),
556
- None,
664
+ k8s_ha_storage_class_name = (
665
+ skypilot_config.get_effective_region_config(
666
+ cloud='kubernetes',
667
+ region=context,
668
+ keys=('high_availability', 'storage_class_name'),
669
+ default_value=None))
670
+
671
+ k8s_kueue_local_queue_name = (
672
+ skypilot_config.get_effective_region_config(
673
+ # TODO(kyuds): Support SSH node pools as well.
674
+ cloud='kubernetes',
675
+ region=context,
676
+ keys=('kueue', 'local_queue_name'),
677
+ default_value=None,
678
+ override_configs=resources.cluster_config_overrides))
679
+
680
+ # Check DWS configuration for GKE.
681
+ (enable_flex_start, enable_flex_start_queued_provisioning,
682
+ max_run_duration_seconds) = gcp_utils.get_dws_config(
683
+ context, k8s_kueue_local_queue_name,
684
+ resources.cluster_config_overrides)
685
+ if enable_flex_start_queued_provisioning or enable_flex_start:
686
+ # DWS is only supported in GKE, check the autoscaler type.
687
+ autoscaler_type = skypilot_config.get_effective_region_config(
688
+ # TODO(kyuds): Support SSH node pools as well.
689
+ cloud='kubernetes',
690
+ region=context,
691
+ keys=('autoscaler',),
692
+ default_value=None)
693
+ if (autoscaler_type !=
694
+ kubernetes_enums.KubernetesAutoscalerType.GKE.value):
695
+ raise ValueError(
696
+ f'DWS is only supported in GKE, but the autoscaler type '
697
+ f'for context {context} is {autoscaler_type}')
698
+
699
+ # Timeout for resource provisioning. This timeout determines how long to
700
+ # wait for pod to be in pending status before giving up.
701
+ # Larger timeout may be required for autoscaling clusters, since
702
+ # autoscaler may take some time to provision new nodes.
703
+ # Note that this timeout includes time taken by the Kubernetes scheduler
704
+ # itself, which can be upto 2-3 seconds, and up to 10-15 seconds when
705
+ # scheduling 100s of pods.
706
+ # We use a linear scaling formula to determine the timeout based on the
707
+ # number of nodes.
708
+
709
+ timeout = self._calculate_provision_timeout(
710
+ num_nodes, volume_mounts, enable_flex_start or
711
+ enable_flex_start_queued_provisioning)
712
+
713
+ # Use _REPR, instead of directly using 'kubernetes' as the config key,
714
+ # because it could be SSH node pool as well.
715
+ cloud_config_str = self._REPR.lower()
716
+ timeout = skypilot_config.get_effective_region_config(
717
+ cloud=cloud_config_str,
718
+ region=context,
719
+ keys=('provision_timeout',),
720
+ default_value=timeout,
557
721
  override_configs=resources.cluster_config_overrides)
558
722
 
559
723
  deploy_vars = {
@@ -564,15 +728,12 @@ class Kubernetes(clouds.Cloud):
564
728
  'accelerator_count': str(acc_count),
565
729
  'timeout': str(timeout),
566
730
  'k8s_port_mode': port_mode.value,
567
- 'k8s_networking_mode': network_utils.get_networking_mode().value,
568
- 'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME,
569
731
  'k8s_acc_label_key': k8s_acc_label_key,
570
732
  'k8s_acc_label_values': k8s_acc_label_values,
571
- 'k8s_ssh_jump_name': self.SKY_SSH_JUMP_NAME,
572
- 'k8s_ssh_jump_image': ssh_jump_image,
573
733
  'k8s_service_account_name': k8s_service_account_name,
574
- 'k8s_automount_sa_token': k8s_automount_sa_token,
734
+ 'k8s_automount_sa_token': 'true',
575
735
  'k8s_fuse_device_required': fuse_device_required,
736
+ 'k8s_kueue_local_queue_name': k8s_kueue_local_queue_name,
576
737
  # Namespace to run the fusermount-server daemonset in
577
738
  'k8s_skypilot_system_namespace': _SKYPILOT_SYSTEM_NAMESPACE,
578
739
  'k8s_fusermount_shared_dir': _FUSERMOUNT_SHARED_DIR,
@@ -600,9 +761,17 @@ class Kubernetes(clouds.Cloud):
600
761
  (constants.PERSISTENT_SETUP_SCRIPT_PATH),
601
762
  'k8s_high_availability_deployment_run_script_dir':
602
763
  (constants.PERSISTENT_RUN_SCRIPT_DIR),
764
+ 'k8s_high_availability_restarting_signal_file':
765
+ (constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE),
766
+ 'ha_recovery_log_path':
767
+ constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format(''),
768
+ 'sky_python_cmd': constants.SKY_PYTHON_CMD,
603
769
  'k8s_high_availability_storage_class_name':
604
770
  (k8s_ha_storage_class_name),
605
771
  'avoid_label_keys': avoid_label_keys,
772
+ 'k8s_enable_flex_start': enable_flex_start,
773
+ 'k8s_max_run_duration_seconds': max_run_duration_seconds,
774
+ 'k8s_network_type': network_type.value,
606
775
  }
607
776
 
608
777
  # Add kubecontext if it is set. It may be None if SkyPilot is running
@@ -613,13 +782,43 @@ class Kubernetes(clouds.Cloud):
613
782
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
614
783
  deploy_vars['k8s_namespace'] = namespace
615
784
 
785
+ # Add backward compatibility template variables for GPUDirect variants
786
+ deploy_vars['k8s_enable_gpudirect_tcpx'] = (
787
+ network_type == KubernetesHighPerformanceNetworkType.GCP_TCPX)
788
+ deploy_vars['k8s_enable_gpudirect_tcpxo'] = (
789
+ network_type == KubernetesHighPerformanceNetworkType.GCP_TCPXO)
790
+ rdma_enabled = (network_type ==
791
+ KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA)
792
+ deploy_vars['k8s_enable_gpudirect_rdma'] = rdma_enabled
793
+ if rdma_enabled and machine_type.startswith('a4'):
794
+ deploy_vars['k8s_enable_gpudirect_rdma_a4'] = True
795
+ else:
796
+ deploy_vars['k8s_enable_gpudirect_rdma_a4'] = False
797
+
798
+ deploy_vars['k8s_ipc_lock_capability'] = (
799
+ network_type.requires_ipc_lock_capability())
800
+
616
801
  return deploy_vars
617
802
 
803
+ @staticmethod
804
+ def _warn_on_disk_size(resources: 'resources_lib.Resources'):
805
+ if resources.disk_size != resources_lib.DEFAULT_DISK_SIZE_GB:
806
+ logger.info(f'{colorama.Style.DIM}Disk size {resources.disk_size} '
807
+ 'is not supported by Kubernetes. '
808
+ 'To add additional disk, use volumes.'
809
+ f'{colorama.Style.RESET_ALL}')
810
+ if resources.disk_tier is not None:
811
+ logger.info(f'{colorama.Style.DIM}Disk tier {resources.disk_tier} '
812
+ 'is not supported by Kubernetes. '
813
+ 'To add additional disk, use volumes.'
814
+ f'{colorama.Style.RESET_ALL}')
815
+
618
816
  def _get_feasible_launchable_resources(
619
817
  self, resources: 'resources_lib.Resources'
620
818
  ) -> 'resources_utils.FeasibleResources':
621
819
  # TODO(zhwu): This needs to be updated to return the correct region
622
820
  # (context) that has enough resources.
821
+ self._warn_on_disk_size(resources)
623
822
  fuzzy_candidate_list: List[str] = []
624
823
  if resources.instance_type is not None:
625
824
  assert resources.is_launchable(), resources
@@ -628,7 +827,8 @@ class Kubernetes(clouds.Cloud):
628
827
  accelerators=resources.accelerators,
629
828
  use_spot=resources.use_spot,
630
829
  region=resources.region,
631
- zone=resources.zone)
830
+ zone=resources.zone,
831
+ resources=resources)
632
832
  if not regions:
633
833
  return resources_utils.FeasibleResources([], [], None)
634
834
  resources = resources.copy(accelerators=None)
@@ -639,7 +839,7 @@ class Kubernetes(clouds.Cloud):
639
839
  resource_list = []
640
840
  for instance_type in instance_list:
641
841
  r = resources.copy(
642
- cloud=Kubernetes(),
842
+ cloud=self.__class__(),
643
843
  instance_type=instance_type,
644
844
  accelerators=None,
645
845
  )
@@ -652,7 +852,9 @@ class Kubernetes(clouds.Cloud):
652
852
  default_instance_type = Kubernetes.get_default_instance_type(
653
853
  cpus=resources.cpus,
654
854
  memory=resources.memory,
655
- disk_tier=resources.disk_tier)
855
+ disk_tier=resources.disk_tier,
856
+ region=resources.region,
857
+ zone=resources.zone)
656
858
 
657
859
  if accelerators is None:
658
860
  # For CPU only clusters, need no special handling
@@ -661,12 +863,18 @@ class Kubernetes(clouds.Cloud):
661
863
  assert len(accelerators) == 1, resources
662
864
  # GPUs requested - build instance type.
663
865
  acc_type, acc_count = list(accelerators.items())[0]
866
+ # If acc_type contains spaces, return empty list since Kubernetes
867
+ # does not support spaces in label values
868
+ if ' ' in acc_type:
869
+ return resources_utils.FeasibleResources([], [], None)
664
870
 
665
871
  # Parse into KubernetesInstanceType
666
872
  k8s_instance_type = (kubernetes_utils.KubernetesInstanceType.
667
873
  from_instance_type(default_instance_type))
668
874
 
669
875
  gpu_task_cpus = k8s_instance_type.cpus
876
+ if resources.cpus is None:
877
+ gpu_task_cpus = self._DEFAULT_NUM_VCPUS_WITH_GPU * acc_count
670
878
  # Special handling to bump up memory multiplier for GPU instances
671
879
  gpu_task_memory = (float(resources.memory.strip('+')) if
672
880
  resources.memory is not None else gpu_task_cpus *
@@ -680,7 +888,8 @@ class Kubernetes(clouds.Cloud):
680
888
  accelerators=None,
681
889
  use_spot=resources.use_spot,
682
890
  region=resources.region,
683
- zone=resources.zone)
891
+ zone=resources.zone,
892
+ resources=resources)
684
893
  if not available_regions:
685
894
  return resources_utils.FeasibleResources([], [], None)
686
895
  # No fuzzy lists for Kubernetes
@@ -691,10 +900,47 @@ class Kubernetes(clouds.Cloud):
691
900
  [], None)
692
901
 
693
902
  @classmethod
694
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
903
+ def _check_single_context(cls, context: str) -> Tuple[bool, str]:
904
+ """Check if the user has access credentials to a single SSH context."""
905
+
906
+ def _red_color(str_to_format: str) -> str:
907
+ return (f'{colorama.Fore.LIGHTRED_EX}'
908
+ f'{str_to_format}'
909
+ f'{colorama.Style.RESET_ALL}')
910
+
911
+ def _dim_color(str_to_format: str) -> str:
912
+ return (f'{colorama.Style.DIM}'
913
+ f'{str_to_format}'
914
+ f'{colorama.Style.RESET_ALL}')
915
+
916
+ def _bright_green_color(str_to_format: str) -> str:
917
+ return (f'{colorama.Fore.GREEN}'
918
+ f'{str_to_format}'
919
+ f'{colorama.Style.RESET_ALL}')
920
+
921
+ try:
922
+ check_result = kubernetes_utils.check_credentials(
923
+ context, run_optional_checks=True)
924
+ if check_result[0]:
925
+ if check_result[1] is not None:
926
+ return True, (_bright_green_color('enabled.') +
927
+ _dim_color(f' Note: {check_result[1]}'))
928
+ else:
929
+ return True, _bright_green_color('enabled.')
930
+ else:
931
+ assert check_result[1] is not None
932
+ return False, (_red_color('disabled.') +
933
+ _dim_color(f' Reason: {check_result[1]}'))
934
+ except Exception as e: # pylint: disable=broad-except
935
+ return False, _red_color(str(e))
936
+
937
+ @classmethod
938
+ def _check_compute_credentials(
939
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
695
940
  """Checks if the user has access credentials to
696
941
  Kubernetes."""
697
942
  # Check for port forward dependencies
943
+ logger.debug(f'Checking compute credentials for {cls.canonical_name()}')
698
944
  reasons = kubernetes_utils.check_port_forward_mode_dependencies(False)
699
945
  if reasons is not None:
700
946
  formatted = '\n'.join(
@@ -718,26 +964,15 @@ class Kubernetes(clouds.Cloud):
718
964
  return (False, 'No available context found in kubeconfig. '
719
965
  'Check if you have a valid kubeconfig file' +
720
966
  check_skypilot_config_msg)
721
- reasons = []
722
- hints = []
967
+
968
+ ctx2text = {}
723
969
  success = False
724
970
  for context in existing_allowed_contexts:
725
- try:
726
- check_result = kubernetes_utils.check_credentials(
727
- context, run_optional_checks=True)
728
- if check_result[0]:
729
- success = True
730
- if check_result[1] is not None:
731
- hints.append(f'Context {context}: {check_result[1]}')
732
- else:
733
- reasons.append(f'Context {context}: {check_result[1]}')
734
- except Exception as e: # pylint: disable=broad-except
735
- return (False, f'Credential check failed for {context}: '
736
- f'{common_utils.format_exception(e)}')
737
- if success:
738
- return (True, cls._format_credential_check_results(hints, reasons))
739
- return (False, 'Failed to find available context with working '
740
- 'credentials. Details:\n' + '\n'.join(reasons))
971
+ suc, text = cls._check_single_context(context)
972
+ success = success or suc
973
+ ctx2text[context] = text
974
+
975
+ return success, ctx2text
741
976
 
742
977
  @classmethod
743
978
  def _format_credential_check_results(cls, hints: List[str],
@@ -768,10 +1003,28 @@ class Kubernetes(clouds.Cloud):
768
1003
  return ''.join(message_parts)
769
1004
 
770
1005
  def get_credential_file_mounts(self) -> Dict[str, str]:
771
- if os.path.exists(os.path.expanduser(CREDENTIAL_PATH)):
1006
+ credential_paths = kubernetes_utils.get_kubeconfig_paths()
1007
+ if credential_paths:
1008
+ # For single kubeconfig path, keep the original path.
1009
+ kubeconfig_file = credential_paths[0]
1010
+ if len(credential_paths) > 1:
1011
+ # For multiple kubeconfig paths, merge them into a single file.
1012
+ # TODO(aylei): GC merged kubeconfig files.
1013
+ kubeconfig_file = tempfile.NamedTemporaryFile(
1014
+ prefix='merged-kubeconfig-', suffix='.yaml',
1015
+ delete=False).name
1016
+ subprocess.run(
1017
+ 'kubectl config view --flatten '
1018
+ f'> {kubeconfig_file}',
1019
+ shell=True,
1020
+ check=True)
1021
+ if os.path.exists(kubeconfig_file):
1022
+ # convert auth plugin paths (e.g.: gke-gcloud-auth-plugin)
1023
+ kubeconfig_file = kubernetes_utils.format_kubeconfig_exec_auth_with_cache(kubeconfig_file) # pylint: disable=line-too-long
1024
+
772
1025
  # Upload kubeconfig to the default path to avoid having to set
773
1026
  # KUBECONFIG in the environment.
774
- return {DEFAULT_KUBECONFIG_PATH: CREDENTIAL_PATH}
1027
+ return {kubernetes.DEFAULT_KUBECONFIG_PATH: kubeconfig_file}
775
1028
  else:
776
1029
  return {}
777
1030
 
@@ -787,7 +1040,7 @@ class Kubernetes(clouds.Cloud):
787
1040
 
788
1041
  all_contexts = kubernetes_utils.get_all_kube_context_names()
789
1042
 
790
- if region not in all_contexts:
1043
+ if region and region not in all_contexts:
791
1044
  raise ValueError(
792
1045
  f'Context {region} not found in kubeconfig. Kubernetes only '
793
1046
  'supports context names as regions. Available '
@@ -810,11 +1063,11 @@ class Kubernetes(clouds.Cloud):
810
1063
 
811
1064
  @classmethod
812
1065
  def get_user_identities(cls) -> Optional[List[List[str]]]:
813
- k8s = kubernetes.kubernetes
814
1066
  identities = []
1067
+ k8s = kubernetes.kubernetes
815
1068
  try:
816
1069
  all_contexts, current_context = (
817
- k8s.config.list_kube_config_contexts())
1070
+ kubernetes.list_kube_config_contexts())
818
1071
  except k8s.config.config_exception.ConfigException:
819
1072
  return None
820
1073
  # Add current context at the head of the list
@@ -825,6 +1078,31 @@ class Kubernetes(clouds.Cloud):
825
1078
  identities.append(identity)
826
1079
  return identities
827
1080
 
1081
+ @classmethod
1082
+ def is_volume_name_valid(cls,
1083
+ volume_name: str) -> Tuple[bool, Optional[str]]:
1084
+ """Validates that the volume name is valid for this cloud.
1085
+
1086
+ Follows Kubernetes DNS-1123 subdomain rules:
1087
+ - must be <= 253 characters
1088
+ - must match: '[a-z0-9]([-a-z0-9]*[a-z0-9])?(.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*' # pylint: disable=line-too-long
1089
+ """
1090
+ # Max length per DNS-1123 subdomain
1091
+ if len(volume_name) > cls._MAX_VOLUME_NAME_LEN_LIMIT:
1092
+ return (False, f'Volume name exceeds the maximum length of '
1093
+ f'{cls._MAX_VOLUME_NAME_LEN_LIMIT} characters '
1094
+ '(DNS-1123 subdomain).')
1095
+
1096
+ # DNS-1123 label: [a-z0-9]([-a-z0-9]*[a-z0-9])?
1097
+ label = r'[a-z0-9]([-a-z0-9]*[a-z0-9])?'
1098
+ # DNS-1123 subdomain: label(\.-separated label)*
1099
+ subdomain_pattern = rf'^{label}(\.{label})*$'
1100
+ if re.fullmatch(subdomain_pattern, volume_name) is None:
1101
+ return (False, 'Volume name must be a valid DNS-1123 subdomain: '
1102
+ 'lowercase alphanumeric, "-", and "."; start/end with '
1103
+ 'alphanumeric.')
1104
+ return True, None
1105
+
828
1106
  @classmethod
829
1107
  def is_label_valid(cls, label_key: str,
830
1108
  label_value: str) -> Tuple[bool, Optional[str]]:
@@ -854,3 +1132,133 @@ class Kubernetes(clouds.Cloud):
854
1132
  if not key_valid or not value_valid:
855
1133
  return False, error_msg
856
1134
  return True, None
1135
+
1136
+ @classmethod
1137
+ def expand_infras(cls) -> List[str]:
1138
+ return [
1139
+ f'{cls.canonical_name()}/{c}'
1140
+ for c in cls.existing_allowed_contexts(silent=True)
1141
+ ]
1142
+
1143
+ @classmethod
1144
+ def _detect_network_type(
1145
+ cls,
1146
+ context: str,
1147
+ network_tier: Optional['resources_utils.NetworkTier'] = None
1148
+ ) -> Tuple[KubernetesHighPerformanceNetworkType, str]:
1149
+ """Detect the type of Kubernetes network based on node labels.
1150
+
1151
+ Args:
1152
+ context: The Kubernetes context to check.
1153
+ network_tier: The network tier requested. If None or not BEST,
1154
+ returns NONE (no high-performance networking).
1155
+
1156
+ Returns:
1157
+ A tuple of the detected network type and the instance type.
1158
+ """
1159
+ # If network_tier is None or not BEST, return NONE
1160
+ if (network_tier is None or
1161
+ network_tier != resources_utils.NetworkTier.BEST):
1162
+ return KubernetesHighPerformanceNetworkType.NONE, ''
1163
+
1164
+ try:
1165
+ nodes = kubernetes_utils.get_kubernetes_nodes(context=context)
1166
+ for node in nodes:
1167
+ if node.metadata.labels:
1168
+ # Check for Nebius clusters
1169
+ for label_key, _ in node.metadata.labels.items():
1170
+ if label_key.startswith('nebius.com/'):
1171
+ return (KubernetesHighPerformanceNetworkType.NEBIUS,
1172
+ '')
1173
+ if label_key.startswith('ib.coreweave.cloud/'):
1174
+ return (
1175
+ KubernetesHighPerformanceNetworkType.COREWEAVE,
1176
+ '')
1177
+
1178
+ # Check for GKE clusters with specific GPUDirect variants
1179
+ machine_family = node.metadata.labels.get(
1180
+ 'cloud.google.com/machine-family', '')
1181
+ instance_type = node.metadata.labels.get(
1182
+ 'node.kubernetes.io/instance-type', '')
1183
+ gke_accelerator = node.metadata.labels.get(
1184
+ 'cloud.google.com/gke-accelerator', '')
1185
+
1186
+ # Check if this is a GKE cluster with A3/A4 machine family
1187
+ if machine_family in ['a3', 'a4']:
1188
+ # Check instance type to determine specific GPUDirect
1189
+ # variant
1190
+ if 'a3-highgpu-8g' in instance_type:
1191
+ return (
1192
+ KubernetesHighPerformanceNetworkType.GCP_TCPX,
1193
+ 'a3-highgpu-8g')
1194
+ elif 'a3-edgegpu-8g' in instance_type:
1195
+ return (
1196
+ KubernetesHighPerformanceNetworkType.GCP_TCPX,
1197
+ 'a3-edgegpu-8g')
1198
+ elif 'a3-megagpu-8g' in instance_type:
1199
+ return (
1200
+ KubernetesHighPerformanceNetworkType.GCP_TCPXO,
1201
+ 'a3-megagpu-8g')
1202
+ elif 'a4-highgpu-8g' in instance_type:
1203
+ return (KubernetesHighPerformanceNetworkType.
1204
+ GCP_GPUDIRECT_RDMA, 'a4-highgpu-8g')
1205
+ elif 'a3-ultragpu-8g' in instance_type:
1206
+ return (KubernetesHighPerformanceNetworkType.
1207
+ GCP_GPUDIRECT_RDMA, 'a3-ultragpu-8g')
1208
+ # Generic A3/A4 detection as fallback
1209
+ elif machine_family == 'a4':
1210
+ return (KubernetesHighPerformanceNetworkType.
1211
+ GCP_GPUDIRECT_RDMA, 'a4')
1212
+
1213
+ # Fallback: Check for GPU Direct TCPX capable instance
1214
+ # types with high-perf GPUs
1215
+ is_gpu_direct_tcpx_instance = (
1216
+ instance_type
1217
+ in gcp_constants.GPU_DIRECT_TCPX_INSTANCE_TYPES)
1218
+ has_high_perf_gpu = ('nvidia-h100' in gke_accelerator or
1219
+ 'nvidia-h200' in gke_accelerator or
1220
+ 'nvidia-b200' in gke_accelerator)
1221
+
1222
+ if is_gpu_direct_tcpx_instance and has_high_perf_gpu:
1223
+ # Default to TCPX if we can't determine the specific
1224
+ # variant
1225
+ return (KubernetesHighPerformanceNetworkType.GCP_TCPX,
1226
+ instance_type)
1227
+
1228
+ except exceptions.KubeAPIUnreachableError:
1229
+ # If we can't reach the cluster, assume no high perf networking
1230
+ pass
1231
+
1232
+ # If we cannot determine the network type based on nodes
1233
+ # Check if the cluster has any node pools with autoscaling enabled
1234
+ # with machine types that support high perf networking for GKE.
1235
+ autoscaler_type = skypilot_config.get_effective_region_config(
1236
+ cloud='kubernetes',
1237
+ region=context,
1238
+ keys=('autoscaler',),
1239
+ default_value=None)
1240
+ if (autoscaler_type !=
1241
+ kubernetes_enums.KubernetesAutoscalerType.GKE.value):
1242
+ return KubernetesHighPerformanceNetworkType.NONE, ''
1243
+ autoscaler = kubernetes_utils.get_autoscaler(
1244
+ kubernetes_enums.KubernetesAutoscalerType(autoscaler_type))
1245
+ logger.debug(f'{context} has autoscaler of type: {autoscaler_type}')
1246
+ machine_types = autoscaler.get_available_machine_types(context)
1247
+ # Check if any machine type supports high perf networking for GKE.
1248
+ if 'a3-highgpu-8g' in machine_types:
1249
+ return (KubernetesHighPerformanceNetworkType.GCP_TCPX,
1250
+ 'a3-highgpu-8g')
1251
+ elif 'a3-edgegpu-8g' in machine_types:
1252
+ return (KubernetesHighPerformanceNetworkType.GCP_TCPX,
1253
+ 'a3-edgegpu-8g')
1254
+ elif 'a3-megagpu-8g' in machine_types:
1255
+ return (KubernetesHighPerformanceNetworkType.GCP_TCPXO,
1256
+ 'a3-megagpu-8g')
1257
+ elif 'a4-highgpu-8g' in machine_types:
1258
+ return (KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA,
1259
+ 'a4-highgpu-8g')
1260
+ elif 'a3-ultragpu-8g' in machine_types:
1261
+ return (KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA,
1262
+ 'a3-ultragpu-8g')
1263
+
1264
+ return KubernetesHighPerformanceNetworkType.NONE, ''