skypilot-nightly 1.0.0.dev20250509__py3-none-any.whl → 1.0.0.dev20251107__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of skypilot-nightly might be problematic. Click here for more details.

Files changed (512) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +25 -7
  3. sky/adaptors/common.py +24 -1
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/hyperbolic.py +8 -0
  7. sky/adaptors/kubernetes.py +149 -18
  8. sky/adaptors/nebius.py +170 -17
  9. sky/adaptors/primeintellect.py +1 -0
  10. sky/adaptors/runpod.py +68 -0
  11. sky/adaptors/seeweb.py +167 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +187 -4
  14. sky/authentication.py +179 -225
  15. sky/backends/__init__.py +4 -2
  16. sky/backends/backend.py +22 -9
  17. sky/backends/backend_utils.py +1299 -380
  18. sky/backends/cloud_vm_ray_backend.py +1715 -518
  19. sky/backends/docker_utils.py +1 -1
  20. sky/backends/local_docker_backend.py +11 -6
  21. sky/backends/wheel_utils.py +37 -9
  22. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  23. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  24. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  25. sky/{clouds/service_catalog → catalog}/common.py +89 -48
  26. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  27. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  28. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +30 -40
  29. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +42 -15
  31. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  33. sky/catalog/data_fetchers/fetch_nebius.py +335 -0
  34. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  35. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  36. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  37. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  38. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  39. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  40. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  41. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  42. sky/catalog/hyperbolic_catalog.py +136 -0
  43. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  44. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  45. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  46. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  47. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  48. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  49. sky/catalog/primeintellect_catalog.py +95 -0
  50. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  51. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  52. sky/catalog/seeweb_catalog.py +184 -0
  53. sky/catalog/shadeform_catalog.py +165 -0
  54. sky/catalog/ssh_catalog.py +167 -0
  55. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  56. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  57. sky/check.py +491 -203
  58. sky/cli.py +5 -6005
  59. sky/client/{cli.py → cli/command.py} +2477 -1885
  60. sky/client/cli/deprecation_utils.py +99 -0
  61. sky/client/cli/flags.py +359 -0
  62. sky/client/cli/table_utils.py +320 -0
  63. sky/client/common.py +70 -32
  64. sky/client/oauth.py +82 -0
  65. sky/client/sdk.py +1203 -297
  66. sky/client/sdk_async.py +833 -0
  67. sky/client/service_account_auth.py +47 -0
  68. sky/cloud_stores.py +73 -0
  69. sky/clouds/__init__.py +13 -0
  70. sky/clouds/aws.py +358 -93
  71. sky/clouds/azure.py +105 -83
  72. sky/clouds/cloud.py +127 -36
  73. sky/clouds/cudo.py +68 -50
  74. sky/clouds/do.py +66 -48
  75. sky/clouds/fluidstack.py +63 -44
  76. sky/clouds/gcp.py +339 -110
  77. sky/clouds/hyperbolic.py +293 -0
  78. sky/clouds/ibm.py +70 -49
  79. sky/clouds/kubernetes.py +563 -162
  80. sky/clouds/lambda_cloud.py +74 -54
  81. sky/clouds/nebius.py +206 -80
  82. sky/clouds/oci.py +88 -66
  83. sky/clouds/paperspace.py +61 -44
  84. sky/clouds/primeintellect.py +317 -0
  85. sky/clouds/runpod.py +164 -74
  86. sky/clouds/scp.py +89 -83
  87. sky/clouds/seeweb.py +466 -0
  88. sky/clouds/shadeform.py +400 -0
  89. sky/clouds/ssh.py +263 -0
  90. sky/clouds/utils/aws_utils.py +10 -4
  91. sky/clouds/utils/gcp_utils.py +87 -11
  92. sky/clouds/utils/oci_utils.py +38 -14
  93. sky/clouds/utils/scp_utils.py +177 -124
  94. sky/clouds/vast.py +99 -77
  95. sky/clouds/vsphere.py +51 -40
  96. sky/core.py +349 -139
  97. sky/dag.py +15 -0
  98. sky/dashboard/out/404.html +1 -1
  99. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  100. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  101. sky/dashboard/out/_next/static/chunks/1871-74503c8e80fd253b.js +6 -0
  102. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  105. sky/dashboard/out/_next/static/chunks/2755.fff53c4a3fcae910.js +26 -0
  106. sky/dashboard/out/_next/static/chunks/3294.72362fa129305b19.js +1 -0
  107. sky/dashboard/out/_next/static/chunks/3785.ad6adaa2a0fa9768.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  110. sky/dashboard/out/_next/static/chunks/4725.a830b5c9e7867c92.js +1 -0
  111. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  112. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  113. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  115. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  116. sky/dashboard/out/_next/static/chunks/6601-06114c982db410b6.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/6856-ef8ba11f96d8c4a3.js +1 -0
  118. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  119. sky/dashboard/out/_next/static/chunks/6990-32b6e2d3822301fa.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  121. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  122. sky/dashboard/out/_next/static/chunks/7615-3301e838e5f25772.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  124. sky/dashboard/out/_next/static/chunks/8969-1e4613c651bf4051.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  126. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  127. sky/dashboard/out/_next/static/chunks/9360.7310982cf5a0dc79.js +31 -0
  128. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  129. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  131. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  133. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  134. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-c736ead69c2d86ec.js +16 -0
  136. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-a37d2063af475a1c.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/pages/clusters-d44859594e6f8064.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  139. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  141. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-5796e8d6aea291a0.js +16 -0
  143. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-6edeb7d06032adfc.js +21 -0
  144. sky/dashboard/out/_next/static/chunks/pages/jobs-479dde13399cf270.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/users-5ab3b907622cf0fe.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  148. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-c5a3eeee1c218af1.js +1 -0
  149. sky/dashboard/out/_next/static/chunks/pages/workspaces-22b23febb3e89ce1.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/webpack-2679be77fc08a2f8.js +1 -0
  151. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  152. sky/dashboard/out/_next/static/zB0ed6ge_W1MDszVHhijS/_buildManifest.js +1 -0
  153. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  154. sky/dashboard/out/clusters/[cluster].html +1 -1
  155. sky/dashboard/out/clusters.html +1 -1
  156. sky/dashboard/out/config.html +1 -0
  157. sky/dashboard/out/index.html +1 -1
  158. sky/dashboard/out/infra/[context].html +1 -0
  159. sky/dashboard/out/infra.html +1 -0
  160. sky/dashboard/out/jobs/[job].html +1 -1
  161. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  162. sky/dashboard/out/jobs.html +1 -1
  163. sky/dashboard/out/users.html +1 -0
  164. sky/dashboard/out/volumes.html +1 -0
  165. sky/dashboard/out/workspace/new.html +1 -0
  166. sky/dashboard/out/workspaces/[name].html +1 -0
  167. sky/dashboard/out/workspaces.html +1 -0
  168. sky/data/data_utils.py +137 -1
  169. sky/data/mounting_utils.py +269 -84
  170. sky/data/storage.py +1451 -1807
  171. sky/data/storage_utils.py +43 -57
  172. sky/exceptions.py +132 -2
  173. sky/execution.py +206 -63
  174. sky/global_user_state.py +2374 -586
  175. sky/jobs/__init__.py +5 -0
  176. sky/jobs/client/sdk.py +242 -65
  177. sky/jobs/client/sdk_async.py +143 -0
  178. sky/jobs/constants.py +9 -8
  179. sky/jobs/controller.py +839 -277
  180. sky/jobs/file_content_utils.py +80 -0
  181. sky/jobs/log_gc.py +201 -0
  182. sky/jobs/recovery_strategy.py +398 -152
  183. sky/jobs/scheduler.py +315 -189
  184. sky/jobs/server/core.py +829 -255
  185. sky/jobs/server/server.py +156 -115
  186. sky/jobs/server/utils.py +136 -0
  187. sky/jobs/state.py +2092 -701
  188. sky/jobs/utils.py +1242 -160
  189. sky/logs/__init__.py +21 -0
  190. sky/logs/agent.py +108 -0
  191. sky/logs/aws.py +243 -0
  192. sky/logs/gcp.py +91 -0
  193. sky/metrics/__init__.py +0 -0
  194. sky/metrics/utils.py +443 -0
  195. sky/models.py +78 -1
  196. sky/optimizer.py +164 -70
  197. sky/provision/__init__.py +90 -4
  198. sky/provision/aws/config.py +147 -26
  199. sky/provision/aws/instance.py +135 -50
  200. sky/provision/azure/instance.py +10 -5
  201. sky/provision/common.py +13 -1
  202. sky/provision/cudo/cudo_machine_type.py +1 -1
  203. sky/provision/cudo/cudo_utils.py +14 -8
  204. sky/provision/cudo/cudo_wrapper.py +72 -71
  205. sky/provision/cudo/instance.py +10 -6
  206. sky/provision/do/instance.py +10 -6
  207. sky/provision/do/utils.py +4 -3
  208. sky/provision/docker_utils.py +114 -23
  209. sky/provision/fluidstack/instance.py +13 -8
  210. sky/provision/gcp/__init__.py +1 -0
  211. sky/provision/gcp/config.py +301 -19
  212. sky/provision/gcp/constants.py +218 -0
  213. sky/provision/gcp/instance.py +36 -8
  214. sky/provision/gcp/instance_utils.py +18 -4
  215. sky/provision/gcp/volume_utils.py +247 -0
  216. sky/provision/hyperbolic/__init__.py +12 -0
  217. sky/provision/hyperbolic/config.py +10 -0
  218. sky/provision/hyperbolic/instance.py +437 -0
  219. sky/provision/hyperbolic/utils.py +373 -0
  220. sky/provision/instance_setup.py +93 -14
  221. sky/provision/kubernetes/__init__.py +5 -0
  222. sky/provision/kubernetes/config.py +9 -52
  223. sky/provision/kubernetes/constants.py +17 -0
  224. sky/provision/kubernetes/instance.py +789 -247
  225. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  226. sky/provision/kubernetes/network.py +27 -17
  227. sky/provision/kubernetes/network_utils.py +40 -43
  228. sky/provision/kubernetes/utils.py +1192 -531
  229. sky/provision/kubernetes/volume.py +282 -0
  230. sky/provision/lambda_cloud/instance.py +22 -16
  231. sky/provision/nebius/constants.py +50 -0
  232. sky/provision/nebius/instance.py +19 -6
  233. sky/provision/nebius/utils.py +196 -91
  234. sky/provision/oci/instance.py +10 -5
  235. sky/provision/paperspace/instance.py +10 -7
  236. sky/provision/paperspace/utils.py +1 -1
  237. sky/provision/primeintellect/__init__.py +10 -0
  238. sky/provision/primeintellect/config.py +11 -0
  239. sky/provision/primeintellect/instance.py +454 -0
  240. sky/provision/primeintellect/utils.py +398 -0
  241. sky/provision/provisioner.py +110 -36
  242. sky/provision/runpod/__init__.py +5 -0
  243. sky/provision/runpod/instance.py +27 -6
  244. sky/provision/runpod/utils.py +51 -18
  245. sky/provision/runpod/volume.py +180 -0
  246. sky/provision/scp/__init__.py +15 -0
  247. sky/provision/scp/config.py +93 -0
  248. sky/provision/scp/instance.py +531 -0
  249. sky/provision/seeweb/__init__.py +11 -0
  250. sky/provision/seeweb/config.py +13 -0
  251. sky/provision/seeweb/instance.py +807 -0
  252. sky/provision/shadeform/__init__.py +11 -0
  253. sky/provision/shadeform/config.py +12 -0
  254. sky/provision/shadeform/instance.py +351 -0
  255. sky/provision/shadeform/shadeform_utils.py +83 -0
  256. sky/provision/ssh/__init__.py +18 -0
  257. sky/provision/vast/instance.py +13 -8
  258. sky/provision/vast/utils.py +10 -7
  259. sky/provision/vsphere/common/vim_utils.py +1 -2
  260. sky/provision/vsphere/instance.py +15 -10
  261. sky/provision/vsphere/vsphere_utils.py +9 -19
  262. sky/py.typed +0 -0
  263. sky/resources.py +844 -118
  264. sky/schemas/__init__.py +0 -0
  265. sky/schemas/api/__init__.py +0 -0
  266. sky/schemas/api/responses.py +225 -0
  267. sky/schemas/db/README +4 -0
  268. sky/schemas/db/env.py +90 -0
  269. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  270. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  271. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  272. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  273. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  274. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  275. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  276. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  277. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  278. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  279. sky/schemas/db/script.py.mako +28 -0
  280. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  281. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  282. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  283. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  284. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  285. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  286. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  287. sky/schemas/generated/__init__.py +0 -0
  288. sky/schemas/generated/autostopv1_pb2.py +36 -0
  289. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  290. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  291. sky/schemas/generated/jobsv1_pb2.py +86 -0
  292. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  293. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  294. sky/schemas/generated/managed_jobsv1_pb2.py +74 -0
  295. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  296. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  297. sky/schemas/generated/servev1_pb2.py +58 -0
  298. sky/schemas/generated/servev1_pb2.pyi +115 -0
  299. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  300. sky/serve/autoscalers.py +357 -5
  301. sky/serve/client/impl.py +310 -0
  302. sky/serve/client/sdk.py +47 -139
  303. sky/serve/client/sdk_async.py +130 -0
  304. sky/serve/constants.py +10 -8
  305. sky/serve/controller.py +64 -19
  306. sky/serve/load_balancer.py +106 -60
  307. sky/serve/load_balancing_policies.py +115 -1
  308. sky/serve/replica_managers.py +273 -162
  309. sky/serve/serve_rpc_utils.py +179 -0
  310. sky/serve/serve_state.py +554 -251
  311. sky/serve/serve_utils.py +733 -220
  312. sky/serve/server/core.py +66 -711
  313. sky/serve/server/impl.py +1093 -0
  314. sky/serve/server/server.py +21 -18
  315. sky/serve/service.py +133 -48
  316. sky/serve/service_spec.py +135 -16
  317. sky/serve/spot_placer.py +3 -0
  318. sky/server/auth/__init__.py +0 -0
  319. sky/server/auth/authn.py +50 -0
  320. sky/server/auth/loopback.py +38 -0
  321. sky/server/auth/oauth2_proxy.py +200 -0
  322. sky/server/common.py +475 -181
  323. sky/server/config.py +81 -23
  324. sky/server/constants.py +44 -6
  325. sky/server/daemons.py +229 -0
  326. sky/server/html/token_page.html +185 -0
  327. sky/server/metrics.py +160 -0
  328. sky/server/requests/executor.py +528 -138
  329. sky/server/requests/payloads.py +351 -17
  330. sky/server/requests/preconditions.py +21 -17
  331. sky/server/requests/process.py +112 -29
  332. sky/server/requests/request_names.py +120 -0
  333. sky/server/requests/requests.py +817 -224
  334. sky/server/requests/serializers/decoders.py +82 -31
  335. sky/server/requests/serializers/encoders.py +140 -22
  336. sky/server/requests/threads.py +106 -0
  337. sky/server/rest.py +417 -0
  338. sky/server/server.py +1290 -284
  339. sky/server/state.py +20 -0
  340. sky/server/stream_utils.py +345 -57
  341. sky/server/uvicorn.py +217 -3
  342. sky/server/versions.py +270 -0
  343. sky/setup_files/MANIFEST.in +5 -0
  344. sky/setup_files/alembic.ini +156 -0
  345. sky/setup_files/dependencies.py +136 -31
  346. sky/setup_files/setup.py +44 -42
  347. sky/sky_logging.py +102 -5
  348. sky/skylet/attempt_skylet.py +1 -0
  349. sky/skylet/autostop_lib.py +129 -8
  350. sky/skylet/configs.py +27 -20
  351. sky/skylet/constants.py +171 -19
  352. sky/skylet/events.py +105 -21
  353. sky/skylet/job_lib.py +335 -104
  354. sky/skylet/log_lib.py +297 -18
  355. sky/skylet/log_lib.pyi +44 -1
  356. sky/skylet/ray_patches/__init__.py +17 -3
  357. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  358. sky/skylet/ray_patches/cli.py.diff +19 -0
  359. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  360. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  361. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  362. sky/skylet/ray_patches/updater.py.diff +18 -0
  363. sky/skylet/ray_patches/worker.py.diff +41 -0
  364. sky/skylet/services.py +564 -0
  365. sky/skylet/skylet.py +63 -4
  366. sky/skylet/subprocess_daemon.py +103 -29
  367. sky/skypilot_config.py +506 -99
  368. sky/ssh_node_pools/__init__.py +1 -0
  369. sky/ssh_node_pools/core.py +135 -0
  370. sky/ssh_node_pools/server.py +233 -0
  371. sky/task.py +621 -137
  372. sky/templates/aws-ray.yml.j2 +10 -3
  373. sky/templates/azure-ray.yml.j2 +1 -1
  374. sky/templates/do-ray.yml.j2 +1 -1
  375. sky/templates/gcp-ray.yml.j2 +57 -0
  376. sky/templates/hyperbolic-ray.yml.j2 +67 -0
  377. sky/templates/jobs-controller.yaml.j2 +27 -24
  378. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  379. sky/templates/kubernetes-ray.yml.j2 +607 -51
  380. sky/templates/lambda-ray.yml.j2 +1 -1
  381. sky/templates/nebius-ray.yml.j2 +33 -12
  382. sky/templates/paperspace-ray.yml.j2 +1 -1
  383. sky/templates/primeintellect-ray.yml.j2 +71 -0
  384. sky/templates/runpod-ray.yml.j2 +9 -1
  385. sky/templates/scp-ray.yml.j2 +3 -50
  386. sky/templates/seeweb-ray.yml.j2 +108 -0
  387. sky/templates/shadeform-ray.yml.j2 +72 -0
  388. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  389. sky/templates/websocket_proxy.py +178 -18
  390. sky/usage/usage_lib.py +18 -11
  391. sky/users/__init__.py +0 -0
  392. sky/users/model.conf +15 -0
  393. sky/users/permission.py +387 -0
  394. sky/users/rbac.py +121 -0
  395. sky/users/server.py +720 -0
  396. sky/users/token_service.py +218 -0
  397. sky/utils/accelerator_registry.py +34 -5
  398. sky/utils/admin_policy_utils.py +84 -38
  399. sky/utils/annotations.py +16 -5
  400. sky/utils/asyncio_utils.py +78 -0
  401. sky/utils/auth_utils.py +153 -0
  402. sky/utils/benchmark_utils.py +60 -0
  403. sky/utils/cli_utils/status_utils.py +159 -86
  404. sky/utils/cluster_utils.py +31 -9
  405. sky/utils/command_runner.py +354 -68
  406. sky/utils/command_runner.pyi +93 -3
  407. sky/utils/common.py +35 -8
  408. sky/utils/common_utils.py +310 -87
  409. sky/utils/config_utils.py +87 -5
  410. sky/utils/context.py +402 -0
  411. sky/utils/context_utils.py +222 -0
  412. sky/utils/controller_utils.py +264 -89
  413. sky/utils/dag_utils.py +31 -12
  414. sky/utils/db/__init__.py +0 -0
  415. sky/utils/db/db_utils.py +470 -0
  416. sky/utils/db/migration_utils.py +133 -0
  417. sky/utils/directory_utils.py +12 -0
  418. sky/utils/env_options.py +13 -0
  419. sky/utils/git.py +567 -0
  420. sky/utils/git_clone.sh +460 -0
  421. sky/utils/infra_utils.py +195 -0
  422. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  423. sky/utils/kubernetes/config_map_utils.py +133 -0
  424. sky/utils/kubernetes/create_cluster.sh +13 -27
  425. sky/utils/kubernetes/delete_cluster.sh +10 -7
  426. sky/utils/kubernetes/deploy_remote_cluster.py +1299 -0
  427. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  428. sky/utils/kubernetes/generate_kind_config.py +6 -66
  429. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  430. sky/utils/kubernetes/gpu_labeler.py +5 -5
  431. sky/utils/kubernetes/kubernetes_deploy_utils.py +354 -47
  432. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  433. sky/utils/kubernetes/ssh_utils.py +221 -0
  434. sky/utils/kubernetes_enums.py +8 -15
  435. sky/utils/lock_events.py +94 -0
  436. sky/utils/locks.py +368 -0
  437. sky/utils/log_utils.py +300 -6
  438. sky/utils/perf_utils.py +22 -0
  439. sky/utils/resource_checker.py +298 -0
  440. sky/utils/resources_utils.py +249 -32
  441. sky/utils/rich_utils.py +213 -37
  442. sky/utils/schemas.py +905 -147
  443. sky/utils/serialize_utils.py +16 -0
  444. sky/utils/status_lib.py +10 -0
  445. sky/utils/subprocess_utils.py +38 -15
  446. sky/utils/tempstore.py +70 -0
  447. sky/utils/timeline.py +24 -52
  448. sky/utils/ux_utils.py +84 -15
  449. sky/utils/validator.py +11 -1
  450. sky/utils/volume.py +86 -0
  451. sky/utils/yaml_utils.py +111 -0
  452. sky/volumes/__init__.py +13 -0
  453. sky/volumes/client/__init__.py +0 -0
  454. sky/volumes/client/sdk.py +149 -0
  455. sky/volumes/server/__init__.py +0 -0
  456. sky/volumes/server/core.py +258 -0
  457. sky/volumes/server/server.py +122 -0
  458. sky/volumes/volume.py +212 -0
  459. sky/workspaces/__init__.py +0 -0
  460. sky/workspaces/core.py +655 -0
  461. sky/workspaces/server.py +101 -0
  462. sky/workspaces/utils.py +56 -0
  463. skypilot_nightly-1.0.0.dev20251107.dist-info/METADATA +675 -0
  464. skypilot_nightly-1.0.0.dev20251107.dist-info/RECORD +594 -0
  465. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/WHEEL +1 -1
  466. sky/benchmark/benchmark_state.py +0 -256
  467. sky/benchmark/benchmark_utils.py +0 -641
  468. sky/clouds/service_catalog/constants.py +0 -7
  469. sky/dashboard/out/_next/static/LksQgChY5izXjokL3LcEu/_buildManifest.js +0 -1
  470. sky/dashboard/out/_next/static/chunks/236-f49500b82ad5392d.js +0 -6
  471. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  472. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  473. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  474. sky/dashboard/out/_next/static/chunks/845-0f8017370869e269.js +0 -1
  475. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  476. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  477. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  478. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  479. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  480. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  481. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  482. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-e15db85d0ea1fbe1.js +0 -1
  483. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  484. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  485. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  486. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-03f279c6741fb48b.js +0 -1
  487. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  488. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  489. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  490. sky/jobs/dashboard/dashboard.py +0 -223
  491. sky/jobs/dashboard/static/favicon.ico +0 -0
  492. sky/jobs/dashboard/templates/index.html +0 -831
  493. sky/jobs/server/dashboard_utils.py +0 -69
  494. sky/skylet/providers/scp/__init__.py +0 -2
  495. sky/skylet/providers/scp/config.py +0 -149
  496. sky/skylet/providers/scp/node_provider.py +0 -578
  497. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  498. sky/utils/db_utils.py +0 -100
  499. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  500. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  501. skypilot_nightly-1.0.0.dev20250509.dist-info/METADATA +0 -361
  502. skypilot_nightly-1.0.0.dev20250509.dist-info/RECORD +0 -396
  503. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  504. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  505. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  506. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  507. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  508. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  509. /sky/dashboard/out/_next/static/{LksQgChY5izXjokL3LcEu → zB0ed6ge_W1MDszVHhijS}/_ssgManifest.js +0 -0
  510. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/entry_points.txt +0 -0
  511. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/licenses/LICENSE +0 -0
  512. {skypilot_nightly-1.0.0.dev20250509.dist-info → skypilot_nightly-1.0.0.dev20251107.dist-info}/top_level.txt +0 -0
sky/clouds/kubernetes.py CHANGED
@@ -1,35 +1,40 @@
1
1
  """Kubernetes."""
2
+ import concurrent.futures
2
3
  import os
3
4
  import re
4
- import typing
5
+ import subprocess
6
+ import tempfile
5
7
  from typing import Dict, Iterator, List, Optional, Set, Tuple, Union
6
8
 
9
+ import colorama
10
+
11
+ from sky import catalog
7
12
  from sky import clouds
8
13
  from sky import exceptions
14
+ from sky import resources as resources_lib
9
15
  from sky import sky_logging
10
16
  from sky import skypilot_config
11
17
  from sky.adaptors import kubernetes
12
- from sky.clouds import service_catalog
18
+ from sky.clouds.utils import gcp_utils
13
19
  from sky.provision import instance_setup
20
+ from sky.provision.gcp import constants as gcp_constants
14
21
  from sky.provision.kubernetes import network_utils
15
22
  from sky.provision.kubernetes import utils as kubernetes_utils
23
+ from sky.provision.kubernetes.utils import is_tpu_on_gke
24
+ from sky.provision.kubernetes.utils import KubernetesHighPerformanceNetworkType
25
+ from sky.provision.kubernetes.utils import normalize_tpu_accelerator_name
16
26
  from sky.skylet import constants
17
27
  from sky.utils import annotations
18
28
  from sky.utils import common_utils
29
+ from sky.utils import env_options
30
+ from sky.utils import kubernetes_enums
19
31
  from sky.utils import registry
20
32
  from sky.utils import resources_utils
21
33
  from sky.utils import schemas
22
-
23
- if typing.TYPE_CHECKING:
24
- # Renaming to avoid shadowing variables.
25
- from sky import resources as resources_lib
34
+ from sky.utils import volume as volume_lib
26
35
 
27
36
  logger = sky_logging.init_logger(__name__)
28
37
 
29
- # Check if KUBECONFIG is set, and use it if it is.
30
- DEFAULT_KUBECONFIG_PATH = '~/.kube/config'
31
- CREDENTIAL_PATH = os.environ.get('KUBECONFIG', DEFAULT_KUBECONFIG_PATH)
32
-
33
38
  # Namespace for SkyPilot resources shared across multiple tenants on the
34
39
  # same cluster (even if they might be running in different namespaces).
35
40
  # E.g., FUSE device manager daemonset is run in this namespace.
@@ -44,9 +49,6 @@ _FUSERMOUNT_SHARED_DIR = '/var/run/fusermount'
44
49
  class Kubernetes(clouds.Cloud):
45
50
  """Kubernetes."""
46
51
 
47
- SKY_SSH_KEY_SECRET_NAME = 'sky-ssh-keys'
48
- SKY_SSH_JUMP_NAME = 'sky-ssh-jump-pod'
49
-
50
52
  # Limit the length of the cluster name to avoid exceeding the limit of 63
51
53
  # characters for Kubernetes resources. We limit to 42 characters (63-21) to
52
54
  # allow additional characters for creating ingress services to expose ports.
@@ -54,9 +56,12 @@ class Kubernetes(clouds.Cloud):
54
56
  # where the suffix is 21 characters long.
55
57
  _MAX_CLUSTER_NAME_LEN_LIMIT = 42
56
58
 
59
+ _MAX_VOLUME_NAME_LEN_LIMIT = 253
60
+
57
61
  _SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = True
58
62
 
59
63
  _DEFAULT_NUM_VCPUS = 2
64
+ _DEFAULT_NUM_VCPUS_WITH_GPU = 4
60
65
  _DEFAULT_MEMORY_CPU_RATIO = 1
61
66
  _DEFAULT_MEMORY_CPU_RATIO_WITH_GPU = 4 # Allocate more memory for GPU tasks
62
67
  _REPR = 'Kubernetes'
@@ -73,6 +78,12 @@ class Kubernetes(clouds.Cloud):
73
78
  'tiers are not '
74
79
  'supported in '
75
80
  'Kubernetes.',
81
+ clouds.CloudImplementationFeatures.CUSTOM_MULTI_NETWORK:
82
+ ('Customized multiple network interfaces are not supported in '
83
+ 'Kubernetes.'),
84
+ clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER:
85
+ ('Custom network tier is not supported in this Kubernetes '
86
+ 'cluster.'),
76
87
  }
77
88
 
78
89
  IMAGE_CPU = 'skypilot:custom-cpu-ubuntu-2004'
@@ -86,47 +97,52 @@ class Kubernetes(clouds.Cloud):
86
97
  # Set of contexts that has logged as temporarily unreachable
87
98
  logged_unreachable_contexts: Set[str] = set()
88
99
 
89
- @property
90
- def ssh_key_secret_field_name(self):
91
- # Use a fresh user hash to avoid conflicts in the secret object naming.
92
- # This can happen when the controller is reusing the same user hash
93
- # through USER_ID_ENV_VAR but has a different SSH key.
94
- fresh_user_hash = common_utils.generate_user_hash()
95
- return f'ssh-publickey-{fresh_user_hash}'
96
-
97
100
  @classmethod
98
101
  def _unsupported_features_for_resources(
99
- cls, resources: 'resources_lib.Resources'
102
+ cls,
103
+ resources: 'resources_lib.Resources',
104
+ region: Optional[str] = None,
100
105
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
101
106
  # TODO(aylei): features need to be regional (per context) to make
102
107
  # multi-kubernetes selection/failover work.
103
108
  unsupported_features = cls._CLOUD_UNSUPPORTED_FEATURES.copy()
104
- context = resources.region
109
+ context = region if region is not None else resources.region
105
110
  if context is None:
106
- context = kubernetes_utils.get_current_kube_config_context_name()
107
- # Features to be disabled for exec auth
108
- is_exec_auth, message = kubernetes_utils.is_kubeconfig_exec_auth(
109
- context)
110
- if is_exec_auth:
111
- assert isinstance(message, str), message
112
- # Controllers cannot spin up new pods with exec auth.
113
- unsupported_features[
114
- clouds.CloudImplementationFeatures.HOST_CONTROLLERS] = message
115
- # Pod does not have permissions to down itself with exec auth.
116
- unsupported_features[
117
- clouds.CloudImplementationFeatures.AUTODOWN] = message
111
+ contexts = cls.existing_allowed_contexts()
112
+ else:
113
+ contexts = [context]
118
114
  unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
119
115
  'Stopping clusters is not supported on Kubernetes.')
120
116
  unsupported_features[clouds.CloudImplementationFeatures.AUTOSTOP] = (
121
117
  'Auto-stop is not supported on Kubernetes.')
122
- # Allow spot instances if supported by the cluster
123
- try:
124
- spot_label_key, _ = kubernetes_utils.get_spot_label(context)
125
- if spot_label_key is not None:
126
- unsupported_features.pop(
127
- clouds.CloudImplementationFeatures.SPOT_INSTANCE, None)
128
- except exceptions.KubeAPIUnreachableError as e:
129
- cls._log_unreachable_context(context, str(e))
118
+ for context in contexts:
119
+ # Allow spot instances if supported by the cluster
120
+ try:
121
+ # Run spot label check and network type detection concurrently
122
+ # as they are independent operations
123
+ with concurrent.futures.ThreadPoolExecutor(
124
+ max_workers=2) as executor:
125
+ spot_future = executor.submit(
126
+ kubernetes_utils.get_spot_label, context)
127
+ network_future = executor.submit(cls._detect_network_type,
128
+ context,
129
+ resources.network_tier)
130
+
131
+ spot_label_key, _ = spot_future.result()
132
+ if spot_label_key is not None:
133
+ unsupported_features.pop(
134
+ clouds.CloudImplementationFeatures.SPOT_INSTANCE,
135
+ None)
136
+
137
+ # Allow custom network tier if supported by the cluster
138
+ # (e.g., Nebius clusters with high performance networking)
139
+ network_type, _ = network_future.result()
140
+ if network_type.supports_high_performance_networking():
141
+ unsupported_features.pop(
142
+ clouds.CloudImplementationFeatures.
143
+ CUSTOM_NETWORK_TIER, None)
144
+ except exceptions.KubeAPIUnreachableError as e:
145
+ cls._log_unreachable_context(context, str(e))
130
146
  return unsupported_features
131
147
 
132
148
  @classmethod
@@ -149,7 +165,7 @@ class Kubernetes(clouds.Cloud):
149
165
  'Ignoring these contexts.')
150
166
 
151
167
  @classmethod
152
- def existing_allowed_contexts(cls) -> List[str]:
168
+ def existing_allowed_contexts(cls, silent: bool = False) -> List[str]:
153
169
  """Get existing allowed contexts.
154
170
 
155
171
  If None is returned in the list, it means that we are running in a pod
@@ -162,15 +178,35 @@ class Kubernetes(clouds.Cloud):
162
178
 
163
179
  all_contexts = set(all_contexts)
164
180
 
165
- allowed_contexts = skypilot_config.get_nested(
166
- ('kubernetes', 'allowed_contexts'), None)
181
+ # Allowed_contexts specified for workspace should take precedence over
182
+ # the global allowed_contexts.
183
+ allowed_contexts = skypilot_config.get_workspace_cloud(
184
+ 'kubernetes').get('allowed_contexts', None)
185
+ if allowed_contexts is None:
186
+ allowed_contexts = skypilot_config.get_effective_region_config(
187
+ cloud='kubernetes',
188
+ region=None,
189
+ keys=('allowed_contexts',),
190
+ default_value=None)
191
+
192
+ # Exclude contexts starting with `ssh-`
193
+ # TODO(romilb): Remove when SSH Node Pools use a separate kubeconfig.
194
+ all_contexts = [
195
+ ctx for ctx in all_contexts if not ctx.startswith('ssh-')
196
+ ]
197
+
198
+ allow_all_contexts = allowed_contexts == 'all' or (
199
+ allowed_contexts is None and
200
+ env_options.Options.ALLOW_ALL_KUBERNETES_CONTEXTS.get())
201
+ if allow_all_contexts:
202
+ allowed_contexts = all_contexts
167
203
 
168
204
  if allowed_contexts is None:
169
205
  # Try kubeconfig if present
170
206
  current_context = (
171
207
  kubernetes_utils.get_current_kube_config_context_name())
172
- if (current_context is None and
173
- kubernetes_utils.is_incluster_config_available()):
208
+ if ((current_context is None or current_context.startswith('ssh-'))
209
+ and kubernetes_utils.is_incluster_config_available()):
174
210
  # If no kubeconfig contexts found, use in-cluster if available
175
211
  current_context = kubernetes.in_cluster_context_name()
176
212
  allowed_contexts = []
@@ -183,8 +219,12 @@ class Kubernetes(clouds.Cloud):
183
219
  if context in all_contexts:
184
220
  existing_contexts.append(context)
185
221
  else:
222
+ # Skip SSH Node Pool contexts
223
+ if context.startswith('ssh-'):
224
+ continue
186
225
  skipped_contexts.append(context)
187
- cls._log_skipped_contexts_once(tuple(skipped_contexts))
226
+ if not silent:
227
+ cls._log_skipped_contexts_once(tuple(skipped_contexts))
188
228
  return existing_contexts
189
229
 
190
230
  @classmethod
@@ -218,10 +258,15 @@ class Kubernetes(clouds.Cloud):
218
258
  'refresh Kubernetes availability if permanent.')
219
259
 
220
260
  @classmethod
221
- def regions_with_offering(cls, instance_type: Optional[str],
222
- accelerators: Optional[Dict[str, int]],
223
- use_spot: bool, region: Optional[str],
224
- zone: Optional[str]) -> List[clouds.Region]:
261
+ def regions_with_offering(
262
+ cls,
263
+ instance_type: Optional[str],
264
+ accelerators: Optional[Dict[str, int]],
265
+ use_spot: bool,
266
+ region: Optional[str],
267
+ zone: Optional[str],
268
+ resources: Optional['resources_lib.Resources'] = None,
269
+ ) -> List[clouds.Region]:
225
270
  del accelerators, zone, use_spot # unused
226
271
  existing_contexts = cls.existing_allowed_contexts()
227
272
 
@@ -231,6 +276,19 @@ class Kubernetes(clouds.Cloud):
231
276
 
232
277
  if region is not None:
233
278
  regions = [r for r in regions if r.name == region]
279
+ if resources is not None:
280
+ filtered_regions = []
281
+ resources_required_features = resources.get_required_cloud_features(
282
+ )
283
+ for r in regions:
284
+ try:
285
+ cls.check_features_are_supported(
286
+ resources, resources_required_features, r.name)
287
+ filtered_regions.append(r)
288
+ except exceptions.NotSupportedError as e:
289
+ logger.info(f'Filter out context: {r.name}, reason: {e}')
290
+ continue
291
+ regions = filtered_regions
234
292
 
235
293
  # Check if requested instance type will fit in the cluster.
236
294
  # TODO(zhwu,romilb): autoscaler type needs to be regional (per
@@ -238,22 +296,6 @@ class Kubernetes(clouds.Cloud):
238
296
  if instance_type is None:
239
297
  return regions
240
298
 
241
- autoscaler_type = kubernetes_utils.get_autoscaler_type()
242
- if (autoscaler_type is not None and not kubernetes_utils.get_autoscaler(
243
- autoscaler_type).can_query_backend):
244
- # Unsupported autoscaler type. Rely on the autoscaler to
245
- # provision the right instance type without running checks.
246
- # Worst case, if autoscaling fails, the pod will be stuck in
247
- # pending state until provision_timeout, after which failover
248
- # will be triggered.
249
- #
250
- # Removing this if statement produces the same behavior,
251
- # because can_create_new_instance_of_type() always returns True
252
- # for unsupported autoscaler types.
253
- # This check is here as a performance optimization to avoid
254
- # further code executions that is known to return this result.
255
- return regions
256
-
257
299
  regions_to_return = []
258
300
  for r in regions:
259
301
  context = r.name
@@ -270,9 +312,34 @@ class Kubernetes(clouds.Cloud):
270
312
  'not fit in the existing Kubernetes cluster '
271
313
  'with context: '
272
314
  f'{context}. Reason: {reason}')
315
+
316
+ autoscaler_type = skypilot_config.get_effective_region_config(
317
+ cloud='kubernetes',
318
+ region=context,
319
+ keys=('autoscaler',),
320
+ default_value=None)
321
+ if (autoscaler_type is not None and
322
+ not kubernetes_utils.get_autoscaler(
323
+ kubernetes_enums.KubernetesAutoscalerType(
324
+ autoscaler_type)).can_query_backend):
325
+ # Unsupported autoscaler type. Rely on the autoscaler to
326
+ # provision the right instance type without running checks.
327
+ # Worst case, if autoscaling fails, the pod will be stuck in
328
+ # pending state until provision_timeout, after which failover
329
+ # will be triggered.
330
+ #
331
+ # Removing this if statement produces the same behavior,
332
+ # because can_create_new_instance_of_type() always returns True
333
+ # for unsupported autoscaler types.
334
+ # This check is here as a performance optimization to avoid
335
+ # further code executions that is known to return this result.
336
+ regions_to_return.append(r)
337
+ continue
338
+
273
339
  if autoscaler_type is None:
274
340
  continue
275
- autoscaler = kubernetes_utils.get_autoscaler(autoscaler_type)
341
+ autoscaler = kubernetes_utils.get_autoscaler(
342
+ kubernetes_enums.KubernetesAutoscalerType(autoscaler_type))
276
343
  logger.debug(f'{context} has autoscaler of type: {autoscaler_type}')
277
344
  if autoscaler.can_create_new_instance_of_type(
278
345
  context, instance_type):
@@ -312,10 +379,12 @@ class Kubernetes(clouds.Cloud):
312
379
  cls,
313
380
  cpus: Optional[str] = None,
314
381
  memory: Optional[str] = None,
315
- disk_tier: Optional['resources_utils.DiskTier'] = None) -> str:
382
+ disk_tier: Optional['resources_utils.DiskTier'] = None,
383
+ region: Optional[str] = None,
384
+ zone: Optional[str] = None) -> str:
316
385
  # TODO(romilb): In the future, we may want to move the instance type
317
386
  # selection + availability checking to a kubernetes_catalog module.
318
- del disk_tier # Unused.
387
+ del disk_tier, region, zone # Unused.
319
388
  # We strip '+' from resource requests since Kubernetes can provision
320
389
  # exactly the requested resources.
321
390
  instance_cpus = float(
@@ -379,7 +448,11 @@ class Kubernetes(clouds.Cloud):
379
448
  return 0
380
449
 
381
450
  @staticmethod
382
- def _calculate_provision_timeout(num_nodes: int) -> int:
451
+ def _calculate_provision_timeout(
452
+ num_nodes: int,
453
+ volume_mounts: Optional[List['volume_lib.VolumeMount']],
454
+ enable_flex_start: bool,
455
+ ) -> int:
383
456
  """Calculate provision timeout based on number of nodes.
384
457
 
385
458
  The timeout scales linearly with the number of nodes to account for
@@ -387,6 +460,8 @@ class Kubernetes(clouds.Cloud):
387
460
 
388
461
  Args:
389
462
  num_nodes: Number of nodes being provisioned
463
+ volume_mounts: Volume mounts for the pod
464
+ enable_flex_start: Whether flex start is enabled
390
465
 
391
466
  Returns:
392
467
  Timeout in seconds
@@ -394,19 +469,38 @@ class Kubernetes(clouds.Cloud):
394
469
  base_timeout = 10 # Base timeout for single node
395
470
  per_node_timeout = 0.2 # Additional seconds per node
396
471
  max_timeout = 60 # Cap at 1 minute
472
+ if enable_flex_start:
473
+ # Flex start takes longer to provision.
474
+ base_timeout = 1200
475
+ per_node_timeout = 10
476
+ max_timeout = 2400
477
+ elif volume_mounts is not None:
478
+ for volume_mount in volume_mounts:
479
+ if (volume_mount.volume_config.type ==
480
+ volume_lib.VolumeType.PVC.value):
481
+ if (volume_mount.volume_config.config.get(
482
+ 'access_mode', '') ==
483
+ volume_lib.VolumeAccessMode.READ_WRITE_MANY.value):
484
+ # GKE may take several minutes to provision a PV
485
+ # supporting READ_WRITE_MANY with filestore.
486
+ base_timeout = 180
487
+ max_timeout = 240
488
+ break
397
489
 
398
490
  return int(
399
491
  min(base_timeout + (per_node_timeout * (num_nodes - 1)),
400
492
  max_timeout))
401
493
 
402
494
  def make_deploy_resources_variables(
403
- self,
404
- resources: 'resources_lib.Resources',
405
- cluster_name: 'resources_utils.ClusterName',
406
- region: Optional['clouds.Region'],
407
- zones: Optional[List['clouds.Zone']],
408
- num_nodes: int,
409
- dryrun: bool = False) -> Dict[str, Optional[str]]:
495
+ self,
496
+ resources: 'resources_lib.Resources',
497
+ cluster_name: 'resources_utils.ClusterName',
498
+ region: Optional['clouds.Region'],
499
+ zones: Optional[List['clouds.Zone']],
500
+ num_nodes: int,
501
+ dryrun: bool = False,
502
+ volume_mounts: Optional[List['volume_lib.VolumeMount']] = None,
503
+ ) -> Dict[str, Optional[str]]:
410
504
  del cluster_name, zones, dryrun # Unused.
411
505
  if region is None:
412
506
  context = kubernetes_utils.get_current_kube_config_context_name()
@@ -414,8 +508,9 @@ class Kubernetes(clouds.Cloud):
414
508
  context = region.name
415
509
  assert context is not None, 'No context found in kubeconfig'
416
510
 
417
- r = resources
418
- acc_dict = self.get_accelerators_from_instance_type(r.instance_type)
511
+ resources = resources.assert_launchable()
512
+ acc_dict = self.get_accelerators_from_instance_type(
513
+ resources.instance_type)
419
514
  custom_resources = resources_utils.make_ray_custom_resources_str(
420
515
  acc_dict)
421
516
 
@@ -426,8 +521,12 @@ class Kubernetes(clouds.Cloud):
426
521
  cpus = k.cpus
427
522
  mem = k.memory
428
523
  # Optionally populate accelerator information.
429
- acc_count = k.accelerator_count if k.accelerator_count else 0
430
- acc_type = k.accelerator_type if k.accelerator_type else None
524
+ acc_type = k.accelerator_type
525
+ acc_count = k.accelerator_count
526
+ if acc_type is not None and is_tpu_on_gke(acc_type):
527
+ acc_type, acc_count = normalize_tpu_accelerator_name(acc_type)
528
+ else:
529
+ acc_count = acc_count or 0
431
530
 
432
531
  def _get_image_id(resources: 'resources_lib.Resources') -> str:
433
532
  image_id_dict = resources.image_id
@@ -444,15 +543,18 @@ class Kubernetes(clouds.Cloud):
444
543
  # Select image based on whether we are using GPUs or not.
445
544
  image_id = self.IMAGE_GPU if acc_count > 0 else self.IMAGE_CPU
446
545
  # Get the container image ID from the service catalog.
447
- image_id = service_catalog.get_image_id_from_tag(
448
- image_id, clouds='kubernetes')
546
+ image_id = catalog.get_image_id_from_tag(image_id,
547
+ clouds='kubernetes')
449
548
  return image_id
450
549
 
451
550
  image_id = _get_image_id(resources)
452
- # TODO(romilb): Create a lightweight image for SSH jump host
453
- ssh_jump_image = service_catalog.get_image_id_from_tag(
454
- self.IMAGE_CPU, clouds='kubernetes')
455
551
 
552
+ # Set environment variables for the pod. Note that SkyPilot env vars
553
+ # are set separately when the task is run. These env vars are
554
+ # independent of the SkyPilot task to be run.
555
+ k8s_env_vars = {kubernetes.IN_CLUSTER_CONTEXT_NAME_ENV_VAR: context}
556
+
557
+ # Setup GPU/TPU labels and resource keys.
456
558
  k8s_acc_label_key = None
457
559
  k8s_acc_label_values = None
458
560
  k8s_topology_label_key = None
@@ -472,17 +574,31 @@ class Kubernetes(clouds.Cloud):
472
574
  tpu_requested = True
473
575
  k8s_resource_key = kubernetes_utils.TPU_RESOURCE_KEY
474
576
  else:
475
- k8s_resource_key = kubernetes_utils.get_gpu_resource_key()
577
+ k8s_resource_key = kubernetes_utils.get_gpu_resource_key(
578
+ context)
476
579
  else:
580
+ # If no GPUs are requested, we set NVIDIA_VISIBLE_DEVICES=none to
581
+ # maintain GPU isolation. This is to override the default behavior
582
+ # of Nvidia device plugin which would expose all GPUs to the pod
583
+ # when no GPUs are requested.
584
+ # Note that NVIDIA_VISIBLE_DEVICES is different from
585
+ # CUDA_VISIBLE_DEVICES - the latter is used to control which GPUs
586
+ # are visible to the application and is set inside the pod, while
587
+ # the former is used to control which GPUs are visible to the pod
588
+ # through the nvidia runtime.
589
+ # See: https://github.com/NVIDIA/k8s-device-plugin/issues/61
590
+ k8s_env_vars['NVIDIA_VISIBLE_DEVICES'] = 'none'
477
591
  avoid_label_keys = kubernetes_utils.get_accelerator_label_keys(
478
592
  context)
479
593
  if len(avoid_label_keys) == 0:
480
594
  avoid_label_keys = None
481
- port_mode = network_utils.get_port_mode(None)
595
+ port_mode = network_utils.get_port_mode(None, context)
482
596
 
483
- remote_identity = skypilot_config.get_nested(
484
- ('kubernetes', 'remote_identity'),
485
- schemas.get_default_remote_identity('kubernetes'))
597
+ remote_identity = skypilot_config.get_effective_region_config(
598
+ cloud='kubernetes',
599
+ region=context,
600
+ keys=('remote_identity',),
601
+ default_value=schemas.get_default_remote_identity('kubernetes'))
486
602
 
487
603
  if isinstance(remote_identity, dict):
488
604
  # If remote_identity is a dict, use the service account for the
@@ -496,20 +612,17 @@ class Kubernetes(clouds.Cloud):
496
612
  # If remote_identity is not a dict, use
497
613
  k8s_service_account_name = remote_identity
498
614
 
499
- if (k8s_service_account_name ==
500
- schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value):
501
- # SA name doesn't matter since automounting credentials is disabled
502
- k8s_service_account_name = 'default'
503
- k8s_automount_sa_token = 'false'
504
- elif (k8s_service_account_name ==
505
- schemas.RemoteIdentityOptions.SERVICE_ACCOUNT.value):
506
- # Use the default service account
615
+ lc = schemas.RemoteIdentityOptions.LOCAL_CREDENTIALS.value
616
+ sa = schemas.RemoteIdentityOptions.SERVICE_ACCOUNT.value
617
+
618
+ if k8s_service_account_name == lc or k8s_service_account_name == sa:
619
+ # Use the default service account if remote identity is not set.
620
+ # For LOCAL_CREDENTIALS, this is for in-cluster authentication
621
+ # which needs a serviceaccount (specifically for SSH node pools
622
+ # which uses in-cluster authentication internally, and we would
623
+ # like to support exec-auth when the user is also using SSH infra)
507
624
  k8s_service_account_name = (
508
625
  kubernetes_utils.DEFAULT_SERVICE_ACCOUNT_NAME)
509
- k8s_automount_sa_token = 'true'
510
- else:
511
- # User specified a custom service account
512
- k8s_automount_sa_token = 'true'
513
626
 
514
627
  fuse_device_required = bool(resources.requires_fuse)
515
628
 
@@ -518,26 +631,22 @@ class Kubernetes(clouds.Cloud):
518
631
  if resources.use_spot:
519
632
  spot_label_key, spot_label_value = kubernetes_utils.get_spot_label()
520
633
 
521
- # Timeout for resource provisioning. This timeout determines how long to
522
- # wait for pod to be in pending status before giving up.
523
- # Larger timeout may be required for autoscaling clusters, since
524
- # autoscaler may take some time to provision new nodes.
525
- # Note that this timeout includes time taken by the Kubernetes scheduler
526
- # itself, which can be upto 2-3 seconds, and up to 10-15 seconds when
527
- # scheduling 100s of pods.
528
- # We use a linear scaling formula to determine the timeout based on the
529
- # number of nodes.
530
-
531
- timeout = self._calculate_provision_timeout(num_nodes)
532
- timeout = skypilot_config.get_nested(
533
- ('kubernetes', 'provision_timeout'),
534
- timeout,
535
- override_configs=resources.cluster_config_overrides)
536
-
537
- # Set environment variables for the pod. Note that SkyPilot env vars
538
- # are set separately when the task is run. These env vars are
539
- # independent of the SkyPilot task to be run.
540
- k8s_env_vars = {kubernetes.IN_CLUSTER_CONTEXT_NAME_ENV_VAR: context}
634
+ network_type, machine_type = self._detect_network_type(
635
+ context, resources.network_tier)
636
+
637
+ # Check if this cluster supports high performance networking and
638
+ # configure appropriate settings for different cluster types
639
+ if (resources.network_tier is not None and
640
+ resources.network_tier == resources_utils.NetworkTier.BEST):
641
+ # Only proceed if CUSTOM_NETWORK_TIER is supported by this cluster
642
+ unsupported_features = self._unsupported_features_for_resources(
643
+ resources)
644
+ if clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER \
645
+ not in unsupported_features:
646
+ # Add high-performance networking environment variables for
647
+ # clusters with high performance networking
648
+ network_env_vars = network_type.get_network_env_vars()
649
+ k8s_env_vars.update(network_env_vars)
541
650
 
542
651
  # We specify object-store-memory to be 500MB to avoid taking up too
543
652
  # much memory on the head node. 'num-cpus' should be set to limit
@@ -551,9 +660,57 @@ class Kubernetes(clouds.Cloud):
551
660
  }
552
661
 
553
662
  # Get the storage class name for high availability controller's PVC
554
- k8s_ha_storage_class_name = skypilot_config.get_nested(
555
- ('kubernetes', 'high_availability', 'storage_class_name'),
556
- None,
663
+ k8s_ha_storage_class_name = (
664
+ skypilot_config.get_effective_region_config(
665
+ cloud='kubernetes',
666
+ region=context,
667
+ keys=('high_availability', 'storage_class_name'),
668
+ default_value=None))
669
+
670
+ k8s_kueue_local_queue_name = (
671
+ skypilot_config.get_effective_region_config(
672
+ cloud='kubernetes',
673
+ region=context,
674
+ keys=('kueue', 'local_queue_name'),
675
+ default_value=None,
676
+ override_configs=resources.cluster_config_overrides))
677
+
678
+ # Check DWS configuration for GKE.
679
+ (enable_flex_start, enable_flex_start_queued_provisioning,
680
+ max_run_duration_seconds) = gcp_utils.get_dws_config(
681
+ context, k8s_kueue_local_queue_name,
682
+ resources.cluster_config_overrides)
683
+ if enable_flex_start_queued_provisioning or enable_flex_start:
684
+ # DWS is only supported in GKE, check the autoscaler type.
685
+ autoscaler_type = skypilot_config.get_effective_region_config(
686
+ cloud='kubernetes',
687
+ region=context,
688
+ keys=('autoscaler',),
689
+ default_value=None)
690
+ if (autoscaler_type !=
691
+ kubernetes_enums.KubernetesAutoscalerType.GKE.value):
692
+ raise ValueError(
693
+ f'DWS is only supported in GKE, but the autoscaler type '
694
+ f'for context {context} is {autoscaler_type}')
695
+
696
+ # Timeout for resource provisioning. This timeout determines how long to
697
+ # wait for pod to be in pending status before giving up.
698
+ # Larger timeout may be required for autoscaling clusters, since
699
+ # autoscaler may take some time to provision new nodes.
700
+ # Note that this timeout includes time taken by the Kubernetes scheduler
701
+ # itself, which can be upto 2-3 seconds, and up to 10-15 seconds when
702
+ # scheduling 100s of pods.
703
+ # We use a linear scaling formula to determine the timeout based on the
704
+ # number of nodes.
705
+
706
+ timeout = self._calculate_provision_timeout(
707
+ num_nodes, volume_mounts, enable_flex_start or
708
+ enable_flex_start_queued_provisioning)
709
+ timeout = skypilot_config.get_effective_region_config(
710
+ cloud='kubernetes',
711
+ region=context,
712
+ keys=('provision_timeout',),
713
+ default_value=timeout,
557
714
  override_configs=resources.cluster_config_overrides)
558
715
 
559
716
  deploy_vars = {
@@ -564,15 +721,12 @@ class Kubernetes(clouds.Cloud):
564
721
  'accelerator_count': str(acc_count),
565
722
  'timeout': str(timeout),
566
723
  'k8s_port_mode': port_mode.value,
567
- 'k8s_networking_mode': network_utils.get_networking_mode().value,
568
- 'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME,
569
724
  'k8s_acc_label_key': k8s_acc_label_key,
570
725
  'k8s_acc_label_values': k8s_acc_label_values,
571
- 'k8s_ssh_jump_name': self.SKY_SSH_JUMP_NAME,
572
- 'k8s_ssh_jump_image': ssh_jump_image,
573
726
  'k8s_service_account_name': k8s_service_account_name,
574
- 'k8s_automount_sa_token': k8s_automount_sa_token,
727
+ 'k8s_automount_sa_token': 'true',
575
728
  'k8s_fuse_device_required': fuse_device_required,
729
+ 'k8s_kueue_local_queue_name': k8s_kueue_local_queue_name,
576
730
  # Namespace to run the fusermount-server daemonset in
577
731
  'k8s_skypilot_system_namespace': _SKYPILOT_SYSTEM_NAMESPACE,
578
732
  'k8s_fusermount_shared_dir': _FUSERMOUNT_SHARED_DIR,
@@ -600,9 +754,17 @@ class Kubernetes(clouds.Cloud):
600
754
  (constants.PERSISTENT_SETUP_SCRIPT_PATH),
601
755
  'k8s_high_availability_deployment_run_script_dir':
602
756
  (constants.PERSISTENT_RUN_SCRIPT_DIR),
757
+ 'k8s_high_availability_restarting_signal_file':
758
+ (constants.PERSISTENT_RUN_RESTARTING_SIGNAL_FILE),
759
+ 'ha_recovery_log_path':
760
+ constants.HA_PERSISTENT_RECOVERY_LOG_PATH.format(''),
761
+ 'sky_python_cmd': constants.SKY_PYTHON_CMD,
603
762
  'k8s_high_availability_storage_class_name':
604
763
  (k8s_ha_storage_class_name),
605
764
  'avoid_label_keys': avoid_label_keys,
765
+ 'k8s_enable_flex_start': enable_flex_start,
766
+ 'k8s_max_run_duration_seconds': max_run_duration_seconds,
767
+ 'k8s_network_type': network_type.value,
606
768
  }
607
769
 
608
770
  # Add kubecontext if it is set. It may be None if SkyPilot is running
@@ -613,13 +775,43 @@ class Kubernetes(clouds.Cloud):
613
775
  namespace = kubernetes_utils.get_kube_config_context_namespace(context)
614
776
  deploy_vars['k8s_namespace'] = namespace
615
777
 
778
+ # Add backward compatibility template variables for GPUDirect variants
779
+ deploy_vars['k8s_enable_gpudirect_tcpx'] = (
780
+ network_type == KubernetesHighPerformanceNetworkType.GCP_TCPX)
781
+ deploy_vars['k8s_enable_gpudirect_tcpxo'] = (
782
+ network_type == KubernetesHighPerformanceNetworkType.GCP_TCPXO)
783
+ rdma_enabled = (network_type ==
784
+ KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA)
785
+ deploy_vars['k8s_enable_gpudirect_rdma'] = rdma_enabled
786
+ if rdma_enabled and machine_type.startswith('a4'):
787
+ deploy_vars['k8s_enable_gpudirect_rdma_a4'] = True
788
+ else:
789
+ deploy_vars['k8s_enable_gpudirect_rdma_a4'] = False
790
+
791
+ deploy_vars['k8s_ipc_lock_capability'] = (
792
+ network_type.requires_ipc_lock_capability())
793
+
616
794
  return deploy_vars
617
795
 
796
+ @staticmethod
797
+ def _warn_on_disk_size(resources: 'resources_lib.Resources'):
798
+ if resources.disk_size != resources_lib.DEFAULT_DISK_SIZE_GB:
799
+ logger.info(f'{colorama.Style.DIM}Disk size {resources.disk_size} '
800
+ 'is not supported by Kubernetes. '
801
+ 'To add additional disk, use volumes.'
802
+ f'{colorama.Style.RESET_ALL}')
803
+ if resources.disk_tier is not None:
804
+ logger.info(f'{colorama.Style.DIM}Disk tier {resources.disk_tier} '
805
+ 'is not supported by Kubernetes. '
806
+ 'To add additional disk, use volumes.'
807
+ f'{colorama.Style.RESET_ALL}')
808
+
618
809
  def _get_feasible_launchable_resources(
619
810
  self, resources: 'resources_lib.Resources'
620
811
  ) -> 'resources_utils.FeasibleResources':
621
812
  # TODO(zhwu): This needs to be updated to return the correct region
622
813
  # (context) that has enough resources.
814
+ self._warn_on_disk_size(resources)
623
815
  fuzzy_candidate_list: List[str] = []
624
816
  if resources.instance_type is not None:
625
817
  assert resources.is_launchable(), resources
@@ -628,7 +820,8 @@ class Kubernetes(clouds.Cloud):
628
820
  accelerators=resources.accelerators,
629
821
  use_spot=resources.use_spot,
630
822
  region=resources.region,
631
- zone=resources.zone)
823
+ zone=resources.zone,
824
+ resources=resources)
632
825
  if not regions:
633
826
  return resources_utils.FeasibleResources([], [], None)
634
827
  resources = resources.copy(accelerators=None)
@@ -639,7 +832,7 @@ class Kubernetes(clouds.Cloud):
639
832
  resource_list = []
640
833
  for instance_type in instance_list:
641
834
  r = resources.copy(
642
- cloud=Kubernetes(),
835
+ cloud=self.__class__(),
643
836
  instance_type=instance_type,
644
837
  accelerators=None,
645
838
  )
@@ -652,7 +845,9 @@ class Kubernetes(clouds.Cloud):
652
845
  default_instance_type = Kubernetes.get_default_instance_type(
653
846
  cpus=resources.cpus,
654
847
  memory=resources.memory,
655
- disk_tier=resources.disk_tier)
848
+ disk_tier=resources.disk_tier,
849
+ region=resources.region,
850
+ zone=resources.zone)
656
851
 
657
852
  if accelerators is None:
658
853
  # For CPU only clusters, need no special handling
@@ -661,12 +856,18 @@ class Kubernetes(clouds.Cloud):
661
856
  assert len(accelerators) == 1, resources
662
857
  # GPUs requested - build instance type.
663
858
  acc_type, acc_count = list(accelerators.items())[0]
859
+ # If acc_type contains spaces, return empty list since Kubernetes
860
+ # does not support spaces in label values
861
+ if ' ' in acc_type:
862
+ return resources_utils.FeasibleResources([], [], None)
664
863
 
665
864
  # Parse into KubernetesInstanceType
666
865
  k8s_instance_type = (kubernetes_utils.KubernetesInstanceType.
667
866
  from_instance_type(default_instance_type))
668
867
 
669
868
  gpu_task_cpus = k8s_instance_type.cpus
869
+ if resources.cpus is None:
870
+ gpu_task_cpus = self._DEFAULT_NUM_VCPUS_WITH_GPU * acc_count
670
871
  # Special handling to bump up memory multiplier for GPU instances
671
872
  gpu_task_memory = (float(resources.memory.strip('+')) if
672
873
  resources.memory is not None else gpu_task_cpus *
@@ -680,7 +881,8 @@ class Kubernetes(clouds.Cloud):
680
881
  accelerators=None,
681
882
  use_spot=resources.use_spot,
682
883
  region=resources.region,
683
- zone=resources.zone)
884
+ zone=resources.zone,
885
+ resources=resources)
684
886
  if not available_regions:
685
887
  return resources_utils.FeasibleResources([], [], None)
686
888
  # No fuzzy lists for Kubernetes
@@ -691,10 +893,47 @@ class Kubernetes(clouds.Cloud):
691
893
  [], None)
692
894
 
693
895
  @classmethod
694
- def _check_compute_credentials(cls) -> Tuple[bool, Optional[str]]:
896
+ def _check_single_context(cls, context: str) -> Tuple[bool, str]:
897
+ """Check if the user has access credentials to a single SSH context."""
898
+
899
+ def _red_color(str_to_format: str) -> str:
900
+ return (f'{colorama.Fore.LIGHTRED_EX}'
901
+ f'{str_to_format}'
902
+ f'{colorama.Style.RESET_ALL}')
903
+
904
+ def _dim_color(str_to_format: str) -> str:
905
+ return (f'{colorama.Style.DIM}'
906
+ f'{str_to_format}'
907
+ f'{colorama.Style.RESET_ALL}')
908
+
909
+ def _bright_green_color(str_to_format: str) -> str:
910
+ return (f'{colorama.Fore.GREEN}'
911
+ f'{str_to_format}'
912
+ f'{colorama.Style.RESET_ALL}')
913
+
914
+ try:
915
+ check_result = kubernetes_utils.check_credentials(
916
+ context, run_optional_checks=True)
917
+ if check_result[0]:
918
+ if check_result[1] is not None:
919
+ return True, (_bright_green_color('enabled.') +
920
+ _dim_color(f' Note: {check_result[1]}'))
921
+ else:
922
+ return True, _bright_green_color('enabled.')
923
+ else:
924
+ assert check_result[1] is not None
925
+ return False, (_red_color('disabled.') +
926
+ _dim_color(f' Reason: {check_result[1]}'))
927
+ except Exception as e: # pylint: disable=broad-except
928
+ return False, _red_color(str(e))
929
+
930
+ @classmethod
931
+ def _check_compute_credentials(
932
+ cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
695
933
  """Checks if the user has access credentials to
696
934
  Kubernetes."""
697
935
  # Check for port forward dependencies
936
+ logger.debug(f'Checking compute credentials for {cls.canonical_name()}')
698
937
  reasons = kubernetes_utils.check_port_forward_mode_dependencies(False)
699
938
  if reasons is not None:
700
939
  formatted = '\n'.join(
@@ -718,26 +957,15 @@ class Kubernetes(clouds.Cloud):
718
957
  return (False, 'No available context found in kubeconfig. '
719
958
  'Check if you have a valid kubeconfig file' +
720
959
  check_skypilot_config_msg)
721
- reasons = []
722
- hints = []
960
+
961
+ ctx2text = {}
723
962
  success = False
724
963
  for context in existing_allowed_contexts:
725
- try:
726
- check_result = kubernetes_utils.check_credentials(
727
- context, run_optional_checks=True)
728
- if check_result[0]:
729
- success = True
730
- if check_result[1] is not None:
731
- hints.append(f'Context {context}: {check_result[1]}')
732
- else:
733
- reasons.append(f'Context {context}: {check_result[1]}')
734
- except Exception as e: # pylint: disable=broad-except
735
- return (False, f'Credential check failed for {context}: '
736
- f'{common_utils.format_exception(e)}')
737
- if success:
738
- return (True, cls._format_credential_check_results(hints, reasons))
739
- return (False, 'Failed to find available context with working '
740
- 'credentials. Details:\n' + '\n'.join(reasons))
964
+ suc, text = cls._check_single_context(context)
965
+ success = success or suc
966
+ ctx2text[context] = text
967
+
968
+ return success, ctx2text
741
969
 
742
970
  @classmethod
743
971
  def _format_credential_check_results(cls, hints: List[str],
@@ -768,10 +996,28 @@ class Kubernetes(clouds.Cloud):
768
996
  return ''.join(message_parts)
769
997
 
770
998
  def get_credential_file_mounts(self) -> Dict[str, str]:
771
- if os.path.exists(os.path.expanduser(CREDENTIAL_PATH)):
999
+ credential_paths = kubernetes_utils.get_kubeconfig_paths()
1000
+ if credential_paths:
1001
+ # For single kubeconfig path, keep the original path.
1002
+ kubeconfig_file = credential_paths[0]
1003
+ if len(credential_paths) > 1:
1004
+ # For multiple kubeconfig paths, merge them into a single file.
1005
+ # TODO(aylei): GC merged kubeconfig files.
1006
+ kubeconfig_file = tempfile.NamedTemporaryFile(
1007
+ prefix='merged-kubeconfig-', suffix='.yaml',
1008
+ delete=False).name
1009
+ subprocess.run(
1010
+ 'kubectl config view --flatten '
1011
+ f'> {kubeconfig_file}',
1012
+ shell=True,
1013
+ check=True)
1014
+ if os.path.exists(kubeconfig_file):
1015
+ # convert auth plugin paths (e.g.: gke-gcloud-auth-plugin)
1016
+ kubeconfig_file = kubernetes_utils.format_kubeconfig_exec_auth_with_cache(kubeconfig_file) # pylint: disable=line-too-long
1017
+
772
1018
  # Upload kubeconfig to the default path to avoid having to set
773
1019
  # KUBECONFIG in the environment.
774
- return {DEFAULT_KUBECONFIG_PATH: CREDENTIAL_PATH}
1020
+ return {kubernetes.DEFAULT_KUBECONFIG_PATH: kubeconfig_file}
775
1021
  else:
776
1022
  return {}
777
1023
 
@@ -787,7 +1033,7 @@ class Kubernetes(clouds.Cloud):
787
1033
 
788
1034
  all_contexts = kubernetes_utils.get_all_kube_context_names()
789
1035
 
790
- if region not in all_contexts:
1036
+ if region and region not in all_contexts:
791
1037
  raise ValueError(
792
1038
  f'Context {region} not found in kubeconfig. Kubernetes only '
793
1039
  'supports context names as regions. Available '
@@ -810,11 +1056,11 @@ class Kubernetes(clouds.Cloud):
810
1056
 
811
1057
  @classmethod
812
1058
  def get_user_identities(cls) -> Optional[List[List[str]]]:
813
- k8s = kubernetes.kubernetes
814
1059
  identities = []
1060
+ k8s = kubernetes.kubernetes
815
1061
  try:
816
1062
  all_contexts, current_context = (
817
- k8s.config.list_kube_config_contexts())
1063
+ kubernetes.list_kube_config_contexts())
818
1064
  except k8s.config.config_exception.ConfigException:
819
1065
  return None
820
1066
  # Add current context at the head of the list
@@ -825,6 +1071,31 @@ class Kubernetes(clouds.Cloud):
825
1071
  identities.append(identity)
826
1072
  return identities
827
1073
 
1074
+ @classmethod
1075
+ def is_volume_name_valid(cls,
1076
+ volume_name: str) -> Tuple[bool, Optional[str]]:
1077
+ """Validates that the volume name is valid for this cloud.
1078
+
1079
+ Follows Kubernetes DNS-1123 subdomain rules:
1080
+ - must be <= 253 characters
1081
+ - must match: '[a-z0-9]([-a-z0-9]*[a-z0-9])?(.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*' # pylint: disable=line-too-long
1082
+ """
1083
+ # Max length per DNS-1123 subdomain
1084
+ if len(volume_name) > cls._MAX_VOLUME_NAME_LEN_LIMIT:
1085
+ return (False, f'Volume name exceeds the maximum length of '
1086
+ f'{cls._MAX_VOLUME_NAME_LEN_LIMIT} characters '
1087
+ '(DNS-1123 subdomain).')
1088
+
1089
+ # DNS-1123 label: [a-z0-9]([-a-z0-9]*[a-z0-9])?
1090
+ label = r'[a-z0-9]([-a-z0-9]*[a-z0-9])?'
1091
+ # DNS-1123 subdomain: label(\.-separated label)*
1092
+ subdomain_pattern = rf'^{label}(\.{label})*$'
1093
+ if re.fullmatch(subdomain_pattern, volume_name) is None:
1094
+ return (False, 'Volume name must be a valid DNS-1123 subdomain: '
1095
+ 'lowercase alphanumeric, "-", and "."; start/end with '
1096
+ 'alphanumeric.')
1097
+ return True, None
1098
+
828
1099
  @classmethod
829
1100
  def is_label_valid(cls, label_key: str,
830
1101
  label_value: str) -> Tuple[bool, Optional[str]]:
@@ -854,3 +1125,133 @@ class Kubernetes(clouds.Cloud):
854
1125
  if not key_valid or not value_valid:
855
1126
  return False, error_msg
856
1127
  return True, None
1128
+
1129
+ @classmethod
1130
+ def expand_infras(cls) -> List[str]:
1131
+ return [
1132
+ f'{cls.canonical_name()}/{c}'
1133
+ for c in cls.existing_allowed_contexts(silent=True)
1134
+ ]
1135
+
1136
+ @classmethod
1137
+ def _detect_network_type(
1138
+ cls,
1139
+ context: str,
1140
+ network_tier: Optional['resources_utils.NetworkTier'] = None
1141
+ ) -> Tuple[KubernetesHighPerformanceNetworkType, str]:
1142
+ """Detect the type of Kubernetes network based on node labels.
1143
+
1144
+ Args:
1145
+ context: The Kubernetes context to check.
1146
+ network_tier: The network tier requested. If None or not BEST,
1147
+ returns NONE (no high-performance networking).
1148
+
1149
+ Returns:
1150
+ A tuple of the detected network type and the instance type.
1151
+ """
1152
+ # If network_tier is None or not BEST, return NONE
1153
+ if (network_tier is None or
1154
+ network_tier != resources_utils.NetworkTier.BEST):
1155
+ return KubernetesHighPerformanceNetworkType.NONE, ''
1156
+
1157
+ try:
1158
+ nodes = kubernetes_utils.get_kubernetes_nodes(context=context)
1159
+ for node in nodes:
1160
+ if node.metadata.labels:
1161
+ # Check for Nebius clusters
1162
+ for label_key, _ in node.metadata.labels.items():
1163
+ if label_key.startswith('nebius.com/'):
1164
+ return (KubernetesHighPerformanceNetworkType.NEBIUS,
1165
+ '')
1166
+ if label_key.startswith('ib.coreweave.cloud/'):
1167
+ return (
1168
+ KubernetesHighPerformanceNetworkType.COREWEAVE,
1169
+ '')
1170
+
1171
+ # Check for GKE clusters with specific GPUDirect variants
1172
+ machine_family = node.metadata.labels.get(
1173
+ 'cloud.google.com/machine-family', '')
1174
+ instance_type = node.metadata.labels.get(
1175
+ 'node.kubernetes.io/instance-type', '')
1176
+ gke_accelerator = node.metadata.labels.get(
1177
+ 'cloud.google.com/gke-accelerator', '')
1178
+
1179
+ # Check if this is a GKE cluster with A3/A4 machine family
1180
+ if machine_family in ['a3', 'a4']:
1181
+ # Check instance type to determine specific GPUDirect
1182
+ # variant
1183
+ if 'a3-highgpu-8g' in instance_type:
1184
+ return (
1185
+ KubernetesHighPerformanceNetworkType.GCP_TCPX,
1186
+ 'a3-highgpu-8g')
1187
+ elif 'a3-edgegpu-8g' in instance_type:
1188
+ return (
1189
+ KubernetesHighPerformanceNetworkType.GCP_TCPX,
1190
+ 'a3-edgegpu-8g')
1191
+ elif 'a3-megagpu-8g' in instance_type:
1192
+ return (
1193
+ KubernetesHighPerformanceNetworkType.GCP_TCPXO,
1194
+ 'a3-megagpu-8g')
1195
+ elif 'a4-highgpu-8g' in instance_type:
1196
+ return (KubernetesHighPerformanceNetworkType.
1197
+ GCP_GPUDIRECT_RDMA, 'a4-highgpu-8g')
1198
+ elif 'a3-ultragpu-8g' in instance_type:
1199
+ return (KubernetesHighPerformanceNetworkType.
1200
+ GCP_GPUDIRECT_RDMA, 'a3-ultragpu-8g')
1201
+ # Generic A3/A4 detection as fallback
1202
+ elif machine_family == 'a4':
1203
+ return (KubernetesHighPerformanceNetworkType.
1204
+ GCP_GPUDIRECT_RDMA, 'a4')
1205
+
1206
+ # Fallback: Check for GPU Direct TCPX capable instance
1207
+ # types with high-perf GPUs
1208
+ is_gpu_direct_tcpx_instance = (
1209
+ instance_type
1210
+ in gcp_constants.GPU_DIRECT_TCPX_INSTANCE_TYPES)
1211
+ has_high_perf_gpu = ('nvidia-h100' in gke_accelerator or
1212
+ 'nvidia-h200' in gke_accelerator or
1213
+ 'nvidia-b200' in gke_accelerator)
1214
+
1215
+ if is_gpu_direct_tcpx_instance and has_high_perf_gpu:
1216
+ # Default to TCPX if we can't determine the specific
1217
+ # variant
1218
+ return (KubernetesHighPerformanceNetworkType.GCP_TCPX,
1219
+ instance_type)
1220
+
1221
+ except exceptions.KubeAPIUnreachableError:
1222
+ # If we can't reach the cluster, assume no high perf networking
1223
+ pass
1224
+
1225
+ # If we cannot determine the network type based on nodes
1226
+ # Check if the cluster has any node pools with autoscaling enabled
1227
+ # with machine types that support high perf networking for GKE.
1228
+ autoscaler_type = skypilot_config.get_effective_region_config(
1229
+ cloud='kubernetes',
1230
+ region=context,
1231
+ keys=('autoscaler',),
1232
+ default_value=None)
1233
+ if (autoscaler_type !=
1234
+ kubernetes_enums.KubernetesAutoscalerType.GKE.value):
1235
+ return KubernetesHighPerformanceNetworkType.NONE, ''
1236
+ autoscaler = kubernetes_utils.get_autoscaler(
1237
+ kubernetes_enums.KubernetesAutoscalerType(autoscaler_type))
1238
+ logger.debug(f'{context} has autoscaler of type: {autoscaler_type}')
1239
+ machine_types = autoscaler.get_available_machine_types(context)
1240
+ # Check if any machine type supports high perf networking for GKE.
1241
+ if 'a3-highgpu-8g' in machine_types:
1242
+ return (KubernetesHighPerformanceNetworkType.GCP_TCPX,
1243
+ 'a3-highgpu-8g')
1244
+ elif 'a3-edgegpu-8g' in machine_types:
1245
+ return (KubernetesHighPerformanceNetworkType.GCP_TCPX,
1246
+ 'a3-edgegpu-8g')
1247
+ elif 'a3-megagpu-8g' in machine_types:
1248
+ return (KubernetesHighPerformanceNetworkType.GCP_TCPXO,
1249
+ 'a3-megagpu-8g')
1250
+ elif 'a4-highgpu-8g' in machine_types:
1251
+ return (KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA,
1252
+ 'a4-highgpu-8g')
1253
+ elif 'a3-ultragpu-8g' in machine_types:
1254
+ return (KubernetesHighPerformanceNetworkType.GCP_GPUDIRECT_RDMA,
1255
+ 'a3-ultragpu-8g')
1256
+
1257
+ return KubernetesHighPerformanceNetworkType.NONE, ''