skypilot-nightly 1.0.0.dev20250905__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (397) hide show
  1. sky/__init__.py +10 -2
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/ibm.py +5 -2
  8. sky/adaptors/kubernetes.py +64 -0
  9. sky/adaptors/nebius.py +3 -1
  10. sky/adaptors/primeintellect.py +1 -0
  11. sky/adaptors/seeweb.py +183 -0
  12. sky/adaptors/shadeform.py +89 -0
  13. sky/admin_policy.py +20 -0
  14. sky/authentication.py +157 -263
  15. sky/backends/__init__.py +3 -2
  16. sky/backends/backend.py +11 -3
  17. sky/backends/backend_utils.py +588 -184
  18. sky/backends/cloud_vm_ray_backend.py +1088 -904
  19. sky/backends/local_docker_backend.py +9 -5
  20. sky/backends/task_codegen.py +633 -0
  21. sky/backends/wheel_utils.py +18 -0
  22. sky/catalog/__init__.py +8 -0
  23. sky/catalog/aws_catalog.py +4 -0
  24. sky/catalog/common.py +19 -1
  25. sky/catalog/data_fetchers/fetch_aws.py +102 -80
  26. sky/catalog/data_fetchers/fetch_gcp.py +30 -3
  27. sky/catalog/data_fetchers/fetch_nebius.py +9 -6
  28. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  29. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  30. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  31. sky/catalog/kubernetes_catalog.py +24 -28
  32. sky/catalog/primeintellect_catalog.py +95 -0
  33. sky/catalog/runpod_catalog.py +5 -1
  34. sky/catalog/seeweb_catalog.py +184 -0
  35. sky/catalog/shadeform_catalog.py +165 -0
  36. sky/check.py +73 -43
  37. sky/client/cli/command.py +675 -412
  38. sky/client/cli/flags.py +4 -2
  39. sky/{volumes/utils.py → client/cli/table_utils.py} +111 -13
  40. sky/client/cli/utils.py +79 -0
  41. sky/client/common.py +12 -2
  42. sky/client/sdk.py +132 -63
  43. sky/client/sdk_async.py +34 -33
  44. sky/cloud_stores.py +82 -3
  45. sky/clouds/__init__.py +6 -0
  46. sky/clouds/aws.py +337 -129
  47. sky/clouds/azure.py +24 -18
  48. sky/clouds/cloud.py +40 -13
  49. sky/clouds/cudo.py +16 -13
  50. sky/clouds/do.py +9 -7
  51. sky/clouds/fluidstack.py +12 -5
  52. sky/clouds/gcp.py +14 -7
  53. sky/clouds/hyperbolic.py +12 -5
  54. sky/clouds/ibm.py +12 -5
  55. sky/clouds/kubernetes.py +80 -45
  56. sky/clouds/lambda_cloud.py +12 -5
  57. sky/clouds/nebius.py +23 -9
  58. sky/clouds/oci.py +19 -12
  59. sky/clouds/paperspace.py +4 -1
  60. sky/clouds/primeintellect.py +317 -0
  61. sky/clouds/runpod.py +85 -24
  62. sky/clouds/scp.py +12 -8
  63. sky/clouds/seeweb.py +477 -0
  64. sky/clouds/shadeform.py +400 -0
  65. sky/clouds/ssh.py +4 -2
  66. sky/clouds/utils/scp_utils.py +61 -50
  67. sky/clouds/vast.py +33 -27
  68. sky/clouds/vsphere.py +14 -16
  69. sky/core.py +174 -165
  70. sky/dashboard/out/404.html +1 -1
  71. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  72. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  73. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  74. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  75. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  76. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  77. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  78. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  79. sky/dashboard/out/_next/static/chunks/{6601-06114c982db410b6.js → 3800-7b45f9fbb6308557.js} +1 -1
  80. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  81. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  82. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  83. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  84. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  85. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  86. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  87. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  88. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  89. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  90. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  91. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  92. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  93. sky/dashboard/out/_next/static/chunks/pages/{_app-ce361c6959bc2001.js → _app-bde01e4a2beec258.js} +1 -1
  94. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  95. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  96. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  97. sky/dashboard/out/_next/static/chunks/pages/infra/{[context]-6563820e094f68ca.js → [context]-c0b5935149902e6f.js} +1 -1
  98. sky/dashboard/out/_next/static/chunks/pages/{infra-aabba60d57826e0f.js → infra-aed0ea19df7cf961.js} +1 -1
  99. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  100. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  101. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  102. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  103. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/pages/workspaces/{[name]-af76bb06dbb3954f.js → [name]-84a40f8c7c627fe4.js} +1 -1
  105. sky/dashboard/out/_next/static/chunks/pages/{workspaces-7598c33a746cdc91.js → workspaces-531b2f8c4bf89f82.js} +1 -1
  106. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  107. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  108. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  109. sky/dashboard/out/clusters/[cluster].html +1 -1
  110. sky/dashboard/out/clusters.html +1 -1
  111. sky/dashboard/out/config.html +1 -1
  112. sky/dashboard/out/index.html +1 -1
  113. sky/dashboard/out/infra/[context].html +1 -1
  114. sky/dashboard/out/infra.html +1 -1
  115. sky/dashboard/out/jobs/[job].html +1 -1
  116. sky/dashboard/out/jobs/pools/[pool].html +1 -1
  117. sky/dashboard/out/jobs.html +1 -1
  118. sky/dashboard/out/users.html +1 -1
  119. sky/dashboard/out/volumes.html +1 -1
  120. sky/dashboard/out/workspace/new.html +1 -1
  121. sky/dashboard/out/workspaces/[name].html +1 -1
  122. sky/dashboard/out/workspaces.html +1 -1
  123. sky/data/data_utils.py +92 -1
  124. sky/data/mounting_utils.py +162 -29
  125. sky/data/storage.py +200 -19
  126. sky/data/storage_utils.py +10 -45
  127. sky/exceptions.py +18 -7
  128. sky/execution.py +74 -31
  129. sky/global_user_state.py +605 -191
  130. sky/jobs/__init__.py +2 -0
  131. sky/jobs/client/sdk.py +101 -4
  132. sky/jobs/client/sdk_async.py +31 -5
  133. sky/jobs/constants.py +15 -8
  134. sky/jobs/controller.py +726 -284
  135. sky/jobs/file_content_utils.py +128 -0
  136. sky/jobs/log_gc.py +193 -0
  137. sky/jobs/recovery_strategy.py +250 -100
  138. sky/jobs/scheduler.py +271 -173
  139. sky/jobs/server/core.py +367 -114
  140. sky/jobs/server/server.py +81 -35
  141. sky/jobs/server/utils.py +89 -35
  142. sky/jobs/state.py +1498 -620
  143. sky/jobs/utils.py +771 -306
  144. sky/logs/agent.py +40 -5
  145. sky/logs/aws.py +9 -19
  146. sky/metrics/utils.py +282 -39
  147. sky/optimizer.py +1 -1
  148. sky/provision/__init__.py +37 -1
  149. sky/provision/aws/config.py +34 -13
  150. sky/provision/aws/instance.py +5 -2
  151. sky/provision/azure/instance.py +5 -3
  152. sky/provision/common.py +2 -0
  153. sky/provision/cudo/instance.py +4 -3
  154. sky/provision/do/instance.py +4 -3
  155. sky/provision/docker_utils.py +97 -26
  156. sky/provision/fluidstack/instance.py +6 -5
  157. sky/provision/gcp/config.py +6 -1
  158. sky/provision/gcp/instance.py +4 -2
  159. sky/provision/hyperbolic/instance.py +4 -2
  160. sky/provision/instance_setup.py +66 -20
  161. sky/provision/kubernetes/__init__.py +2 -0
  162. sky/provision/kubernetes/config.py +7 -44
  163. sky/provision/kubernetes/constants.py +0 -1
  164. sky/provision/kubernetes/instance.py +609 -213
  165. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  166. sky/provision/kubernetes/network.py +12 -8
  167. sky/provision/kubernetes/network_utils.py +8 -25
  168. sky/provision/kubernetes/utils.py +382 -418
  169. sky/provision/kubernetes/volume.py +150 -18
  170. sky/provision/lambda_cloud/instance.py +16 -13
  171. sky/provision/nebius/instance.py +6 -2
  172. sky/provision/nebius/utils.py +103 -86
  173. sky/provision/oci/instance.py +4 -2
  174. sky/provision/paperspace/instance.py +4 -3
  175. sky/provision/primeintellect/__init__.py +10 -0
  176. sky/provision/primeintellect/config.py +11 -0
  177. sky/provision/primeintellect/instance.py +454 -0
  178. sky/provision/primeintellect/utils.py +398 -0
  179. sky/provision/provisioner.py +30 -9
  180. sky/provision/runpod/__init__.py +2 -0
  181. sky/provision/runpod/instance.py +4 -3
  182. sky/provision/runpod/volume.py +69 -13
  183. sky/provision/scp/instance.py +307 -130
  184. sky/provision/seeweb/__init__.py +11 -0
  185. sky/provision/seeweb/config.py +13 -0
  186. sky/provision/seeweb/instance.py +812 -0
  187. sky/provision/shadeform/__init__.py +11 -0
  188. sky/provision/shadeform/config.py +12 -0
  189. sky/provision/shadeform/instance.py +351 -0
  190. sky/provision/shadeform/shadeform_utils.py +83 -0
  191. sky/provision/vast/instance.py +5 -3
  192. sky/provision/volume.py +164 -0
  193. sky/provision/vsphere/common/ssl_helper.py +1 -1
  194. sky/provision/vsphere/common/vapiconnect.py +2 -1
  195. sky/provision/vsphere/common/vim_utils.py +3 -2
  196. sky/provision/vsphere/instance.py +8 -6
  197. sky/provision/vsphere/vsphere_utils.py +8 -1
  198. sky/resources.py +11 -3
  199. sky/schemas/api/responses.py +107 -6
  200. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  201. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  202. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  203. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  204. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  205. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  206. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  207. sky/schemas/db/spot_jobs/002_cluster_pool.py +3 -3
  208. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  209. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  210. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  211. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  212. sky/schemas/generated/jobsv1_pb2.py +86 -0
  213. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  214. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  215. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  216. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  217. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  218. sky/schemas/generated/servev1_pb2.py +58 -0
  219. sky/schemas/generated/servev1_pb2.pyi +115 -0
  220. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  221. sky/serve/autoscalers.py +2 -0
  222. sky/serve/client/impl.py +55 -21
  223. sky/serve/constants.py +4 -3
  224. sky/serve/controller.py +17 -11
  225. sky/serve/load_balancing_policies.py +1 -1
  226. sky/serve/replica_managers.py +219 -142
  227. sky/serve/serve_rpc_utils.py +179 -0
  228. sky/serve/serve_state.py +63 -54
  229. sky/serve/serve_utils.py +145 -109
  230. sky/serve/server/core.py +46 -25
  231. sky/serve/server/impl.py +311 -162
  232. sky/serve/server/server.py +21 -19
  233. sky/serve/service.py +84 -68
  234. sky/serve/service_spec.py +45 -7
  235. sky/server/auth/loopback.py +38 -0
  236. sky/server/auth/oauth2_proxy.py +12 -7
  237. sky/server/common.py +47 -24
  238. sky/server/config.py +62 -28
  239. sky/server/constants.py +9 -1
  240. sky/server/daemons.py +109 -38
  241. sky/server/metrics.py +76 -96
  242. sky/server/middleware_utils.py +166 -0
  243. sky/server/requests/executor.py +381 -145
  244. sky/server/requests/payloads.py +71 -18
  245. sky/server/requests/preconditions.py +15 -13
  246. sky/server/requests/request_names.py +121 -0
  247. sky/server/requests/requests.py +507 -157
  248. sky/server/requests/serializers/decoders.py +48 -17
  249. sky/server/requests/serializers/encoders.py +85 -20
  250. sky/server/requests/threads.py +117 -0
  251. sky/server/rest.py +116 -24
  252. sky/server/server.py +420 -172
  253. sky/server/stream_utils.py +219 -45
  254. sky/server/uvicorn.py +30 -19
  255. sky/setup_files/MANIFEST.in +6 -1
  256. sky/setup_files/alembic.ini +8 -0
  257. sky/setup_files/dependencies.py +62 -19
  258. sky/setup_files/setup.py +44 -44
  259. sky/sky_logging.py +13 -5
  260. sky/skylet/attempt_skylet.py +106 -24
  261. sky/skylet/configs.py +3 -1
  262. sky/skylet/constants.py +111 -26
  263. sky/skylet/events.py +64 -10
  264. sky/skylet/job_lib.py +141 -104
  265. sky/skylet/log_lib.py +233 -5
  266. sky/skylet/log_lib.pyi +40 -2
  267. sky/skylet/providers/ibm/node_provider.py +12 -8
  268. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  269. sky/skylet/runtime_utils.py +21 -0
  270. sky/skylet/services.py +524 -0
  271. sky/skylet/skylet.py +22 -1
  272. sky/skylet/subprocess_daemon.py +104 -29
  273. sky/skypilot_config.py +99 -79
  274. sky/ssh_node_pools/server.py +9 -8
  275. sky/task.py +221 -104
  276. sky/templates/aws-ray.yml.j2 +1 -0
  277. sky/templates/azure-ray.yml.j2 +1 -0
  278. sky/templates/cudo-ray.yml.j2 +1 -0
  279. sky/templates/do-ray.yml.j2 +1 -0
  280. sky/templates/fluidstack-ray.yml.j2 +1 -0
  281. sky/templates/gcp-ray.yml.j2 +1 -0
  282. sky/templates/hyperbolic-ray.yml.j2 +1 -0
  283. sky/templates/ibm-ray.yml.j2 +2 -1
  284. sky/templates/jobs-controller.yaml.j2 +3 -0
  285. sky/templates/kubernetes-ray.yml.j2 +196 -55
  286. sky/templates/lambda-ray.yml.j2 +1 -0
  287. sky/templates/nebius-ray.yml.j2 +3 -0
  288. sky/templates/oci-ray.yml.j2 +1 -0
  289. sky/templates/paperspace-ray.yml.j2 +1 -0
  290. sky/templates/primeintellect-ray.yml.j2 +72 -0
  291. sky/templates/runpod-ray.yml.j2 +1 -0
  292. sky/templates/scp-ray.yml.j2 +1 -0
  293. sky/templates/seeweb-ray.yml.j2 +171 -0
  294. sky/templates/shadeform-ray.yml.j2 +73 -0
  295. sky/templates/vast-ray.yml.j2 +1 -0
  296. sky/templates/vsphere-ray.yml.j2 +1 -0
  297. sky/templates/websocket_proxy.py +188 -43
  298. sky/usage/usage_lib.py +16 -4
  299. sky/users/permission.py +60 -43
  300. sky/utils/accelerator_registry.py +6 -3
  301. sky/utils/admin_policy_utils.py +18 -5
  302. sky/utils/annotations.py +22 -0
  303. sky/utils/asyncio_utils.py +78 -0
  304. sky/utils/atomic.py +1 -1
  305. sky/utils/auth_utils.py +153 -0
  306. sky/utils/cli_utils/status_utils.py +12 -7
  307. sky/utils/cluster_utils.py +28 -6
  308. sky/utils/command_runner.py +88 -27
  309. sky/utils/command_runner.pyi +36 -3
  310. sky/utils/common.py +3 -1
  311. sky/utils/common_utils.py +37 -4
  312. sky/utils/config_utils.py +1 -14
  313. sky/utils/context.py +127 -40
  314. sky/utils/context_utils.py +73 -18
  315. sky/utils/controller_utils.py +229 -70
  316. sky/utils/db/db_utils.py +95 -18
  317. sky/utils/db/kv_cache.py +149 -0
  318. sky/utils/db/migration_utils.py +24 -7
  319. sky/utils/env_options.py +4 -0
  320. sky/utils/git.py +559 -1
  321. sky/utils/kubernetes/create_cluster.sh +15 -30
  322. sky/utils/kubernetes/delete_cluster.sh +10 -7
  323. sky/utils/kubernetes/{deploy_remote_cluster.py → deploy_ssh_node_pools.py} +258 -380
  324. sky/utils/kubernetes/generate_kind_config.py +6 -66
  325. sky/utils/kubernetes/gpu_labeler.py +13 -3
  326. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  327. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  328. sky/utils/kubernetes/kubernetes_deploy_utils.py +213 -194
  329. sky/utils/kubernetes/rsync_helper.sh +11 -3
  330. sky/utils/kubernetes_enums.py +7 -15
  331. sky/utils/lock_events.py +4 -4
  332. sky/utils/locks.py +128 -31
  333. sky/utils/log_utils.py +0 -319
  334. sky/utils/resource_checker.py +13 -10
  335. sky/utils/resources_utils.py +53 -29
  336. sky/utils/rich_utils.py +8 -4
  337. sky/utils/schemas.py +107 -52
  338. sky/utils/subprocess_utils.py +17 -4
  339. sky/utils/thread_utils.py +91 -0
  340. sky/utils/timeline.py +2 -1
  341. sky/utils/ux_utils.py +35 -1
  342. sky/utils/volume.py +88 -4
  343. sky/utils/yaml_utils.py +9 -0
  344. sky/volumes/client/sdk.py +48 -10
  345. sky/volumes/server/core.py +59 -22
  346. sky/volumes/server/server.py +46 -17
  347. sky/volumes/volume.py +54 -42
  348. sky/workspaces/core.py +57 -21
  349. sky/workspaces/server.py +13 -12
  350. sky_templates/README.md +3 -0
  351. sky_templates/__init__.py +3 -0
  352. sky_templates/ray/__init__.py +0 -0
  353. sky_templates/ray/start_cluster +183 -0
  354. sky_templates/ray/stop_cluster +75 -0
  355. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/METADATA +331 -65
  356. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  357. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  358. sky/client/cli/git.py +0 -549
  359. sky/dashboard/out/_next/static/chunks/1121-408ed10b2f9fce17.js +0 -1
  360. sky/dashboard/out/_next/static/chunks/1141-943efc7aff0f0c06.js +0 -1
  361. sky/dashboard/out/_next/static/chunks/1836-37fede578e2da5f8.js +0 -40
  362. sky/dashboard/out/_next/static/chunks/3015-86cabed5d4669ad0.js +0 -1
  363. sky/dashboard/out/_next/static/chunks/3294.c80326aec9bfed40.js +0 -6
  364. sky/dashboard/out/_next/static/chunks/3785.4872a2f3aa489880.js +0 -1
  365. sky/dashboard/out/_next/static/chunks/4045.b30465273dc5e468.js +0 -21
  366. sky/dashboard/out/_next/static/chunks/4676-9da7fdbde90b5549.js +0 -10
  367. sky/dashboard/out/_next/static/chunks/4725.10f7a9a5d3ea8208.js +0 -1
  368. sky/dashboard/out/_next/static/chunks/5339.3fda4a4010ff4e06.js +0 -51
  369. sky/dashboard/out/_next/static/chunks/6135-4b4d5e824b7f9d3c.js +0 -1
  370. sky/dashboard/out/_next/static/chunks/649.b9d7f7d10c1b8c53.js +0 -45
  371. sky/dashboard/out/_next/static/chunks/6856-dca7962af4814e1b.js +0 -1
  372. sky/dashboard/out/_next/static/chunks/6990-08b2a1cae076a943.js +0 -1
  373. sky/dashboard/out/_next/static/chunks/7325.b4bc99ce0892dcd5.js +0 -6
  374. sky/dashboard/out/_next/static/chunks/754-d0da8ab45f9509e9.js +0 -18
  375. sky/dashboard/out/_next/static/chunks/7669.1f5d9a402bf5cc42.js +0 -36
  376. sky/dashboard/out/_next/static/chunks/8969-0be3036bf86f8256.js +0 -1
  377. sky/dashboard/out/_next/static/chunks/9025.c12318fb6a1a9093.js +0 -6
  378. sky/dashboard/out/_next/static/chunks/9037-fa1737818d0a0969.js +0 -6
  379. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-1cbba24bd1bd35f8.js +0 -16
  380. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-0b4b35dc1dfe046c.js +0 -16
  381. sky/dashboard/out/_next/static/chunks/pages/clusters-469814d711d63b1b.js +0 -1
  382. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-dd64309c3fe67ed2.js +0 -11
  383. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-07349868f7905d37.js +0 -16
  384. sky/dashboard/out/_next/static/chunks/pages/jobs-1f70d9faa564804f.js +0 -1
  385. sky/dashboard/out/_next/static/chunks/pages/users-018bf31cda52e11b.js +0 -1
  386. sky/dashboard/out/_next/static/chunks/pages/volumes-739726d6b823f532.js +0 -1
  387. sky/dashboard/out/_next/static/chunks/webpack-4fe903277b57b523.js +0 -1
  388. sky/dashboard/out/_next/static/css/4614e06482d7309e.css +0 -3
  389. sky/dashboard/out/_next/static/mS-4qZPSkRuA1u-g2wQhg/_buildManifest.js +0 -1
  390. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  391. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  392. skypilot_nightly-1.0.0.dev20250905.dist-info/RECORD +0 -547
  393. skypilot_nightly-1.0.0.dev20250905.dist-info/top_level.txt +0 -1
  394. /sky/dashboard/out/_next/static/{mS-4qZPSkRuA1u-g2wQhg → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  395. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +0 -0
  396. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  397. {skypilot_nightly-1.0.0.dev20250905.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/clouds/azure.py CHANGED
@@ -15,6 +15,7 @@ from sky import exceptions
15
15
  from sky import sky_logging
16
16
  from sky import skypilot_config
17
17
  from sky.adaptors import azure
18
+ from sky.adaptors import common as adaptors_common
18
19
  from sky.clouds.utils import azure_utils
19
20
  from sky.utils import annotations
20
21
  from sky.utils import common_utils
@@ -86,7 +87,9 @@ class Azure(clouds.Cloud):
86
87
 
87
88
  @classmethod
88
89
  def _unsupported_features_for_resources(
89
- cls, resources: 'resources.Resources'
90
+ cls,
91
+ resources: 'resources.Resources',
92
+ region: Optional[str] = None,
90
93
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
91
94
  features = {
92
95
  clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
@@ -263,10 +266,15 @@ class Azure(clouds.Cloud):
263
266
  return _DEFAULT_GPU_IMAGE_ID
264
267
 
265
268
  @classmethod
266
- def regions_with_offering(cls, instance_type: str,
267
- accelerators: Optional[Dict[str, int]],
268
- use_spot: bool, region: Optional[str],
269
- zone: Optional[str]) -> List[clouds.Region]:
269
+ def regions_with_offering(
270
+ cls,
271
+ instance_type: str,
272
+ accelerators: Optional[Dict[str, int]],
273
+ use_spot: bool,
274
+ region: Optional[str],
275
+ zone: Optional[str],
276
+ resources: Optional['resources.Resources'] = None,
277
+ ) -> List[clouds.Region]:
270
278
  del accelerators # unused
271
279
  assert zone is None, 'Azure does not support zones'
272
280
  regions = catalog.get_region_zones_for_instance_type(
@@ -546,6 +554,7 @@ class Azure(clouds.Cloud):
546
554
  @classmethod
547
555
  def _check_credentials(cls) -> Tuple[bool, Optional[str]]:
548
556
  """Checks if the user has access credentials to this cloud."""
557
+
549
558
  help_str = (
550
559
  ' Run the following commands:'
551
560
  f'\n{cls._INDENT_PREFIX} $ az login'
@@ -561,6 +570,16 @@ class Azure(clouds.Cloud):
561
570
  return (False,
562
571
  f'{azure_token_cache_file} does not exist.' + help_str)
563
572
 
573
+ dependency_installation_hints = (
574
+ 'Azure dependencies are not installed. '
575
+ 'Run the following commands:'
576
+ f'\n{cls._INDENT_PREFIX} $ pip install skypilot[azure]'
577
+ f'\n{cls._INDENT_PREFIX}Credentials may also need to be set.')
578
+ # Check if the azure blob storage dependencies are installed.
579
+ if not adaptors_common.can_import_modules(
580
+ ['azure.storage.blob', 'msgraph']):
581
+ return False, dependency_installation_hints
582
+
564
583
  try:
565
584
  _run_output('az --version')
566
585
  except subprocess.CalledProcessError as e:
@@ -580,19 +599,6 @@ class Azure(clouds.Cloud):
580
599
  return False, (f'Getting user\'s Azure identity failed.{help_str}\n'
581
600
  f'{cls._INDENT_PREFIX}Details: '
582
601
  f'{common_utils.format_exception(e)}')
583
-
584
- # Check if the azure blob storage dependencies are installed.
585
- try:
586
- # pylint: disable=redefined-outer-name, import-outside-toplevel, unused-import
587
- from azure.storage import blob
588
- import msgraph
589
- except ImportError as e:
590
- return False, (
591
- f'Azure blob storage depdencies are not installed. '
592
- 'Run the following commands:'
593
- f'\n{cls._INDENT_PREFIX} $ pip install skypilot[azure]'
594
- f'\n{cls._INDENT_PREFIX}Details: '
595
- f'{common_utils.format_exception(e)}')
596
602
  return True, None
597
603
 
598
604
  def get_credential_file_mounts(self) -> Dict[str, str]:
sky/clouds/cloud.py CHANGED
@@ -185,10 +185,15 @@ class Cloud:
185
185
  #### Regions/Zones ####
186
186
 
187
187
  @classmethod
188
- def regions_with_offering(cls, instance_type: str,
189
- accelerators: Optional[Dict[str, int]],
190
- use_spot: bool, region: Optional[str],
191
- zone: Optional[str]) -> List[Region]:
188
+ def regions_with_offering(
189
+ cls,
190
+ instance_type: str,
191
+ accelerators: Optional[Dict[str, int]],
192
+ use_spot: bool,
193
+ region: Optional[str],
194
+ zone: Optional[str],
195
+ resources: Optional['resources_lib.Resources'] = None,
196
+ ) -> List[Region]:
192
197
  """Returns the regions that offer the specified resources.
193
198
 
194
199
  The order of the regions follow the order of the regions returned by
@@ -340,6 +345,14 @@ class Cloud:
340
345
  """Returns {acc: acc_count} held by 'instance_type', if any."""
341
346
  raise NotImplementedError
342
347
 
348
+ @classmethod
349
+ def get_arch_from_instance_type(
350
+ cls,
351
+ instance_type: str,
352
+ ) -> Optional[str]:
353
+ """Returns the arch of the instance type, if any."""
354
+ raise NotImplementedError
355
+
343
356
  @classmethod
344
357
  def get_default_instance_type(cls,
345
358
  cpus: Optional[str] = None,
@@ -666,8 +679,11 @@ class Cloud:
666
679
 
667
680
  @classmethod
668
681
  def check_features_are_supported(
669
- cls, resources: 'resources_lib.Resources',
670
- requested_features: Set[CloudImplementationFeatures]) -> None:
682
+ cls,
683
+ resources: 'resources_lib.Resources',
684
+ requested_features: Set[CloudImplementationFeatures],
685
+ region: Optional[str] = None,
686
+ ) -> None:
671
687
  """Errors out if the cloud does not support all requested features.
672
688
 
673
689
  For instance, Lambda Cloud does not support stop, so
@@ -685,7 +701,7 @@ class Cloud:
685
701
  requested features.
686
702
  """
687
703
  unsupported_features2reason = cls._unsupported_features_for_resources(
688
- resources)
704
+ resources, region)
689
705
 
690
706
  # Docker image is not compatible with ssh proxy command.
691
707
  if skypilot_config.get_effective_region_config(
@@ -715,7 +731,9 @@ class Cloud:
715
731
 
716
732
  @classmethod
717
733
  def _unsupported_features_for_resources(
718
- cls, resources: 'resources_lib.Resources'
734
+ cls,
735
+ resources: 'resources_lib.Resources',
736
+ region: Optional[str] = None,
719
737
  ) -> Dict[CloudImplementationFeatures, str]:
720
738
  """The features not supported based on the resources provided.
721
739
 
@@ -726,7 +744,7 @@ class Cloud:
726
744
  A dict of {feature: reason} for the features not supported by the
727
745
  cloud implementation.
728
746
  """
729
- del resources
747
+ del resources, region
730
748
  raise NotImplementedError
731
749
 
732
750
  @classmethod
@@ -800,12 +818,21 @@ class Cloud:
800
818
  if acc_from_instance_type is None:
801
819
  return False
802
820
 
803
- for acc in acc_requested:
804
- if acc not in acc_from_instance_type:
821
+ for requested_acc in acc_requested:
822
+ for instance_acc in acc_from_instance_type:
823
+ # The requested accelerator can be canonicalized based on
824
+ # the accelerator registry, which may not has the same case
825
+ # as the cloud's catalog, e.g., 'RTXPro6000' in Shadeform
826
+ # catalog, and 'RTXPRO6000' in RunPod catalog.
827
+ if requested_acc.lower() == instance_acc.lower():
828
+ # Found the requested accelerator in the instance type.
829
+ break
830
+ else:
831
+ # Requested accelerator not found in instance type.
805
832
  return False
806
833
  # Avoid float point precision issue.
807
- if not math.isclose(acc_requested[acc],
808
- acc_from_instance_type[acc]):
834
+ if not math.isclose(acc_requested[requested_acc],
835
+ acc_from_instance_type[instance_acc]):
809
836
  return False
810
837
  return True
811
838
 
sky/clouds/cudo.py CHANGED
@@ -5,6 +5,7 @@ from typing import Dict, Iterator, List, Optional, Tuple, Union
5
5
 
6
6
  from sky import catalog
7
7
  from sky import clouds
8
+ from sky.adaptors import common
8
9
  from sky.utils import common_utils
9
10
  from sky.utils import registry
10
11
  from sky.utils import resources_utils
@@ -86,7 +87,9 @@ class Cudo(clouds.Cloud):
86
87
 
87
88
  @classmethod
88
89
  def _unsupported_features_for_resources(
89
- cls, resources: 'resources_lib.Resources'
90
+ cls,
91
+ resources: 'resources_lib.Resources',
92
+ region: Optional[str] = None,
90
93
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
91
94
  """The features not supported based on the resources provided.
92
95
 
@@ -105,10 +108,15 @@ class Cudo(clouds.Cloud):
105
108
  return cls._MAX_CLUSTER_NAME_LEN_LIMIT
106
109
 
107
110
  @classmethod
108
- def regions_with_offering(cls, instance_type,
109
- accelerators: Optional[Dict[str, int]],
110
- use_spot: bool, region: Optional[str],
111
- zone: Optional[str]) -> List[clouds.Region]:
111
+ def regions_with_offering(
112
+ cls,
113
+ instance_type,
114
+ accelerators: Optional[Dict[str, int]],
115
+ use_spot: bool,
116
+ region: Optional[str],
117
+ zone: Optional[str],
118
+ resources: Optional['resources_lib.Resources'] = None,
119
+ ) -> List[clouds.Region]:
112
120
  assert zone is None, 'Cudo does not support zones.'
113
121
  del accelerators, zone # unused
114
122
  if use_spot:
@@ -287,14 +295,9 @@ class Cudo(clouds.Cloud):
287
295
  cls) -> Tuple[bool, Optional[Union[str, Dict[str, str]]]]:
288
296
  """Checks if the user has access credentials to
289
297
  Cudo's compute service."""
290
- try:
291
- # pylint: disable=import-outside-toplevel,unused-import
292
- from cudo_compute import cudo_api
293
- except (ImportError, subprocess.CalledProcessError) as e:
294
- return False, (
295
- f'{cls._DEPENDENCY_HINT}\n'
296
- f'{cls._INDENT_PREFIX}'
297
- f'{common_utils.format_exception(e, use_bracket=True)}')
298
+ if not common.can_import_modules(['cudo_compute']):
299
+ return False, (f'{cls._DEPENDENCY_HINT}\n'
300
+ f'{cls._INDENT_PREFIX}')
298
301
 
299
302
  try:
300
303
  _run_output('cudoctl --version')
sky/clouds/do.py CHANGED
@@ -57,7 +57,9 @@ class DO(clouds.Cloud):
57
57
 
58
58
  @classmethod
59
59
  def _unsupported_features_for_resources(
60
- cls, resources: 'resources_lib.Resources'
60
+ cls,
61
+ resources: 'resources_lib.Resources',
62
+ region: Optional[str] = None,
61
63
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
62
64
  """The features not supported based on the resources provided.
63
65
 
@@ -83,6 +85,7 @@ class DO(clouds.Cloud):
83
85
  use_spot: bool,
84
86
  region: Optional[str],
85
87
  zone: Optional[str],
88
+ resources: Optional['resources_lib.Resources'] = None,
86
89
  ) -> List[clouds.Region]:
87
90
  assert zone is None, 'DO does not support zones.'
88
91
  del accelerators, zone # unused
@@ -283,18 +286,17 @@ class DO(clouds.Cloud):
283
286
  """Verify that the user has valid credentials for
284
287
  DO's compute service."""
285
288
 
286
- try:
287
- do.exceptions()
288
- except ImportError as err:
289
- return False, str(err)
289
+ installed, err_msg = do.check_exceptions_dependencies_installed()
290
+ if not installed:
291
+ return False, err_msg
290
292
 
291
293
  try:
292
294
  # attempt to make a CURL request for listing instances
293
295
  do_utils.client().droplets.list()
294
- except do.exceptions().HttpResponseError as err:
295
- return False, str(err)
296
296
  except do_utils.DigitalOceanError as err:
297
297
  return False, str(err)
298
+ except do.exceptions().HttpResponseError as err:
299
+ return False, str(err)
298
300
 
299
301
  return True, None
300
302
 
sky/clouds/fluidstack.py CHANGED
@@ -73,7 +73,9 @@ class Fluidstack(clouds.Cloud):
73
73
 
74
74
  @classmethod
75
75
  def _unsupported_features_for_resources(
76
- cls, resources: 'resources_lib.Resources'
76
+ cls,
77
+ resources: 'resources_lib.Resources',
78
+ region: Optional[str] = None,
77
79
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
78
80
  """The features not supported based on the resources provided.
79
81
 
@@ -92,10 +94,15 @@ class Fluidstack(clouds.Cloud):
92
94
  return cls._MAX_CLUSTER_NAME_LEN_LIMIT
93
95
 
94
96
  @classmethod
95
- def regions_with_offering(cls, instance_type: str,
96
- accelerators: Optional[Dict[str, int]],
97
- use_spot: bool, region: Optional[str],
98
- zone: Optional[str]) -> List[clouds.Region]:
97
+ def regions_with_offering(
98
+ cls,
99
+ instance_type: str,
100
+ accelerators: Optional[Dict[str, int]],
101
+ use_spot: bool,
102
+ region: Optional[str],
103
+ zone: Optional[str],
104
+ resources: Optional['resources_lib.Resources'] = None,
105
+ ) -> List[clouds.Region]:
99
106
  assert zone is None, 'FluidStack does not support zones.'
100
107
  del accelerators, zone # unused
101
108
  if use_spot:
sky/clouds/gcp.py CHANGED
@@ -211,7 +211,9 @@ class GCP(clouds.Cloud):
211
211
 
212
212
  @classmethod
213
213
  def _unsupported_features_for_resources(
214
- cls, resources: 'resources.Resources'
214
+ cls,
215
+ resources: 'resources.Resources',
216
+ region: Optional[str] = None,
215
217
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
216
218
  unsupported = {}
217
219
  if gcp_utils.is_tpu_vm_pod(resources):
@@ -255,10 +257,15 @@ class GCP(clouds.Cloud):
255
257
 
256
258
  #### Regions/Zones ####
257
259
  @classmethod
258
- def regions_with_offering(cls, instance_type: str,
259
- accelerators: Optional[Dict[str, int]],
260
- use_spot: bool, region: Optional[str],
261
- zone: Optional[str]) -> List[clouds.Region]:
260
+ def regions_with_offering(
261
+ cls,
262
+ instance_type: str,
263
+ accelerators: Optional[Dict[str, int]],
264
+ use_spot: bool,
265
+ region: Optional[str],
266
+ zone: Optional[str],
267
+ resources: Optional['resources.Resources'] = None,
268
+ ) -> List[clouds.Region]:
262
269
  if accelerators is None:
263
270
  regions = catalog.get_region_zones_for_instance_type(instance_type,
264
271
  use_spot,
@@ -1179,8 +1186,8 @@ class GCP(clouds.Cloud):
1179
1186
  # These series don't support pd-standard, use pd-balanced for LOW.
1180
1187
  _propagate_disk_type(
1181
1188
  lowest=tier2name[resources_utils.DiskTier.MEDIUM])
1182
- if instance_type.startswith('a3-ultragpu') or series == 'n4':
1183
- # a3-ultragpu instances only support hyperdisk-balanced.
1189
+ if instance_type.startswith('a3-ultragpu') or series in ('n4', 'a4'):
1190
+ # a3-ultragpu, n4, and a4 instances only support hyperdisk-balanced.
1184
1191
  _propagate_disk_type(all='hyperdisk-balanced')
1185
1192
 
1186
1193
  # Series specific handling
sky/clouds/hyperbolic.py CHANGED
@@ -65,7 +65,9 @@ class Hyperbolic(clouds.Cloud):
65
65
 
66
66
  @classmethod
67
67
  def _unsupported_features_for_resources(
68
- cls, resources: 'resources_lib.Resources'
68
+ cls,
69
+ resources: 'resources_lib.Resources',
70
+ region: Optional[str] = None,
69
71
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
70
72
  del resources
71
73
  return cls._CLOUD_UNSUPPORTED_FEATURES
@@ -78,10 +80,15 @@ class Hyperbolic(clouds.Cloud):
78
80
  return catalog.instance_type_exists(instance_type, 'hyperbolic')
79
81
 
80
82
  @classmethod
81
- def regions_with_offering(cls, instance_type: str,
82
- accelerators: Optional[Dict[str, int]],
83
- use_spot: bool, region: Optional[str],
84
- zone: Optional[str]) -> List[clouds.Region]:
83
+ def regions_with_offering(
84
+ cls,
85
+ instance_type: str,
86
+ accelerators: Optional[Dict[str, int]],
87
+ use_spot: bool,
88
+ region: Optional[str],
89
+ zone: Optional[str],
90
+ resources: Optional['resources_lib.Resources'] = None,
91
+ ) -> List[clouds.Region]:
85
92
  assert zone is None, 'Hyperbolic does not support zones.'
86
93
  del accelerators, zone # unused
87
94
 
sky/clouds/ibm.py CHANGED
@@ -37,7 +37,9 @@ class IBM(clouds.Cloud):
37
37
 
38
38
  @classmethod
39
39
  def _unsupported_features_for_resources(
40
- cls, resources: 'resources_lib.Resources'
40
+ cls,
41
+ resources: 'resources_lib.Resources',
42
+ region: Optional[str] = None,
41
43
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
42
44
  features = {
43
45
  clouds.CloudImplementationFeatures.CLONE_DISK_FROM_CLUSTER:
@@ -68,10 +70,15 @@ class IBM(clouds.Cloud):
68
70
  return cls._MAX_CLUSTER_NAME_LEN_LIMIT
69
71
 
70
72
  @classmethod
71
- def regions_with_offering(cls, instance_type: str,
72
- accelerators: Optional[Dict[str, int]],
73
- use_spot: bool, region: Optional[str],
74
- zone: Optional[str]) -> List[clouds.Region]:
73
+ def regions_with_offering(
74
+ cls,
75
+ instance_type: str,
76
+ accelerators: Optional[Dict[str, int]],
77
+ use_spot: bool,
78
+ region: Optional[str],
79
+ zone: Optional[str],
80
+ resources: Optional['resources_lib.Resources'] = None,
81
+ ) -> List[clouds.Region]:
75
82
  del accelerators # unused
76
83
  if use_spot:
77
84
  return []
sky/clouds/kubernetes.py CHANGED
@@ -1,4 +1,5 @@
1
1
  """Kubernetes."""
2
+ import concurrent.futures
2
3
  import os
3
4
  import re
4
5
  import subprocess
@@ -25,6 +26,7 @@ from sky.provision.kubernetes.utils import normalize_tpu_accelerator_name
25
26
  from sky.skylet import constants
26
27
  from sky.utils import annotations
27
28
  from sky.utils import common_utils
29
+ from sky.utils import env_options
28
30
  from sky.utils import kubernetes_enums
29
31
  from sky.utils import registry
30
32
  from sky.utils import resources_utils
@@ -47,9 +49,6 @@ _FUSERMOUNT_SHARED_DIR = '/var/run/fusermount'
47
49
  class Kubernetes(clouds.Cloud):
48
50
  """Kubernetes."""
49
51
 
50
- SKY_SSH_KEY_SECRET_NAME = 'sky-ssh-keys'
51
- SKY_SSH_JUMP_NAME = 'sky-ssh-jump-pod'
52
-
53
52
  # Limit the length of the cluster name to avoid exceeding the limit of 63
54
53
  # characters for Kubernetes resources. We limit to 42 characters (63-21) to
55
54
  # allow additional characters for creating ingress services to expose ports.
@@ -62,6 +61,7 @@ class Kubernetes(clouds.Cloud):
62
61
  _SUPPORTS_SERVICE_ACCOUNT_ON_REMOTE = True
63
62
 
64
63
  _DEFAULT_NUM_VCPUS = 2
64
+ _DEFAULT_NUM_VCPUS_WITH_GPU = 4
65
65
  _DEFAULT_MEMORY_CPU_RATIO = 1
66
66
  _DEFAULT_MEMORY_CPU_RATIO_WITH_GPU = 4 # Allocate more memory for GPU tasks
67
67
  _REPR = 'Kubernetes'
@@ -97,44 +97,52 @@ class Kubernetes(clouds.Cloud):
97
97
  # Set of contexts that has logged as temporarily unreachable
98
98
  logged_unreachable_contexts: Set[str] = set()
99
99
 
100
- @property
101
- def ssh_key_secret_field_name(self):
102
- # Use a fresh user hash to avoid conflicts in the secret object naming.
103
- # This can happen when the controller is reusing the same user hash
104
- # through USER_ID_ENV_VAR but has a different SSH key.
105
- fresh_user_hash = common_utils.generate_user_hash()
106
- return f'ssh-publickey-{fresh_user_hash}'
107
-
108
100
  @classmethod
109
101
  def _unsupported_features_for_resources(
110
- cls, resources: 'resources_lib.Resources'
102
+ cls,
103
+ resources: 'resources_lib.Resources',
104
+ region: Optional[str] = None,
111
105
  ) -> Dict[clouds.CloudImplementationFeatures, str]:
112
106
  # TODO(aylei): features need to be regional (per context) to make
113
107
  # multi-kubernetes selection/failover work.
114
108
  unsupported_features = cls._CLOUD_UNSUPPORTED_FEATURES.copy()
115
- context = resources.region
109
+ context = region if region is not None else resources.region
116
110
  if context is None:
117
- context = kubernetes_utils.get_current_kube_config_context_name()
111
+ contexts = cls.existing_allowed_contexts()
112
+ else:
113
+ contexts = [context]
118
114
  unsupported_features[clouds.CloudImplementationFeatures.STOP] = (
119
115
  'Stopping clusters is not supported on Kubernetes.')
120
116
  unsupported_features[clouds.CloudImplementationFeatures.AUTOSTOP] = (
121
117
  'Auto-stop is not supported on Kubernetes.')
122
- # Allow spot instances if supported by the cluster
123
- try:
124
- spot_label_key, _ = kubernetes_utils.get_spot_label(context)
125
- if spot_label_key is not None:
126
- unsupported_features.pop(
127
- clouds.CloudImplementationFeatures.SPOT_INSTANCE, None)
128
- # Allow custom network tier if supported by the cluster
129
- # (e.g., Nebius clusters with high performance networking)
130
- network_type, _ = cls._detect_network_type(context,
131
- resources.network_tier)
132
- if network_type.supports_high_performance_networking():
133
- unsupported_features.pop(
134
- clouds.CloudImplementationFeatures.CUSTOM_NETWORK_TIER,
135
- None)
136
- except exceptions.KubeAPIUnreachableError as e:
137
- cls._log_unreachable_context(context, str(e))
118
+ for context in contexts:
119
+ # Allow spot instances if supported by the cluster
120
+ try:
121
+ # Run spot label check and network type detection concurrently
122
+ # as they are independent operations
123
+ with concurrent.futures.ThreadPoolExecutor(
124
+ max_workers=2) as executor:
125
+ spot_future = executor.submit(
126
+ kubernetes_utils.get_spot_label, context)
127
+ network_future = executor.submit(cls._detect_network_type,
128
+ context,
129
+ resources.network_tier)
130
+
131
+ spot_label_key, _ = spot_future.result()
132
+ if spot_label_key is not None:
133
+ unsupported_features.pop(
134
+ clouds.CloudImplementationFeatures.SPOT_INSTANCE,
135
+ None)
136
+
137
+ # Allow custom network tier if supported by the cluster
138
+ # (e.g., Nebius clusters with high performance networking)
139
+ network_type, _ = network_future.result()
140
+ if network_type.supports_high_performance_networking():
141
+ unsupported_features.pop(
142
+ clouds.CloudImplementationFeatures.
143
+ CUSTOM_NETWORK_TIER, None)
144
+ except exceptions.KubeAPIUnreachableError as e:
145
+ cls._log_unreachable_context(context, str(e))
138
146
  return unsupported_features
139
147
 
140
148
  @classmethod
@@ -187,6 +195,12 @@ class Kubernetes(clouds.Cloud):
187
195
  ctx for ctx in all_contexts if not ctx.startswith('ssh-')
188
196
  ]
189
197
 
198
+ allow_all_contexts = allowed_contexts == 'all' or (
199
+ allowed_contexts is None and
200
+ env_options.Options.ALLOW_ALL_KUBERNETES_CONTEXTS.get())
201
+ if allow_all_contexts:
202
+ allowed_contexts = all_contexts
203
+
190
204
  if allowed_contexts is None:
191
205
  # Try kubeconfig if present
192
206
  current_context = (
@@ -244,10 +258,15 @@ class Kubernetes(clouds.Cloud):
244
258
  'refresh Kubernetes availability if permanent.')
245
259
 
246
260
  @classmethod
247
- def regions_with_offering(cls, instance_type: Optional[str],
248
- accelerators: Optional[Dict[str, int]],
249
- use_spot: bool, region: Optional[str],
250
- zone: Optional[str]) -> List[clouds.Region]:
261
+ def regions_with_offering(
262
+ cls,
263
+ instance_type: Optional[str],
264
+ accelerators: Optional[Dict[str, int]],
265
+ use_spot: bool,
266
+ region: Optional[str],
267
+ zone: Optional[str],
268
+ resources: Optional['resources_lib.Resources'] = None,
269
+ ) -> List[clouds.Region]:
251
270
  del accelerators, zone, use_spot # unused
252
271
  existing_contexts = cls.existing_allowed_contexts()
253
272
 
@@ -257,6 +276,19 @@ class Kubernetes(clouds.Cloud):
257
276
 
258
277
  if region is not None:
259
278
  regions = [r for r in regions if r.name == region]
279
+ if resources is not None:
280
+ filtered_regions = []
281
+ resources_required_features = resources.get_required_cloud_features(
282
+ )
283
+ for r in regions:
284
+ try:
285
+ cls.check_features_are_supported(
286
+ resources, resources_required_features, r.name)
287
+ filtered_regions.append(r)
288
+ except exceptions.NotSupportedError as e:
289
+ logger.info(f'Filter out context: {r.name}, reason: {e}')
290
+ continue
291
+ regions = filtered_regions
260
292
 
261
293
  # Check if requested instance type will fit in the cluster.
262
294
  # TODO(zhwu,romilb): autoscaler type needs to be regional (per
@@ -516,9 +548,6 @@ class Kubernetes(clouds.Cloud):
516
548
  return image_id
517
549
 
518
550
  image_id = _get_image_id(resources)
519
- # TODO(romilb): Create a lightweight image for SSH jump host
520
- ssh_jump_image = catalog.get_image_id_from_tag(self.IMAGE_CPU,
521
- clouds='kubernetes')
522
551
 
523
552
  # Set environment variables for the pod. Note that SkyPilot env vars
524
553
  # are set separately when the task is run. These env vars are
@@ -566,6 +595,7 @@ class Kubernetes(clouds.Cloud):
566
595
  port_mode = network_utils.get_port_mode(None, context)
567
596
 
568
597
  remote_identity = skypilot_config.get_effective_region_config(
598
+ # TODO(kyuds): Support SSH node pools as well.
569
599
  cloud='kubernetes',
570
600
  region=context,
571
601
  keys=('remote_identity',),
@@ -640,6 +670,7 @@ class Kubernetes(clouds.Cloud):
640
670
 
641
671
  k8s_kueue_local_queue_name = (
642
672
  skypilot_config.get_effective_region_config(
673
+ # TODO(kyuds): Support SSH node pools as well.
643
674
  cloud='kubernetes',
644
675
  region=context,
645
676
  keys=('kueue', 'local_queue_name'),
@@ -654,6 +685,7 @@ class Kubernetes(clouds.Cloud):
654
685
  if enable_flex_start_queued_provisioning or enable_flex_start:
655
686
  # DWS is only supported in GKE, check the autoscaler type.
656
687
  autoscaler_type = skypilot_config.get_effective_region_config(
688
+ # TODO(kyuds): Support SSH node pools as well.
657
689
  cloud='kubernetes',
658
690
  region=context,
659
691
  keys=('autoscaler',),
@@ -677,8 +709,12 @@ class Kubernetes(clouds.Cloud):
677
709
  timeout = self._calculate_provision_timeout(
678
710
  num_nodes, volume_mounts, enable_flex_start or
679
711
  enable_flex_start_queued_provisioning)
712
+
713
+ # Use _REPR, instead of directly using 'kubernetes' as the config key,
714
+ # because it could be SSH node pool as well.
715
+ cloud_config_str = self._REPR.lower()
680
716
  timeout = skypilot_config.get_effective_region_config(
681
- cloud='kubernetes',
717
+ cloud=cloud_config_str,
682
718
  region=context,
683
719
  keys=('provision_timeout',),
684
720
  default_value=timeout,
@@ -692,13 +728,8 @@ class Kubernetes(clouds.Cloud):
692
728
  'accelerator_count': str(acc_count),
693
729
  'timeout': str(timeout),
694
730
  'k8s_port_mode': port_mode.value,
695
- 'k8s_networking_mode': network_utils.get_networking_mode(
696
- None, context=context).value,
697
- 'k8s_ssh_key_secret_name': self.SKY_SSH_KEY_SECRET_NAME,
698
731
  'k8s_acc_label_key': k8s_acc_label_key,
699
732
  'k8s_acc_label_values': k8s_acc_label_values,
700
- 'k8s_ssh_jump_name': self.SKY_SSH_JUMP_NAME,
701
- 'k8s_ssh_jump_image': ssh_jump_image,
702
733
  'k8s_service_account_name': k8s_service_account_name,
703
734
  'k8s_automount_sa_token': 'true',
704
735
  'k8s_fuse_device_required': fuse_device_required,
@@ -796,7 +827,8 @@ class Kubernetes(clouds.Cloud):
796
827
  accelerators=resources.accelerators,
797
828
  use_spot=resources.use_spot,
798
829
  region=resources.region,
799
- zone=resources.zone)
830
+ zone=resources.zone,
831
+ resources=resources)
800
832
  if not regions:
801
833
  return resources_utils.FeasibleResources([], [], None)
802
834
  resources = resources.copy(accelerators=None)
@@ -841,6 +873,8 @@ class Kubernetes(clouds.Cloud):
841
873
  from_instance_type(default_instance_type))
842
874
 
843
875
  gpu_task_cpus = k8s_instance_type.cpus
876
+ if resources.cpus is None:
877
+ gpu_task_cpus = self._DEFAULT_NUM_VCPUS_WITH_GPU * acc_count
844
878
  # Special handling to bump up memory multiplier for GPU instances
845
879
  gpu_task_memory = (float(resources.memory.strip('+')) if
846
880
  resources.memory is not None else gpu_task_cpus *
@@ -854,7 +888,8 @@ class Kubernetes(clouds.Cloud):
854
888
  accelerators=None,
855
889
  use_spot=resources.use_spot,
856
890
  region=resources.region,
857
- zone=resources.zone)
891
+ zone=resources.zone,
892
+ resources=resources)
858
893
  if not available_regions:
859
894
  return resources_utils.FeasibleResources([], [], None)
860
895
  # No fuzzy lists for Kubernetes