skypilot-nightly 1.0.0.dev20250502__py3-none-any.whl → 1.0.0.dev20251203__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (546) hide show
  1. sky/__init__.py +22 -6
  2. sky/adaptors/aws.py +81 -16
  3. sky/adaptors/common.py +25 -2
  4. sky/adaptors/coreweave.py +278 -0
  5. sky/adaptors/do.py +8 -2
  6. sky/adaptors/gcp.py +11 -0
  7. sky/adaptors/hyperbolic.py +8 -0
  8. sky/adaptors/ibm.py +5 -2
  9. sky/adaptors/kubernetes.py +149 -18
  10. sky/adaptors/nebius.py +173 -30
  11. sky/adaptors/primeintellect.py +1 -0
  12. sky/adaptors/runpod.py +68 -0
  13. sky/adaptors/seeweb.py +183 -0
  14. sky/adaptors/shadeform.py +89 -0
  15. sky/admin_policy.py +187 -4
  16. sky/authentication.py +179 -225
  17. sky/backends/__init__.py +4 -2
  18. sky/backends/backend.py +22 -9
  19. sky/backends/backend_utils.py +1323 -397
  20. sky/backends/cloud_vm_ray_backend.py +1749 -1029
  21. sky/backends/docker_utils.py +1 -1
  22. sky/backends/local_docker_backend.py +11 -6
  23. sky/backends/task_codegen.py +633 -0
  24. sky/backends/wheel_utils.py +55 -9
  25. sky/{clouds/service_catalog → catalog}/__init__.py +21 -19
  26. sky/{clouds/service_catalog → catalog}/aws_catalog.py +27 -8
  27. sky/{clouds/service_catalog → catalog}/azure_catalog.py +10 -7
  28. sky/{clouds/service_catalog → catalog}/common.py +90 -49
  29. sky/{clouds/service_catalog → catalog}/cudo_catalog.py +8 -5
  30. sky/{clouds/service_catalog → catalog}/data_fetchers/analyze.py +1 -1
  31. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_aws.py +116 -80
  32. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_cudo.py +38 -38
  33. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_gcp.py +70 -16
  34. sky/catalog/data_fetchers/fetch_hyperbolic.py +136 -0
  35. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_lambda_cloud.py +1 -0
  36. sky/catalog/data_fetchers/fetch_nebius.py +338 -0
  37. sky/catalog/data_fetchers/fetch_runpod.py +698 -0
  38. sky/catalog/data_fetchers/fetch_seeweb.py +329 -0
  39. sky/catalog/data_fetchers/fetch_shadeform.py +142 -0
  40. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vast.py +1 -1
  41. sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_vsphere.py +1 -1
  42. sky/{clouds/service_catalog → catalog}/do_catalog.py +5 -2
  43. sky/{clouds/service_catalog → catalog}/fluidstack_catalog.py +6 -3
  44. sky/{clouds/service_catalog → catalog}/gcp_catalog.py +41 -15
  45. sky/catalog/hyperbolic_catalog.py +136 -0
  46. sky/{clouds/service_catalog → catalog}/ibm_catalog.py +9 -6
  47. sky/{clouds/service_catalog → catalog}/kubernetes_catalog.py +36 -24
  48. sky/{clouds/service_catalog → catalog}/lambda_catalog.py +9 -6
  49. sky/{clouds/service_catalog → catalog}/nebius_catalog.py +9 -7
  50. sky/{clouds/service_catalog → catalog}/oci_catalog.py +9 -6
  51. sky/{clouds/service_catalog → catalog}/paperspace_catalog.py +5 -2
  52. sky/catalog/primeintellect_catalog.py +95 -0
  53. sky/{clouds/service_catalog → catalog}/runpod_catalog.py +11 -4
  54. sky/{clouds/service_catalog → catalog}/scp_catalog.py +9 -6
  55. sky/catalog/seeweb_catalog.py +184 -0
  56. sky/catalog/shadeform_catalog.py +165 -0
  57. sky/catalog/ssh_catalog.py +167 -0
  58. sky/{clouds/service_catalog → catalog}/vast_catalog.py +6 -3
  59. sky/{clouds/service_catalog → catalog}/vsphere_catalog.py +5 -2
  60. sky/check.py +533 -185
  61. sky/cli.py +5 -5975
  62. sky/client/{cli.py → cli/command.py} +2591 -1956
  63. sky/client/cli/deprecation_utils.py +99 -0
  64. sky/client/cli/flags.py +359 -0
  65. sky/client/cli/table_utils.py +322 -0
  66. sky/client/cli/utils.py +79 -0
  67. sky/client/common.py +78 -32
  68. sky/client/oauth.py +82 -0
  69. sky/client/sdk.py +1219 -319
  70. sky/client/sdk_async.py +827 -0
  71. sky/client/service_account_auth.py +47 -0
  72. sky/cloud_stores.py +82 -3
  73. sky/clouds/__init__.py +13 -0
  74. sky/clouds/aws.py +564 -164
  75. sky/clouds/azure.py +105 -83
  76. sky/clouds/cloud.py +140 -40
  77. sky/clouds/cudo.py +68 -50
  78. sky/clouds/do.py +66 -48
  79. sky/clouds/fluidstack.py +63 -44
  80. sky/clouds/gcp.py +339 -110
  81. sky/clouds/hyperbolic.py +293 -0
  82. sky/clouds/ibm.py +70 -49
  83. sky/clouds/kubernetes.py +570 -162
  84. sky/clouds/lambda_cloud.py +74 -54
  85. sky/clouds/nebius.py +210 -81
  86. sky/clouds/oci.py +88 -66
  87. sky/clouds/paperspace.py +61 -44
  88. sky/clouds/primeintellect.py +317 -0
  89. sky/clouds/runpod.py +164 -74
  90. sky/clouds/scp.py +89 -86
  91. sky/clouds/seeweb.py +477 -0
  92. sky/clouds/shadeform.py +400 -0
  93. sky/clouds/ssh.py +263 -0
  94. sky/clouds/utils/aws_utils.py +10 -4
  95. sky/clouds/utils/gcp_utils.py +87 -11
  96. sky/clouds/utils/oci_utils.py +38 -14
  97. sky/clouds/utils/scp_utils.py +231 -167
  98. sky/clouds/vast.py +99 -77
  99. sky/clouds/vsphere.py +51 -40
  100. sky/core.py +375 -173
  101. sky/dag.py +15 -0
  102. sky/dashboard/out/404.html +1 -1
  103. sky/dashboard/out/_next/static/96_E2yl3QAiIJGOYCkSpB/_buildManifest.js +1 -0
  104. sky/dashboard/out/_next/static/chunks/1141-e6aa9ab418717c59.js +11 -0
  105. sky/dashboard/out/_next/static/chunks/1272-1ef0bf0237faccdb.js +1 -0
  106. sky/dashboard/out/_next/static/chunks/1871-7e202677c42f43fe.js +6 -0
  107. sky/dashboard/out/_next/static/chunks/2260-7703229c33c5ebd5.js +1 -0
  108. sky/dashboard/out/_next/static/chunks/2350.fab69e61bac57b23.js +1 -0
  109. sky/dashboard/out/_next/static/chunks/2369.fc20f0c2c8ed9fe7.js +15 -0
  110. sky/dashboard/out/_next/static/chunks/2755.edd818326d489a1d.js +26 -0
  111. sky/dashboard/out/_next/static/chunks/3294.20a8540fe697d5ee.js +1 -0
  112. sky/dashboard/out/_next/static/chunks/3785.7e245f318f9d1121.js +1 -0
  113. sky/dashboard/out/_next/static/chunks/3800-7b45f9fbb6308557.js +1 -0
  114. sky/dashboard/out/_next/static/chunks/3850-ff4a9a69d978632b.js +1 -0
  115. sky/dashboard/out/_next/static/chunks/3937.210053269f121201.js +1 -0
  116. sky/dashboard/out/_next/static/chunks/4725.172ede95d1b21022.js +1 -0
  117. sky/dashboard/out/_next/static/chunks/4937.a2baa2df5572a276.js +15 -0
  118. sky/dashboard/out/_next/static/chunks/5739-d67458fcb1386c92.js +8 -0
  119. sky/dashboard/out/_next/static/chunks/6130-2be46d70a38f1e82.js +1 -0
  120. sky/dashboard/out/_next/static/chunks/616-3d59f75e2ccf9321.js +39 -0
  121. sky/dashboard/out/_next/static/chunks/6212-7bd06f60ba693125.js +13 -0
  122. sky/dashboard/out/_next/static/chunks/6856-8f27d1c10c98def8.js +1 -0
  123. sky/dashboard/out/_next/static/chunks/6989-01359c57e018caa4.js +1 -0
  124. sky/dashboard/out/_next/static/chunks/6990-9146207c4567fdfd.js +1 -0
  125. sky/dashboard/out/_next/static/chunks/7359-c8d04e06886000b3.js +30 -0
  126. sky/dashboard/out/_next/static/chunks/7411-b15471acd2cba716.js +41 -0
  127. sky/dashboard/out/_next/static/chunks/7615-019513abc55b3b47.js +1 -0
  128. sky/dashboard/out/_next/static/chunks/8640.5b9475a2d18c5416.js +16 -0
  129. sky/dashboard/out/_next/static/chunks/8969-452f9d5cbdd2dc73.js +1 -0
  130. sky/dashboard/out/_next/static/chunks/9025.fa408f3242e9028d.js +6 -0
  131. sky/dashboard/out/_next/static/chunks/9353-cff34f7e773b2e2b.js +1 -0
  132. sky/dashboard/out/_next/static/chunks/9360.a536cf6b1fa42355.js +31 -0
  133. sky/dashboard/out/_next/static/chunks/9847.3aaca6bb33455140.js +30 -0
  134. sky/dashboard/out/_next/static/chunks/fd9d1056-86323a29a8f7e46a.js +1 -0
  135. sky/dashboard/out/_next/static/chunks/framework-cf60a09ccd051a10.js +33 -0
  136. sky/dashboard/out/_next/static/chunks/main-app-587214043926b3cc.js +1 -0
  137. sky/dashboard/out/_next/static/chunks/main-f15ccb73239a3bf1.js +1 -0
  138. sky/dashboard/out/_next/static/chunks/pages/_app-bde01e4a2beec258.js +34 -0
  139. sky/dashboard/out/_next/static/chunks/pages/_error-c66a4e8afc46f17b.js +1 -0
  140. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-792db96d918c98c9.js +16 -0
  141. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-abfcac9c137aa543.js +1 -0
  142. sky/dashboard/out/_next/static/chunks/pages/clusters-ee39056f9851a3ff.js +1 -0
  143. sky/dashboard/out/_next/static/chunks/pages/config-dfb9bf07b13045f4.js +1 -0
  144. sky/dashboard/out/_next/static/chunks/pages/index-444f1804401f04ea.js +1 -0
  145. sky/dashboard/out/_next/static/chunks/pages/infra/[context]-c0b5935149902e6f.js +1 -0
  146. sky/dashboard/out/_next/static/chunks/pages/infra-aed0ea19df7cf961.js +1 -0
  147. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-d66997e2bfc837cf.js +16 -0
  148. sky/dashboard/out/_next/static/chunks/pages/jobs/pools/[pool]-9faf940b253e3e06.js +21 -0
  149. sky/dashboard/out/_next/static/chunks/pages/jobs-2072b48b617989c9.js +1 -0
  150. sky/dashboard/out/_next/static/chunks/pages/users-f42674164aa73423.js +1 -0
  151. sky/dashboard/out/_next/static/chunks/pages/volumes-b84b948ff357c43e.js +1 -0
  152. sky/dashboard/out/_next/static/chunks/pages/workspace/new-3f88a1c7e86a3f86.js +1 -0
  153. sky/dashboard/out/_next/static/chunks/pages/workspaces/[name]-84a40f8c7c627fe4.js +1 -0
  154. sky/dashboard/out/_next/static/chunks/pages/workspaces-531b2f8c4bf89f82.js +1 -0
  155. sky/dashboard/out/_next/static/chunks/webpack-64e05f17bf2cf8ce.js +1 -0
  156. sky/dashboard/out/_next/static/css/0748ce22df867032.css +3 -0
  157. sky/dashboard/out/clusters/[cluster]/[job].html +1 -1
  158. sky/dashboard/out/clusters/[cluster].html +1 -1
  159. sky/dashboard/out/clusters.html +1 -1
  160. sky/dashboard/out/config.html +1 -0
  161. sky/dashboard/out/index.html +1 -1
  162. sky/dashboard/out/infra/[context].html +1 -0
  163. sky/dashboard/out/infra.html +1 -0
  164. sky/dashboard/out/jobs/[job].html +1 -1
  165. sky/dashboard/out/jobs/pools/[pool].html +1 -0
  166. sky/dashboard/out/jobs.html +1 -1
  167. sky/dashboard/out/users.html +1 -0
  168. sky/dashboard/out/volumes.html +1 -0
  169. sky/dashboard/out/workspace/new.html +1 -0
  170. sky/dashboard/out/workspaces/[name].html +1 -0
  171. sky/dashboard/out/workspaces.html +1 -0
  172. sky/data/data_utils.py +137 -1
  173. sky/data/mounting_utils.py +269 -84
  174. sky/data/storage.py +1460 -1807
  175. sky/data/storage_utils.py +43 -57
  176. sky/exceptions.py +126 -2
  177. sky/execution.py +216 -63
  178. sky/global_user_state.py +2390 -586
  179. sky/jobs/__init__.py +7 -0
  180. sky/jobs/client/sdk.py +300 -58
  181. sky/jobs/client/sdk_async.py +161 -0
  182. sky/jobs/constants.py +15 -8
  183. sky/jobs/controller.py +848 -275
  184. sky/jobs/file_content_utils.py +128 -0
  185. sky/jobs/log_gc.py +193 -0
  186. sky/jobs/recovery_strategy.py +402 -152
  187. sky/jobs/scheduler.py +314 -189
  188. sky/jobs/server/core.py +836 -255
  189. sky/jobs/server/server.py +156 -115
  190. sky/jobs/server/utils.py +136 -0
  191. sky/jobs/state.py +2109 -706
  192. sky/jobs/utils.py +1306 -215
  193. sky/logs/__init__.py +21 -0
  194. sky/logs/agent.py +108 -0
  195. sky/logs/aws.py +243 -0
  196. sky/logs/gcp.py +91 -0
  197. sky/metrics/__init__.py +0 -0
  198. sky/metrics/utils.py +453 -0
  199. sky/models.py +78 -1
  200. sky/optimizer.py +164 -70
  201. sky/provision/__init__.py +90 -4
  202. sky/provision/aws/config.py +147 -26
  203. sky/provision/aws/instance.py +136 -50
  204. sky/provision/azure/instance.py +11 -6
  205. sky/provision/common.py +13 -1
  206. sky/provision/cudo/cudo_machine_type.py +1 -1
  207. sky/provision/cudo/cudo_utils.py +14 -8
  208. sky/provision/cudo/cudo_wrapper.py +72 -71
  209. sky/provision/cudo/instance.py +10 -6
  210. sky/provision/do/instance.py +10 -6
  211. sky/provision/do/utils.py +4 -3
  212. sky/provision/docker_utils.py +140 -33
  213. sky/provision/fluidstack/instance.py +13 -8
  214. sky/provision/gcp/__init__.py +1 -0
  215. sky/provision/gcp/config.py +301 -19
  216. sky/provision/gcp/constants.py +218 -0
  217. sky/provision/gcp/instance.py +36 -8
  218. sky/provision/gcp/instance_utils.py +18 -4
  219. sky/provision/gcp/volume_utils.py +247 -0
  220. sky/provision/hyperbolic/__init__.py +12 -0
  221. sky/provision/hyperbolic/config.py +10 -0
  222. sky/provision/hyperbolic/instance.py +437 -0
  223. sky/provision/hyperbolic/utils.py +373 -0
  224. sky/provision/instance_setup.py +101 -20
  225. sky/provision/kubernetes/__init__.py +5 -0
  226. sky/provision/kubernetes/config.py +9 -52
  227. sky/provision/kubernetes/constants.py +17 -0
  228. sky/provision/kubernetes/instance.py +919 -280
  229. sky/provision/kubernetes/manifests/fusermount-server-daemonset.yaml +1 -2
  230. sky/provision/kubernetes/network.py +27 -17
  231. sky/provision/kubernetes/network_utils.py +44 -43
  232. sky/provision/kubernetes/utils.py +1221 -534
  233. sky/provision/kubernetes/volume.py +343 -0
  234. sky/provision/lambda_cloud/instance.py +22 -16
  235. sky/provision/nebius/constants.py +50 -0
  236. sky/provision/nebius/instance.py +19 -6
  237. sky/provision/nebius/utils.py +237 -137
  238. sky/provision/oci/instance.py +10 -5
  239. sky/provision/paperspace/instance.py +10 -7
  240. sky/provision/paperspace/utils.py +1 -1
  241. sky/provision/primeintellect/__init__.py +10 -0
  242. sky/provision/primeintellect/config.py +11 -0
  243. sky/provision/primeintellect/instance.py +454 -0
  244. sky/provision/primeintellect/utils.py +398 -0
  245. sky/provision/provisioner.py +117 -36
  246. sky/provision/runpod/__init__.py +5 -0
  247. sky/provision/runpod/instance.py +27 -6
  248. sky/provision/runpod/utils.py +51 -18
  249. sky/provision/runpod/volume.py +214 -0
  250. sky/provision/scp/__init__.py +15 -0
  251. sky/provision/scp/config.py +93 -0
  252. sky/provision/scp/instance.py +707 -0
  253. sky/provision/seeweb/__init__.py +11 -0
  254. sky/provision/seeweb/config.py +13 -0
  255. sky/provision/seeweb/instance.py +812 -0
  256. sky/provision/shadeform/__init__.py +11 -0
  257. sky/provision/shadeform/config.py +12 -0
  258. sky/provision/shadeform/instance.py +351 -0
  259. sky/provision/shadeform/shadeform_utils.py +83 -0
  260. sky/provision/ssh/__init__.py +18 -0
  261. sky/provision/vast/instance.py +13 -8
  262. sky/provision/vast/utils.py +10 -7
  263. sky/provision/volume.py +164 -0
  264. sky/provision/vsphere/common/ssl_helper.py +1 -1
  265. sky/provision/vsphere/common/vapiconnect.py +2 -1
  266. sky/provision/vsphere/common/vim_utils.py +4 -4
  267. sky/provision/vsphere/instance.py +15 -10
  268. sky/provision/vsphere/vsphere_utils.py +17 -20
  269. sky/py.typed +0 -0
  270. sky/resources.py +845 -119
  271. sky/schemas/__init__.py +0 -0
  272. sky/schemas/api/__init__.py +0 -0
  273. sky/schemas/api/responses.py +227 -0
  274. sky/schemas/db/README +4 -0
  275. sky/schemas/db/env.py +90 -0
  276. sky/schemas/db/global_user_state/001_initial_schema.py +124 -0
  277. sky/schemas/db/global_user_state/002_add_workspace_to_cluster_history.py +35 -0
  278. sky/schemas/db/global_user_state/003_fix_initial_revision.py +61 -0
  279. sky/schemas/db/global_user_state/004_is_managed.py +34 -0
  280. sky/schemas/db/global_user_state/005_cluster_event.py +32 -0
  281. sky/schemas/db/global_user_state/006_provision_log.py +41 -0
  282. sky/schemas/db/global_user_state/007_cluster_event_request_id.py +34 -0
  283. sky/schemas/db/global_user_state/008_skylet_ssh_tunnel_metadata.py +34 -0
  284. sky/schemas/db/global_user_state/009_last_activity_and_launched_at.py +89 -0
  285. sky/schemas/db/global_user_state/010_save_ssh_key.py +66 -0
  286. sky/schemas/db/global_user_state/011_is_ephemeral.py +34 -0
  287. sky/schemas/db/kv_cache/001_initial_schema.py +29 -0
  288. sky/schemas/db/script.py.mako +28 -0
  289. sky/schemas/db/serve_state/001_initial_schema.py +67 -0
  290. sky/schemas/db/serve_state/002_yaml_content.py +34 -0
  291. sky/schemas/db/skypilot_config/001_initial_schema.py +30 -0
  292. sky/schemas/db/spot_jobs/001_initial_schema.py +97 -0
  293. sky/schemas/db/spot_jobs/002_cluster_pool.py +42 -0
  294. sky/schemas/db/spot_jobs/003_pool_hash.py +34 -0
  295. sky/schemas/db/spot_jobs/004_job_file_contents.py +42 -0
  296. sky/schemas/db/spot_jobs/005_logs_gc.py +38 -0
  297. sky/schemas/db/spot_jobs/006_controller_pid_started_at.py +34 -0
  298. sky/schemas/db/spot_jobs/007_config_file_content.py +34 -0
  299. sky/schemas/generated/__init__.py +0 -0
  300. sky/schemas/generated/autostopv1_pb2.py +36 -0
  301. sky/schemas/generated/autostopv1_pb2.pyi +43 -0
  302. sky/schemas/generated/autostopv1_pb2_grpc.py +146 -0
  303. sky/schemas/generated/jobsv1_pb2.py +86 -0
  304. sky/schemas/generated/jobsv1_pb2.pyi +254 -0
  305. sky/schemas/generated/jobsv1_pb2_grpc.py +542 -0
  306. sky/schemas/generated/managed_jobsv1_pb2.py +76 -0
  307. sky/schemas/generated/managed_jobsv1_pb2.pyi +278 -0
  308. sky/schemas/generated/managed_jobsv1_pb2_grpc.py +278 -0
  309. sky/schemas/generated/servev1_pb2.py +58 -0
  310. sky/schemas/generated/servev1_pb2.pyi +115 -0
  311. sky/schemas/generated/servev1_pb2_grpc.py +322 -0
  312. sky/serve/autoscalers.py +357 -5
  313. sky/serve/client/impl.py +310 -0
  314. sky/serve/client/sdk.py +47 -139
  315. sky/serve/client/sdk_async.py +130 -0
  316. sky/serve/constants.py +12 -9
  317. sky/serve/controller.py +68 -17
  318. sky/serve/load_balancer.py +106 -60
  319. sky/serve/load_balancing_policies.py +116 -2
  320. sky/serve/replica_managers.py +434 -249
  321. sky/serve/serve_rpc_utils.py +179 -0
  322. sky/serve/serve_state.py +569 -257
  323. sky/serve/serve_utils.py +775 -265
  324. sky/serve/server/core.py +66 -711
  325. sky/serve/server/impl.py +1093 -0
  326. sky/serve/server/server.py +21 -18
  327. sky/serve/service.py +192 -89
  328. sky/serve/service_spec.py +144 -20
  329. sky/serve/spot_placer.py +3 -0
  330. sky/server/auth/__init__.py +0 -0
  331. sky/server/auth/authn.py +50 -0
  332. sky/server/auth/loopback.py +38 -0
  333. sky/server/auth/oauth2_proxy.py +202 -0
  334. sky/server/common.py +478 -182
  335. sky/server/config.py +85 -23
  336. sky/server/constants.py +44 -6
  337. sky/server/daemons.py +295 -0
  338. sky/server/html/token_page.html +185 -0
  339. sky/server/metrics.py +160 -0
  340. sky/server/middleware_utils.py +166 -0
  341. sky/server/requests/executor.py +558 -138
  342. sky/server/requests/payloads.py +364 -24
  343. sky/server/requests/preconditions.py +21 -17
  344. sky/server/requests/process.py +112 -29
  345. sky/server/requests/request_names.py +121 -0
  346. sky/server/requests/requests.py +822 -226
  347. sky/server/requests/serializers/decoders.py +82 -31
  348. sky/server/requests/serializers/encoders.py +140 -22
  349. sky/server/requests/threads.py +117 -0
  350. sky/server/rest.py +455 -0
  351. sky/server/server.py +1309 -285
  352. sky/server/state.py +20 -0
  353. sky/server/stream_utils.py +327 -61
  354. sky/server/uvicorn.py +217 -3
  355. sky/server/versions.py +270 -0
  356. sky/setup_files/MANIFEST.in +11 -1
  357. sky/setup_files/alembic.ini +160 -0
  358. sky/setup_files/dependencies.py +139 -31
  359. sky/setup_files/setup.py +44 -42
  360. sky/sky_logging.py +114 -7
  361. sky/skylet/attempt_skylet.py +106 -24
  362. sky/skylet/autostop_lib.py +129 -8
  363. sky/skylet/configs.py +29 -20
  364. sky/skylet/constants.py +216 -25
  365. sky/skylet/events.py +101 -21
  366. sky/skylet/job_lib.py +345 -164
  367. sky/skylet/log_lib.py +297 -18
  368. sky/skylet/log_lib.pyi +44 -1
  369. sky/skylet/providers/ibm/node_provider.py +12 -8
  370. sky/skylet/providers/ibm/vpc_provider.py +13 -12
  371. sky/skylet/ray_patches/__init__.py +17 -3
  372. sky/skylet/ray_patches/autoscaler.py.diff +18 -0
  373. sky/skylet/ray_patches/cli.py.diff +19 -0
  374. sky/skylet/ray_patches/command_runner.py.diff +17 -0
  375. sky/skylet/ray_patches/log_monitor.py.diff +20 -0
  376. sky/skylet/ray_patches/resource_demand_scheduler.py.diff +32 -0
  377. sky/skylet/ray_patches/updater.py.diff +18 -0
  378. sky/skylet/ray_patches/worker.py.diff +41 -0
  379. sky/skylet/runtime_utils.py +21 -0
  380. sky/skylet/services.py +568 -0
  381. sky/skylet/skylet.py +72 -4
  382. sky/skylet/subprocess_daemon.py +104 -29
  383. sky/skypilot_config.py +506 -99
  384. sky/ssh_node_pools/__init__.py +1 -0
  385. sky/ssh_node_pools/core.py +135 -0
  386. sky/ssh_node_pools/server.py +233 -0
  387. sky/task.py +685 -163
  388. sky/templates/aws-ray.yml.j2 +11 -3
  389. sky/templates/azure-ray.yml.j2 +2 -1
  390. sky/templates/cudo-ray.yml.j2 +1 -0
  391. sky/templates/do-ray.yml.j2 +2 -1
  392. sky/templates/fluidstack-ray.yml.j2 +1 -0
  393. sky/templates/gcp-ray.yml.j2 +62 -1
  394. sky/templates/hyperbolic-ray.yml.j2 +68 -0
  395. sky/templates/ibm-ray.yml.j2 +2 -1
  396. sky/templates/jobs-controller.yaml.j2 +27 -24
  397. sky/templates/kubernetes-loadbalancer.yml.j2 +2 -0
  398. sky/templates/kubernetes-ray.yml.j2 +611 -50
  399. sky/templates/lambda-ray.yml.j2 +2 -1
  400. sky/templates/nebius-ray.yml.j2 +34 -12
  401. sky/templates/oci-ray.yml.j2 +1 -0
  402. sky/templates/paperspace-ray.yml.j2 +2 -1
  403. sky/templates/primeintellect-ray.yml.j2 +72 -0
  404. sky/templates/runpod-ray.yml.j2 +10 -1
  405. sky/templates/scp-ray.yml.j2 +4 -50
  406. sky/templates/seeweb-ray.yml.j2 +171 -0
  407. sky/templates/shadeform-ray.yml.j2 +73 -0
  408. sky/templates/sky-serve-controller.yaml.j2 +22 -2
  409. sky/templates/vast-ray.yml.j2 +1 -0
  410. sky/templates/vsphere-ray.yml.j2 +1 -0
  411. sky/templates/websocket_proxy.py +212 -37
  412. sky/usage/usage_lib.py +31 -15
  413. sky/users/__init__.py +0 -0
  414. sky/users/model.conf +15 -0
  415. sky/users/permission.py +397 -0
  416. sky/users/rbac.py +121 -0
  417. sky/users/server.py +720 -0
  418. sky/users/token_service.py +218 -0
  419. sky/utils/accelerator_registry.py +35 -5
  420. sky/utils/admin_policy_utils.py +84 -38
  421. sky/utils/annotations.py +38 -5
  422. sky/utils/asyncio_utils.py +78 -0
  423. sky/utils/atomic.py +1 -1
  424. sky/utils/auth_utils.py +153 -0
  425. sky/utils/benchmark_utils.py +60 -0
  426. sky/utils/cli_utils/status_utils.py +159 -86
  427. sky/utils/cluster_utils.py +31 -9
  428. sky/utils/command_runner.py +354 -68
  429. sky/utils/command_runner.pyi +93 -3
  430. sky/utils/common.py +35 -8
  431. sky/utils/common_utils.py +314 -91
  432. sky/utils/config_utils.py +74 -5
  433. sky/utils/context.py +403 -0
  434. sky/utils/context_utils.py +242 -0
  435. sky/utils/controller_utils.py +383 -89
  436. sky/utils/dag_utils.py +31 -12
  437. sky/utils/db/__init__.py +0 -0
  438. sky/utils/db/db_utils.py +485 -0
  439. sky/utils/db/kv_cache.py +149 -0
  440. sky/utils/db/migration_utils.py +137 -0
  441. sky/utils/directory_utils.py +12 -0
  442. sky/utils/env_options.py +13 -0
  443. sky/utils/git.py +567 -0
  444. sky/utils/git_clone.sh +460 -0
  445. sky/utils/infra_utils.py +195 -0
  446. sky/utils/kubernetes/cleanup-tunnel.sh +62 -0
  447. sky/utils/kubernetes/config_map_utils.py +133 -0
  448. sky/utils/kubernetes/create_cluster.sh +15 -29
  449. sky/utils/kubernetes/delete_cluster.sh +10 -7
  450. sky/utils/kubernetes/deploy_ssh_node_pools.py +1177 -0
  451. sky/utils/kubernetes/exec_kubeconfig_converter.py +22 -31
  452. sky/utils/kubernetes/generate_kind_config.py +6 -66
  453. sky/utils/kubernetes/generate_kubeconfig.sh +4 -1
  454. sky/utils/kubernetes/gpu_labeler.py +18 -8
  455. sky/utils/kubernetes/k8s_gpu_labeler_job.yaml +2 -1
  456. sky/utils/kubernetes/k8s_gpu_labeler_setup.yaml +16 -16
  457. sky/utils/kubernetes/kubernetes_deploy_utils.py +284 -114
  458. sky/utils/kubernetes/rsync_helper.sh +11 -3
  459. sky/utils/kubernetes/ssh-tunnel.sh +379 -0
  460. sky/utils/kubernetes/ssh_utils.py +221 -0
  461. sky/utils/kubernetes_enums.py +8 -15
  462. sky/utils/lock_events.py +94 -0
  463. sky/utils/locks.py +416 -0
  464. sky/utils/log_utils.py +82 -107
  465. sky/utils/perf_utils.py +22 -0
  466. sky/utils/resource_checker.py +298 -0
  467. sky/utils/resources_utils.py +249 -32
  468. sky/utils/rich_utils.py +217 -39
  469. sky/utils/schemas.py +955 -160
  470. sky/utils/serialize_utils.py +16 -0
  471. sky/utils/status_lib.py +10 -0
  472. sky/utils/subprocess_utils.py +29 -15
  473. sky/utils/tempstore.py +70 -0
  474. sky/utils/thread_utils.py +91 -0
  475. sky/utils/timeline.py +26 -53
  476. sky/utils/ux_utils.py +84 -15
  477. sky/utils/validator.py +11 -1
  478. sky/utils/volume.py +165 -0
  479. sky/utils/yaml_utils.py +111 -0
  480. sky/volumes/__init__.py +13 -0
  481. sky/volumes/client/__init__.py +0 -0
  482. sky/volumes/client/sdk.py +150 -0
  483. sky/volumes/server/__init__.py +0 -0
  484. sky/volumes/server/core.py +270 -0
  485. sky/volumes/server/server.py +124 -0
  486. sky/volumes/volume.py +215 -0
  487. sky/workspaces/__init__.py +0 -0
  488. sky/workspaces/core.py +655 -0
  489. sky/workspaces/server.py +101 -0
  490. sky/workspaces/utils.py +56 -0
  491. sky_templates/README.md +3 -0
  492. sky_templates/__init__.py +3 -0
  493. sky_templates/ray/__init__.py +0 -0
  494. sky_templates/ray/start_cluster +183 -0
  495. sky_templates/ray/stop_cluster +75 -0
  496. skypilot_nightly-1.0.0.dev20251203.dist-info/METADATA +676 -0
  497. skypilot_nightly-1.0.0.dev20251203.dist-info/RECORD +611 -0
  498. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/WHEEL +1 -1
  499. skypilot_nightly-1.0.0.dev20251203.dist-info/top_level.txt +2 -0
  500. sky/benchmark/benchmark_state.py +0 -256
  501. sky/benchmark/benchmark_utils.py +0 -641
  502. sky/clouds/service_catalog/constants.py +0 -7
  503. sky/dashboard/out/_next/static/GWvVBSCS7FmUiVmjaL1a7/_buildManifest.js +0 -1
  504. sky/dashboard/out/_next/static/chunks/236-2db3ee3fba33dd9e.js +0 -6
  505. sky/dashboard/out/_next/static/chunks/312-c3c8845990db8ffc.js +0 -15
  506. sky/dashboard/out/_next/static/chunks/37-0a572fe0dbb89c4d.js +0 -6
  507. sky/dashboard/out/_next/static/chunks/678-206dddca808e6d16.js +0 -59
  508. sky/dashboard/out/_next/static/chunks/845-9e60713e0c441abc.js +0 -1
  509. sky/dashboard/out/_next/static/chunks/979-7bf73a4c7cea0f5c.js +0 -1
  510. sky/dashboard/out/_next/static/chunks/fd9d1056-2821b0f0cabcd8bd.js +0 -1
  511. sky/dashboard/out/_next/static/chunks/framework-87d061ee6ed71b28.js +0 -33
  512. sky/dashboard/out/_next/static/chunks/main-app-241eb28595532291.js +0 -1
  513. sky/dashboard/out/_next/static/chunks/main-e0e2335212e72357.js +0 -1
  514. sky/dashboard/out/_next/static/chunks/pages/_app-e6b013bc3f77ad60.js +0 -1
  515. sky/dashboard/out/_next/static/chunks/pages/_error-1be831200e60c5c0.js +0 -1
  516. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]/[job]-6ac338bc2239cb45.js +0 -1
  517. sky/dashboard/out/_next/static/chunks/pages/clusters/[cluster]-f383db7389368ea7.js +0 -1
  518. sky/dashboard/out/_next/static/chunks/pages/clusters-a93b93e10b8b074e.js +0 -1
  519. sky/dashboard/out/_next/static/chunks/pages/index-f9f039532ca8cbc4.js +0 -1
  520. sky/dashboard/out/_next/static/chunks/pages/jobs/[job]-1c519e1afc523dc9.js +0 -1
  521. sky/dashboard/out/_next/static/chunks/pages/jobs-a75029b67aab6a2e.js +0 -1
  522. sky/dashboard/out/_next/static/chunks/webpack-830f59b8404e96b8.js +0 -1
  523. sky/dashboard/out/_next/static/css/c6933bbb2ce7f4dd.css +0 -3
  524. sky/jobs/dashboard/dashboard.py +0 -223
  525. sky/jobs/dashboard/static/favicon.ico +0 -0
  526. sky/jobs/dashboard/templates/index.html +0 -831
  527. sky/jobs/server/dashboard_utils.py +0 -69
  528. sky/skylet/providers/scp/__init__.py +0 -2
  529. sky/skylet/providers/scp/config.py +0 -149
  530. sky/skylet/providers/scp/node_provider.py +0 -578
  531. sky/templates/kubernetes-ssh-jump.yml.j2 +0 -94
  532. sky/utils/db_utils.py +0 -100
  533. sky/utils/kubernetes/deploy_remote_cluster.sh +0 -308
  534. sky/utils/kubernetes/ssh_jump_lifecycle_manager.py +0 -191
  535. skypilot_nightly-1.0.0.dev20250502.dist-info/METADATA +0 -361
  536. skypilot_nightly-1.0.0.dev20250502.dist-info/RECORD +0 -396
  537. skypilot_nightly-1.0.0.dev20250502.dist-info/top_level.txt +0 -1
  538. /sky/{clouds/service_catalog → catalog}/config.py +0 -0
  539. /sky/{benchmark → catalog/data_fetchers}/__init__.py +0 -0
  540. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_azure.py +0 -0
  541. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_fluidstack.py +0 -0
  542. /sky/{clouds/service_catalog → catalog}/data_fetchers/fetch_ibm.py +0 -0
  543. /sky/{clouds/service_catalog/data_fetchers → client/cli}/__init__.py +0 -0
  544. /sky/dashboard/out/_next/static/{GWvVBSCS7FmUiVmjaL1a7 → 96_E2yl3QAiIJGOYCkSpB}/_ssgManifest.js +0 -0
  545. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/entry_points.txt +0 -0
  546. {skypilot_nightly-1.0.0.dev20250502.dist-info → skypilot_nightly-1.0.0.dev20251203.dist-info}/licenses/LICENSE +0 -0
sky/core.py CHANGED
@@ -1,6 +1,4 @@
1
1
  """SDK functions for cluster/job management."""
2
- import os
3
- import shlex
4
2
  import typing
5
3
  from typing import Any, Dict, List, Optional, Tuple, Union
6
4
 
@@ -8,7 +6,7 @@ import colorama
8
6
 
9
7
  from sky import admin_policy
10
8
  from sky import backends
11
- from sky import check as sky_check
9
+ from sky import catalog
12
10
  from sky import clouds
13
11
  from sky import dag as dag_lib
14
12
  from sky import data
@@ -17,21 +15,26 @@ from sky import global_user_state
17
15
  from sky import models
18
16
  from sky import optimizer
19
17
  from sky import sky_logging
18
+ from sky import skypilot_config
20
19
  from sky import task as task_lib
20
+ from sky.adaptors import common as adaptors_common
21
21
  from sky.backends import backend_utils
22
+ from sky.backends import cloud_vm_ray_backend
22
23
  from sky.clouds import cloud as sky_cloud
23
- from sky.clouds import service_catalog
24
24
  from sky.jobs.server import core as managed_jobs_core
25
25
  from sky.provision.kubernetes import constants as kubernetes_constants
26
26
  from sky.provision.kubernetes import utils as kubernetes_utils
27
+ from sky.schemas.api import responses
28
+ from sky.server.requests import request_names
29
+ from sky.skylet import autostop_lib
27
30
  from sky.skylet import constants
28
31
  from sky.skylet import job_lib
29
- from sky.skylet import log_lib
30
32
  from sky.usage import usage_lib
31
33
  from sky.utils import admin_policy_utils
32
34
  from sky.utils import common
33
35
  from sky.utils import common_utils
34
36
  from sky.utils import controller_utils
37
+ from sky.utils import resources_utils
35
38
  from sky.utils import rich_utils
36
39
  from sky.utils import status_lib
37
40
  from sky.utils import subprocess_utils
@@ -40,6 +43,9 @@ from sky.utils.kubernetes import kubernetes_deploy_utils
40
43
 
41
44
  if typing.TYPE_CHECKING:
42
45
  from sky import resources as resources_lib
46
+ from sky.schemas.generated import jobsv1_pb2
47
+ else:
48
+ jobsv1_pb2 = adaptors_common.LazyImport('sky.schemas.generated.jobsv1_pb2')
43
49
 
44
50
  logger = sky_logging.init_logger(__name__)
45
51
 
@@ -78,14 +84,15 @@ def optimize(
78
84
  # is shown on `sky launch`. The optimizer is also invoked during failover,
79
85
  # but we do not apply the admin policy there. We should apply the admin
80
86
  # policy in the optimizer, but that will require some refactoring.
81
- dag, _ = admin_policy_utils.apply(
82
- dag,
83
- use_mutated_config_in_current_request=True,
84
- request_options=request_options)
85
- return optimizer.Optimizer.optimize(dag=dag,
86
- minimize=minimize,
87
- blocked_resources=blocked_resources,
88
- quiet=quiet)
87
+ with admin_policy_utils.apply_and_use_config_in_current_request(
88
+ dag,
89
+ request_name=request_names.AdminPolicyRequestName.OPTIMIZE,
90
+ request_options=request_options) as dag:
91
+ dag.resolve_and_validate_volumes()
92
+ return optimizer.Optimizer.optimize(dag=dag,
93
+ minimize=minimize,
94
+ blocked_resources=blocked_resources,
95
+ quiet=quiet)
89
96
 
90
97
 
91
98
  @usage_lib.entrypoint
@@ -93,7 +100,10 @@ def status(
93
100
  cluster_names: Optional[Union[str, List[str]]] = None,
94
101
  refresh: common.StatusRefreshMode = common.StatusRefreshMode.NONE,
95
102
  all_users: bool = False,
96
- ) -> List[Dict[str, Any]]:
103
+ include_credentials: bool = False,
104
+ summary_response: bool = False,
105
+ include_handle: bool = True,
106
+ ) -> List[responses.StatusResponse]:
97
107
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
98
108
  """Gets cluster statuses.
99
109
 
@@ -160,22 +170,37 @@ def status(
160
170
  provided, all clusters will be queried.
161
171
  refresh: whether to query the latest cluster statuses from the cloud
162
172
  provider(s).
173
+ include_credentials: whether to fetch ssh credentials for cluster
174
+ (credentials field in responses.StatusResponse)
163
175
 
164
176
  Returns:
165
177
  A list of dicts, with each dict containing the information of a
166
178
  cluster. If a cluster is found to be terminated or not found, it will
167
179
  be omitted from the returned list.
168
180
  """
169
- clusters = backend_utils.get_clusters(refresh=refresh,
170
- cluster_names=cluster_names,
171
- all_users=all_users)
172
- return clusters
181
+ clusters = backend_utils.get_clusters(
182
+ refresh=refresh,
183
+ cluster_names=cluster_names,
184
+ all_users=all_users,
185
+ include_credentials=include_credentials,
186
+ summary_response=summary_response,
187
+ include_handle=include_handle)
188
+
189
+ status_responses = []
190
+ for cluster in clusters:
191
+ try:
192
+ status_responses.append(
193
+ responses.StatusResponse.model_validate(cluster))
194
+ except Exception as e: # pylint: disable=broad-except
195
+ logger.warning('Failed to validate status responses for cluster '
196
+ f'{cluster.get("name")}: {e}')
197
+ return status_responses
173
198
 
174
199
 
175
200
  def status_kubernetes(
176
201
  ) -> Tuple[List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
177
202
  List['kubernetes_utils.KubernetesSkyPilotClusterInfoPayload'],
178
- List[Dict[str, Any]], Optional[str]]:
203
+ List[responses.ManagedJobRecord], Optional[str]]:
179
204
  """Gets all SkyPilot clusters and jobs in the Kubernetes cluster.
180
205
 
181
206
  Managed jobs and services are also included in the clusters returned.
@@ -250,6 +275,7 @@ all_clusters, unmanaged_clusters, all_jobs, context
250
275
  kubernetes_utils.KubernetesSkyPilotClusterInfoPayload.from_cluster(c)
251
276
  for c in unmanaged_clusters
252
277
  ]
278
+ all_jobs = [responses.ManagedJobRecord(**job) for job in all_jobs]
253
279
  return all_clusters, unmanaged_clusters, all_jobs, context
254
280
 
255
281
 
@@ -262,22 +288,26 @@ def endpoints(cluster: str,
262
288
  port: The port number to get the endpoint for. If None, endpoints
263
289
  for all ports are returned..
264
290
 
265
- Returns: A dictionary of port numbers to endpoints. If endpoint is None,
291
+ Returns: A dictionary of port numbers to endpoints. If port is None,
266
292
  the dictionary will contain all ports:endpoints exposed on the cluster.
267
293
 
268
294
  Raises:
269
- ValueError: if the cluster is not UP or the endpoint is not exposed.
295
+ ValueError: if the cluster is not UP or the endpoint is not exposed.
270
296
  RuntimeError: if the cluster has no ports to be exposed or no endpoints
271
297
  are exposed yet.
272
298
  """
273
299
  with rich_utils.safe_status(
274
300
  ux_utils.spinner_message(
275
301
  f'Fetching endpoints for cluster {cluster}')):
276
- return backend_utils.get_endpoints(cluster=cluster, port=port)
302
+ result = backend_utils.get_endpoints(cluster=cluster, port=port)
303
+ return result
277
304
 
278
305
 
279
306
  @usage_lib.entrypoint
280
- def cost_report() -> List[Dict[str, Any]]:
307
+ def cost_report(
308
+ days: Optional[int] = None,
309
+ dashboard_summary_response: bool = False,
310
+ cluster_hashes: Optional[List[str]] = None) -> List[Dict[str, Any]]:
281
311
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
282
312
  """Get all cluster cost reports, including those that have been downed.
283
313
 
@@ -295,6 +325,13 @@ def cost_report() -> List[Dict[str, Any]]:
295
325
  'cluster_hash': (str) unique hash identifying cluster,
296
326
  'usage_intervals': (List[Tuple[int, int]]) cluster usage times,
297
327
  'total_cost': (float) cost given resources and usage intervals,
328
+ 'cloud': (str) cloud of the cluster,
329
+ 'region': (str) region of the cluster,
330
+ 'cpus': (str) number of vCPUs of the cluster,
331
+ 'memory': (str) memory of the cluster,
332
+ 'accelerators': (str) accelerators of the cluster,
333
+ 'resources_str': (str) resources string of the cluster,
334
+ 'resources_str_full': (str) full resources string of the cluster,
298
335
  }
299
336
 
300
337
  The estimated cost column indicates price for the cluster based on the type
@@ -304,25 +341,103 @@ def cost_report() -> List[Dict[str, Any]]:
304
341
  cache of the cluster status, and may not be accurate for the cluster with
305
342
  autostop/use_spot set or terminated/stopped on the cloud console.
306
343
 
344
+ Args:
345
+ days: Number of days to look back from now. Active clusters are always
346
+ included. Historical clusters are only included if they were last
347
+ used within the past 'days' days. Defaults to 30 days.
348
+
307
349
  Returns:
308
350
  A list of dicts, with each dict containing the cost information of a
309
351
  cluster.
310
352
  """
311
- cluster_reports = global_user_state.get_clusters_from_history()
353
+ if days is None:
354
+ days = constants.COST_REPORT_DEFAULT_DAYS
355
+
356
+ abbreviate_response = dashboard_summary_response and cluster_hashes is None
357
+
358
+ cluster_reports = global_user_state.get_clusters_from_history(
359
+ days=days,
360
+ abbreviate_response=abbreviate_response,
361
+ cluster_hashes=cluster_hashes)
362
+ logger.debug(
363
+ f'{len(cluster_reports)} clusters found from history with {days} days.')
364
+
365
+ def _process_cluster_report(
366
+ cluster_report: Dict[str, Any]) -> Dict[str, Any]:
367
+ """Process cluster report by calculating cost and adding fields."""
368
+ # Make a copy to avoid modifying the original
369
+ report = cluster_report.copy()
312
370
 
313
- def get_total_cost(cluster_report: dict) -> float:
314
- duration = cluster_report['duration']
315
- launched_nodes = cluster_report['num_nodes']
316
- launched_resources = cluster_report['resources']
371
+ def get_total_cost(cluster_report: dict) -> float:
372
+ duration = cluster_report['duration']
373
+ launched_nodes = cluster_report['num_nodes']
374
+ launched_resources = cluster_report['resources']
317
375
 
318
- cost = (launched_resources.get_cost(duration) * launched_nodes)
319
- return cost
376
+ cost = (launched_resources.get_cost(duration) * launched_nodes)
377
+ return cost
320
378
 
321
- for cluster_report in cluster_reports:
322
- cluster_report['total_cost'] = get_total_cost(cluster_report)
323
- cluster_report['cloud'] = str(cluster_report['resources'].cloud)
324
- cluster_report['accelerators'] = cluster_report[
325
- 'resources'].accelerators
379
+ try:
380
+ report['total_cost'] = get_total_cost(report)
381
+ except Exception as e: # pylint: disable=broad-except
382
+ # Ok to skip the total cost as this is just for display purposes.
383
+ logger.warning(f'Failed to get total cost for cluster '
384
+ f'{report["name"]}: {str(e)}')
385
+ report['total_cost'] = 0.0
386
+
387
+ return report
388
+
389
+ # Process clusters in parallel
390
+ if not cluster_reports:
391
+ return []
392
+
393
+ if not abbreviate_response:
394
+ cluster_reports = subprocess_utils.run_in_parallel(
395
+ _process_cluster_report, cluster_reports)
396
+
397
+ def _update_record_with_resources(record: Dict[str, Any]) -> None:
398
+ """Add resource fields for dashboard compatibility."""
399
+ if record is None:
400
+ return
401
+ resources = record.get('resources')
402
+ if resources is None:
403
+ return
404
+ if not dashboard_summary_response:
405
+ fields = ['cloud', 'region', 'cpus', 'memory', 'accelerators']
406
+ else:
407
+ fields = ['cloud']
408
+ for field in fields:
409
+ try:
410
+ record[field] = str(getattr(resources, field))
411
+ except Exception as e: # pylint: disable=broad-except
412
+ # Ok to skip the fields as this is just for display
413
+ # purposes.
414
+ logger.debug(f'Failed to get resources.{field} for cluster '
415
+ f'{record["name"]}: {str(e)}')
416
+ record[field] = None
417
+
418
+ # Add resources_str and resources_str_full for dashboard
419
+ # compatibility
420
+ num_nodes = record.get('num_nodes', 1)
421
+ try:
422
+ resource_str_simple, resource_str_full = (
423
+ resources_utils.format_resource(resources,
424
+ simplified_only=False))
425
+ record['resources_str'] = f'{num_nodes}x{resource_str_simple}'
426
+ record['resources_str_full'] = f'{num_nodes}x{resource_str_full}'
427
+ except Exception as e: # pylint: disable=broad-except
428
+ logger.debug(f'Failed to get resources_str for cluster '
429
+ f'{record["name"]}: {str(e)}')
430
+ for field in fields:
431
+ record[field] = None
432
+ record['resources_str'] = '-'
433
+ record['resources_str_full'] = '-'
434
+
435
+ for report in cluster_reports:
436
+ _update_record_with_resources(report)
437
+ if dashboard_summary_response:
438
+ report.pop('usage_intervals')
439
+ report.pop('user_hash')
440
+ report.pop('resources')
326
441
 
327
442
  return cluster_reports
328
443
 
@@ -330,6 +445,8 @@ def cost_report() -> List[Dict[str, Any]]:
330
445
  def _start(
331
446
  cluster_name: str,
332
447
  idle_minutes_to_autostop: Optional[int] = None,
448
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = (
449
+ autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR),
333
450
  retry_until_up: bool = False,
334
451
  down: bool = False, # pylint: disable=redefined-outer-name
335
452
  force: bool = False,
@@ -369,9 +486,44 @@ def _start(
369
486
  'supported when starting SkyPilot controllers. To '
370
487
  f'fix: omit the {arguments_str} to use the '
371
488
  f'default autostop settings from config.')
372
- idle_minutes_to_autostop, down = (
373
- controller_utils.get_controller_autostop_config(
374
- controller=controller))
489
+
490
+ # Get the autostop resources, from which we extract the correct autostop
491
+ # config.
492
+ controller_resources = controller_utils.get_controller_resources(
493
+ controller, [])
494
+ # All resources should have the same autostop config.
495
+ controller_autostop_config = list(
496
+ controller_resources)[0].autostop_config
497
+ if (controller_autostop_config is not None and
498
+ controller_autostop_config.enabled):
499
+ idle_minutes_to_autostop = controller_autostop_config.idle_minutes
500
+ down = controller_autostop_config.down
501
+ else:
502
+ # For non-controller clusters, restore autostop configuration from
503
+ # database if not explicitly provided.
504
+ if idle_minutes_to_autostop is None:
505
+ cluster_record = global_user_state.get_cluster_from_name(
506
+ cluster_name, include_user_info=False, summary_response=True)
507
+ if cluster_record is not None:
508
+ stored_autostop = cluster_record.get('autostop', -1)
509
+ stored_to_down = cluster_record.get('to_down', False)
510
+ # Restore autostop if it was previously set (autostop > 0)
511
+ if stored_autostop > 0:
512
+ logger.warning(f'Restoring cluster {cluster_name!r} with '
513
+ f'autostop set to {stored_autostop} minutes'
514
+ f'. To turn off autostop, run: '
515
+ f'`sky autostop {cluster_name} --cancel`')
516
+ idle_minutes_to_autostop = stored_autostop
517
+ # Only restore 'down' if it was explicitly set and we're
518
+ # restoring autostop
519
+ if stored_to_down:
520
+ down = stored_to_down
521
+ elif stored_autostop == 0:
522
+ logger.warning(
523
+ f'Autostop was previously set to 0 minutes '
524
+ f'for cluster {cluster_name!r} so it will '
525
+ 'not be restored. To turn on autostop, run: '
526
+ f'`sky autostop {cluster_name} -i <minutes>`')
375
527
 
376
528
  usage_lib.record_cluster_name_for_current_operation(cluster_name)
377
529
 
@@ -391,7 +543,7 @@ def _start(
391
543
  all_file_mounts=None,
392
544
  storage_mounts=storage_mounts)
393
545
  if idle_minutes_to_autostop is not None:
394
- backend.set_autostop(handle, idle_minutes_to_autostop, down=down)
546
+ backend.set_autostop(handle, idle_minutes_to_autostop, wait_for, down)
395
547
  return handle
396
548
 
397
549
 
@@ -399,6 +551,8 @@ def _start(
399
551
  def start(
400
552
  cluster_name: str,
401
553
  idle_minutes_to_autostop: Optional[int] = None,
554
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = (
555
+ autostop_lib.DEFAULT_AUTOSTOP_WAIT_FOR),
402
556
  retry_until_up: bool = False,
403
557
  down: bool = False, # pylint: disable=redefined-outer-name
404
558
  force: bool = False,
@@ -453,6 +607,7 @@ def start(
453
607
  '`idle_minutes_to_autostop` must be set if `down` is True.')
454
608
  return _start(cluster_name,
455
609
  idle_minutes_to_autostop,
610
+ wait_for,
456
611
  retry_until_up,
457
612
  down,
458
613
  force=force)
@@ -463,7 +618,10 @@ def _stop_not_supported_message(resources: 'resources_lib.Resources') -> str:
463
618
  message = ('Stopping spot instances is currently not supported on '
464
619
  f'{resources.cloud}')
465
620
  else:
466
- message = f'Stopping is currently not supported for {resources}'
621
+ cloud_name = resources.cloud.display_name(
622
+ ) if resources.cloud else resources.cloud
623
+ message = ('Stopping is currently not supported for '
624
+ f'{cloud_name}')
467
625
  return message
468
626
 
469
627
 
@@ -539,6 +697,11 @@ def stop(cluster_name: str, purge: bool = False) -> None:
539
697
  raise exceptions.ClusterDoesNotExist(
540
698
  f'Cluster {cluster_name!r} does not exist.')
541
699
 
700
+ global_user_state.add_cluster_event(
701
+ cluster_name, status_lib.ClusterStatus.STOPPED,
702
+ 'Cluster was stopped by user.',
703
+ global_user_state.ClusterEventType.STATUS_CHANGE)
704
+
542
705
  backend = backend_utils.get_backend_from_handle(handle)
543
706
 
544
707
  if isinstance(backend, backends.CloudVmRayBackend):
@@ -566,6 +729,8 @@ def stop(cluster_name: str, purge: bool = False) -> None:
566
729
  def autostop(
567
730
  cluster_name: str,
568
731
  idle_minutes: int,
732
+ wait_for: Optional[autostop_lib.AutostopWaitFor] = autostop_lib.
733
+ DEFAULT_AUTOSTOP_WAIT_FOR,
569
734
  down: bool = False, # pylint: disable=redefined-outer-name
570
735
  ) -> None:
571
736
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
@@ -627,29 +792,26 @@ def autostop(
627
792
  )
628
793
  backend = backend_utils.get_backend_from_handle(handle)
629
794
 
795
+ resources = handle.launched_resources.assert_launchable()
630
796
  # Check cloud supports stopping spot instances
631
- cloud = handle.launched_resources.cloud
632
- assert cloud is not None, handle
797
+ cloud = resources.cloud
633
798
 
634
799
  if not isinstance(backend, backends.CloudVmRayBackend):
635
800
  raise exceptions.NotSupportedError(
636
801
  f'{operation} cluster {cluster_name!r} with backend '
637
802
  f'{backend.__class__.__name__!r} is not supported.')
638
- cloud = handle.launched_resources.cloud
803
+
639
804
  # Check if autostop/autodown is required and supported
640
805
  if not is_cancel:
641
806
  try:
642
807
  if down:
643
808
  cloud.check_features_are_supported(
644
- handle.launched_resources,
645
- {clouds.CloudImplementationFeatures.AUTODOWN})
809
+ resources, {clouds.CloudImplementationFeatures.AUTODOWN})
646
810
  else:
647
811
  cloud.check_features_are_supported(
648
- handle.launched_resources,
649
- {clouds.CloudImplementationFeatures.STOP})
812
+ resources, {clouds.CloudImplementationFeatures.STOP})
650
813
  cloud.check_features_are_supported(
651
- handle.launched_resources,
652
- {clouds.CloudImplementationFeatures.AUTOSTOP})
814
+ resources, {clouds.CloudImplementationFeatures.AUTOSTOP})
653
815
  except exceptions.NotSupportedError as e:
654
816
  raise exceptions.NotSupportedError(
655
817
  f'{colorama.Fore.YELLOW}{operation} on cluster '
@@ -658,7 +820,7 @@ def autostop(
658
820
  f'see reason above.') from e
659
821
 
660
822
  usage_lib.record_cluster_name_for_current_operation(cluster_name)
661
- backend.set_autostop(handle, idle_minutes, down)
823
+ backend.set_autostop(handle, idle_minutes, wait_for, down)
662
824
 
663
825
 
664
826
  # ==================
@@ -669,7 +831,7 @@ def autostop(
669
831
  @usage_lib.entrypoint
670
832
  def queue(cluster_name: str,
671
833
  skip_finished: bool = False,
672
- all_users: bool = False) -> List[dict]:
834
+ all_users: bool = False) -> List[responses.ClusterJobRecord]:
673
835
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
674
836
  """Gets the job queue of a cluster.
675
837
 
@@ -703,10 +865,10 @@ def queue(cluster_name: str,
703
865
  exceptions.CommandError: if failed to get the job queue with ssh.
704
866
  """
705
867
  all_jobs = not skip_finished
706
- user_hash: Optional[str] = common_utils.get_user_hash()
707
868
  if all_users:
708
869
  user_hash = None
709
- code = job_lib.JobLibCodeGen.get_job_queue(user_hash, all_jobs)
870
+ else:
871
+ user_hash = common_utils.get_current_user().id
710
872
 
711
873
  handle = backend_utils.check_cluster_available(
712
874
  cluster_name,
@@ -714,18 +876,49 @@ def queue(cluster_name: str,
714
876
  )
715
877
  backend = backend_utils.get_backend_from_handle(handle)
716
878
 
717
- returncode, jobs_payload, stderr = backend.run_on_head(handle,
718
- code,
719
- require_outputs=True,
720
- separate_stderr=True)
721
- subprocess_utils.handle_returncode(
722
- returncode,
723
- command=code,
724
- error_msg=f'Failed to get job queue on cluster {cluster_name}.',
725
- stderr=f'{jobs_payload + stderr}',
726
- stream_logs=True)
727
- jobs = job_lib.load_job_queue(jobs_payload)
728
- return jobs
879
+ use_legacy = not handle.is_grpc_enabled_with_flag
880
+
881
+ if not use_legacy:
882
+ try:
883
+ request = jobsv1_pb2.GetJobQueueRequest(user_hash=user_hash,
884
+ all_jobs=all_jobs)
885
+ response = backend_utils.invoke_skylet_with_retries(
886
+ lambda: cloud_vm_ray_backend.SkyletClient(
887
+ handle.get_grpc_channel()).get_job_queue(request))
888
+ jobs = []
889
+ for job_info in response.jobs:
890
+ job_dict = {
891
+ 'job_id': job_info.job_id,
892
+ 'job_name': job_info.job_name,
893
+ 'submitted_at': job_info.submitted_at,
894
+ 'status': job_lib.JobStatus.from_protobuf(job_info.status),
895
+ 'run_timestamp': job_info.run_timestamp,
896
+ 'start_at': job_info.start_at
897
+ if job_info.HasField('start_at') else None,
898
+ 'end_at': job_info.end_at
899
+ if job_info.HasField('end_at') else None,
900
+ 'resources': job_info.resources,
901
+ 'log_path': job_info.log_path,
902
+ 'user_hash': job_info.username,
903
+ }
904
+ # Copied from job_lib.load_job_queue.
905
+ user = global_user_state.get_user(job_dict['user_hash'])
906
+ job_dict['username'] = user.name if user is not None else None
907
+ jobs.append(job_dict)
908
+ except exceptions.SkyletMethodNotImplementedError:
909
+ use_legacy = True
910
+ if use_legacy:
911
+ code = job_lib.JobLibCodeGen.get_job_queue(user_hash, all_jobs)
912
+ returncode, jobs_payload, stderr = backend.run_on_head(
913
+ handle, code, require_outputs=True, separate_stderr=True)
914
+ subprocess_utils.handle_returncode(
915
+ returncode,
916
+ command=code,
917
+ error_msg=f'Failed to get job queue on cluster {cluster_name}.',
918
+ stderr=f'{jobs_payload + stderr}',
919
+ stream_logs=True)
920
+ jobs = job_lib.load_job_queue(jobs_payload)
921
+ return [responses.ClusterJobRecord.model_validate(job) for job in jobs]
729
922
 
730
923
 
731
924
  @usage_lib.entrypoint
@@ -795,8 +988,10 @@ def cancel(
795
988
  f'handle for cluster {cluster_name!r} should not be None')
796
989
 
797
990
  backend = backend_utils.get_backend_from_handle(handle)
991
+ user_hash: Optional[str] = common_utils.get_current_user().id
798
992
 
799
993
  if all_users:
994
+ user_hash = None
800
995
  sky_logging.print(
801
996
  f'{colorama.Fore.YELLOW}'
802
997
  f'Cancelling all users\' jobs on cluster {cluster_name!r}...'
@@ -821,7 +1016,7 @@ def cancel(
821
1016
  backend.cancel_jobs(handle,
822
1017
  job_ids,
823
1018
  cancel_all=all or all_users,
824
- user_hash=common_utils.get_user_hash())
1019
+ user_hash=user_hash)
825
1020
 
826
1021
 
827
1022
  @usage_lib.entrypoint
@@ -859,7 +1054,12 @@ def tail_logs(cluster_name: str,
859
1054
  backend = backend_utils.get_backend_from_handle(handle)
860
1055
 
861
1056
  usage_lib.record_cluster_name_for_current_operation(cluster_name)
862
- return backend.tail_logs(handle, job_id, follow=follow, tail=tail)
1057
+ # Although tail_logs returns an int when require_outputs=False (default),
1058
+ # we need to check returnval as an int to avoid type checking errors.
1059
+ returnval = backend.tail_logs(handle, job_id, follow=follow, tail=tail)
1060
+ assert isinstance(returnval,
1061
+ int), (f'returnval must be an int, but got {returnval}')
1062
+ return returnval
863
1063
 
864
1064
 
865
1065
  @usage_lib.entrypoint
@@ -958,25 +1158,25 @@ def job_status(cluster_name: str,
958
1158
  # = Storage Management =
959
1159
  # ======================
960
1160
  @usage_lib.entrypoint
961
- def storage_ls() -> List[Dict[str, Any]]:
1161
+ def storage_ls() -> List[responses.StorageRecord]:
962
1162
  # NOTE(dev): Keep the docstring consistent between the Python API and CLI.
963
1163
  """Gets the storages.
964
1164
 
965
1165
  Returns:
966
- [
967
- {
968
- 'name': str,
969
- 'launched_at': int timestamp of creation,
970
- 'store': List[sky.StoreType],
971
- 'last_use': int timestamp of last use,
972
- 'status': sky.StorageStatus,
973
- }
974
- ]
1166
+ List[responses.StorageRecord]: A list of storage records.
975
1167
  """
976
1168
  storages = global_user_state.get_storage()
1169
+ storage_records = []
977
1170
  for storage in storages:
978
- storage['store'] = list(storage.pop('handle').sky_stores.keys())
979
- return storages
1171
+ storage_records.append(
1172
+ responses.StorageRecord(
1173
+ name=storage['name'],
1174
+ launched_at=storage['launched_at'],
1175
+ store=list(storage.pop('handle').sky_stores.keys()),
1176
+ last_use=storage['last_use'],
1177
+ status=storage['status'],
1178
+ ))
1179
+ return storage_records
980
1180
 
981
1181
 
982
1182
  @usage_lib.entrypoint
@@ -992,9 +1192,7 @@ def storage_delete(name: str) -> None:
992
1192
  if handle is None:
993
1193
  raise ValueError(f'Storage name {name!r} not found.')
994
1194
  else:
995
- storage_object = data.Storage(name=handle.storage_name,
996
- source=handle.source,
997
- sync_on_reconstruction=False)
1195
+ storage_object = data.Storage.from_handle(handle)
998
1196
  storage_object.delete()
999
1197
 
1000
1198
 
@@ -1002,20 +1200,49 @@ def storage_delete(name: str) -> None:
1002
1200
  # = Catalog Observe =
1003
1201
  # ===================
1004
1202
  @usage_lib.entrypoint
1005
- def enabled_clouds() -> List[clouds.Cloud]:
1006
- return global_user_state.get_cached_enabled_clouds(
1007
- sky_cloud.CloudCapability.COMPUTE)
1203
+ def enabled_clouds(workspace: Optional[str] = None,
1204
+ expand: bool = False) -> List[str]:
1205
+ if workspace is None:
1206
+ workspace = skypilot_config.get_active_workspace()
1207
+ cached_clouds = global_user_state.get_cached_enabled_clouds(
1208
+ sky_cloud.CloudCapability.COMPUTE, workspace=workspace)
1209
+ with skypilot_config.local_active_workspace_ctx(workspace):
1210
+ if not expand:
1211
+ return [cloud.canonical_name() for cloud in cached_clouds]
1212
+ enabled_ssh_infras = []
1213
+ enabled_k8s_infras = []
1214
+ enabled_cloud_infras = []
1215
+ for cloud in cached_clouds:
1216
+ cloud_infra = cloud.expand_infras()
1217
+ if isinstance(cloud, clouds.SSH):
1218
+ enabled_ssh_infras.extend(cloud_infra)
1219
+ elif isinstance(cloud, clouds.Kubernetes):
1220
+ enabled_k8s_infras.extend(cloud_infra)
1221
+ else:
1222
+ enabled_cloud_infras.extend(cloud_infra)
1223
+ all_infras = sorted(enabled_ssh_infras) + sorted(
1224
+ enabled_k8s_infras) + sorted(enabled_cloud_infras)
1225
+ return all_infras
1008
1226
 
1009
1227
 
1010
1228
  @usage_lib.entrypoint
1011
1229
  def realtime_kubernetes_gpu_availability(
1012
1230
  context: Optional[str] = None,
1013
1231
  name_filter: Optional[str] = None,
1014
- quantity_filter: Optional[int] = None
1232
+ quantity_filter: Optional[int] = None,
1233
+ is_ssh: Optional[bool] = None
1015
1234
  ) -> List[Tuple[str, List[models.RealtimeGpuAvailability]]]:
1016
1235
 
1017
1236
  if context is None:
1018
- context_list = clouds.Kubernetes.existing_allowed_contexts()
1237
+ # Include contexts from both Kubernetes and SSH clouds
1238
+ kubernetes_contexts = clouds.Kubernetes.existing_allowed_contexts()
1239
+ ssh_contexts = clouds.SSH.existing_allowed_contexts()
1240
+ if is_ssh is None:
1241
+ context_list = kubernetes_contexts + ssh_contexts
1242
+ elif is_ssh:
1243
+ context_list = ssh_contexts
1244
+ else:
1245
+ context_list = kubernetes_contexts
1019
1246
  else:
1020
1247
  context_list = [context]
1021
1248
 
@@ -1024,9 +1251,9 @@ def realtime_kubernetes_gpu_availability(
1024
1251
  name_filter: Optional[str] = None,
1025
1252
  quantity_filter: Optional[int] = None
1026
1253
  ) -> List[models.RealtimeGpuAvailability]:
1027
- counts, capacity, available = service_catalog.list_accelerator_realtime(
1254
+ counts, capacity, available = catalog.list_accelerator_realtime(
1028
1255
  gpus_only=True,
1029
- clouds='kubernetes',
1256
+ clouds='ssh' if is_ssh else 'kubernetes',
1030
1257
  name_filter=name_filter,
1031
1258
  region_filter=context,
1032
1259
  quantity_filter=quantity_filter,
@@ -1058,16 +1285,19 @@ def realtime_kubernetes_gpu_availability(
1058
1285
  name_filter=name_filter,
1059
1286
  quantity_filter=quantity_filter), context_list)
1060
1287
 
1288
+ cloud_identity = 'ssh' if is_ssh else 'kubernetes'
1289
+ cloud_identity_capital = 'SSH' if is_ssh else 'Kubernetes'
1290
+
1061
1291
  for ctx, queried in zip(context_list, parallel_queried):
1062
1292
  cumulative_count += len(queried)
1063
1293
  if len(queried) == 0:
1064
1294
  # don't add gpu results for clusters that don't have any
1065
- logger.debug(f'No gpus found in k8s cluster {ctx}')
1295
+ logger.debug(f'No gpus found in {cloud_identity} cluster {ctx}')
1066
1296
  continue
1067
1297
  availability_lists.append((ctx, queried))
1068
1298
 
1069
1299
  if cumulative_count == 0:
1070
- err_msg = 'No GPUs found in any Kubernetes clusters. '
1300
+ err_msg = f'No GPUs found in any {cloud_identity_capital} clusters. '
1071
1301
  debug_msg = 'To further debug, run: sky check '
1072
1302
  if name_filter is not None:
1073
1303
  gpu_info_msg = f' {name_filter!r}'
@@ -1075,9 +1305,9 @@ def realtime_kubernetes_gpu_availability(
1075
1305
  gpu_info_msg += (' with requested quantity'
1076
1306
  f' {quantity_filter}')
1077
1307
  err_msg = (f'Resources{gpu_info_msg} not found '
1078
- 'in Kubernetes clusters. ')
1079
- debug_msg = ('To show available accelerators on kubernetes,'
1080
- ' run: sky show-gpus --cloud kubernetes ')
1308
+ f'in {cloud_identity_capital} clusters. ')
1309
+ debug_msg = (f'To show available accelerators on {cloud_identity}, '
1310
+ f' run: sky show-gpus --cloud {cloud_identity} ')
1081
1311
  full_err_msg = (err_msg + kubernetes_constants.NO_GPU_HELP_MESSAGE +
1082
1312
  debug_msg)
1083
1313
  raise ValueError(full_err_msg)
@@ -1089,89 +1319,61 @@ def realtime_kubernetes_gpu_availability(
1089
1319
  # =================
1090
1320
  @usage_lib.entrypoint
1091
1321
  def local_up(gpus: bool,
1092
- ips: Optional[List[str]],
1093
- ssh_user: Optional[str],
1094
- ssh_key: Optional[str],
1095
- cleanup: bool,
1096
- context_name: Optional[str] = None,
1097
- password: Optional[str] = None) -> None:
1098
- """Creates a local or remote cluster."""
1099
-
1100
- def _validate_args(ips, ssh_user, ssh_key, cleanup):
1101
- # If any of --ips, --ssh-user, or --ssh-key-path is specified,
1102
- # all must be specified
1103
- if bool(ips) or bool(ssh_user) or bool(ssh_key):
1104
- if not (ips and ssh_user and ssh_key):
1105
- with ux_utils.print_exception_no_traceback():
1106
- raise ValueError(
1107
- 'All ips, ssh_user, and ssh_key must be specified '
1108
- 'together.')
1109
-
1110
- # --cleanup can only be used if --ips, --ssh-user and --ssh-key-path
1111
- # are all provided
1112
- if cleanup and not (ips and ssh_user and ssh_key):
1113
- with ux_utils.print_exception_no_traceback():
1114
- raise ValueError(
1115
- 'cleanup can only be used with ips, ssh_user and ssh_key.')
1116
-
1117
- _validate_args(ips, ssh_user, ssh_key, cleanup)
1118
-
1119
- # If remote deployment arguments are specified, run remote up script
1120
- if ips:
1121
- assert ssh_user is not None and ssh_key is not None
1122
- kubernetes_deploy_utils.deploy_remote_cluster(ips, ssh_user, ssh_key,
1123
- cleanup, context_name,
1124
- password)
1125
- else:
1126
- # Run local deployment (kind) if no remote args are specified
1127
- kubernetes_deploy_utils.deploy_local_cluster(gpus)
1322
+ name: Optional[str] = None,
1323
+ port_start: Optional[int] = None) -> None:
1324
+ """Creates a local cluster."""
1325
+ kubernetes_deploy_utils.deploy_local_cluster(name, port_start, gpus)
1128
1326
 
1129
1327
 
1130
- def local_down() -> None:
1328
+ def local_down(name: Optional[str] = None) -> None:
1131
1329
  """Tears down the Kubernetes cluster started by local_up."""
1132
- cluster_removed = False
1330
+ kubernetes_deploy_utils.teardown_local_cluster(name)
1133
1331
 
1134
- path_to_package = os.path.dirname(__file__)
1135
- down_script_path = os.path.join(path_to_package, 'utils/kubernetes',
1136
- 'delete_cluster.sh')
1137
1332
 
1138
- cwd = os.path.dirname(os.path.abspath(down_script_path))
1139
- run_command = shlex.split(down_script_path)
1333
+ @usage_lib.entrypoint
1334
+ def ssh_up(infra: Optional[str] = None, cleanup: bool = False) -> None:
1335
+ """Deploys or tears down a Kubernetes cluster on SSH targets.
1140
1336
 
1141
- # Setup logging paths
1142
- run_timestamp = sky_logging.get_run_timestamp()
1143
- log_path = os.path.join(constants.SKY_LOGS_DIRECTORY, run_timestamp,
1144
- 'local_down.log')
1337
+ Args:
1338
+ infra: Name of the cluster configuration in ssh_node_pools.yaml.
1339
+ If None, the first cluster in the file is used.
1340
+ cleanup: If True, clean up the cluster instead of deploying.
1341
+ """
1342
+ kubernetes_deploy_utils.deploy_ssh_cluster(
1343
+ cleanup=cleanup,
1344
+ infra=infra,
1345
+ )
1145
1346
 
1146
- with rich_utils.safe_status(
1147
- ux_utils.spinner_message('Removing local cluster',
1148
- log_path=log_path,
1149
- is_local=True)):
1150
-
1151
- returncode, stdout, stderr = log_lib.run_with_log(cmd=run_command,
1152
- log_path=log_path,
1153
- require_outputs=True,
1154
- stream_logs=False,
1155
- cwd=cwd)
1156
- stderr = stderr.replace('No kind clusters found.\n', '')
1157
-
1158
- if returncode == 0:
1159
- cluster_removed = True
1160
- elif returncode == 100:
1161
- logger.info(ux_utils.error_message('Local cluster does not exist.'))
1162
- else:
1163
- with ux_utils.print_exception_no_traceback():
1164
- raise RuntimeError('Failed to create local cluster. '
1165
- f'Stdout: {stdout}'
1166
- f'\nError: {stderr}')
1167
- if cluster_removed:
1168
- # Run sky check
1169
- with rich_utils.safe_status(
1170
- ux_utils.spinner_message('Running sky check...')):
1171
- sky_check.check_capability(sky_cloud.CloudCapability.COMPUTE,
1172
- clouds=['kubernetes'],
1173
- quiet=True)
1174
- logger.info(
1175
- ux_utils.finishing_message('Local cluster removed.',
1176
- log_path=log_path,
1177
- is_local=True))
1347
+
1348
+ @usage_lib.entrypoint
1349
+ def ssh_status(context_name: str) -> Tuple[bool, str]:
1350
+ """Check the status of an SSH Node Pool context.
1351
+
1352
+ Args:
1353
+ context_name: The SSH context name (e.g., 'ssh-my-cluster')
1354
+
1355
+ Returns:
1356
+ Tuple[bool, str]: (is_ready, reason)
1357
+ - is_ready: True if the SSH Node Pool is ready, False otherwise
1358
+ - reason: Explanation of the status
1359
+ """
1360
+ try:
1361
+ is_ready, reason = clouds.SSH.check_single_context(context_name)
1362
+ return is_ready, reason
1363
+ except Exception as e: # pylint: disable=broad-except
1364
+ return False, ('Failed to check SSH context: '
1365
+ f'{common_utils.format_exception(e)}')
1366
+
1367
+
1368
+ def get_all_contexts() -> List[str]:
1369
+ """Get all available contexts from Kubernetes and SSH clouds.
1370
+
1371
+ Returns:
1372
+ List[str]: A list of all available context names.
1373
+ """
1374
+ kube_contexts = clouds.Kubernetes.existing_allowed_contexts()
1375
+ ssh_contexts = clouds.SSH.get_ssh_node_pool_contexts()
1376
+ # Ensure ssh_contexts are prefixed appropriately if not already
1377
+ # For now, assuming get_ssh_node_pool_contexts already returns them
1378
+ # in the desired format (e.g., 'ssh-my-cluster')
1379
+ return sorted(list(set(kube_contexts + ssh_contexts)))